From df7b136f245cf6beee7572eeca2f9f5cdc82ad7c Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Fri, 26 Apr 2019 09:25:41 +1000 Subject: [PATCH] tools/compare: record the source of the matched blocks --- tools/compare.cpp | 59 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/tools/compare.cpp b/tools/compare.cpp index 66a2c44..a9ab558 100644 --- a/tools/compare.cpp +++ b/tools/compare.cpp @@ -32,7 +32,7 @@ namespace emory { static_hash::digest_t digest; struct digest_equality { - bool operator() (chunk const &a, chunk const &b) + bool operator() (chunk const &a, chunk const &b) const { return a.digest == b.digest; } @@ -40,7 +40,7 @@ namespace emory { struct digest_ordering { - bool operator() (chunk const &a, chunk const &b) + bool operator() (chunk const &a, chunk const &b) const { return a.digest < b.digest; } @@ -50,23 +50,35 @@ namespace emory { template struct match { - struct { + struct side { IdT id; chunk data; } src, dst; + struct src_equality { - bool operator() (match const &a, match const &b) + bool operator() (match const &a, match const &b) const { return a.src == b.src; } }; - struct src_ordering { - bool operator() (match const &a, match const &b) + struct dst_equality { + bool operator() (match const &a, match const &b) const { - return a.src < b.src; + return chunk::digest_equality {} ( + a.dst.data, + b.dst.data + ); + } + }; + + + struct src_ordering { + bool operator() (match const &a, match const &b) const + { + return chunk::digest_ordering {} (a.src.data, b.src.data); } }; }; @@ -189,7 +201,7 @@ int main (int argc, char const **argv) emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p); std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_ordering {}); - std::vector found; + std::vector> found; for (int i = ARGS_SOURCE; i != argc; ++i) { auto const path = argv[i]; @@ -205,21 +217,39 @@ int main (int argc, char const **argv) std::begin (source_matches), std::end (source_matches), std::back_inserter (found), - [] (auto const &i) { return i.src.data; } + [&] (auto const &j) -> emory::match + { + return { + .src = { i, j.src.data }, + .dst = { ARGS_TARGET, j.dst.data }, + }; + }); + + std::sort ( + std::begin (found), + std::end (found), + emory::match::src_ordering {} ); - std::sort (std::begin (found), std::end (found), emory::chunk::digest_ordering {}); found.erase ( - std::unique (std::begin (found), std::end (found), emory::chunk::digest_equality {}), + std::unique ( + std::begin (found), + std::end (found), + emory::match::dst_equality {} + ), found.end () ); } + std::clog << "Finalising\n"; std::size_t matching = 0; + std::vector source_bytes (argc - ARGS_SOURCE, 0); for (auto const &i: found) { - std::cout << i << '\n'; - matching += i.offset.second - i.offset.first; + std::cout << i.dst.data << '\n'; + auto const size = i.dst.data.offset.second - i.dst.data.offset.first; + matching += size; + source_bytes[i.src.id - ARGS_SOURCE] += size; } std::size_t const total = std::accumulate ( @@ -230,4 +260,7 @@ int main (int argc, char const **argv) ); std::cout << "Found " << found.size () << " chunks. " << matching << "/" << total << " bytes for a factor of " << float (matching) / total << "\n"; + + for (int i = ARGS_SOURCE; i != argc; ++i) + std::cerr << argv[i] << " contributed: " << source_bytes[i - ARGS_SOURCE] << '\n'; }