From f063bdfd8ef8d27a9bc0651aa19da5fb2c10eb76 Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Fri, 26 Apr 2019 09:08:03 +1000 Subject: [PATCH] tools/compare: allow multiple sources --- tools/compare.cpp | 102 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 26 deletions(-) diff --git a/tools/compare.cpp b/tools/compare.cpp index 8d0c689..66a2c44 100644 --- a/tools/compare.cpp +++ b/tools/compare.cpp @@ -39,7 +39,7 @@ namespace emory { }; - struct digest_comparator { + struct digest_ordering { bool operator() (chunk const &a, chunk const &b) { return a.digest < b.digest; @@ -48,6 +48,30 @@ namespace emory { }; + template + struct match { + struct { + IdT id; + chunk data; + } src, dst; + + struct src_equality { + bool operator() (match const &a, match const &b) + { + return a.src == b.src; + } + }; + + + struct src_ordering { + bool operator() (match const &a, match const &b) + { + return a.src < b.src; + } + }; + }; + + std::ostream& operator<< (std::ostream &os, chunk const &val) { return os << "{ first: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.first @@ -62,7 +86,7 @@ namespace emory { }; - std::vector common (map const &a, map const &b); + std::vector> common (map const &a, map const &b); } @@ -100,31 +124,36 @@ emory::map::map (cruft::view data, const emory::params &p) } -std::vector -emory::common (emory::map const &a, emory::map const &b) +std::vector< + emory::match +> +emory::common (emory::map const &src, emory::map const &dst) { - CHECK (std::is_sorted (a.elements.begin (), a.elements.end (), emory::chunk::digest_comparator {})); - CHECK (std::is_sorted (b.elements.begin (), b.elements.end (), emory::chunk::digest_comparator {})); + CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), emory::chunk::digest_ordering {})); + CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), emory::chunk::digest_ordering {})); - std::vector res; + std::vector> res; - for (auto a_cursor = a.elements.begin (), b_cursor = b.elements.begin (); - a_cursor != a.elements.end () && b_cursor != b.elements.end (); + for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin (); + src_cursor != src.elements.end () && dst_cursor != dst.elements.end (); /* nothing */) { - if (a_cursor->digest < b_cursor->digest) { - ++a_cursor; + if (src_cursor->digest < dst_cursor->digest) { + ++src_cursor; continue; } - if (b_cursor->digest < a_cursor->digest) { - ++b_cursor; + if (dst_cursor->digest < src_cursor->digest) { + ++dst_cursor; continue; } - res.push_back (*a_cursor); - ++a_cursor; - ++b_cursor; + res.push_back ({ + .src = { .id = &src, .data = *src_cursor }, + .dst = { .id = &dst, .data = *dst_cursor }, + }); + ++src_cursor; + ++dst_cursor; } return res; @@ -145,8 +174,8 @@ enum { int main (int argc, char const **argv) { - if (argc != NUM_ARGS) { - std::cerr << "usage: " << argv[ARG_SELF] << " \n"; + if (argc < NUM_ARGS) { + std::cerr << "usage: " << argv[ARG_SELF] << " [...]\n"; return EXIT_FAILURE; } @@ -156,16 +185,37 @@ int main (int argc, char const **argv) .minimum = cruft::parse::from_string (argv[ARGS_MINIMUM]), }; - emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p); - emory::map source (cruft::mapped_file (argv[ARGS_SOURCE]), p); - std::clog << "Hashing target\n"; - std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_comparator {}); - std::clog << "Hashing source\n"; - std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_comparator {}); + emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p); + std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_ordering {}); - std::clog << "Finding common\n"; - auto const &found = common (target, source); + std::vector found; + + for (int i = ARGS_SOURCE; i != argc; ++i) { + auto const path = argv[i]; + std::clog << "Hashing source: " << path << '\n'; + emory::map source (cruft::mapped_file (path), p); + std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_ordering {}); + + std::clog << "Finding common\n"; + auto const &source_matches = common (target, source); + std::clog << "Discovered " << source_matches.size () << " blocks\n"; + + std::transform ( + std::begin (source_matches), + std::end (source_matches), + std::back_inserter (found), + [] (auto const &i) { return i.src.data; } + ); + + std::sort (std::begin (found), std::end (found), emory::chunk::digest_ordering {}); + found.erase ( + std::unique (std::begin (found), std::end (found), emory::chunk::digest_equality {}), + found.end () + ); + } + + std::clog << "Finalising\n"; std::size_t matching = 0; for (auto const &i: found) { std::cout << i << '\n';