/* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 2013 Danny Robson */ #include #include #include #include #include #include #include #include namespace emory { using static_hash = cruft::crypto::hash::SHA1; struct params { std::size_t bits; std::size_t window; std::ptrdiff_t minimum; }; struct chunk { std::pair offset; static_hash::digest_t digest; struct digest_equality { bool operator() (chunk const &a, chunk const &b) { return a.digest == b.digest; } }; struct digest_comparator { bool operator() (chunk const &a, chunk const &b) { return a.digest < b.digest; } }; }; std::ostream& operator<< (std::ostream &os, chunk const &val) { return os << "{ first: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.first << ", second: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.second << " }" << std::dec; } struct map { map (cruft::view data, params const&); std::vector elements; }; std::vector common (map const &a, map const &b); } emory::map::map (cruft::view data, const emory::params &p) { using hash_type = cruft::hash::buzhash; hash_type h (p.window, data); auto remain = data.consume (p.window); using digest_type = hash_type::digest_type ; digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits); for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) { auto const digest = h (cursor); if (std::distance (start, cursor) < p.minimum) continue; if (unlikely (digest & mask)) continue; cruft::view const region { start, cursor }; start = cursor + 1; elements.push_back ({ .offset = { std::pair { std::distance (data.begin (), region.begin ()), std::distance (data.begin (), region.end ()) }, }, .digest = static_hash {} (region), }); } } std::vector emory::common (emory::map const &a, emory::map const &b) { CHECK (std::is_sorted (a.elements.begin (), a.elements.end (), emory::chunk::digest_comparator {})); CHECK (std::is_sorted (b.elements.begin (), b.elements.end (), emory::chunk::digest_comparator {})); std::vector res; for (auto a_cursor = a.elements.begin (), b_cursor = b.elements.begin (); a_cursor != a.elements.end () && b_cursor != b.elements.end (); /* nothing */) { if (a_cursor->digest < b_cursor->digest) { ++a_cursor; continue; } if (b_cursor->digest < a_cursor->digest) { ++b_cursor; continue; } res.push_back (*a_cursor); ++a_cursor; ++b_cursor; } return res; } enum { ARG_SELF, ARG_BITS, ARG_WINDOW, ARGS_MINIMUM, ARGS_TARGET, ARGS_SOURCE, NUM_ARGS, }; int main (int argc, char const **argv) { if (argc != NUM_ARGS) { std::cerr << "usage: " << argv[ARG_SELF] << " \n"; return EXIT_FAILURE; } emory::params const p { .bits = cruft::parse::from_string (argv[ARG_BITS ]), .window = cruft::parse::from_string (argv[ARG_WINDOW]), .minimum = cruft::parse::from_string (argv[ARGS_MINIMUM]), }; emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p); emory::map source (cruft::mapped_file (argv[ARGS_SOURCE]), p); std::clog << "Hashing target\n"; std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_comparator {}); std::clog << "Hashing source\n"; std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_comparator {}); std::clog << "Finding common\n"; auto const &found = common (target, source); std::size_t matching = 0; for (auto const &i: found) { std::cout << i << '\n'; matching += i.offset.second - i.offset.first; } std::size_t const total = std::accumulate ( target.elements.begin (), target.elements.end (), 0u, [] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; } ); std::cout << "Found " << found.size () << " chunks of " << matching << " bytes for a factor of " << float (matching) / total << "\n"; }