/* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 2013 Danny Robson */ #include #include #include #include #include #include #include #include namespace emory { using static_hash = cruft::crypto::hash::SHA1; struct params { std::size_t bits; std::size_t window; std::ptrdiff_t minimum; }; struct chunk { std::pair offset; static_hash::digest_t digest; struct digest_equality { bool operator() (chunk const &a, chunk const &b) const { return a.digest == b.digest; } }; struct digest_ordering { bool operator() (chunk const &a, chunk const &b) const { return a.digest < b.digest; } }; }; template struct match { struct side { IdT id; chunk data; }; side src, dst; struct src_equality { bool operator() (match const &a, match const &b) const { return a.src == b.src; } }; struct dst_equality { bool operator() (match const &a, match const &b) const { return chunk::digest_equality {} ( a.dst.data, b.dst.data ); } }; struct src_ordering { bool operator() (match const &a, match const &b) const { return chunk::digest_ordering {} (a.src.data, b.src.data); } }; }; std::ostream& operator<< (std::ostream &os, chunk const &val) { return os << "{ first: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.first << ", second: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.second << " }" << std::dec; } struct map { map (cruft::view data, params const&); std::vector elements; }; template std::ostream& operator<< (std::ostream &os, match const &val) { return os << "{ src: { id: " << val.src.id << ", data: " << val.src.data << " }" << ", dst: { id: " << val.dst.id << ", data: " << val.dst.data << " }" << " }"; } std::vector> common (map const &a, map const &b); } emory::map::map (cruft::view data, const emory::params &p) { using hash_type = cruft::hash::buzhash; hash_type h (p.window, data); auto remain = data.consume (p.window); using digest_type = hash_type::digest_type ; digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits); for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) { auto const digest = h (cursor); if (std::distance (start, cursor) < p.minimum) continue; if (unlikely (digest & mask)) continue; cruft::view const region { start, cursor }; start = cursor + 1; elements.push_back ({ .offset = { std::pair { std::distance (data.begin (), region.begin ()), std::distance (data.begin (), region.end ()) }, }, .digest = static_hash {} (region), }); } } std::vector< emory::match > emory::common (emory::map const &src, emory::map const &dst) { CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), emory::chunk::digest_ordering {})); CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), emory::chunk::digest_ordering {})); std::vector> res; for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin (); src_cursor != src.elements.end () && dst_cursor != dst.elements.end (); /* nothing */) { if (src_cursor->digest < dst_cursor->digest) { ++src_cursor; continue; } if (dst_cursor->digest < src_cursor->digest) { ++dst_cursor; continue; } res.push_back ({ .src = { .id = &src, .data = *src_cursor }, .dst = { .id = &dst, .data = *dst_cursor }, }); ++src_cursor; ++dst_cursor; } return res; } enum { ARG_SELF, ARG_BITS, ARG_WINDOW, ARGS_MINIMUM, ARGS_TARGET, ARGS_SOURCE, NUM_ARGS, }; int main (int argc, char const **argv) { if (argc < NUM_ARGS) { std::cerr << "usage: " << argv[ARG_SELF] << " [...]\n"; return EXIT_FAILURE; } emory::params const p { .bits = cruft::parse::from_string (argv[ARG_BITS ]), .window = cruft::parse::from_string (argv[ARG_WINDOW]), .minimum = cruft::parse::from_string (argv[ARGS_MINIMUM]), }; std::clog << "Hashing target\n"; emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p); std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_ordering {}); std::cout << "Found " << target.elements.size () << " chunks\n"; std::vector> found; for (int i = ARGS_SOURCE; i != argc; ++i) { auto const path = argv[i]; std::clog << "Hashing source: " << path << '\n'; emory::map source (cruft::mapped_file (path), p); std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_ordering {}); std::clog << "Finding common\n"; auto const &source_matches = common (target, source); std::clog << "Discovered " << source_matches.size () << " blocks\n"; std::transform ( std::begin (source_matches), std::end (source_matches), std::back_inserter (found), [&] (auto const &j) -> emory::match { return { .src = { i, j.src.data }, .dst = { ARGS_TARGET, j.dst.data }, }; }); std::sort ( std::begin (found), std::end (found), emory::match::src_ordering {} ); found.erase ( std::unique ( std::begin (found), std::end (found), emory::match::dst_equality {} ), found.end () ); } std::clog << "Finalising\n"; std::size_t matching = 0; std::vector source_bytes (argc - ARGS_SOURCE, 0); for (auto const &i: found) { std::cout << i << '\n'; auto const size = i.dst.data.offset.second - i.dst.data.offset.first; matching += size; source_bytes[i.src.id - ARGS_SOURCE] += size; } std::size_t const total = std::accumulate ( target.elements.begin (), target.elements.end (), 0u, [] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; } ); std::cout << "Found " << found.size () << " chunks. " << matching << "/" << total << " bytes for a factor of " << float (matching) / total << "\n"; for (int i = ARGS_SOURCE; i != argc; ++i) std::cerr << argv[i] << " contributed: " << source_bytes[i - ARGS_SOURCE] << '\n'; }