From af35a7ffe0bc5b20ec2a1fd97b75d1db563a59cb Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Tue, 23 Apr 2019 21:34:01 +1000 Subject: [PATCH] emory-compare: add basic duplication estimator --- CMakeLists.txt | 11 ++- emory-compare.cpp | 176 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+), 7 deletions(-) create mode 100644 emory-compare.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1102277..6125b98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,10 +23,7 @@ add_subdirectory(cruft/crypto) ############################################################################### -add_executable (emory - emory.cpp -) - - -##----------------------------------------------------------------------------- -target_link_libraries(emory cruft-crypto cruft-util) +foreach (t emory emory-compare) + add_executable ("${t}" "${t}.cpp") + target_link_libraries("${t}" cruft-crypto cruft-util) +endforeach() diff --git a/emory-compare.cpp b/emory-compare.cpp new file mode 100644 index 0000000..c4def70 --- /dev/null +++ b/emory-compare.cpp @@ -0,0 +1,176 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2013 Danny Robson + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include + + +namespace emory { + using static_hash = cruft::crypto::hash::SHA1; + + struct params { + std::size_t bits; + std::size_t window; + std::ptrdiff_t minimum; + }; + + struct chunk { + std::pair offset; + static_hash::digest_t digest; + + struct digest_equality { + bool operator() (chunk const &a, chunk const &b) + { + return a.digest == b.digest; + } + }; + + + struct digest_comparator { + bool operator() (chunk const &a, chunk const &b) + { + return a.digest < b.digest; + } + }; + }; + + + std::ostream& operator<< (std::ostream &os, chunk const &val) + { + return os << "{ first: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.first + << ", second: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.second + << " }" << std::dec; + } + + + struct map { + map (cruft::view data, params const&); + std::vector elements; + }; + + + std::vector common (map const &a, map const &b); +} + + +emory::map::map (cruft::view data, const emory::params &p) +{ + using hash_type = cruft::hash::buzhash; + hash_type h (p.window, data); + auto remain = data.consume (p.window); + + using digest_type = hash_type::digest_type ; + digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits); + + for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) { + auto const digest = h (cursor); + + if (std::distance (start, cursor) < p.minimum) + continue; + + if (unlikely (digest & mask)) + continue; + + cruft::view const region { start, cursor }; + start = cursor + 1; + + elements.push_back ({ + .offset = { + std::pair { + std::distance (data.begin (), region.begin ()), + std::distance (data.begin (), region.end ()) + }, + }, + .digest = static_hash {} (region), + }); + } +} + + +std::vector +emory::common (emory::map const &a, emory::map const &b) +{ + CHECK (std::is_sorted (a.elements.begin (), a.elements.end (), emory::chunk::digest_comparator {})); + CHECK (std::is_sorted (b.elements.begin (), b.elements.end (), emory::chunk::digest_comparator {})); + + std::vector res; + + for (auto a_cursor = a.elements.begin (), b_cursor = b.elements.begin (); + a_cursor != a.elements.end () && b_cursor != b.elements.end (); + /* nothing */) + { + if (a_cursor->digest < b_cursor->digest) { + ++a_cursor; + continue; + } + + if (b_cursor->digest < a_cursor->digest) { + ++b_cursor; + continue; + } + + res.push_back (*a_cursor); + ++a_cursor; + ++b_cursor; + } + + return res; +} + + +enum { + ARG_SELF, + ARG_BITS, + ARG_WINDOW, + ARGS_MINIMUM, + ARGS_TARGET, + ARGS_SOURCE, + + NUM_ARGS, +}; + + +int main (int argc, char const **argv) +{ + if (argc != NUM_ARGS) { + std::cerr << "usage: " << argv[ARG_SELF] << " \n"; + return EXIT_FAILURE; + } + + emory::params const p { + .window = cruft::parse::from_string (argv[ARG_WINDOW]), + .bits = cruft::parse::from_string (argv[ARG_BITS ]), + .minimum = cruft::parse::from_string (argv[ARGS_MINIMUM]), + }; + + emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p); + emory::map source (cruft::mapped_file (argv[ARGS_SOURCE]), p); + + std::clog << "Hashing target\n"; + std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_comparator {}); + std::clog << "Hashing source\n"; + std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_comparator {}); + + std::clog << "Finding common\n"; + auto const &found = common (target, source); + std::size_t total = 0; + for (auto const &i: found) { + std::cout << i << '\n'; + total += i.offset.second - i.offset.first; + } + + std::cout << "Found " << found.size () << " chunks of " << total << " bytes\n"; +}