From f6840ba6c173bf9ff32cba12c3445db53d6a554d Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Tue, 23 Apr 2019 08:55:16 +1000 Subject: [PATCH] emory: compute possible size reduction from deduplication --- cruft/util | 2 +- emory.cpp | 119 +++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 98 insertions(+), 23 deletions(-) diff --git a/cruft/util b/cruft/util index e1d5639..cc9b9b1 160000 --- a/cruft/util +++ b/cruft/util @@ -1 +1 @@ -Subproject commit e1d56395f6b16cb70f8dbb942a117dc01fd7e74c +Subproject commit cc9b9b19c1a9a7002248d7f62053d49002a8b297 diff --git a/emory.cpp b/emory.cpp index 363fd70..33de024 100644 --- a/emory.cpp +++ b/emory.cpp @@ -8,33 +8,108 @@ #include #include +#include +#include + +#include #include + +struct chunk { + cruft::view data; + cruft::crypto::hash::SHA1::digest_t digest; +}; + +struct digest_equality { + bool operator() (chunk const &a, chunk const &b) + { + return a.digest == b.digest; + } +}; + + +enum { + ARG_SELF, + ARG_BITS, + ARG_WINDOW, + ARGS_INPUT, + + NUM_ARGS, +}; + + int main (int argc, char const **argv) { - (void)argc; - - cruft::mapped_file src (argv[1]); - cruft::view bytes (src); - - static constexpr std::size_t BITS = 16; - std::vector counts (BITS, 0); - - static constexpr std::size_t WINDOW = 48; - cruft::hash::buzhash h (WINDOW, bytes); - - for (auto const &val: bytes.consume (WINDOW)) { - auto const res = h (&val); - - std::size_t mask = ~u64(0) >> (64 - BITS); - for (std::size_t i = 0; i < BITS; ++i) { - if ((res & mask) == 0) - counts[i]++; - mask >>= 1; - } + if (argc != NUM_ARGS) { + std::cerr << "usage: " << argv[ARG_SELF] << " \n"; + return EXIT_FAILURE; } - for (auto const &i: counts) - std::cout << i << '\n'; + auto const window = cruft::parse::from_string (argv[ARG_WINDOW]); + auto const bits = cruft::parse::from_string (argv[ARG_BITS ]); + + cruft::mapped_file src (argv[ARGS_INPUT]); + cruft::view bytes (src); + + using hash_type = cruft::hash::buzhash; + hash_type h (window, bytes); + bytes = bytes.consume (window); + + using digest_type = hash_type::digest_type ; + digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - bits); + + std::vector chunks; + + for (u08 const *cursor = bytes.begin (), *start = src.data (); cursor != bytes.end (); cursor++) { + auto const res = h (cursor); + + if (res & mask) + continue; + + cruft::view const region { start, cursor }; + start = cursor + 1; + + chunks.push_back ({ + .data = region, + .digest = cruft::crypto::hash::SHA1 {} (region), + }); + } + + std::sort ( + std::begin (chunks), + std::end (chunks), + [] (auto const &a, auto const &b) + { + return a.digest < b.digest; + }); + + std::size_t reduction = 0; + for (auto cursor = chunks.begin (); cursor != chunks.end (); ++cursor) { + auto first_match = std::adjacent_find ( + cursor, chunks.end (), + digest_equality {} + ); + + if (first_match == chunks.end ()) + break; + + auto last_match = std::find_if_not ( + first_match, + chunks.end (), + [&first_match] (auto const &i) + { + return i.digest == first_match->digest; + }); + + auto const count = std::distance (first_match, last_match); + auto const size = first_match->data.size (); + std::cout << count << " duplicates of " << size << " bytes\n"; + reduction += (count - 1) * size; + + cursor = last_match; + } + + std::cout << chunks.size () << " chunks found\n"; + std::cout << float (reduction) / src.size () << " reduction\n"; }