/* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 2013 Danny Robson */ #include #include #include #include #include #include struct chunk { cruft::view data; cruft::crypto::hash::SHA1::digest_t digest; }; struct digest_equality { bool operator() (chunk const &a, chunk const &b) { return a.digest == b.digest; } }; enum { ARG_SELF, ARG_BITS, ARG_WINDOW, ARGS_MINIMUM, ARGS_INPUT, NUM_ARGS, }; int main (int argc, char const **argv) { if (argc != NUM_ARGS) { std::cerr << "usage: " << argv[ARG_SELF] << " \n"; return EXIT_FAILURE; } auto const window = cruft::parse::from_string (argv[ARG_WINDOW]); auto const bits = cruft::parse::from_string (argv[ARG_BITS ]); auto const minimum_size = cruft::parse::from_string (argv[ARGS_MINIMUM]); cruft::mapped_file src (argv[ARGS_INPUT]); cruft::view bytes (src); using hash_type = cruft::hash::buzhash; hash_type h (window, bytes); bytes = bytes.consume (window); using digest_type = hash_type::digest_type ; digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - bits); std::vector chunks; for (u08 const *cursor = bytes.begin (), *start = src.data (); cursor != bytes.end (); cursor++) { auto const res = h (cursor); if (std::distance (start, cursor) < minimum_size) continue; if (unlikely (res & mask)) continue; cruft::view const region { start, cursor }; start = cursor + 1; chunks.push_back ({ .data = region, .digest = cruft::crypto::hash::SHA1 {} (region), }); } std::sort ( std::begin (chunks), std::end (chunks), [] (auto const &a, auto const &b) { return a.digest < b.digest; }); std::size_t reduction = 0; for (auto cursor = chunks.begin (); cursor != chunks.end (); ++cursor) { auto first_match = std::adjacent_find ( cursor, chunks.end (), digest_equality {} ); if (first_match == chunks.end ()) break; auto last_match = std::find_if_not ( first_match, chunks.end (), [&first_match] (auto const &i) { return i.digest == first_match->digest; }); auto const count = std::distance (first_match, last_match); auto const size = first_match->data.size (); std::cout << count << " duplicates of " << size << " bytes\n"; reduction += (count - 1) * size; cursor = last_match; } std::cout << chunks.size () << " chunks found\n"; std::cout << float (reduction) / src.size () << " reduction\n"; }