diff --git a/emory/chunk/map.hpp b/emory/chunk/map.hpp index 1c136ae..afb35f0 100644 --- a/emory/chunk/map.hpp +++ b/emory/chunk/map.hpp @@ -20,6 +20,9 @@ namespace emory::chunk { struct map { map (cruft::view data, params const&); + + std::size_t size (void) const noexcept { return elements.size (); } + std::vector elements; }; } diff --git a/tools/analyse.cpp b/tools/analyse.cpp index 2de3816..7a1fbf3 100644 --- a/tools/analyse.cpp +++ b/tools/analyse.cpp @@ -3,32 +3,20 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * - * Copyright 2013 Danny Robson + * Copyright 2019 Danny Robson */ -#include +#include "emory/chunk/map.hpp" +#include "emory/chunk/params.hpp" + #include +#include #include -#include - -#include #include -struct chunk { - cruft::view data; - cruft::crypto::hash::SHA1::digest_t digest; -}; - -struct digest_equality { - bool operator() (chunk const &a, chunk const &b) - { - return a.digest == b.digest; - } -}; - - +/////////////////////////////////////////////////////////////////////////////// enum { ARG_SELF, ARG_BITS, @@ -40,6 +28,7 @@ enum { }; +//----------------------------------------------------------------------------- int main (int argc, char const **argv) { if (argc != NUM_ARGS) { @@ -47,74 +36,14 @@ int main (int argc, char const **argv) return EXIT_FAILURE; } - auto const window = cruft::parse::from_string (argv[ARG_WINDOW]); - auto const bits = cruft::parse::from_string (argv[ARG_BITS ]); - auto const minimum_size = cruft::parse::from_string (argv[ARGS_MINIMUM]); + emory::chunk::params const p { + .bits = cruft::parse::from_string (argv[ARG_BITS ]), + .window = cruft::parse::from_string (argv[ARG_WINDOW]), + .minimum = cruft::parse::from_string (argv[ARGS_MINIMUM]), + }; - cruft::mapped_file src (argv[ARGS_INPUT]); - cruft::view bytes (src); + cruft::mapped_file data (argv[ARGS_INPUT]); + emory::chunk::map src (data, p); - using hash_type = cruft::hash::buzhash; - hash_type h (window, bytes); - bytes = bytes.consume (window); - - using digest_type = hash_type::digest_type ; - digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - bits); - - std::vector chunks; - - for (u08 const *cursor = bytes.begin (), *start = src.data (); cursor != bytes.end (); cursor++) { - auto const res = h (cursor); - - if (std::distance (start, cursor) < minimum_size) - continue; - - if (unlikely (res & mask)) - continue; - - cruft::view const region { start, cursor }; - start = cursor + 1; - - chunks.push_back ({ - .data = region, - .digest = cruft::crypto::hash::SHA1 {} (region), - }); - } - - std::sort ( - std::begin (chunks), - std::end (chunks), - [] (auto const &a, auto const &b) - { - return a.digest < b.digest; - }); - - std::size_t reduction = 0; - for (auto cursor = chunks.begin (); cursor != chunks.end (); ++cursor) { - auto first_match = std::adjacent_find ( - cursor, chunks.end (), - digest_equality {} - ); - - if (first_match == chunks.end ()) - break; - - auto last_match = std::find_if_not ( - first_match, - chunks.end (), - [&first_match] (auto const &i) - { - return i.digest == first_match->digest; - }); - - auto const count = std::distance (first_match, last_match); - auto const size = first_match->data.size (); - std::cout << count << " duplicates of " << size << " bytes\n"; - reduction += (count - 1) * size; - - cursor = last_match; - } - - std::cout << chunks.size () << " chunks found\n"; - std::cout << float (reduction) / src.size () << " reduction\n"; + std::cout << src.size () << " chunks\n"; }