From b2731f0968b1cf538c3d47eee88f63d3b745bc57 Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Sun, 6 Dec 2020 06:21:35 +1000 Subject: [PATCH] chunk/map: add a windowed buzhash --- emory/chunk/map.cpp | 53 +++++++++++++++++++++++++++++------------- emory/chunk/region.hpp | 6 +++++ 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/emory/chunk/map.cpp b/emory/chunk/map.cpp index 6e3c36c..bdcebf9 100644 --- a/emory/chunk/map.cpp +++ b/emory/chunk/map.cpp @@ -17,36 +17,57 @@ using emory::chunk::map; /////////////////////////////////////////////////////////////////////////////// map::map ( - cruft::view data, + cruft::view src, emory::chunk::params const &p ) { using hash_type = cruft::hash::buzhash; - hash_type h (p.window, data); - auto remain = data.consume (p.window); + if (src.size () < p.window) + return; using digest_type = hash_type::digest_type ; digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits); - for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) { - auto const digest = h (cursor); + u64 hash_state = 0; + auto start = src.begin (); - if (std::distance (start, cursor) < p.minimum) - continue; + for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) { + hash_state = 0; + cursor += p.minimum - p.window; + for (std::size_t i = 0; i < p.window; ++i) + hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++; - if (unlikely (digest & mask)) - continue; + for ( ; cursor < src.end () - p.window; ++cursor) { + if (likely (hash_state & mask)) { + hash_state = cruft::rotatel (hash_state, 1) + ^ cruft::rotatel (u64 (*(cursor - p.window)), p.window) + ^ *cursor; + continue; + } - cruft::view const region { start, cursor }; - start = cursor + 1; + cruft::view const region { start, cursor }; + + elements.push_back ({ + .offset = { + std::distance (src.begin (), start), + std::distance (src.begin (), cursor) + }, + .digest = static_hash {} (region) + }); + + start = cursor; + break; + } + } + + if (start != src.end ()) { + cruft::view const region { start, src.end () }; elements.push_back ({ .offset = { - std::pair { - std::distance (data.begin (), region.begin ()), - std::distance (data.begin (), region.end ()) - }, + std::distance (src.begin (), start), + std::distance (src.begin (), src.end ()) }, - .digest = static_hash {} (region), + .digest = static_hash {} (region) }); } } diff --git a/emory/chunk/region.hpp b/emory/chunk/region.hpp index 956c0df..1dfc3a3 100644 --- a/emory/chunk/region.hpp +++ b/emory/chunk/region.hpp @@ -20,6 +20,12 @@ namespace emory::chunk { std::pair offset; static_hash::digest_t digest; + off64_t size (void) const + { + CHECK_LE (offset.first, offset.second); + return offset.second - offset.first; + } + struct digest_equality { bool operator() (region const &a, region const &b) const {