chunk/map: add a windowed buzhash

This commit is contained in:
Danny Robson 2020-12-06 06:21:35 +10:00
parent f4aab7f2c2
commit b2731f0968
2 changed files with 43 additions and 16 deletions

View File

@ -17,36 +17,57 @@ using emory::chunk::map;
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
map::map ( map::map (
cruft::view<u08 const *> data, cruft::view<u08 const *> src,
emory::chunk::params const &p emory::chunk::params const &p
) { ) {
using hash_type = cruft::hash::buzhash<u64>; using hash_type = cruft::hash::buzhash<u64>;
hash_type h (p.window, data); if (src.size () < p.window)
auto remain = data.consume (p.window); return;
using digest_type = hash_type::digest_type ; using digest_type = hash_type::digest_type ;
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits); digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) { u64 hash_state = 0;
auto const digest = h (cursor); auto start = src.begin ();
if (std::distance (start, cursor) < p.minimum) for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
continue; hash_state = 0;
cursor += p.minimum - p.window;
for (std::size_t i = 0; i < p.window; ++i)
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
if (unlikely (digest & mask)) for ( ; cursor < src.end () - p.window; ++cursor) {
if (likely (hash_state & mask)) {
hash_state = cruft::rotatel (hash_state, 1)
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
^ *cursor;
continue; continue;
}
cruft::view<u08 const*> const region { start, cursor }; cruft::view<u08 const*> const region { start, cursor };
start = cursor + 1;
elements.push_back ({ elements.push_back ({
.offset = { .offset = {
std::pair<std::size_t,std::size_t> { std::distance (src.begin (), start),
std::distance (data.begin (), region.begin ()), std::distance (src.begin (), cursor)
std::distance (data.begin (), region.end ())
}, },
.digest = static_hash {} (region)
});
start = cursor;
break;
}
}
if (start != src.end ()) {
cruft::view<u08 const*> const region { start, src.end () };
elements.push_back ({
.offset = {
std::distance (src.begin (), start),
std::distance (src.begin (), src.end ())
}, },
.digest = static_hash {} (region), .digest = static_hash {} (region)
}); });
} }
} }

View File

@ -20,6 +20,12 @@ namespace emory::chunk {
std::pair<off64_t, off64_t> offset; std::pair<off64_t, off64_t> offset;
static_hash::digest_t digest; static_hash::digest_t digest;
off64_t size (void) const
{
CHECK_LE (offset.first, offset.second);
return offset.second - offset.first;
}
struct digest_equality { struct digest_equality {
bool operator() (region const &a, region const &b) const bool operator() (region const &a, region const &b) const
{ {