chunk/map: add a windowed buzhash
This commit is contained in:
parent
f4aab7f2c2
commit
b2731f0968
@ -17,36 +17,57 @@ using emory::chunk::map;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
map::map (
|
||||
cruft::view<u08 const *> data,
|
||||
cruft::view<u08 const *> src,
|
||||
emory::chunk::params const &p
|
||||
) {
|
||||
using hash_type = cruft::hash::buzhash<u64>;
|
||||
hash_type h (p.window, data);
|
||||
auto remain = data.consume (p.window);
|
||||
if (src.size () < p.window)
|
||||
return;
|
||||
|
||||
using digest_type = hash_type::digest_type ;
|
||||
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
|
||||
|
||||
for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) {
|
||||
auto const digest = h (cursor);
|
||||
u64 hash_state = 0;
|
||||
auto start = src.begin ();
|
||||
|
||||
if (std::distance (start, cursor) < p.minimum)
|
||||
continue;
|
||||
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
|
||||
hash_state = 0;
|
||||
cursor += p.minimum - p.window;
|
||||
for (std::size_t i = 0; i < p.window; ++i)
|
||||
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
|
||||
|
||||
if (unlikely (digest & mask))
|
||||
continue;
|
||||
for ( ; cursor < src.end () - p.window; ++cursor) {
|
||||
if (likely (hash_state & mask)) {
|
||||
hash_state = cruft::rotatel (hash_state, 1)
|
||||
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
||||
^ *cursor;
|
||||
continue;
|
||||
}
|
||||
|
||||
cruft::view<u08 const*> const region { start, cursor };
|
||||
start = cursor + 1;
|
||||
cruft::view<u08 const*> const region { start, cursor };
|
||||
|
||||
elements.push_back ({
|
||||
.offset = {
|
||||
std::distance (src.begin (), start),
|
||||
std::distance (src.begin (), cursor)
|
||||
},
|
||||
.digest = static_hash {} (region)
|
||||
});
|
||||
|
||||
start = cursor;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (start != src.end ()) {
|
||||
cruft::view<u08 const*> const region { start, src.end () };
|
||||
|
||||
elements.push_back ({
|
||||
.offset = {
|
||||
std::pair<std::size_t,std::size_t> {
|
||||
std::distance (data.begin (), region.begin ()),
|
||||
std::distance (data.begin (), region.end ())
|
||||
},
|
||||
std::distance (src.begin (), start),
|
||||
std::distance (src.begin (), src.end ())
|
||||
},
|
||||
.digest = static_hash {} (region),
|
||||
.digest = static_hash {} (region)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -20,6 +20,12 @@ namespace emory::chunk {
|
||||
std::pair<off64_t, off64_t> offset;
|
||||
static_hash::digest_t digest;
|
||||
|
||||
off64_t size (void) const
|
||||
{
|
||||
CHECK_LE (offset.first, offset.second);
|
||||
return offset.second - offset.first;
|
||||
}
|
||||
|
||||
struct digest_equality {
|
||||
bool operator() (region const &a, region const &b) const
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user