chunk/map: add a windowed buzhash
This commit is contained in:
parent
f4aab7f2c2
commit
b2731f0968
@ -17,36 +17,57 @@ using emory::chunk::map;
|
|||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
map::map (
|
map::map (
|
||||||
cruft::view<u08 const *> data,
|
cruft::view<u08 const *> src,
|
||||||
emory::chunk::params const &p
|
emory::chunk::params const &p
|
||||||
) {
|
) {
|
||||||
using hash_type = cruft::hash::buzhash<u64>;
|
using hash_type = cruft::hash::buzhash<u64>;
|
||||||
hash_type h (p.window, data);
|
if (src.size () < p.window)
|
||||||
auto remain = data.consume (p.window);
|
return;
|
||||||
|
|
||||||
using digest_type = hash_type::digest_type ;
|
using digest_type = hash_type::digest_type ;
|
||||||
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
|
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
|
||||||
|
|
||||||
for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) {
|
u64 hash_state = 0;
|
||||||
auto const digest = h (cursor);
|
auto start = src.begin ();
|
||||||
|
|
||||||
if (std::distance (start, cursor) < p.minimum)
|
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
|
||||||
continue;
|
hash_state = 0;
|
||||||
|
cursor += p.minimum - p.window;
|
||||||
|
for (std::size_t i = 0; i < p.window; ++i)
|
||||||
|
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
|
||||||
|
|
||||||
if (unlikely (digest & mask))
|
for ( ; cursor < src.end () - p.window; ++cursor) {
|
||||||
|
if (likely (hash_state & mask)) {
|
||||||
|
hash_state = cruft::rotatel (hash_state, 1)
|
||||||
|
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
||||||
|
^ *cursor;
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
cruft::view<u08 const*> const region { start, cursor };
|
cruft::view<u08 const*> const region { start, cursor };
|
||||||
start = cursor + 1;
|
|
||||||
|
|
||||||
elements.push_back ({
|
elements.push_back ({
|
||||||
.offset = {
|
.offset = {
|
||||||
std::pair<std::size_t,std::size_t> {
|
std::distance (src.begin (), start),
|
||||||
std::distance (data.begin (), region.begin ()),
|
std::distance (src.begin (), cursor)
|
||||||
std::distance (data.begin (), region.end ())
|
|
||||||
},
|
},
|
||||||
|
.digest = static_hash {} (region)
|
||||||
|
});
|
||||||
|
|
||||||
|
start = cursor;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start != src.end ()) {
|
||||||
|
cruft::view<u08 const*> const region { start, src.end () };
|
||||||
|
|
||||||
|
elements.push_back ({
|
||||||
|
.offset = {
|
||||||
|
std::distance (src.begin (), start),
|
||||||
|
std::distance (src.begin (), src.end ())
|
||||||
},
|
},
|
||||||
.digest = static_hash {} (region),
|
.digest = static_hash {} (region)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -20,6 +20,12 @@ namespace emory::chunk {
|
|||||||
std::pair<off64_t, off64_t> offset;
|
std::pair<off64_t, off64_t> offset;
|
||||||
static_hash::digest_t digest;
|
static_hash::digest_t digest;
|
||||||
|
|
||||||
|
off64_t size (void) const
|
||||||
|
{
|
||||||
|
CHECK_LE (offset.first, offset.second);
|
||||||
|
return offset.second - offset.first;
|
||||||
|
}
|
||||||
|
|
||||||
struct digest_equality {
|
struct digest_equality {
|
||||||
bool operator() (region const &a, region const &b) const
|
bool operator() (region const &a, region const &b) const
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user