chunk/param: add a maximum chunk size param

This commit is contained in:
Danny Robson 2020-12-29 08:01:40 +10:00
parent 3ad55453f7
commit c172ee2c40
5 changed files with 25 additions and 10 deletions

View File

@ -9,6 +9,7 @@
#include "params.hpp" #include "params.hpp"
#include <cruft/util/hash/buzhash.hpp> #include <cruft/util/hash/buzhash.hpp>
#include <cruft/util/debug/assert.hpp>
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -37,14 +38,18 @@ namespace emory::chunk {
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++; hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
for ( ; cursor < src.end () - p.window; ++cursor) { for ( ; cursor < src.end () - p.window; ++cursor) {
if (cursor < start + p.maximum) {
if (likely (hash_state & mask)) { if (likely (hash_state & mask)) {
hash_state = cruft::rotatel (hash_state, 1) hash_state = cruft::rotatel (hash_state, 1)
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window) ^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
^ *cursor; ^ *cursor;
continue; continue;
} }
}
cruft::view<u08 const*> const region { start, cursor }; cruft::view<u08 const*> const region { start, cursor };
CHECK_GE (cursor - start, p.minimum);
CHECK_LE (cursor - start, p.maximum);
*dst = { *dst = {
.offset = { .offset = {

View File

@ -23,6 +23,7 @@ emory::chunk::operator<< (std::ostream &os, params const &val)
return os << "{ bits: " << val.bits return os << "{ bits: " << val.bits
<< ", window: " << val.window << ", window: " << val.window
<< ", minimum: " << val.minimum << ", minimum: " << val.minimum
<< ", maximum: " << val.maximum
<< " }"; << " }";
} }

View File

@ -18,11 +18,14 @@ namespace emory::chunk {
std::size_t window; std::size_t window;
/// The minimum number of bytes for a matching region. /// The minimum number of bytes for a matching region.
std::ptrdiff_t minimum; std::ptrdiff_t minimum;
/// The maximum number of bytes for a matching region
std::ptrdiff_t maximum;
}; };
constexpr params DEFAULT_PARAMS { constexpr params DEFAULT_PARAMS {
.bits = 12, .bits = 12,
.window = 8, .window = 8,
.minimum = 4096, .minimum = 4096,
.maximum = 4 * 1024 * 1024,
}; };
} }

View File

@ -154,6 +154,7 @@ enum {
ARG_BITS, ARG_BITS,
ARG_WINDOW, ARG_WINDOW,
ARG_MINIMUM, ARG_MINIMUM,
ARG_MAXIMUM,
NUM_ARGS, NUM_ARGS,
NUM_ARGS_REQUIRED = 3, NUM_ARGS_REQUIRED = 3,
@ -164,10 +165,11 @@ enum {
int main (int argc, char const **argv) int main (int argc, char const **argv)
{ {
if (argc < NUM_ARGS_REQUIRED) { if (argc < NUM_ARGS_REQUIRED) {
std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum]\n" std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum] [maximum]\n"
<< "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n' << "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n'
<< "default window = " << emory::chunk::DEFAULT_PARAMS.window << '\n' << "default window = " << emory::chunk::DEFAULT_PARAMS.window << '\n'
<< "default minimum = " << emory::chunk::DEFAULT_PARAMS.minimum << '\n'; << "default minimum = " << emory::chunk::DEFAULT_PARAMS.minimum << '\n'
<< "default maximum = " << emory::chunk::DEFAULT_PARAMS.maximum << '\n';
return EXIT_FAILURE; return EXIT_FAILURE;
} }
@ -178,6 +180,8 @@ int main (int argc, char const **argv)
p.window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]); p.window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]);
if (argc > ARG_BITS + 1) if (argc > ARG_BITS + 1)
p.minimum = cruft::parse::from_string<std::size_t> (argv[ARG_MINIMUM]); p.minimum = cruft::parse::from_string<std::size_t> (argv[ARG_MINIMUM]);
if (argc > ARG_MAXIMUM + 1)
p.maximum = cruft::parse::from_string<std::size_t> (argv[ARG_MAXIMUM]);
std::cerr << p << '\n'; std::cerr << p << '\n';

View File

@ -24,6 +24,7 @@ enum {
ARG_BITS, ARG_BITS,
ARG_WINDOW, ARG_WINDOW,
ARGS_MINIMUM, ARGS_MINIMUM,
ARGS_MAXIMUM,
ARGS_TARGET, ARGS_TARGET,
ARGS_SOURCE, ARGS_SOURCE,
@ -34,7 +35,7 @@ enum {
int main (int argc, char const **argv) int main (int argc, char const **argv)
{ {
if (argc < NUM_ARGS) { if (argc < NUM_ARGS) {
std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <target> <source> [...]\n"; std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <maximum> <target> <source> [...]\n";
return EXIT_FAILURE; return EXIT_FAILURE;
} }
@ -42,6 +43,7 @@ int main (int argc, char const **argv)
.bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS ]), .bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS ]),
.window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]), .window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]),
.minimum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]), .minimum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]),
.maximum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MAXIMUM]),
}; };
std::clog << "Hashing target\n"; std::clog << "Hashing target\n";