comments
This commit is contained in:
parent
f083ff0f64
commit
aa90c7ef70
@ -14,7 +14,15 @@
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
namespace emory::chunk {
|
||||
template <typename HashT, typename OutputT>
|
||||
/// Scan a memory range and write chunk data to a supplied output
|
||||
/// iterator.
|
||||
///
|
||||
/// \tparam HashT The strong final hash function type
|
||||
/// \tparam OutputT The output iterator type
|
||||
template <
|
||||
typename HashT,
|
||||
typename OutputT
|
||||
>
|
||||
OutputT
|
||||
find (
|
||||
OutputT &&dst,
|
||||
@ -31,22 +39,29 @@ namespace emory::chunk {
|
||||
u64 hash_state = 0;
|
||||
auto start = src.begin ();
|
||||
|
||||
// Scan the entire source memory region
|
||||
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
|
||||
// Initialise the hash state. We can jump ahead to just before the
|
||||
// rolling hash window starts because the preceding data would be
|
||||
// rotated out anyway.
|
||||
hash_state = 0;
|
||||
cursor += p.minimum - p.window;
|
||||
for (std::size_t i = 0; i < p.window; ++i)
|
||||
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
|
||||
|
||||
// Scan until the last point that satisfies our chunk size constraints.
|
||||
for ( ; cursor < src.end () - p.window; ++cursor) {
|
||||
if (cursor < start + p.maximum) {
|
||||
// Rotate the buzhash state if we haven't hit a marker.
|
||||
if (likely (hash_state & mask)) {
|
||||
hash_state = cruft::rotatel (hash_state, 1)
|
||||
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
||||
^ *cursor;
|
||||
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
||||
^ *cursor;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Record the matching chunk
|
||||
cruft::view<u08 const*> const region { start, cursor };
|
||||
CHECK_GE (cursor - start, p.minimum);
|
||||
CHECK_LE (cursor - start, p.maximum);
|
||||
@ -64,6 +79,7 @@ namespace emory::chunk {
|
||||
}
|
||||
}
|
||||
|
||||
// Create a final chunk from the tail data
|
||||
if (start != src.end ()) {
|
||||
cruft::view<u08 const*> const region { start, src.end () };
|
||||
|
||||
|
@ -27,6 +27,8 @@
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/// Provides _some_ consistent ordering for regions. The meaning isn't well
|
||||
/// defined. The function is provided only so that we can identify duplicates.
|
||||
static
|
||||
std::strong_ordering
|
||||
region_ordering (
|
||||
@ -42,25 +44,24 @@ region_ordering (
|
||||
}
|
||||
|
||||
|
||||
static bool region_less (emory::chunk::region const &a, emory::chunk::region const &b)
|
||||
{
|
||||
//-----------------------------------------------------------------------------
|
||||
static bool region_less (
|
||||
emory::chunk::region const &a,
|
||||
emory::chunk::region const &b
|
||||
) {
|
||||
return region_ordering (a, b) < 0;
|
||||
}
|
||||
|
||||
|
||||
static bool region_equal (emory::chunk::region const &a, emory::chunk::region const &b)
|
||||
{
|
||||
//-----------------------------------------------------------------------------
|
||||
static bool region_equal (
|
||||
emory::chunk::region const &a,
|
||||
emory::chunk::region const &b
|
||||
) {
|
||||
return region_ordering (a, b) == 0;
|
||||
}
|
||||
|
||||
|
||||
//static bool overlap (emory::chunk::region const &a, emory::chunk::region const &b)
|
||||
//{
|
||||
// return a.offset.first < b.offset.second &&
|
||||
// b.offset.first < a.offset.second;
|
||||
//}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
static void find_path_chunks (
|
||||
std::vector<emory::chunk::region> &res,
|
||||
@ -69,7 +70,8 @@ static void find_path_chunks (
|
||||
);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
///----------------------------------------------------------------------------
|
||||
/// Scan chunks in the path provided to a regular file.
|
||||
static
|
||||
void
|
||||
find_regular_chunks (
|
||||
@ -89,7 +91,8 @@ find_regular_chunks (
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
///----------------------------------------------------------------------------
|
||||
/// Scan chunks in the directory by recursing into all children.
|
||||
static
|
||||
void
|
||||
find_directory_chunks (
|
||||
@ -104,7 +107,9 @@ find_directory_chunks (
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
///----------------------------------------------------------------------------
|
||||
/// Scan chunks from a given path by dispatching to `find_foo_chunks` style
|
||||
/// functions depending on the file type.
|
||||
static void find_path_chunks (
|
||||
std::vector<emory::chunk::region> &res,
|
||||
std::filesystem::path const &src,
|
||||
@ -133,7 +138,8 @@ static void find_path_chunks (
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
///----------------------------------------------------------------------------
|
||||
/// Find all regions in a path and return a vector of the regions.
|
||||
static
|
||||
std::vector<emory::chunk::region>
|
||||
find_chunks (std::filesystem::path const &src, emory::chunk::params const &p)
|
||||
@ -164,6 +170,7 @@ enum {
|
||||
//-----------------------------------------------------------------------------
|
||||
int main (int argc, char const **argv)
|
||||
{
|
||||
// Extract commandline arguments
|
||||
if (argc < NUM_ARGS_REQUIRED) {
|
||||
std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum] [maximum]\n"
|
||||
<< "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n'
|
||||
@ -188,6 +195,7 @@ int main (int argc, char const **argv)
|
||||
std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc);
|
||||
output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit);
|
||||
|
||||
// Find all the chunks and prepare them for output
|
||||
std::cout << "processing\n";
|
||||
std::vector<emory::chunk::region> src = find_chunks (argv[ARGS_INPUT], p);
|
||||
|
||||
@ -199,6 +207,7 @@ int main (int argc, char const **argv)
|
||||
region_less
|
||||
);
|
||||
|
||||
// Write all chunks to the output file
|
||||
output << "params: " << p << '\n';
|
||||
for (auto const &chunk: src) {
|
||||
output << chunk.size() << ' ';
|
||||
@ -217,6 +226,7 @@ int main (int argc, char const **argv)
|
||||
auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0);
|
||||
std::cout << myaccum << '\n';
|
||||
|
||||
// Find the total and unique byte counts
|
||||
auto const total_bytes = std::accumulate (
|
||||
src.begin (),
|
||||
src.end (),
|
||||
|
Loading…
Reference in New Issue
Block a user