This commit is contained in:
Danny Robson 2020-12-29 08:36:21 +10:00
parent f083ff0f64
commit aa90c7ef70
2 changed files with 44 additions and 18 deletions

View File

@ -14,7 +14,15 @@
///////////////////////////////////////////////////////////////////////////////
namespace emory::chunk {
template <typename HashT, typename OutputT>
/// Scan a memory range and write chunk data to a supplied output
/// iterator.
///
/// \tparam HashT The strong final hash function type
/// \tparam OutputT The output iterator type
template <
typename HashT,
typename OutputT
>
OutputT
find (
OutputT &&dst,
@ -31,14 +39,20 @@ namespace emory::chunk {
u64 hash_state = 0;
auto start = src.begin ();
// Scan the entire source memory region
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
// Initialise the hash state. We can jump ahead to just before the
// rolling hash window starts because the preceding data would be
// rotated out anyway.
hash_state = 0;
cursor += p.minimum - p.window;
for (std::size_t i = 0; i < p.window; ++i)
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
// Scan until the last point that satisfies our chunk size constraints.
for ( ; cursor < src.end () - p.window; ++cursor) {
if (cursor < start + p.maximum) {
// Rotate the buzhash state if we haven't hit a marker.
if (likely (hash_state & mask)) {
hash_state = cruft::rotatel (hash_state, 1)
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
@ -47,6 +61,7 @@ namespace emory::chunk {
}
}
// Record the matching chunk
cruft::view<u08 const*> const region { start, cursor };
CHECK_GE (cursor - start, p.minimum);
CHECK_LE (cursor - start, p.maximum);
@ -64,6 +79,7 @@ namespace emory::chunk {
}
}
// Create a final chunk from the tail data
if (start != src.end ()) {
cruft::view<u08 const*> const region { start, src.end () };

View File

@ -27,6 +27,8 @@
///////////////////////////////////////////////////////////////////////////////
/// Provides _some_ consistent ordering for regions. The meaning isn't well
/// defined. The function is provided only so that we can identify duplicates.
static
std::strong_ordering
region_ordering (
@ -42,25 +44,24 @@ region_ordering (
}
static bool region_less (emory::chunk::region const &a, emory::chunk::region const &b)
{
//-----------------------------------------------------------------------------
static bool region_less (
emory::chunk::region const &a,
emory::chunk::region const &b
) {
return region_ordering (a, b) < 0;
}
static bool region_equal (emory::chunk::region const &a, emory::chunk::region const &b)
{
//-----------------------------------------------------------------------------
static bool region_equal (
emory::chunk::region const &a,
emory::chunk::region const &b
) {
return region_ordering (a, b) == 0;
}
//static bool overlap (emory::chunk::region const &a, emory::chunk::region const &b)
//{
// return a.offset.first < b.offset.second &&
// b.offset.first < a.offset.second;
//}
///////////////////////////////////////////////////////////////////////////////
static void find_path_chunks (
std::vector<emory::chunk::region> &res,
@ -69,7 +70,8 @@ static void find_path_chunks (
);
//-----------------------------------------------------------------------------
///----------------------------------------------------------------------------
/// Scan chunks in the path provided to a regular file.
static
void
find_regular_chunks (
@ -89,7 +91,8 @@ find_regular_chunks (
}
//-----------------------------------------------------------------------------
///----------------------------------------------------------------------------
/// Scan chunks in the directory by recursing into all children.
static
void
find_directory_chunks (
@ -104,7 +107,9 @@ find_directory_chunks (
}
//-----------------------------------------------------------------------------
///----------------------------------------------------------------------------
/// Scan chunks from a given path by dispatching to `find_foo_chunks` style
/// functions depending on the file type.
static void find_path_chunks (
std::vector<emory::chunk::region> &res,
std::filesystem::path const &src,
@ -133,7 +138,8 @@ static void find_path_chunks (
}
//-----------------------------------------------------------------------------
///----------------------------------------------------------------------------
/// Find all regions in a path and return a vector of the regions.
static
std::vector<emory::chunk::region>
find_chunks (std::filesystem::path const &src, emory::chunk::params const &p)
@ -164,6 +170,7 @@ enum {
//-----------------------------------------------------------------------------
int main (int argc, char const **argv)
{
// Extract commandline arguments
if (argc < NUM_ARGS_REQUIRED) {
std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum] [maximum]\n"
<< "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n'
@ -188,6 +195,7 @@ int main (int argc, char const **argv)
std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc);
output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit);
// Find all the chunks and prepare them for output
std::cout << "processing\n";
std::vector<emory::chunk::region> src = find_chunks (argv[ARGS_INPUT], p);
@ -199,6 +207,7 @@ int main (int argc, char const **argv)
region_less
);
// Write all chunks to the output file
output << "params: " << p << '\n';
for (auto const &chunk: src) {
output << chunk.size() << ' ';
@ -217,6 +226,7 @@ int main (int argc, char const **argv)
auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0);
std::cout << myaccum << '\n';
// Find the total and unique byte counts
auto const total_bytes = std::accumulate (
src.begin (),
src.end (),