From aa90c7ef708f0e269b332a053abc4ec8ebf79f27 Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Tue, 29 Dec 2020 08:36:21 +1000 Subject: [PATCH] comments --- emory/chunk/find.hpp | 22 +++++++++++++++++++--- tools/analyse.cpp | 40 +++++++++++++++++++++++++--------------- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/emory/chunk/find.hpp b/emory/chunk/find.hpp index cfa3640..64f4aaf 100644 --- a/emory/chunk/find.hpp +++ b/emory/chunk/find.hpp @@ -14,7 +14,15 @@ /////////////////////////////////////////////////////////////////////////////// namespace emory::chunk { - template + /// Scan a memory range and write chunk data to a supplied output + /// iterator. + /// + /// \tparam HashT The strong final hash function type + /// \tparam OutputT The output iterator type + template < + typename HashT, + typename OutputT + > OutputT find ( OutputT &&dst, @@ -31,22 +39,29 @@ namespace emory::chunk { u64 hash_state = 0; auto start = src.begin (); + // Scan the entire source memory region for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) { + // Initialise the hash state. We can jump ahead to just before the + // rolling hash window starts because the preceding data would be + // rotated out anyway. hash_state = 0; cursor += p.minimum - p.window; for (std::size_t i = 0; i < p.window; ++i) hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++; + // Scan until the last point that satisfies our chunk size constraints. for ( ; cursor < src.end () - p.window; ++cursor) { if (cursor < start + p.maximum) { + // Rotate the buzhash state if we haven't hit a marker. if (likely (hash_state & mask)) { hash_state = cruft::rotatel (hash_state, 1) - ^ cruft::rotatel (u64 (*(cursor - p.window)), p.window) - ^ *cursor; + ^ cruft::rotatel (u64 (*(cursor - p.window)), p.window) + ^ *cursor; continue; } } + // Record the matching chunk cruft::view const region { start, cursor }; CHECK_GE (cursor - start, p.minimum); CHECK_LE (cursor - start, p.maximum); @@ -64,6 +79,7 @@ namespace emory::chunk { } } + // Create a final chunk from the tail data if (start != src.end ()) { cruft::view const region { start, src.end () }; diff --git a/tools/analyse.cpp b/tools/analyse.cpp index f1821fa..97bef41 100644 --- a/tools/analyse.cpp +++ b/tools/analyse.cpp @@ -27,6 +27,8 @@ /////////////////////////////////////////////////////////////////////////////// +/// Provides _some_ consistent ordering for regions. The meaning isn't well +/// defined. The function is provided only so that we can identify duplicates. static std::strong_ordering region_ordering ( @@ -42,25 +44,24 @@ region_ordering ( } -static bool region_less (emory::chunk::region const &a, emory::chunk::region const &b) -{ +//----------------------------------------------------------------------------- +static bool region_less ( + emory::chunk::region const &a, + emory::chunk::region const &b +) { return region_ordering (a, b) < 0; } -static bool region_equal (emory::chunk::region const &a, emory::chunk::region const &b) -{ +//----------------------------------------------------------------------------- +static bool region_equal ( + emory::chunk::region const &a, + emory::chunk::region const &b +) { return region_ordering (a, b) == 0; } -//static bool overlap (emory::chunk::region const &a, emory::chunk::region const &b) -//{ -// return a.offset.first < b.offset.second && -// b.offset.first < a.offset.second; -//} - - /////////////////////////////////////////////////////////////////////////////// static void find_path_chunks ( std::vector &res, @@ -69,7 +70,8 @@ static void find_path_chunks ( ); -//----------------------------------------------------------------------------- +///---------------------------------------------------------------------------- +/// Scan chunks in the path provided to a regular file. static void find_regular_chunks ( @@ -89,7 +91,8 @@ find_regular_chunks ( } -//----------------------------------------------------------------------------- +///---------------------------------------------------------------------------- +/// Scan chunks in the directory by recursing into all children. static void find_directory_chunks ( @@ -104,7 +107,9 @@ find_directory_chunks ( } -//----------------------------------------------------------------------------- +///---------------------------------------------------------------------------- +/// Scan chunks from a given path by dispatching to `find_foo_chunks` style +/// functions depending on the file type. static void find_path_chunks ( std::vector &res, std::filesystem::path const &src, @@ -133,7 +138,8 @@ static void find_path_chunks ( } -//----------------------------------------------------------------------------- +///---------------------------------------------------------------------------- +/// Find all regions in a path and return a vector of the regions. static std::vector find_chunks (std::filesystem::path const &src, emory::chunk::params const &p) @@ -164,6 +170,7 @@ enum { //----------------------------------------------------------------------------- int main (int argc, char const **argv) { + // Extract commandline arguments if (argc < NUM_ARGS_REQUIRED) { std::cerr << "usage: " << argv[ARG_SELF] << " [bits] [window] [minimum] [maximum]\n" << "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n' @@ -188,6 +195,7 @@ int main (int argc, char const **argv) std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc); output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit); + // Find all the chunks and prepare them for output std::cout << "processing\n"; std::vector src = find_chunks (argv[ARGS_INPUT], p); @@ -199,6 +207,7 @@ int main (int argc, char const **argv) region_less ); + // Write all chunks to the output file output << "params: " << p << '\n'; for (auto const &chunk: src) { output << chunk.size() << ' '; @@ -217,6 +226,7 @@ int main (int argc, char const **argv) auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0); std::cout << myaccum << '\n'; + // Find the total and unique byte counts auto const total_bytes = std::accumulate ( src.begin (), src.end (),