This commit is contained in:
Danny Robson 2020-12-29 08:36:21 +10:00
parent f083ff0f64
commit aa90c7ef70
2 changed files with 44 additions and 18 deletions

View File

@ -14,7 +14,15 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
namespace emory::chunk { namespace emory::chunk {
template <typename HashT, typename OutputT> /// Scan a memory range and write chunk data to a supplied output
/// iterator.
///
/// \tparam HashT The strong final hash function type
/// \tparam OutputT The output iterator type
template <
typename HashT,
typename OutputT
>
OutputT OutputT
find ( find (
OutputT &&dst, OutputT &&dst,
@ -31,14 +39,20 @@ namespace emory::chunk {
u64 hash_state = 0; u64 hash_state = 0;
auto start = src.begin (); auto start = src.begin ();
// Scan the entire source memory region
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) { for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
// Initialise the hash state. We can jump ahead to just before the
// rolling hash window starts because the preceding data would be
// rotated out anyway.
hash_state = 0; hash_state = 0;
cursor += p.minimum - p.window; cursor += p.minimum - p.window;
for (std::size_t i = 0; i < p.window; ++i) for (std::size_t i = 0; i < p.window; ++i)
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++; hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
// Scan until the last point that satisfies our chunk size constraints.
for ( ; cursor < src.end () - p.window; ++cursor) { for ( ; cursor < src.end () - p.window; ++cursor) {
if (cursor < start + p.maximum) { if (cursor < start + p.maximum) {
// Rotate the buzhash state if we haven't hit a marker.
if (likely (hash_state & mask)) { if (likely (hash_state & mask)) {
hash_state = cruft::rotatel (hash_state, 1) hash_state = cruft::rotatel (hash_state, 1)
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window) ^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
@ -47,6 +61,7 @@ namespace emory::chunk {
} }
} }
// Record the matching chunk
cruft::view<u08 const*> const region { start, cursor }; cruft::view<u08 const*> const region { start, cursor };
CHECK_GE (cursor - start, p.minimum); CHECK_GE (cursor - start, p.minimum);
CHECK_LE (cursor - start, p.maximum); CHECK_LE (cursor - start, p.maximum);
@ -64,6 +79,7 @@ namespace emory::chunk {
} }
} }
// Create a final chunk from the tail data
if (start != src.end ()) { if (start != src.end ()) {
cruft::view<u08 const*> const region { start, src.end () }; cruft::view<u08 const*> const region { start, src.end () };

View File

@ -27,6 +27,8 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
/// Provides _some_ consistent ordering for regions. The meaning isn't well
/// defined. The function is provided only so that we can identify duplicates.
static static
std::strong_ordering std::strong_ordering
region_ordering ( region_ordering (
@ -42,25 +44,24 @@ region_ordering (
} }
static bool region_less (emory::chunk::region const &a, emory::chunk::region const &b) //-----------------------------------------------------------------------------
{ static bool region_less (
emory::chunk::region const &a,
emory::chunk::region const &b
) {
return region_ordering (a, b) < 0; return region_ordering (a, b) < 0;
} }
static bool region_equal (emory::chunk::region const &a, emory::chunk::region const &b) //-----------------------------------------------------------------------------
{ static bool region_equal (
emory::chunk::region const &a,
emory::chunk::region const &b
) {
return region_ordering (a, b) == 0; return region_ordering (a, b) == 0;
} }
//static bool overlap (emory::chunk::region const &a, emory::chunk::region const &b)
//{
// return a.offset.first < b.offset.second &&
// b.offset.first < a.offset.second;
//}
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
static void find_path_chunks ( static void find_path_chunks (
std::vector<emory::chunk::region> &res, std::vector<emory::chunk::region> &res,
@ -69,7 +70,8 @@ static void find_path_chunks (
); );
//----------------------------------------------------------------------------- ///----------------------------------------------------------------------------
/// Scan chunks in the path provided to a regular file.
static static
void void
find_regular_chunks ( find_regular_chunks (
@ -89,7 +91,8 @@ find_regular_chunks (
} }
//----------------------------------------------------------------------------- ///----------------------------------------------------------------------------
/// Scan chunks in the directory by recursing into all children.
static static
void void
find_directory_chunks ( find_directory_chunks (
@ -104,7 +107,9 @@ find_directory_chunks (
} }
//----------------------------------------------------------------------------- ///----------------------------------------------------------------------------
/// Scan chunks from a given path by dispatching to `find_foo_chunks` style
/// functions depending on the file type.
static void find_path_chunks ( static void find_path_chunks (
std::vector<emory::chunk::region> &res, std::vector<emory::chunk::region> &res,
std::filesystem::path const &src, std::filesystem::path const &src,
@ -133,7 +138,8 @@ static void find_path_chunks (
} }
//----------------------------------------------------------------------------- ///----------------------------------------------------------------------------
/// Find all regions in a path and return a vector of the regions.
static static
std::vector<emory::chunk::region> std::vector<emory::chunk::region>
find_chunks (std::filesystem::path const &src, emory::chunk::params const &p) find_chunks (std::filesystem::path const &src, emory::chunk::params const &p)
@ -164,6 +170,7 @@ enum {
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
int main (int argc, char const **argv) int main (int argc, char const **argv)
{ {
// Extract commandline arguments
if (argc < NUM_ARGS_REQUIRED) { if (argc < NUM_ARGS_REQUIRED) {
std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum] [maximum]\n" std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum] [maximum]\n"
<< "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n' << "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n'
@ -188,6 +195,7 @@ int main (int argc, char const **argv)
std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc); std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc);
output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit); output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit);
// Find all the chunks and prepare them for output
std::cout << "processing\n"; std::cout << "processing\n";
std::vector<emory::chunk::region> src = find_chunks (argv[ARGS_INPUT], p); std::vector<emory::chunk::region> src = find_chunks (argv[ARGS_INPUT], p);
@ -199,6 +207,7 @@ int main (int argc, char const **argv)
region_less region_less
); );
// Write all chunks to the output file
output << "params: " << p << '\n'; output << "params: " << p << '\n';
for (auto const &chunk: src) { for (auto const &chunk: src) {
output << chunk.size() << ' '; output << chunk.size() << ' ';
@ -217,6 +226,7 @@ int main (int argc, char const **argv)
auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0); auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0);
std::cout << myaccum << '\n'; std::cout << myaccum << '\n';
// Find the total and unique byte counts
auto const total_bytes = std::accumulate ( auto const total_bytes = std::accumulate (
src.begin (), src.begin (),
src.end (), src.end (),