comments
This commit is contained in:
parent
f083ff0f64
commit
aa90c7ef70
@ -14,7 +14,15 @@
|
|||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
namespace emory::chunk {
|
namespace emory::chunk {
|
||||||
template <typename HashT, typename OutputT>
|
/// Scan a memory range and write chunk data to a supplied output
|
||||||
|
/// iterator.
|
||||||
|
///
|
||||||
|
/// \tparam HashT The strong final hash function type
|
||||||
|
/// \tparam OutputT The output iterator type
|
||||||
|
template <
|
||||||
|
typename HashT,
|
||||||
|
typename OutputT
|
||||||
|
>
|
||||||
OutputT
|
OutputT
|
||||||
find (
|
find (
|
||||||
OutputT &&dst,
|
OutputT &&dst,
|
||||||
@ -31,22 +39,29 @@ namespace emory::chunk {
|
|||||||
u64 hash_state = 0;
|
u64 hash_state = 0;
|
||||||
auto start = src.begin ();
|
auto start = src.begin ();
|
||||||
|
|
||||||
|
// Scan the entire source memory region
|
||||||
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
|
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
|
||||||
|
// Initialise the hash state. We can jump ahead to just before the
|
||||||
|
// rolling hash window starts because the preceding data would be
|
||||||
|
// rotated out anyway.
|
||||||
hash_state = 0;
|
hash_state = 0;
|
||||||
cursor += p.minimum - p.window;
|
cursor += p.minimum - p.window;
|
||||||
for (std::size_t i = 0; i < p.window; ++i)
|
for (std::size_t i = 0; i < p.window; ++i)
|
||||||
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
|
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
|
||||||
|
|
||||||
|
// Scan until the last point that satisfies our chunk size constraints.
|
||||||
for ( ; cursor < src.end () - p.window; ++cursor) {
|
for ( ; cursor < src.end () - p.window; ++cursor) {
|
||||||
if (cursor < start + p.maximum) {
|
if (cursor < start + p.maximum) {
|
||||||
|
// Rotate the buzhash state if we haven't hit a marker.
|
||||||
if (likely (hash_state & mask)) {
|
if (likely (hash_state & mask)) {
|
||||||
hash_state = cruft::rotatel (hash_state, 1)
|
hash_state = cruft::rotatel (hash_state, 1)
|
||||||
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
||||||
^ *cursor;
|
^ *cursor;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Record the matching chunk
|
||||||
cruft::view<u08 const*> const region { start, cursor };
|
cruft::view<u08 const*> const region { start, cursor };
|
||||||
CHECK_GE (cursor - start, p.minimum);
|
CHECK_GE (cursor - start, p.minimum);
|
||||||
CHECK_LE (cursor - start, p.maximum);
|
CHECK_LE (cursor - start, p.maximum);
|
||||||
@ -64,6 +79,7 @@ namespace emory::chunk {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create a final chunk from the tail data
|
||||||
if (start != src.end ()) {
|
if (start != src.end ()) {
|
||||||
cruft::view<u08 const*> const region { start, src.end () };
|
cruft::view<u08 const*> const region { start, src.end () };
|
||||||
|
|
||||||
|
@ -27,6 +27,8 @@
|
|||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
/// Provides _some_ consistent ordering for regions. The meaning isn't well
|
||||||
|
/// defined. The function is provided only so that we can identify duplicates.
|
||||||
static
|
static
|
||||||
std::strong_ordering
|
std::strong_ordering
|
||||||
region_ordering (
|
region_ordering (
|
||||||
@ -42,25 +44,24 @@ region_ordering (
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static bool region_less (emory::chunk::region const &a, emory::chunk::region const &b)
|
//-----------------------------------------------------------------------------
|
||||||
{
|
static bool region_less (
|
||||||
|
emory::chunk::region const &a,
|
||||||
|
emory::chunk::region const &b
|
||||||
|
) {
|
||||||
return region_ordering (a, b) < 0;
|
return region_ordering (a, b) < 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static bool region_equal (emory::chunk::region const &a, emory::chunk::region const &b)
|
//-----------------------------------------------------------------------------
|
||||||
{
|
static bool region_equal (
|
||||||
|
emory::chunk::region const &a,
|
||||||
|
emory::chunk::region const &b
|
||||||
|
) {
|
||||||
return region_ordering (a, b) == 0;
|
return region_ordering (a, b) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//static bool overlap (emory::chunk::region const &a, emory::chunk::region const &b)
|
|
||||||
//{
|
|
||||||
// return a.offset.first < b.offset.second &&
|
|
||||||
// b.offset.first < a.offset.second;
|
|
||||||
//}
|
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
static void find_path_chunks (
|
static void find_path_chunks (
|
||||||
std::vector<emory::chunk::region> &res,
|
std::vector<emory::chunk::region> &res,
|
||||||
@ -69,7 +70,8 @@ static void find_path_chunks (
|
|||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
///----------------------------------------------------------------------------
|
||||||
|
/// Scan chunks in the path provided to a regular file.
|
||||||
static
|
static
|
||||||
void
|
void
|
||||||
find_regular_chunks (
|
find_regular_chunks (
|
||||||
@ -89,7 +91,8 @@ find_regular_chunks (
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
///----------------------------------------------------------------------------
|
||||||
|
/// Scan chunks in the directory by recursing into all children.
|
||||||
static
|
static
|
||||||
void
|
void
|
||||||
find_directory_chunks (
|
find_directory_chunks (
|
||||||
@ -104,7 +107,9 @@ find_directory_chunks (
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
///----------------------------------------------------------------------------
|
||||||
|
/// Scan chunks from a given path by dispatching to `find_foo_chunks` style
|
||||||
|
/// functions depending on the file type.
|
||||||
static void find_path_chunks (
|
static void find_path_chunks (
|
||||||
std::vector<emory::chunk::region> &res,
|
std::vector<emory::chunk::region> &res,
|
||||||
std::filesystem::path const &src,
|
std::filesystem::path const &src,
|
||||||
@ -133,7 +138,8 @@ static void find_path_chunks (
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
///----------------------------------------------------------------------------
|
||||||
|
/// Find all regions in a path and return a vector of the regions.
|
||||||
static
|
static
|
||||||
std::vector<emory::chunk::region>
|
std::vector<emory::chunk::region>
|
||||||
find_chunks (std::filesystem::path const &src, emory::chunk::params const &p)
|
find_chunks (std::filesystem::path const &src, emory::chunk::params const &p)
|
||||||
@ -164,6 +170,7 @@ enum {
|
|||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
int main (int argc, char const **argv)
|
int main (int argc, char const **argv)
|
||||||
{
|
{
|
||||||
|
// Extract commandline arguments
|
||||||
if (argc < NUM_ARGS_REQUIRED) {
|
if (argc < NUM_ARGS_REQUIRED) {
|
||||||
std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum] [maximum]\n"
|
std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum] [maximum]\n"
|
||||||
<< "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n'
|
<< "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n'
|
||||||
@ -188,6 +195,7 @@ int main (int argc, char const **argv)
|
|||||||
std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc);
|
std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc);
|
||||||
output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit);
|
output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit);
|
||||||
|
|
||||||
|
// Find all the chunks and prepare them for output
|
||||||
std::cout << "processing\n";
|
std::cout << "processing\n";
|
||||||
std::vector<emory::chunk::region> src = find_chunks (argv[ARGS_INPUT], p);
|
std::vector<emory::chunk::region> src = find_chunks (argv[ARGS_INPUT], p);
|
||||||
|
|
||||||
@ -199,6 +207,7 @@ int main (int argc, char const **argv)
|
|||||||
region_less
|
region_less
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Write all chunks to the output file
|
||||||
output << "params: " << p << '\n';
|
output << "params: " << p << '\n';
|
||||||
for (auto const &chunk: src) {
|
for (auto const &chunk: src) {
|
||||||
output << chunk.size() << ' ';
|
output << chunk.size() << ' ';
|
||||||
@ -217,6 +226,7 @@ int main (int argc, char const **argv)
|
|||||||
auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0);
|
auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0);
|
||||||
std::cout << myaccum << '\n';
|
std::cout << myaccum << '\n';
|
||||||
|
|
||||||
|
// Find the total and unique byte counts
|
||||||
auto const total_bytes = std::accumulate (
|
auto const total_bytes = std::accumulate (
|
||||||
src.begin (),
|
src.begin (),
|
||||||
src.end (),
|
src.end (),
|
||||||
|
Loading…
Reference in New Issue
Block a user