diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c1d43a..68daf97 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,8 +32,8 @@ find_package (fmt REQUIRED) list(APPEND libemory_sources emory/chunk/fwd.cpp emory/chunk/fwd.hpp - emory/chunk/map.cpp - emory/chunk/map.hpp + emory/chunk/find.hpp + emory/chunk/find.cpp emory/chunk/match.cpp emory/chunk/match.hpp emory/chunk/ostream.cpp diff --git a/emory/chunk/find.hpp b/emory/chunk/find.hpp new file mode 100644 index 0000000..7f5cdb1 --- /dev/null +++ b/emory/chunk/find.hpp @@ -0,0 +1,76 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#include "params.hpp" + +#include + + +/////////////////////////////////////////////////////////////////////////////// +namespace emory::chunk { + template + OutputT + find ( + OutputT &&dst, + cruft::view src, + emory::chunk::params const &p + ) { + using hash_type = cruft::hash::buzhash; + if (src.size () < p.window) + return dst; + + using digest_type = hash_type::digest_type ; + digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits); + + u64 hash_state = 0; + auto start = src.begin (); + + for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) { + hash_state = 0; + cursor += p.minimum - p.window; + for (std::size_t i = 0; i < p.window; ++i) + hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++; + + for ( ; cursor < src.end () - p.window; ++cursor) { + if (likely (hash_state & mask)) { + hash_state = cruft::rotatel (hash_state, 1) + ^ cruft::rotatel (u64 (*(cursor - p.window)), p.window) + ^ *cursor; + continue; + } + + cruft::view const region { start, cursor }; + + *dst = { + .offset = { + std::distance (src.begin (), start), + std::distance (src.begin (), cursor) + }, + .digest = HashT {} (region) + }; + + start = cursor; + break; + } + } + + if (start != src.end ()) { + cruft::view const region { start, src.end () }; + + *dst++ = { + .offset = { + std::distance (src.begin (), start), + std::distance (src.begin (), src.end ()) + }, + .digest = HashT {} (region) + }; + } + + return dst; + } +} \ No newline at end of file diff --git a/emory/chunk/fwd.hpp b/emory/chunk/fwd.hpp index 898ef15..bab5e24 100644 --- a/emory/chunk/fwd.hpp +++ b/emory/chunk/fwd.hpp @@ -11,7 +11,6 @@ namespace emory::chunk { struct params; struct region; - struct map; template struct match; }; \ No newline at end of file diff --git a/emory/chunk/map.cpp b/emory/chunk/map.cpp deleted file mode 100644 index f14601f..0000000 --- a/emory/chunk/map.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - * Copyright 2019, Danny Robson - */ - -#include "map.hpp" - -#include "params.hpp" - -#include - -using emory::chunk::map; - - -/////////////////////////////////////////////////////////////////////////////// -template -static -OutputT -find_chunks ( - OutputT &&dst, - cruft::view src, - emory::chunk::params const &p -) { - using hash_type = cruft::hash::buzhash; - if (src.size () < p.window) - return dst; - - using digest_type = hash_type::digest_type ; - digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits); - - u64 hash_state = 0; - auto start = src.begin (); - - for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) { - hash_state = 0; - cursor += p.minimum - p.window; - for (std::size_t i = 0; i < p.window; ++i) - hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++; - - for ( ; cursor < src.end () - p.window; ++cursor) { - if (likely (hash_state & mask)) { - hash_state = cruft::rotatel (hash_state, 1) - ^ cruft::rotatel (u64 (*(cursor - p.window)), p.window) - ^ *cursor; - continue; - } - - cruft::view const region { start, cursor }; - - *dst = { - .offset = { - std::distance (src.begin (), start), - std::distance (src.begin (), cursor) - }, - .digest = HashT {} (region) - }; - - start = cursor; - break; - } - } - - if (start != src.end ()) { - cruft::view const region { start, src.end () }; - - *dst++ = { - .offset = { - std::distance (src.begin (), start), - std::distance (src.begin (), src.end ()) - }, - .digest = HashT {} (region) - }; - } - - return dst; -} - -/////////////////////////////////////////////////////////////////////////////// -map::map ( - cruft::view src, - emory::chunk::params const &p -) { - ::find_chunks (std::back_inserter (elements), src, p); -} diff --git a/emory/chunk/map.hpp b/emory/chunk/map.hpp deleted file mode 100644 index afb35f0..0000000 --- a/emory/chunk/map.hpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - * Copyright 2019, Danny Robson - */ - -#pragma once - -#include "fwd.hpp" - -#include "region.hpp" - -#include -#include - -#include - -namespace emory::chunk { - struct map { - map (cruft::view data, params const&); - - std::size_t size (void) const noexcept { return elements.size (); } - - std::vector elements; - }; -} diff --git a/emory/chunk/match.cpp b/emory/chunk/match.cpp index 0f8a3ff..918bcbd 100644 --- a/emory/chunk/match.cpp +++ b/emory/chunk/match.cpp @@ -8,20 +8,18 @@ #include "match.hpp" -#include "map.hpp" - std::vector< - emory::chunk::match + emory::chunk::match const*> > -emory::chunk::common (map const &src, map const &dst) +emory::chunk::common (std::vector const &src, std::vector const &dst) { - CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), region::digest_ordering {})); - CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), region::digest_ordering {})); + CHECK (std::is_sorted (src.begin (), src.end (), region::digest_ordering {})); + CHECK (std::is_sorted (dst.begin (), dst.end (), region::digest_ordering {})); - std::vector> res; + std::vector const*>> res; - for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin (); - src_cursor != src.elements.end () && dst_cursor != dst.elements.end (); + for (auto src_cursor = src.begin (), dst_cursor = dst.begin (); + src_cursor != src.end () && dst_cursor != dst.end (); /* nothing */) { if (src_cursor->digest < dst_cursor->digest) { diff --git a/emory/chunk/match.hpp b/emory/chunk/match.hpp index 1af88d9..9c6e997 100644 --- a/emory/chunk/match.hpp +++ b/emory/chunk/match.hpp @@ -51,6 +51,6 @@ namespace emory::chunk { }; - std::vector> - common (map const &a, map const &b); + std::vector const*>> + common (std::vector const &a, std::vector const &b); } diff --git a/tools/analyse.cpp b/tools/analyse.cpp index f4beb17..13d8d59 100644 --- a/tools/analyse.cpp +++ b/tools/analyse.cpp @@ -6,7 +6,8 @@ * Copyright 2019 Danny Robson */ -#include "emory/chunk/map.hpp" +#include "emory/chunk/find.hpp" +#include "emory/chunk/region.hpp" #include "emory/chunk/params.hpp" #include "emory/chunk/ostream.hpp" @@ -102,38 +103,39 @@ int main (int argc, char const **argv) std::cout << "size: " << data.size () << '\n'; std::cout << "processing\n"; - emory::chunk::map src (data, p); + std::vector src; + emory::chunk::find (std::back_inserter (src), data, p); std::cout << "validating\n"; std::cout << src.size () << " chunks\n"; std::sort ( - src.elements.begin (), - src.elements.end (), + src.begin (), + src.end (), [] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; } ); - for (int i = 0, cursor = 0; i < std::ssize (src.elements); ++i) { - if (src.elements[i].offset.first != cursor) { + for (off_t i = 0, cursor = 0; i < std::ssize (src); ++i) { + if (src[i].offset.first != cursor) { std::cout << "non-overlapping chunks\n"; return -1; } - cursor = src.elements[i].offset.second; + cursor = src[i].offset.second; } - if (src.elements.back ().offset.second != std::ssize (data)) { + if (src.back ().offset.second != std::ssize (data)) { std::cout << "invalid total size\n"; return -1; } std::sort ( - src.elements.begin (), - src.elements.end (), + src.begin (), + src.end (), region_less ); std::vector sizes; std::transform ( - src.elements.begin (), - src.elements.end (), + src.begin (), + src.end (), std::back_inserter (sizes), [] (auto const &val) { return val.size (); } ); @@ -141,8 +143,8 @@ int main (int argc, char const **argv) std::cout << myaccum << '\n'; auto const total_bytes = std::accumulate ( - src.elements.begin (), - src.elements.end (), + src.begin (), + src.end (), 0, [] (auto const accum, auto const rhs) { @@ -152,8 +154,8 @@ int main (int argc, char const **argv) std::vector unique; std::unique_copy ( - src.elements.begin (), - src.elements.end (), + src.begin (), + src.end (), std::back_inserter (unique), region_equal ); @@ -173,5 +175,5 @@ int main (int argc, char const **argv) 100.f * duplicated_fraction ); - std::cout << (src.elements.size () - unique.size ()) << " duplicates\n"; + std::cout << (src.size () - unique.size ()) << " duplicates\n"; } diff --git a/tools/compare.cpp b/tools/compare.cpp index b8de046..80933d2 100644 --- a/tools/compare.cpp +++ b/tools/compare.cpp @@ -7,7 +7,8 @@ */ #include "emory/chunk/params.hpp" -#include "emory/chunk/map.hpp" +#include "emory/chunk/find.hpp" +#include "emory/chunk/region.hpp" #include "emory/chunk/match.hpp" #include "emory/chunk/ostream.hpp" @@ -44,17 +45,26 @@ int main (int argc, char const **argv) }; std::clog << "Hashing target\n"; - emory::chunk::map target (cruft::mapped_file (argv[ARGS_TARGET]), p); - std::sort (target.elements.begin (), target.elements.end (), emory::chunk::region::digest_ordering {}); - std::cout << "Found " << target.elements.size () << " chunks\n"; + std::vector target; + emory::chunk::find ( + std::back_inserter (target), + cruft::mapped_file (argv[ARGS_TARGET]), p + ); + + std::sort (target.begin (), target.end (), emory::chunk::region::digest_ordering {}); + std::cout << "Found " << target.size () << " chunks\n"; std::vector> found; for (int i = ARGS_SOURCE; i != argc; ++i) { auto const path = argv[i]; std::clog << "Hashing source: " << path << '\n'; - emory::chunk::map source (cruft::mapped_file (path), p); - std::sort (source.elements.begin (), source.elements.end (), emory::chunk::region::digest_ordering {}); + std::vector source; + emory::chunk::find ( + std::back_inserter (source), + cruft::mapped_file (path), p + ); + std::sort (source.begin (), source.end (), emory::chunk::region::digest_ordering {}); std::clog << "Finding common\n"; auto const &source_matches = common (target, source); @@ -99,8 +109,8 @@ int main (int argc, char const **argv) } std::size_t const total = std::accumulate ( - target.elements.begin (), - target.elements.end (), + target.begin (), + target.end (), 0u, [] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; } );