chunk/find: remove map in favour of bare queries
This commit is contained in:
parent
db6c4f54a2
commit
830fb1f47e
@ -32,8 +32,8 @@ find_package (fmt REQUIRED)
|
||||
list(APPEND libemory_sources
|
||||
emory/chunk/fwd.cpp
|
||||
emory/chunk/fwd.hpp
|
||||
emory/chunk/map.cpp
|
||||
emory/chunk/map.hpp
|
||||
emory/chunk/find.hpp
|
||||
emory/chunk/find.cpp
|
||||
emory/chunk/match.cpp
|
||||
emory/chunk/match.hpp
|
||||
emory/chunk/ostream.cpp
|
||||
|
76
emory/chunk/find.hpp
Normal file
76
emory/chunk/find.hpp
Normal file
@ -0,0 +1,76 @@
|
||||
/*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* Copyright 2019, Danny Robson <danny@nerdcruft.net>
|
||||
*/
|
||||
|
||||
#include "params.hpp"
|
||||
|
||||
#include <cruft/util/hash/buzhash.hpp>
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
namespace emory::chunk {
|
||||
template <typename HashT, typename OutputT>
|
||||
OutputT
|
||||
find (
|
||||
OutputT &&dst,
|
||||
cruft::view<u08 const*> src,
|
||||
emory::chunk::params const &p
|
||||
) {
|
||||
using hash_type = cruft::hash::buzhash<u64>;
|
||||
if (src.size () < p.window)
|
||||
return dst;
|
||||
|
||||
using digest_type = hash_type::digest_type ;
|
||||
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
|
||||
|
||||
u64 hash_state = 0;
|
||||
auto start = src.begin ();
|
||||
|
||||
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
|
||||
hash_state = 0;
|
||||
cursor += p.minimum - p.window;
|
||||
for (std::size_t i = 0; i < p.window; ++i)
|
||||
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
|
||||
|
||||
for ( ; cursor < src.end () - p.window; ++cursor) {
|
||||
if (likely (hash_state & mask)) {
|
||||
hash_state = cruft::rotatel (hash_state, 1)
|
||||
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
||||
^ *cursor;
|
||||
continue;
|
||||
}
|
||||
|
||||
cruft::view<u08 const*> const region { start, cursor };
|
||||
|
||||
*dst = {
|
||||
.offset = {
|
||||
std::distance (src.begin (), start),
|
||||
std::distance (src.begin (), cursor)
|
||||
},
|
||||
.digest = HashT {} (region)
|
||||
};
|
||||
|
||||
start = cursor;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (start != src.end ()) {
|
||||
cruft::view<u08 const*> const region { start, src.end () };
|
||||
|
||||
*dst++ = {
|
||||
.offset = {
|
||||
std::distance (src.begin (), start),
|
||||
std::distance (src.begin (), src.end ())
|
||||
},
|
||||
.digest = HashT {} (region)
|
||||
};
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
}
|
@ -11,7 +11,6 @@
|
||||
namespace emory::chunk {
|
||||
struct params;
|
||||
struct region;
|
||||
struct map;
|
||||
|
||||
template <typename IdT> struct match;
|
||||
};
|
@ -1,87 +0,0 @@
|
||||
/*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* Copyright 2019, Danny Robson <danny@nerdcruft.net>
|
||||
*/
|
||||
|
||||
#include "map.hpp"
|
||||
|
||||
#include "params.hpp"
|
||||
|
||||
#include <cruft/util/hash/buzhash.hpp>
|
||||
|
||||
using emory::chunk::map;
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
template <typename HashT, typename OutputT>
|
||||
static
|
||||
OutputT
|
||||
find_chunks (
|
||||
OutputT &&dst,
|
||||
cruft::view<u08 const*> src,
|
||||
emory::chunk::params const &p
|
||||
) {
|
||||
using hash_type = cruft::hash::buzhash<u64>;
|
||||
if (src.size () < p.window)
|
||||
return dst;
|
||||
|
||||
using digest_type = hash_type::digest_type ;
|
||||
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
|
||||
|
||||
u64 hash_state = 0;
|
||||
auto start = src.begin ();
|
||||
|
||||
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
|
||||
hash_state = 0;
|
||||
cursor += p.minimum - p.window;
|
||||
for (std::size_t i = 0; i < p.window; ++i)
|
||||
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
|
||||
|
||||
for ( ; cursor < src.end () - p.window; ++cursor) {
|
||||
if (likely (hash_state & mask)) {
|
||||
hash_state = cruft::rotatel (hash_state, 1)
|
||||
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
||||
^ *cursor;
|
||||
continue;
|
||||
}
|
||||
|
||||
cruft::view<u08 const*> const region { start, cursor };
|
||||
|
||||
*dst = {
|
||||
.offset = {
|
||||
std::distance (src.begin (), start),
|
||||
std::distance (src.begin (), cursor)
|
||||
},
|
||||
.digest = HashT {} (region)
|
||||
};
|
||||
|
||||
start = cursor;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (start != src.end ()) {
|
||||
cruft::view<u08 const*> const region { start, src.end () };
|
||||
|
||||
*dst++ = {
|
||||
.offset = {
|
||||
std::distance (src.begin (), start),
|
||||
std::distance (src.begin (), src.end ())
|
||||
},
|
||||
.digest = HashT {} (region)
|
||||
};
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
map::map (
|
||||
cruft::view<u08 const *> src,
|
||||
emory::chunk::params const &p
|
||||
) {
|
||||
::find_chunks<static_hash> (std::back_inserter (elements), src, p);
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
/*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* Copyright 2019, Danny Robson <danny@nerdcruft.net>
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "fwd.hpp"
|
||||
|
||||
#include "region.hpp"
|
||||
|
||||
#include <cruft/util/view.hpp>
|
||||
#include <cruft/util/std.hpp>
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace emory::chunk {
|
||||
struct map {
|
||||
map (cruft::view<u08 const*> data, params const&);
|
||||
|
||||
std::size_t size (void) const noexcept { return elements.size (); }
|
||||
|
||||
std::vector<region> elements;
|
||||
};
|
||||
}
|
@ -8,20 +8,18 @@
|
||||
|
||||
#include "match.hpp"
|
||||
|
||||
#include "map.hpp"
|
||||
|
||||
std::vector<
|
||||
emory::chunk::match<emory::chunk::map const*>
|
||||
emory::chunk::match<std::vector<emory::chunk::region> const*>
|
||||
>
|
||||
emory::chunk::common (map const &src, map const &dst)
|
||||
emory::chunk::common (std::vector<region> const &src, std::vector<region> const &dst)
|
||||
{
|
||||
CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), region::digest_ordering {}));
|
||||
CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), region::digest_ordering {}));
|
||||
CHECK (std::is_sorted (src.begin (), src.end (), region::digest_ordering {}));
|
||||
CHECK (std::is_sorted (dst.begin (), dst.end (), region::digest_ordering {}));
|
||||
|
||||
std::vector<match<map const*>> res;
|
||||
std::vector<match<std::vector<region> const*>> res;
|
||||
|
||||
for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin ();
|
||||
src_cursor != src.elements.end () && dst_cursor != dst.elements.end ();
|
||||
for (auto src_cursor = src.begin (), dst_cursor = dst.begin ();
|
||||
src_cursor != src.end () && dst_cursor != dst.end ();
|
||||
/* nothing */)
|
||||
{
|
||||
if (src_cursor->digest < dst_cursor->digest) {
|
||||
|
@ -51,6 +51,6 @@ namespace emory::chunk {
|
||||
};
|
||||
|
||||
|
||||
std::vector<match<map const*>>
|
||||
common (map const &a, map const &b);
|
||||
std::vector<match<std::vector<region> const*>>
|
||||
common (std::vector<region> const &a, std::vector<region> const &b);
|
||||
}
|
||||
|
@ -6,7 +6,8 @@
|
||||
* Copyright 2019 Danny Robson <danny@nerdcruft.net>
|
||||
*/
|
||||
|
||||
#include "emory/chunk/map.hpp"
|
||||
#include "emory/chunk/find.hpp"
|
||||
#include "emory/chunk/region.hpp"
|
||||
#include "emory/chunk/params.hpp"
|
||||
#include "emory/chunk/ostream.hpp"
|
||||
|
||||
@ -102,38 +103,39 @@ int main (int argc, char const **argv)
|
||||
std::cout << "size: " << data.size () << '\n';
|
||||
|
||||
std::cout << "processing\n";
|
||||
emory::chunk::map src (data, p);
|
||||
std::vector<emory::chunk::region> src;
|
||||
emory::chunk::find<emory::chunk::static_hash> (std::back_inserter (src), data, p);
|
||||
|
||||
std::cout << "validating\n";
|
||||
std::cout << src.size () << " chunks\n";
|
||||
std::sort (
|
||||
src.elements.begin (),
|
||||
src.elements.end (),
|
||||
src.begin (),
|
||||
src.end (),
|
||||
[] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; }
|
||||
);
|
||||
for (int i = 0, cursor = 0; i < std::ssize (src.elements); ++i) {
|
||||
if (src.elements[i].offset.first != cursor) {
|
||||
for (off_t i = 0, cursor = 0; i < std::ssize (src); ++i) {
|
||||
if (src[i].offset.first != cursor) {
|
||||
std::cout << "non-overlapping chunks\n";
|
||||
return -1;
|
||||
}
|
||||
cursor = src.elements[i].offset.second;
|
||||
cursor = src[i].offset.second;
|
||||
}
|
||||
|
||||
if (src.elements.back ().offset.second != std::ssize (data)) {
|
||||
if (src.back ().offset.second != std::ssize (data)) {
|
||||
std::cout << "invalid total size\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::sort (
|
||||
src.elements.begin (),
|
||||
src.elements.end (),
|
||||
src.begin (),
|
||||
src.end (),
|
||||
region_less
|
||||
);
|
||||
|
||||
std::vector<off64_t> sizes;
|
||||
std::transform (
|
||||
src.elements.begin (),
|
||||
src.elements.end (),
|
||||
src.begin (),
|
||||
src.end (),
|
||||
std::back_inserter (sizes),
|
||||
[] (auto const &val) { return val.size (); }
|
||||
);
|
||||
@ -141,8 +143,8 @@ int main (int argc, char const **argv)
|
||||
std::cout << myaccum << '\n';
|
||||
|
||||
auto const total_bytes = std::accumulate (
|
||||
src.elements.begin (),
|
||||
src.elements.end (),
|
||||
src.begin (),
|
||||
src.end (),
|
||||
0,
|
||||
[] (auto const accum, auto const rhs)
|
||||
{
|
||||
@ -152,8 +154,8 @@ int main (int argc, char const **argv)
|
||||
|
||||
std::vector<emory::chunk::region> unique;
|
||||
std::unique_copy (
|
||||
src.elements.begin (),
|
||||
src.elements.end (),
|
||||
src.begin (),
|
||||
src.end (),
|
||||
std::back_inserter (unique),
|
||||
region_equal
|
||||
);
|
||||
@ -173,5 +175,5 @@ int main (int argc, char const **argv)
|
||||
100.f * duplicated_fraction
|
||||
);
|
||||
|
||||
std::cout << (src.elements.size () - unique.size ()) << " duplicates\n";
|
||||
std::cout << (src.size () - unique.size ()) << " duplicates\n";
|
||||
}
|
||||
|
@ -7,7 +7,8 @@
|
||||
*/
|
||||
|
||||
#include "emory/chunk/params.hpp"
|
||||
#include "emory/chunk/map.hpp"
|
||||
#include "emory/chunk/find.hpp"
|
||||
#include "emory/chunk/region.hpp"
|
||||
#include "emory/chunk/match.hpp"
|
||||
#include "emory/chunk/ostream.hpp"
|
||||
|
||||
@ -44,17 +45,26 @@ int main (int argc, char const **argv)
|
||||
};
|
||||
|
||||
std::clog << "Hashing target\n";
|
||||
emory::chunk::map target (cruft::mapped_file (argv[ARGS_TARGET]), p);
|
||||
std::sort (target.elements.begin (), target.elements.end (), emory::chunk::region::digest_ordering {});
|
||||
std::cout << "Found " << target.elements.size () << " chunks\n";
|
||||
std::vector<emory::chunk::region> target;
|
||||
emory::chunk::find<emory::chunk::static_hash> (
|
||||
std::back_inserter (target),
|
||||
cruft::mapped_file (argv[ARGS_TARGET]), p
|
||||
);
|
||||
|
||||
std::sort (target.begin (), target.end (), emory::chunk::region::digest_ordering {});
|
||||
std::cout << "Found " << target.size () << " chunks\n";
|
||||
|
||||
std::vector<emory::chunk::match<int>> found;
|
||||
|
||||
for (int i = ARGS_SOURCE; i != argc; ++i) {
|
||||
auto const path = argv[i];
|
||||
std::clog << "Hashing source: " << path << '\n';
|
||||
emory::chunk::map source (cruft::mapped_file (path), p);
|
||||
std::sort (source.elements.begin (), source.elements.end (), emory::chunk::region::digest_ordering {});
|
||||
std::vector<emory::chunk::region> source;
|
||||
emory::chunk::find<emory::chunk::static_hash> (
|
||||
std::back_inserter (source),
|
||||
cruft::mapped_file (path), p
|
||||
);
|
||||
std::sort (source.begin (), source.end (), emory::chunk::region::digest_ordering {});
|
||||
|
||||
std::clog << "Finding common\n";
|
||||
auto const &source_matches = common (target, source);
|
||||
@ -99,8 +109,8 @@ int main (int argc, char const **argv)
|
||||
}
|
||||
|
||||
std::size_t const total = std::accumulate (
|
||||
target.elements.begin (),
|
||||
target.elements.end (),
|
||||
target.begin (),
|
||||
target.end (),
|
||||
0u,
|
||||
[] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; }
|
||||
);
|
||||
|
Loading…
Reference in New Issue
Block a user