chunk/find: remove map in favour of bare queries
This commit is contained in:
parent
db6c4f54a2
commit
830fb1f47e
@ -32,8 +32,8 @@ find_package (fmt REQUIRED)
|
|||||||
list(APPEND libemory_sources
|
list(APPEND libemory_sources
|
||||||
emory/chunk/fwd.cpp
|
emory/chunk/fwd.cpp
|
||||||
emory/chunk/fwd.hpp
|
emory/chunk/fwd.hpp
|
||||||
emory/chunk/map.cpp
|
emory/chunk/find.hpp
|
||||||
emory/chunk/map.hpp
|
emory/chunk/find.cpp
|
||||||
emory/chunk/match.cpp
|
emory/chunk/match.cpp
|
||||||
emory/chunk/match.hpp
|
emory/chunk/match.hpp
|
||||||
emory/chunk/ostream.cpp
|
emory/chunk/ostream.cpp
|
||||||
|
76
emory/chunk/find.hpp
Normal file
76
emory/chunk/find.hpp
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
/*
|
||||||
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
*
|
||||||
|
* Copyright 2019, Danny Robson <danny@nerdcruft.net>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "params.hpp"
|
||||||
|
|
||||||
|
#include <cruft/util/hash/buzhash.hpp>
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
namespace emory::chunk {
|
||||||
|
template <typename HashT, typename OutputT>
|
||||||
|
OutputT
|
||||||
|
find (
|
||||||
|
OutputT &&dst,
|
||||||
|
cruft::view<u08 const*> src,
|
||||||
|
emory::chunk::params const &p
|
||||||
|
) {
|
||||||
|
using hash_type = cruft::hash::buzhash<u64>;
|
||||||
|
if (src.size () < p.window)
|
||||||
|
return dst;
|
||||||
|
|
||||||
|
using digest_type = hash_type::digest_type ;
|
||||||
|
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
|
||||||
|
|
||||||
|
u64 hash_state = 0;
|
||||||
|
auto start = src.begin ();
|
||||||
|
|
||||||
|
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
|
||||||
|
hash_state = 0;
|
||||||
|
cursor += p.minimum - p.window;
|
||||||
|
for (std::size_t i = 0; i < p.window; ++i)
|
||||||
|
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
|
||||||
|
|
||||||
|
for ( ; cursor < src.end () - p.window; ++cursor) {
|
||||||
|
if (likely (hash_state & mask)) {
|
||||||
|
hash_state = cruft::rotatel (hash_state, 1)
|
||||||
|
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
||||||
|
^ *cursor;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
cruft::view<u08 const*> const region { start, cursor };
|
||||||
|
|
||||||
|
*dst = {
|
||||||
|
.offset = {
|
||||||
|
std::distance (src.begin (), start),
|
||||||
|
std::distance (src.begin (), cursor)
|
||||||
|
},
|
||||||
|
.digest = HashT {} (region)
|
||||||
|
};
|
||||||
|
|
||||||
|
start = cursor;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start != src.end ()) {
|
||||||
|
cruft::view<u08 const*> const region { start, src.end () };
|
||||||
|
|
||||||
|
*dst++ = {
|
||||||
|
.offset = {
|
||||||
|
std::distance (src.begin (), start),
|
||||||
|
std::distance (src.begin (), src.end ())
|
||||||
|
},
|
||||||
|
.digest = HashT {} (region)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
}
|
@ -11,7 +11,6 @@
|
|||||||
namespace emory::chunk {
|
namespace emory::chunk {
|
||||||
struct params;
|
struct params;
|
||||||
struct region;
|
struct region;
|
||||||
struct map;
|
|
||||||
|
|
||||||
template <typename IdT> struct match;
|
template <typename IdT> struct match;
|
||||||
};
|
};
|
@ -1,87 +0,0 @@
|
|||||||
/*
|
|
||||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
||||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
||||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
||||||
*
|
|
||||||
* Copyright 2019, Danny Robson <danny@nerdcruft.net>
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "map.hpp"
|
|
||||||
|
|
||||||
#include "params.hpp"
|
|
||||||
|
|
||||||
#include <cruft/util/hash/buzhash.hpp>
|
|
||||||
|
|
||||||
using emory::chunk::map;
|
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
template <typename HashT, typename OutputT>
|
|
||||||
static
|
|
||||||
OutputT
|
|
||||||
find_chunks (
|
|
||||||
OutputT &&dst,
|
|
||||||
cruft::view<u08 const*> src,
|
|
||||||
emory::chunk::params const &p
|
|
||||||
) {
|
|
||||||
using hash_type = cruft::hash::buzhash<u64>;
|
|
||||||
if (src.size () < p.window)
|
|
||||||
return dst;
|
|
||||||
|
|
||||||
using digest_type = hash_type::digest_type ;
|
|
||||||
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
|
|
||||||
|
|
||||||
u64 hash_state = 0;
|
|
||||||
auto start = src.begin ();
|
|
||||||
|
|
||||||
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
|
|
||||||
hash_state = 0;
|
|
||||||
cursor += p.minimum - p.window;
|
|
||||||
for (std::size_t i = 0; i < p.window; ++i)
|
|
||||||
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
|
|
||||||
|
|
||||||
for ( ; cursor < src.end () - p.window; ++cursor) {
|
|
||||||
if (likely (hash_state & mask)) {
|
|
||||||
hash_state = cruft::rotatel (hash_state, 1)
|
|
||||||
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
|
|
||||||
^ *cursor;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
cruft::view<u08 const*> const region { start, cursor };
|
|
||||||
|
|
||||||
*dst = {
|
|
||||||
.offset = {
|
|
||||||
std::distance (src.begin (), start),
|
|
||||||
std::distance (src.begin (), cursor)
|
|
||||||
},
|
|
||||||
.digest = HashT {} (region)
|
|
||||||
};
|
|
||||||
|
|
||||||
start = cursor;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (start != src.end ()) {
|
|
||||||
cruft::view<u08 const*> const region { start, src.end () };
|
|
||||||
|
|
||||||
*dst++ = {
|
|
||||||
.offset = {
|
|
||||||
std::distance (src.begin (), start),
|
|
||||||
std::distance (src.begin (), src.end ())
|
|
||||||
},
|
|
||||||
.digest = HashT {} (region)
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return dst;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
map::map (
|
|
||||||
cruft::view<u08 const *> src,
|
|
||||||
emory::chunk::params const &p
|
|
||||||
) {
|
|
||||||
::find_chunks<static_hash> (std::back_inserter (elements), src, p);
|
|
||||||
}
|
|
@ -1,28 +0,0 @@
|
|||||||
/*
|
|
||||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
||||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
||||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
||||||
*
|
|
||||||
* Copyright 2019, Danny Robson <danny@nerdcruft.net>
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "fwd.hpp"
|
|
||||||
|
|
||||||
#include "region.hpp"
|
|
||||||
|
|
||||||
#include <cruft/util/view.hpp>
|
|
||||||
#include <cruft/util/std.hpp>
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace emory::chunk {
|
|
||||||
struct map {
|
|
||||||
map (cruft::view<u08 const*> data, params const&);
|
|
||||||
|
|
||||||
std::size_t size (void) const noexcept { return elements.size (); }
|
|
||||||
|
|
||||||
std::vector<region> elements;
|
|
||||||
};
|
|
||||||
}
|
|
@ -8,20 +8,18 @@
|
|||||||
|
|
||||||
#include "match.hpp"
|
#include "match.hpp"
|
||||||
|
|
||||||
#include "map.hpp"
|
|
||||||
|
|
||||||
std::vector<
|
std::vector<
|
||||||
emory::chunk::match<emory::chunk::map const*>
|
emory::chunk::match<std::vector<emory::chunk::region> const*>
|
||||||
>
|
>
|
||||||
emory::chunk::common (map const &src, map const &dst)
|
emory::chunk::common (std::vector<region> const &src, std::vector<region> const &dst)
|
||||||
{
|
{
|
||||||
CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), region::digest_ordering {}));
|
CHECK (std::is_sorted (src.begin (), src.end (), region::digest_ordering {}));
|
||||||
CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), region::digest_ordering {}));
|
CHECK (std::is_sorted (dst.begin (), dst.end (), region::digest_ordering {}));
|
||||||
|
|
||||||
std::vector<match<map const*>> res;
|
std::vector<match<std::vector<region> const*>> res;
|
||||||
|
|
||||||
for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin ();
|
for (auto src_cursor = src.begin (), dst_cursor = dst.begin ();
|
||||||
src_cursor != src.elements.end () && dst_cursor != dst.elements.end ();
|
src_cursor != src.end () && dst_cursor != dst.end ();
|
||||||
/* nothing */)
|
/* nothing */)
|
||||||
{
|
{
|
||||||
if (src_cursor->digest < dst_cursor->digest) {
|
if (src_cursor->digest < dst_cursor->digest) {
|
||||||
|
@ -51,6 +51,6 @@ namespace emory::chunk {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
std::vector<match<map const*>>
|
std::vector<match<std::vector<region> const*>>
|
||||||
common (map const &a, map const &b);
|
common (std::vector<region> const &a, std::vector<region> const &b);
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,8 @@
|
|||||||
* Copyright 2019 Danny Robson <danny@nerdcruft.net>
|
* Copyright 2019 Danny Robson <danny@nerdcruft.net>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "emory/chunk/map.hpp"
|
#include "emory/chunk/find.hpp"
|
||||||
|
#include "emory/chunk/region.hpp"
|
||||||
#include "emory/chunk/params.hpp"
|
#include "emory/chunk/params.hpp"
|
||||||
#include "emory/chunk/ostream.hpp"
|
#include "emory/chunk/ostream.hpp"
|
||||||
|
|
||||||
@ -102,38 +103,39 @@ int main (int argc, char const **argv)
|
|||||||
std::cout << "size: " << data.size () << '\n';
|
std::cout << "size: " << data.size () << '\n';
|
||||||
|
|
||||||
std::cout << "processing\n";
|
std::cout << "processing\n";
|
||||||
emory::chunk::map src (data, p);
|
std::vector<emory::chunk::region> src;
|
||||||
|
emory::chunk::find<emory::chunk::static_hash> (std::back_inserter (src), data, p);
|
||||||
|
|
||||||
std::cout << "validating\n";
|
std::cout << "validating\n";
|
||||||
std::cout << src.size () << " chunks\n";
|
std::cout << src.size () << " chunks\n";
|
||||||
std::sort (
|
std::sort (
|
||||||
src.elements.begin (),
|
src.begin (),
|
||||||
src.elements.end (),
|
src.end (),
|
||||||
[] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; }
|
[] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; }
|
||||||
);
|
);
|
||||||
for (int i = 0, cursor = 0; i < std::ssize (src.elements); ++i) {
|
for (off_t i = 0, cursor = 0; i < std::ssize (src); ++i) {
|
||||||
if (src.elements[i].offset.first != cursor) {
|
if (src[i].offset.first != cursor) {
|
||||||
std::cout << "non-overlapping chunks\n";
|
std::cout << "non-overlapping chunks\n";
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
cursor = src.elements[i].offset.second;
|
cursor = src[i].offset.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src.elements.back ().offset.second != std::ssize (data)) {
|
if (src.back ().offset.second != std::ssize (data)) {
|
||||||
std::cout << "invalid total size\n";
|
std::cout << "invalid total size\n";
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::sort (
|
std::sort (
|
||||||
src.elements.begin (),
|
src.begin (),
|
||||||
src.elements.end (),
|
src.end (),
|
||||||
region_less
|
region_less
|
||||||
);
|
);
|
||||||
|
|
||||||
std::vector<off64_t> sizes;
|
std::vector<off64_t> sizes;
|
||||||
std::transform (
|
std::transform (
|
||||||
src.elements.begin (),
|
src.begin (),
|
||||||
src.elements.end (),
|
src.end (),
|
||||||
std::back_inserter (sizes),
|
std::back_inserter (sizes),
|
||||||
[] (auto const &val) { return val.size (); }
|
[] (auto const &val) { return val.size (); }
|
||||||
);
|
);
|
||||||
@ -141,8 +143,8 @@ int main (int argc, char const **argv)
|
|||||||
std::cout << myaccum << '\n';
|
std::cout << myaccum << '\n';
|
||||||
|
|
||||||
auto const total_bytes = std::accumulate (
|
auto const total_bytes = std::accumulate (
|
||||||
src.elements.begin (),
|
src.begin (),
|
||||||
src.elements.end (),
|
src.end (),
|
||||||
0,
|
0,
|
||||||
[] (auto const accum, auto const rhs)
|
[] (auto const accum, auto const rhs)
|
||||||
{
|
{
|
||||||
@ -152,8 +154,8 @@ int main (int argc, char const **argv)
|
|||||||
|
|
||||||
std::vector<emory::chunk::region> unique;
|
std::vector<emory::chunk::region> unique;
|
||||||
std::unique_copy (
|
std::unique_copy (
|
||||||
src.elements.begin (),
|
src.begin (),
|
||||||
src.elements.end (),
|
src.end (),
|
||||||
std::back_inserter (unique),
|
std::back_inserter (unique),
|
||||||
region_equal
|
region_equal
|
||||||
);
|
);
|
||||||
@ -173,5 +175,5 @@ int main (int argc, char const **argv)
|
|||||||
100.f * duplicated_fraction
|
100.f * duplicated_fraction
|
||||||
);
|
);
|
||||||
|
|
||||||
std::cout << (src.elements.size () - unique.size ()) << " duplicates\n";
|
std::cout << (src.size () - unique.size ()) << " duplicates\n";
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,8 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "emory/chunk/params.hpp"
|
#include "emory/chunk/params.hpp"
|
||||||
#include "emory/chunk/map.hpp"
|
#include "emory/chunk/find.hpp"
|
||||||
|
#include "emory/chunk/region.hpp"
|
||||||
#include "emory/chunk/match.hpp"
|
#include "emory/chunk/match.hpp"
|
||||||
#include "emory/chunk/ostream.hpp"
|
#include "emory/chunk/ostream.hpp"
|
||||||
|
|
||||||
@ -44,17 +45,26 @@ int main (int argc, char const **argv)
|
|||||||
};
|
};
|
||||||
|
|
||||||
std::clog << "Hashing target\n";
|
std::clog << "Hashing target\n";
|
||||||
emory::chunk::map target (cruft::mapped_file (argv[ARGS_TARGET]), p);
|
std::vector<emory::chunk::region> target;
|
||||||
std::sort (target.elements.begin (), target.elements.end (), emory::chunk::region::digest_ordering {});
|
emory::chunk::find<emory::chunk::static_hash> (
|
||||||
std::cout << "Found " << target.elements.size () << " chunks\n";
|
std::back_inserter (target),
|
||||||
|
cruft::mapped_file (argv[ARGS_TARGET]), p
|
||||||
|
);
|
||||||
|
|
||||||
|
std::sort (target.begin (), target.end (), emory::chunk::region::digest_ordering {});
|
||||||
|
std::cout << "Found " << target.size () << " chunks\n";
|
||||||
|
|
||||||
std::vector<emory::chunk::match<int>> found;
|
std::vector<emory::chunk::match<int>> found;
|
||||||
|
|
||||||
for (int i = ARGS_SOURCE; i != argc; ++i) {
|
for (int i = ARGS_SOURCE; i != argc; ++i) {
|
||||||
auto const path = argv[i];
|
auto const path = argv[i];
|
||||||
std::clog << "Hashing source: " << path << '\n';
|
std::clog << "Hashing source: " << path << '\n';
|
||||||
emory::chunk::map source (cruft::mapped_file (path), p);
|
std::vector<emory::chunk::region> source;
|
||||||
std::sort (source.elements.begin (), source.elements.end (), emory::chunk::region::digest_ordering {});
|
emory::chunk::find<emory::chunk::static_hash> (
|
||||||
|
std::back_inserter (source),
|
||||||
|
cruft::mapped_file (path), p
|
||||||
|
);
|
||||||
|
std::sort (source.begin (), source.end (), emory::chunk::region::digest_ordering {});
|
||||||
|
|
||||||
std::clog << "Finding common\n";
|
std::clog << "Finding common\n";
|
||||||
auto const &source_matches = common (target, source);
|
auto const &source_matches = common (target, source);
|
||||||
@ -99,8 +109,8 @@ int main (int argc, char const **argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::size_t const total = std::accumulate (
|
std::size_t const total = std::accumulate (
|
||||||
target.elements.begin (),
|
target.begin (),
|
||||||
target.elements.end (),
|
target.end (),
|
||||||
0u,
|
0u,
|
||||||
[] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; }
|
[] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; }
|
||||||
);
|
);
|
||||||
|
Loading…
Reference in New Issue
Block a user