chunk/find: remove map in favour of bare queries

This commit is contained in:
Danny Robson 2020-12-06 09:34:33 +10:00
parent db6c4f54a2
commit 830fb1f47e
9 changed files with 124 additions and 154 deletions

View File

@ -32,8 +32,8 @@ find_package (fmt REQUIRED)
list(APPEND libemory_sources
emory/chunk/fwd.cpp
emory/chunk/fwd.hpp
emory/chunk/map.cpp
emory/chunk/map.hpp
emory/chunk/find.hpp
emory/chunk/find.cpp
emory/chunk/match.cpp
emory/chunk/match.hpp
emory/chunk/ostream.cpp

76
emory/chunk/find.hpp Normal file
View File

@ -0,0 +1,76 @@
/*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* Copyright 2019, Danny Robson <danny@nerdcruft.net>
*/
#include "params.hpp"
#include <cruft/util/hash/buzhash.hpp>
///////////////////////////////////////////////////////////////////////////////
namespace emory::chunk {
template <typename HashT, typename OutputT>
OutputT
find (
OutputT &&dst,
cruft::view<u08 const*> src,
emory::chunk::params const &p
) {
using hash_type = cruft::hash::buzhash<u64>;
if (src.size () < p.window)
return dst;
using digest_type = hash_type::digest_type ;
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
u64 hash_state = 0;
auto start = src.begin ();
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
hash_state = 0;
cursor += p.minimum - p.window;
for (std::size_t i = 0; i < p.window; ++i)
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
for ( ; cursor < src.end () - p.window; ++cursor) {
if (likely (hash_state & mask)) {
hash_state = cruft::rotatel (hash_state, 1)
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
^ *cursor;
continue;
}
cruft::view<u08 const*> const region { start, cursor };
*dst = {
.offset = {
std::distance (src.begin (), start),
std::distance (src.begin (), cursor)
},
.digest = HashT {} (region)
};
start = cursor;
break;
}
}
if (start != src.end ()) {
cruft::view<u08 const*> const region { start, src.end () };
*dst++ = {
.offset = {
std::distance (src.begin (), start),
std::distance (src.begin (), src.end ())
},
.digest = HashT {} (region)
};
}
return dst;
}
}

View File

@ -11,7 +11,6 @@
namespace emory::chunk {
struct params;
struct region;
struct map;
template <typename IdT> struct match;
};

View File

@ -1,87 +0,0 @@
/*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* Copyright 2019, Danny Robson <danny@nerdcruft.net>
*/
#include "map.hpp"
#include "params.hpp"
#include <cruft/util/hash/buzhash.hpp>
using emory::chunk::map;
///////////////////////////////////////////////////////////////////////////////
template <typename HashT, typename OutputT>
static
OutputT
find_chunks (
OutputT &&dst,
cruft::view<u08 const*> src,
emory::chunk::params const &p
) {
using hash_type = cruft::hash::buzhash<u64>;
if (src.size () < p.window)
return dst;
using digest_type = hash_type::digest_type ;
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
u64 hash_state = 0;
auto start = src.begin ();
for (auto cursor = src.begin (); cursor < src.end () - p.minimum; ) {
hash_state = 0;
cursor += p.minimum - p.window;
for (std::size_t i = 0; i < p.window; ++i)
hash_state = cruft::rotatel (hash_state, 1) ^ *cursor++;
for ( ; cursor < src.end () - p.window; ++cursor) {
if (likely (hash_state & mask)) {
hash_state = cruft::rotatel (hash_state, 1)
^ cruft::rotatel (u64 (*(cursor - p.window)), p.window)
^ *cursor;
continue;
}
cruft::view<u08 const*> const region { start, cursor };
*dst = {
.offset = {
std::distance (src.begin (), start),
std::distance (src.begin (), cursor)
},
.digest = HashT {} (region)
};
start = cursor;
break;
}
}
if (start != src.end ()) {
cruft::view<u08 const*> const region { start, src.end () };
*dst++ = {
.offset = {
std::distance (src.begin (), start),
std::distance (src.begin (), src.end ())
},
.digest = HashT {} (region)
};
}
return dst;
}
///////////////////////////////////////////////////////////////////////////////
map::map (
cruft::view<u08 const *> src,
emory::chunk::params const &p
) {
::find_chunks<static_hash> (std::back_inserter (elements), src, p);
}

View File

@ -1,28 +0,0 @@
/*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* Copyright 2019, Danny Robson <danny@nerdcruft.net>
*/
#pragma once
#include "fwd.hpp"
#include "region.hpp"
#include <cruft/util/view.hpp>
#include <cruft/util/std.hpp>
#include <vector>
namespace emory::chunk {
struct map {
map (cruft::view<u08 const*> data, params const&);
std::size_t size (void) const noexcept { return elements.size (); }
std::vector<region> elements;
};
}

View File

@ -8,20 +8,18 @@
#include "match.hpp"
#include "map.hpp"
std::vector<
emory::chunk::match<emory::chunk::map const*>
emory::chunk::match<std::vector<emory::chunk::region> const*>
>
emory::chunk::common (map const &src, map const &dst)
emory::chunk::common (std::vector<region> const &src, std::vector<region> const &dst)
{
CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), region::digest_ordering {}));
CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), region::digest_ordering {}));
CHECK (std::is_sorted (src.begin (), src.end (), region::digest_ordering {}));
CHECK (std::is_sorted (dst.begin (), dst.end (), region::digest_ordering {}));
std::vector<match<map const*>> res;
std::vector<match<std::vector<region> const*>> res;
for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin ();
src_cursor != src.elements.end () && dst_cursor != dst.elements.end ();
for (auto src_cursor = src.begin (), dst_cursor = dst.begin ();
src_cursor != src.end () && dst_cursor != dst.end ();
/* nothing */)
{
if (src_cursor->digest < dst_cursor->digest) {

View File

@ -51,6 +51,6 @@ namespace emory::chunk {
};
std::vector<match<map const*>>
common (map const &a, map const &b);
std::vector<match<std::vector<region> const*>>
common (std::vector<region> const &a, std::vector<region> const &b);
}

View File

@ -6,7 +6,8 @@
* Copyright 2019 Danny Robson <danny@nerdcruft.net>
*/
#include "emory/chunk/map.hpp"
#include "emory/chunk/find.hpp"
#include "emory/chunk/region.hpp"
#include "emory/chunk/params.hpp"
#include "emory/chunk/ostream.hpp"
@ -102,38 +103,39 @@ int main (int argc, char const **argv)
std::cout << "size: " << data.size () << '\n';
std::cout << "processing\n";
emory::chunk::map src (data, p);
std::vector<emory::chunk::region> src;
emory::chunk::find<emory::chunk::static_hash> (std::back_inserter (src), data, p);
std::cout << "validating\n";
std::cout << src.size () << " chunks\n";
std::sort (
src.elements.begin (),
src.elements.end (),
src.begin (),
src.end (),
[] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; }
);
for (int i = 0, cursor = 0; i < std::ssize (src.elements); ++i) {
if (src.elements[i].offset.first != cursor) {
for (off_t i = 0, cursor = 0; i < std::ssize (src); ++i) {
if (src[i].offset.first != cursor) {
std::cout << "non-overlapping chunks\n";
return -1;
}
cursor = src.elements[i].offset.second;
cursor = src[i].offset.second;
}
if (src.elements.back ().offset.second != std::ssize (data)) {
if (src.back ().offset.second != std::ssize (data)) {
std::cout << "invalid total size\n";
return -1;
}
std::sort (
src.elements.begin (),
src.elements.end (),
src.begin (),
src.end (),
region_less
);
std::vector<off64_t> sizes;
std::transform (
src.elements.begin (),
src.elements.end (),
src.begin (),
src.end (),
std::back_inserter (sizes),
[] (auto const &val) { return val.size (); }
);
@ -141,8 +143,8 @@ int main (int argc, char const **argv)
std::cout << myaccum << '\n';
auto const total_bytes = std::accumulate (
src.elements.begin (),
src.elements.end (),
src.begin (),
src.end (),
0,
[] (auto const accum, auto const rhs)
{
@ -152,8 +154,8 @@ int main (int argc, char const **argv)
std::vector<emory::chunk::region> unique;
std::unique_copy (
src.elements.begin (),
src.elements.end (),
src.begin (),
src.end (),
std::back_inserter (unique),
region_equal
);
@ -173,5 +175,5 @@ int main (int argc, char const **argv)
100.f * duplicated_fraction
);
std::cout << (src.elements.size () - unique.size ()) << " duplicates\n";
std::cout << (src.size () - unique.size ()) << " duplicates\n";
}

View File

@ -7,7 +7,8 @@
*/
#include "emory/chunk/params.hpp"
#include "emory/chunk/map.hpp"
#include "emory/chunk/find.hpp"
#include "emory/chunk/region.hpp"
#include "emory/chunk/match.hpp"
#include "emory/chunk/ostream.hpp"
@ -44,17 +45,26 @@ int main (int argc, char const **argv)
};
std::clog << "Hashing target\n";
emory::chunk::map target (cruft::mapped_file (argv[ARGS_TARGET]), p);
std::sort (target.elements.begin (), target.elements.end (), emory::chunk::region::digest_ordering {});
std::cout << "Found " << target.elements.size () << " chunks\n";
std::vector<emory::chunk::region> target;
emory::chunk::find<emory::chunk::static_hash> (
std::back_inserter (target),
cruft::mapped_file (argv[ARGS_TARGET]), p
);
std::sort (target.begin (), target.end (), emory::chunk::region::digest_ordering {});
std::cout << "Found " << target.size () << " chunks\n";
std::vector<emory::chunk::match<int>> found;
for (int i = ARGS_SOURCE; i != argc; ++i) {
auto const path = argv[i];
std::clog << "Hashing source: " << path << '\n';
emory::chunk::map source (cruft::mapped_file (path), p);
std::sort (source.elements.begin (), source.elements.end (), emory::chunk::region::digest_ordering {});
std::vector<emory::chunk::region> source;
emory::chunk::find<emory::chunk::static_hash> (
std::back_inserter (source),
cruft::mapped_file (path), p
);
std::sort (source.begin (), source.end (), emory::chunk::region::digest_ordering {});
std::clog << "Finding common\n";
auto const &source_matches = common (target, source);
@ -99,8 +109,8 @@ int main (int argc, char const **argv)
}
std::size_t const total = std::accumulate (
target.elements.begin (),
target.elements.end (),
target.begin (),
target.end (),
0u,
[] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; }
);