emory: compute possible size reduction from deduplication

This commit is contained in:
Danny Robson 2019-04-23 08:55:16 +10:00
parent eedc8d8e5c
commit f6840ba6c1
2 changed files with 98 additions and 23 deletions

@ -1 +1 @@
Subproject commit e1d56395f6b16cb70f8dbb942a117dc01fd7e74c Subproject commit cc9b9b19c1a9a7002248d7f62053d49002a8b297

107
emory.cpp
View File

@ -8,33 +8,108 @@
#include <cruft/util/hash/buzhash.hpp> #include <cruft/util/hash/buzhash.hpp>
#include <cruft/util/io.hpp> #include <cruft/util/io.hpp>
#include <cruft/util/parse/value.hpp>
#include <cruft/crypto/hash/sha1.hpp>
#include <algorithm>
#include <iostream> #include <iostream>
struct chunk {
cruft::view<u08 const*> data;
cruft::crypto::hash::SHA1::digest_t digest;
};
struct digest_equality {
bool operator() (chunk const &a, chunk const &b)
{
return a.digest == b.digest;
}
};
enum {
ARG_SELF,
ARG_BITS,
ARG_WINDOW,
ARGS_INPUT,
NUM_ARGS,
};
int main (int argc, char const **argv) int main (int argc, char const **argv)
{ {
(void)argc; if (argc != NUM_ARGS) {
std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <input>\n";
return EXIT_FAILURE;
}
cruft::mapped_file src (argv[1]); auto const window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]);
auto const bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS ]);
cruft::mapped_file src (argv[ARGS_INPUT]);
cruft::view bytes (src); cruft::view bytes (src);
static constexpr std::size_t BITS = 16; using hash_type = cruft::hash::buzhash<u64>;
std::vector<std::size_t> counts (BITS, 0); hash_type h (window, bytes);
bytes = bytes.consume (window);
static constexpr std::size_t WINDOW = 48; using digest_type = hash_type::digest_type ;
cruft::hash::buzhash<u64> h (WINDOW, bytes); digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - bits);
for (auto const &val: bytes.consume (WINDOW)) { std::vector<chunk> chunks;
auto const res = h (&val);
std::size_t mask = ~u64(0) >> (64 - BITS); for (u08 const *cursor = bytes.begin (), *start = src.data (); cursor != bytes.end (); cursor++) {
for (std::size_t i = 0; i < BITS; ++i) { auto const res = h (cursor);
if ((res & mask) == 0)
counts[i]++; if (res & mask)
mask >>= 1; continue;
}
cruft::view<u08 const*> const region { start, cursor };
start = cursor + 1;
chunks.push_back ({
.data = region,
.digest = cruft::crypto::hash::SHA1 {} (region),
});
} }
for (auto const &i: counts) std::sort (
std::cout << i << '\n'; std::begin (chunks),
std::end (chunks),
[] (auto const &a, auto const &b)
{
return a.digest < b.digest;
});
std::size_t reduction = 0;
for (auto cursor = chunks.begin (); cursor != chunks.end (); ++cursor) {
auto first_match = std::adjacent_find (
cursor, chunks.end (),
digest_equality {}
);
if (first_match == chunks.end ())
break;
auto last_match = std::find_if_not (
first_match,
chunks.end (),
[&first_match] (auto const &i)
{
return i.digest == first_match->digest;
});
auto const count = std::distance (first_match, last_match);
auto const size = first_match->data.size ();
std::cout << count << " duplicates of " << size << " bytes\n";
reduction += (count - 1) * size;
cursor = last_match;
}
std::cout << chunks.size () << " chunks found\n";
std::cout << float (reduction) / src.size () << " reduction\n";
} }