2019-04-23 08:17:34 +10:00
|
|
|
/*
|
|
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
*
|
|
|
|
* Copyright 2013 Danny Robson <danny@nerdcruft.net>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <cruft/util/hash/buzhash.hpp>
|
|
|
|
#include <cruft/util/io.hpp>
|
2019-04-23 08:55:16 +10:00
|
|
|
#include <cruft/util/parse/value.hpp>
|
2019-04-23 08:17:34 +10:00
|
|
|
|
2019-04-23 08:55:16 +10:00
|
|
|
#include <cruft/crypto/hash/sha1.hpp>
|
|
|
|
|
|
|
|
#include <algorithm>
|
2019-04-23 08:17:34 +10:00
|
|
|
#include <iostream>
|
|
|
|
|
2019-04-23 08:55:16 +10:00
|
|
|
|
|
|
|
struct chunk {
|
|
|
|
cruft::view<u08 const*> data;
|
|
|
|
cruft::crypto::hash::SHA1::digest_t digest;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct digest_equality {
|
|
|
|
bool operator() (chunk const &a, chunk const &b)
|
|
|
|
{
|
|
|
|
return a.digest == b.digest;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
enum {
|
|
|
|
ARG_SELF,
|
|
|
|
ARG_BITS,
|
|
|
|
ARG_WINDOW,
|
2019-04-23 09:35:45 +10:00
|
|
|
ARGS_MINIMUM,
|
2019-04-23 08:55:16 +10:00
|
|
|
ARGS_INPUT,
|
|
|
|
|
|
|
|
NUM_ARGS,
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2019-04-23 08:17:34 +10:00
|
|
|
int main (int argc, char const **argv)
|
|
|
|
{
|
2019-04-23 08:55:16 +10:00
|
|
|
if (argc != NUM_ARGS) {
|
2019-04-23 09:35:45 +10:00
|
|
|
std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <input>\n";
|
2019-04-23 08:55:16 +10:00
|
|
|
return EXIT_FAILURE;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto const window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]);
|
|
|
|
auto const bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS ]);
|
2019-04-23 09:35:45 +10:00
|
|
|
auto const minimum_size = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]);
|
2019-04-23 08:17:34 +10:00
|
|
|
|
2019-04-23 08:55:16 +10:00
|
|
|
cruft::mapped_file src (argv[ARGS_INPUT]);
|
2019-04-23 08:17:34 +10:00
|
|
|
cruft::view bytes (src);
|
|
|
|
|
2019-04-23 08:55:16 +10:00
|
|
|
using hash_type = cruft::hash::buzhash<u64>;
|
|
|
|
hash_type h (window, bytes);
|
|
|
|
bytes = bytes.consume (window);
|
|
|
|
|
|
|
|
using digest_type = hash_type::digest_type ;
|
|
|
|
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - bits);
|
|
|
|
|
|
|
|
std::vector<chunk> chunks;
|
|
|
|
|
|
|
|
for (u08 const *cursor = bytes.begin (), *start = src.data (); cursor != bytes.end (); cursor++) {
|
|
|
|
auto const res = h (cursor);
|
|
|
|
|
2019-04-23 09:35:45 +10:00
|
|
|
if (std::distance (start, cursor) < minimum_size)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (unlikely (res & mask))
|
2019-04-23 08:55:16 +10:00
|
|
|
continue;
|
|
|
|
|
|
|
|
cruft::view<u08 const*> const region { start, cursor };
|
|
|
|
start = cursor + 1;
|
|
|
|
|
|
|
|
chunks.push_back ({
|
|
|
|
.data = region,
|
|
|
|
.digest = cruft::crypto::hash::SHA1 {} (region),
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
std::sort (
|
|
|
|
std::begin (chunks),
|
|
|
|
std::end (chunks),
|
|
|
|
[] (auto const &a, auto const &b)
|
|
|
|
{
|
|
|
|
return a.digest < b.digest;
|
|
|
|
});
|
|
|
|
|
|
|
|
std::size_t reduction = 0;
|
|
|
|
for (auto cursor = chunks.begin (); cursor != chunks.end (); ++cursor) {
|
|
|
|
auto first_match = std::adjacent_find (
|
|
|
|
cursor, chunks.end (),
|
|
|
|
digest_equality {}
|
|
|
|
);
|
|
|
|
|
|
|
|
if (first_match == chunks.end ())
|
|
|
|
break;
|
2019-04-23 08:17:34 +10:00
|
|
|
|
2019-04-23 08:55:16 +10:00
|
|
|
auto last_match = std::find_if_not (
|
|
|
|
first_match,
|
|
|
|
chunks.end (),
|
|
|
|
[&first_match] (auto const &i)
|
|
|
|
{
|
|
|
|
return i.digest == first_match->digest;
|
|
|
|
});
|
2019-04-23 08:17:34 +10:00
|
|
|
|
2019-04-23 08:55:16 +10:00
|
|
|
auto const count = std::distance (first_match, last_match);
|
|
|
|
auto const size = first_match->data.size ();
|
|
|
|
std::cout << count << " duplicates of " << size << " bytes\n";
|
|
|
|
reduction += (count - 1) * size;
|
2019-04-23 08:17:34 +10:00
|
|
|
|
2019-04-23 08:55:16 +10:00
|
|
|
cursor = last_match;
|
2019-04-23 08:17:34 +10:00
|
|
|
}
|
|
|
|
|
2019-04-23 08:55:16 +10:00
|
|
|
std::cout << chunks.size () << " chunks found\n";
|
|
|
|
std::cout << float (reduction) / src.size () << " reduction\n";
|
2019-04-23 08:17:34 +10:00
|
|
|
}
|