tools/analyse: use the common API

This commit is contained in:
Danny Robson 2019-04-26 11:16:02 +10:00
parent 54c466f5e5
commit 6ca1fe1670
2 changed files with 18 additions and 86 deletions

View File

@ -20,6 +20,9 @@
namespace emory::chunk { namespace emory::chunk {
struct map { struct map {
map (cruft::view<u08 const*> data, params const&); map (cruft::view<u08 const*> data, params const&);
std::size_t size (void) const noexcept { return elements.size (); }
std::vector<region> elements; std::vector<region> elements;
}; };
} }

View File

@ -3,32 +3,20 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this * License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. * file, You can obtain one at http://mozilla.org/MPL/2.0/.
* *
* Copyright 2013 Danny Robson <danny@nerdcruft.net> * Copyright 2019 Danny Robson <danny@nerdcruft.net>
*/ */
#include <cruft/util/hash/buzhash.hpp> #include "emory/chunk/map.hpp"
#include "emory/chunk/params.hpp"
#include <cruft/util/io.hpp> #include <cruft/util/io.hpp>
#include <cruft/util/view.hpp>
#include <cruft/util/parse/value.hpp> #include <cruft/util/parse/value.hpp>
#include <cruft/crypto/hash/sha1.hpp>
#include <algorithm>
#include <iostream> #include <iostream>
struct chunk { ///////////////////////////////////////////////////////////////////////////////
cruft::view<u08 const*> data;
cruft::crypto::hash::SHA1::digest_t digest;
};
struct digest_equality {
bool operator() (chunk const &a, chunk const &b)
{
return a.digest == b.digest;
}
};
enum { enum {
ARG_SELF, ARG_SELF,
ARG_BITS, ARG_BITS,
@ -40,6 +28,7 @@ enum {
}; };
//-----------------------------------------------------------------------------
int main (int argc, char const **argv) int main (int argc, char const **argv)
{ {
if (argc != NUM_ARGS) { if (argc != NUM_ARGS) {
@ -47,74 +36,14 @@ int main (int argc, char const **argv)
return EXIT_FAILURE; return EXIT_FAILURE;
} }
auto const window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]); emory::chunk::params const p {
auto const bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS ]); .bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS ]),
auto const minimum_size = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]); .window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]),
.minimum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]),
};
cruft::mapped_file src (argv[ARGS_INPUT]); cruft::mapped_file data (argv[ARGS_INPUT]);
cruft::view bytes (src); emory::chunk::map src (data, p);
using hash_type = cruft::hash::buzhash<u64>; std::cout << src.size () << " chunks\n";
hash_type h (window, bytes);
bytes = bytes.consume (window);
using digest_type = hash_type::digest_type ;
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - bits);
std::vector<chunk> chunks;
for (u08 const *cursor = bytes.begin (), *start = src.data (); cursor != bytes.end (); cursor++) {
auto const res = h (cursor);
if (std::distance (start, cursor) < minimum_size)
continue;
if (unlikely (res & mask))
continue;
cruft::view<u08 const*> const region { start, cursor };
start = cursor + 1;
chunks.push_back ({
.data = region,
.digest = cruft::crypto::hash::SHA1 {} (region),
});
}
std::sort (
std::begin (chunks),
std::end (chunks),
[] (auto const &a, auto const &b)
{
return a.digest < b.digest;
});
std::size_t reduction = 0;
for (auto cursor = chunks.begin (); cursor != chunks.end (); ++cursor) {
auto first_match = std::adjacent_find (
cursor, chunks.end (),
digest_equality {}
);
if (first_match == chunks.end ())
break;
auto last_match = std::find_if_not (
first_match,
chunks.end (),
[&first_match] (auto const &i)
{
return i.digest == first_match->digest;
});
auto const count = std::distance (first_match, last_match);
auto const size = first_match->data.size ();
std::cout << count << " duplicates of " << size << " bytes\n";
reduction += (count - 1) * size;
cursor = last_match;
}
std::cout << chunks.size () << " chunks found\n";
std::cout << float (reduction) / src.size () << " reduction\n";
} }