170 lines
4.8 KiB
C++
170 lines
4.8 KiB
C++
/*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* Copyright 2019 Danny Robson <danny@nerdcruft.net>
|
|
*/
|
|
|
|
#include "emory/chunk/map.hpp"
|
|
#include "emory/chunk/params.hpp"
|
|
|
|
#include <cruft/util/io.hpp>
|
|
#include <cruft/util/view.hpp>
|
|
#include <cruft/util/parse/value.hpp>
|
|
|
|
#include <algorithm>
|
|
#include <iostream>
|
|
#include <compare>
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
static
|
|
std::strong_ordering
|
|
region_ordering (
|
|
emory::chunk::region const &a,
|
|
emory::chunk::region const &b
|
|
) {
|
|
if (auto const cmp = a.size () <=> b.size (); cmp != 0)
|
|
return cmp;
|
|
for (int i = 0; i < std::ssize (a.digest); ++i)
|
|
if (auto const cmp = a.digest[i] <=> b.digest[i]; cmp != 0)
|
|
return cmp;
|
|
return std::strong_ordering::equal;
|
|
}
|
|
|
|
|
|
static bool region_less (emory::chunk::region const &a, emory::chunk::region const &b)
|
|
{
|
|
return region_ordering (a, b) < 0;
|
|
}
|
|
|
|
|
|
static bool region_equal (emory::chunk::region const &a, emory::chunk::region const &b)
|
|
{
|
|
return region_ordering (a, b) == 0;
|
|
}
|
|
|
|
|
|
//static bool overlap (emory::chunk::region const &a, emory::chunk::region const &b)
|
|
//{
|
|
// return a.offset.first < b.offset.second &&
|
|
// b.offset.first < a.offset.second;
|
|
//}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
enum {
|
|
ARG_SELF,
|
|
|
|
ARGS_INPUT,
|
|
ARGS_OUTPUT,
|
|
|
|
ARG_BITS,
|
|
ARG_WINDOW,
|
|
ARG_MINIMUM,
|
|
|
|
NUM_ARGS,
|
|
NUM_ARGS_REQUIRED = 3,
|
|
};
|
|
|
|
|
|
static constexpr std::size_t DEFAULT_BITS = 12;
|
|
static constexpr std::size_t DEFAULT_WINDOW = 4;
|
|
static constexpr std::ptrdiff_t DEFAULT_MINIMUM = 512;
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
int main (int argc, char const **argv)
|
|
{
|
|
if (argc < NUM_ARGS_REQUIRED) {
|
|
std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum]\n"
|
|
<< "default bits = " << DEFAULT_BITS << '\n'
|
|
<< "default window = " << DEFAULT_WINDOW << '\n'
|
|
<< "default minimum = " << DEFAULT_MINIMUM << '\n';
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
#define PARSE_DEFAULT(NAME, TYPE) \
|
|
argc > ARG_##NAME \
|
|
? cruft::parse::from_string<TYPE> (argv[ARG_##NAME]) \
|
|
: DEFAULT_##NAME
|
|
|
|
emory::chunk::params const p {
|
|
.bits = PARSE_DEFAULT (BITS, std::size_t),
|
|
.window = PARSE_DEFAULT (WINDOW, std::size_t),
|
|
.minimum = PARSE_DEFAULT (MINIMUM, std::ptrdiff_t),
|
|
};
|
|
|
|
cruft::mapped_file data (argv[ARGS_INPUT]);
|
|
std::cout << "size: " << data.size () << '\n';
|
|
|
|
std::cout << "processing\n";
|
|
emory::chunk::map src (data, p);
|
|
|
|
std::cout << "validating\n";
|
|
std::cout << src.size () << " chunks\n";
|
|
std::sort (
|
|
src.elements.begin (),
|
|
src.elements.end (),
|
|
[] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; }
|
|
);
|
|
for (int i = 0, cursor = 0; i < std::ssize (src.elements); ++i) {
|
|
if (src.elements[i].offset.first != cursor) {
|
|
std::cout << "non-overlapping chunks\n";
|
|
return -1;
|
|
}
|
|
cursor = src.elements[i].offset.second;
|
|
}
|
|
|
|
if (src.elements.back ().offset.second != std::ssize (data)) {
|
|
std::cout << "invalid total size\n";
|
|
return -1;
|
|
}
|
|
|
|
std::sort (
|
|
src.elements.begin (),
|
|
src.elements.end (),
|
|
region_less
|
|
);
|
|
|
|
std::vector<off64_t> sizes;
|
|
std::transform (
|
|
src.elements.begin (),
|
|
src.elements.end (),
|
|
std::back_inserter (sizes),
|
|
[] (auto const &val) { return val.size (); }
|
|
);
|
|
auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0);
|
|
std::cout << myaccum << '\n';
|
|
|
|
auto const total_bytes = std::accumulate (
|
|
src.elements.begin (),
|
|
src.elements.end (),
|
|
0,
|
|
[] (auto const accum, auto const rhs)
|
|
{
|
|
return accum + rhs.size ();
|
|
});
|
|
|
|
|
|
std::vector<emory::chunk::region> unique;
|
|
std::unique_copy (
|
|
src.elements.begin (),
|
|
src.elements.end (),
|
|
std::back_inserter (unique),
|
|
region_equal
|
|
);
|
|
auto const unique_bytes = std::accumulate (
|
|
unique.begin (),
|
|
unique.end (),
|
|
0, [] (auto const accum, auto const rhs) { return accum + rhs.size (); }
|
|
);
|
|
|
|
auto const duplicated_bytes = total_bytes - unique_bytes;
|
|
float const duplicated_fraction = float (duplicated_bytes) / total_bytes;
|
|
|
|
std::cout << duplicated_bytes << " duplicated bytes of " << total_bytes << " (" << duplicated_fraction << "%)\n";
|
|
std::cout << (src.elements.size () - unique.size ()) << " duplicates\n";
|
|
}
|