2019-04-23 21:34:01 +10:00
|
|
|
/*
|
|
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
*
|
|
|
|
* Copyright 2013 Danny Robson <danny@nerdcruft.net>
|
|
|
|
*/
|
|
|
|
|
2019-04-26 11:10:07 +10:00
|
|
|
#include "emory/chunk/params.hpp"
|
2020-12-06 10:34:33 +11:00
|
|
|
#include "emory/chunk/find.hpp"
|
|
|
|
#include "emory/chunk/region.hpp"
|
2019-04-26 11:10:07 +10:00
|
|
|
#include "emory/chunk/match.hpp"
|
|
|
|
#include "emory/chunk/ostream.hpp"
|
|
|
|
|
2019-04-23 21:34:01 +10:00
|
|
|
#include <cruft/util/io.hpp>
|
|
|
|
#include <cruft/util/parse/value.hpp>
|
|
|
|
|
2019-04-26 11:10:07 +10:00
|
|
|
#include <cstddef>
|
2019-04-23 21:34:01 +10:00
|
|
|
#include <iostream>
|
|
|
|
|
|
|
|
|
|
|
|
enum {
|
|
|
|
ARG_SELF,
|
|
|
|
ARG_BITS,
|
|
|
|
ARG_WINDOW,
|
|
|
|
ARGS_MINIMUM,
|
2020-12-29 09:01:40 +11:00
|
|
|
ARGS_MAXIMUM,
|
2019-04-23 21:34:01 +10:00
|
|
|
ARGS_TARGET,
|
|
|
|
ARGS_SOURCE,
|
|
|
|
|
|
|
|
NUM_ARGS,
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
int main (int argc, char const **argv)
|
|
|
|
{
|
2019-04-26 09:08:03 +10:00
|
|
|
if (argc < NUM_ARGS) {
|
2020-12-29 09:01:40 +11:00
|
|
|
std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <maximum> <target> <source> [...]\n";
|
2019-04-23 21:34:01 +10:00
|
|
|
return EXIT_FAILURE;
|
|
|
|
}
|
|
|
|
|
2019-04-26 11:10:07 +10:00
|
|
|
emory::chunk::params const p {
|
2020-12-29 09:01:40 +11:00
|
|
|
.bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS ]),
|
|
|
|
.window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]),
|
2019-04-23 21:34:01 +10:00
|
|
|
.minimum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]),
|
2020-12-29 09:01:40 +11:00
|
|
|
.maximum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MAXIMUM]),
|
2019-04-23 21:34:01 +10:00
|
|
|
};
|
|
|
|
|
|
|
|
std::clog << "Hashing target\n";
|
2020-12-06 10:34:33 +11:00
|
|
|
std::vector<emory::chunk::region> target;
|
|
|
|
emory::chunk::find<emory::chunk::static_hash> (
|
|
|
|
std::back_inserter (target),
|
|
|
|
cruft::mapped_file (argv[ARGS_TARGET]), p
|
|
|
|
);
|
|
|
|
|
|
|
|
std::sort (target.begin (), target.end (), emory::chunk::region::digest_ordering {});
|
|
|
|
std::cout << "Found " << target.size () << " chunks\n";
|
2019-04-26 09:08:03 +10:00
|
|
|
|
2019-04-26 11:10:07 +10:00
|
|
|
std::vector<emory::chunk::match<int>> found;
|
2019-04-26 09:08:03 +10:00
|
|
|
|
|
|
|
for (int i = ARGS_SOURCE; i != argc; ++i) {
|
|
|
|
auto const path = argv[i];
|
|
|
|
std::clog << "Hashing source: " << path << '\n';
|
2020-12-06 10:34:33 +11:00
|
|
|
std::vector<emory::chunk::region> source;
|
|
|
|
emory::chunk::find<emory::chunk::static_hash> (
|
|
|
|
std::back_inserter (source),
|
|
|
|
cruft::mapped_file (path), p
|
|
|
|
);
|
|
|
|
std::sort (source.begin (), source.end (), emory::chunk::region::digest_ordering {});
|
2019-04-26 09:08:03 +10:00
|
|
|
|
|
|
|
std::clog << "Finding common\n";
|
|
|
|
auto const &source_matches = common (target, source);
|
|
|
|
std::clog << "Discovered " << source_matches.size () << " blocks\n";
|
|
|
|
|
|
|
|
std::transform (
|
|
|
|
std::begin (source_matches),
|
|
|
|
std::end (source_matches),
|
|
|
|
std::back_inserter (found),
|
2019-04-26 11:10:07 +10:00
|
|
|
[&] (auto const &j) -> emory::chunk::match<int>
|
2019-04-26 09:25:41 +10:00
|
|
|
{
|
|
|
|
return {
|
|
|
|
.src = { i, j.src.data },
|
|
|
|
.dst = { ARGS_TARGET, j.dst.data },
|
|
|
|
};
|
|
|
|
});
|
|
|
|
|
|
|
|
std::sort (
|
|
|
|
std::begin (found),
|
|
|
|
std::end (found),
|
2019-04-26 11:10:07 +10:00
|
|
|
emory::chunk::match<int>::src_ordering {}
|
2019-04-26 09:08:03 +10:00
|
|
|
);
|
|
|
|
|
|
|
|
found.erase (
|
2019-04-26 09:25:41 +10:00
|
|
|
std::unique (
|
|
|
|
std::begin (found),
|
|
|
|
std::end (found),
|
2019-04-26 11:10:07 +10:00
|
|
|
emory::chunk::match<int>::dst_equality {}
|
2019-04-26 09:25:41 +10:00
|
|
|
),
|
2019-04-26 09:08:03 +10:00
|
|
|
found.end ()
|
|
|
|
);
|
|
|
|
}
|
2019-04-23 21:34:01 +10:00
|
|
|
|
2019-04-26 09:08:03 +10:00
|
|
|
std::clog << "Finalising\n";
|
2019-04-24 07:37:10 +10:00
|
|
|
std::size_t matching = 0;
|
2019-04-26 09:25:41 +10:00
|
|
|
std::vector<std::size_t> source_bytes (argc - ARGS_SOURCE, 0);
|
2019-04-23 21:34:01 +10:00
|
|
|
for (auto const &i: found) {
|
2019-04-26 09:45:43 +10:00
|
|
|
std::cout << i << '\n';
|
2019-04-26 09:25:41 +10:00
|
|
|
auto const size = i.dst.data.offset.second - i.dst.data.offset.first;
|
|
|
|
matching += size;
|
|
|
|
source_bytes[i.src.id - ARGS_SOURCE] += size;
|
2019-04-23 21:34:01 +10:00
|
|
|
}
|
|
|
|
|
2019-04-24 07:37:10 +10:00
|
|
|
std::size_t const total = std::accumulate (
|
2020-12-06 10:34:33 +11:00
|
|
|
target.begin (),
|
|
|
|
target.end (),
|
2019-04-24 07:37:10 +10:00
|
|
|
0u,
|
|
|
|
[] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; }
|
|
|
|
);
|
|
|
|
|
2019-04-24 07:43:43 +10:00
|
|
|
std::cout << "Found " << found.size () << " chunks. " << matching << "/" << total << " bytes for a factor of " << float (matching) / total << "\n";
|
2019-04-26 09:25:41 +10:00
|
|
|
|
|
|
|
for (int i = ARGS_SOURCE; i != argc; ++i)
|
|
|
|
std::cerr << argv[i] << " contributed: " << source_bytes[i - ARGS_SOURCE] << '\n';
|
2019-04-23 21:34:01 +10:00
|
|
|
}
|