2019-04-23 08:17:34 +10:00
|
|
|
/*
|
|
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
*
|
2019-04-26 11:16:02 +10:00
|
|
|
* Copyright 2019 Danny Robson <danny@nerdcruft.net>
|
2019-04-23 08:17:34 +10:00
|
|
|
*/
|
|
|
|
|
2020-12-06 10:34:33 +11:00
|
|
|
#include "emory/chunk/find.hpp"
|
|
|
|
#include "emory/chunk/region.hpp"
|
2019-04-26 11:16:02 +10:00
|
|
|
#include "emory/chunk/params.hpp"
|
2020-12-06 10:00:03 +11:00
|
|
|
#include "emory/chunk/ostream.hpp"
|
2019-04-26 11:16:02 +10:00
|
|
|
|
2019-04-23 08:17:34 +10:00
|
|
|
#include <cruft/util/io.hpp>
|
2019-04-26 11:16:02 +10:00
|
|
|
#include <cruft/util/view.hpp>
|
2019-04-23 08:55:16 +10:00
|
|
|
#include <cruft/util/parse/value.hpp>
|
2020-12-06 12:01:41 +11:00
|
|
|
#include <cruft/util/posix/except.hpp>
|
2019-04-23 08:17:34 +10:00
|
|
|
|
2020-12-06 10:07:27 +11:00
|
|
|
#include <fmt/format.h>
|
|
|
|
#include <fmt/compile.h>
|
|
|
|
|
2020-12-06 07:22:18 +11:00
|
|
|
#include <algorithm>
|
|
|
|
#include <compare>
|
2020-12-06 12:01:41 +11:00
|
|
|
#include <filesystem>
|
2020-12-06 10:00:22 +11:00
|
|
|
#include <fstream>
|
2020-12-06 12:01:41 +11:00
|
|
|
#include <iostream>
|
2021-01-23 07:48:39 +11:00
|
|
|
#include <deque>
|
2019-04-23 08:17:34 +10:00
|
|
|
|
2019-04-23 08:55:16 +10:00
|
|
|
|
2019-04-26 11:16:02 +10:00
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
2020-12-29 09:36:21 +11:00
|
|
|
/// Provides _some_ consistent ordering for regions. The meaning isn't well
|
|
|
|
/// defined. The function is provided only so that we can identify duplicates.
|
2020-12-06 07:22:18 +11:00
|
|
|
static
|
|
|
|
std::strong_ordering
|
|
|
|
region_ordering (
|
|
|
|
emory::chunk::region const &a,
|
|
|
|
emory::chunk::region const &b
|
|
|
|
) {
|
|
|
|
if (auto const cmp = a.size () <=> b.size (); cmp != 0)
|
|
|
|
return cmp;
|
|
|
|
for (int i = 0; i < std::ssize (a.digest); ++i)
|
|
|
|
if (auto const cmp = a.digest[i] <=> b.digest[i]; cmp != 0)
|
|
|
|
return cmp;
|
|
|
|
return std::strong_ordering::equal;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-29 09:36:21 +11:00
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
static bool region_less (
|
|
|
|
emory::chunk::region const &a,
|
|
|
|
emory::chunk::region const &b
|
|
|
|
) {
|
2020-12-06 07:22:18 +11:00
|
|
|
return region_ordering (a, b) < 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-29 09:36:21 +11:00
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
static bool region_equal (
|
|
|
|
emory::chunk::region const &a,
|
|
|
|
emory::chunk::region const &b
|
|
|
|
) {
|
2020-12-06 07:22:18 +11:00
|
|
|
return region_ordering (a, b) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-06 12:01:41 +11:00
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void find_path_chunks (
|
2021-01-23 07:48:39 +11:00
|
|
|
std::deque<emory::chunk::region> &res,
|
2020-12-06 12:01:41 +11:00
|
|
|
std::filesystem::path const &src,
|
|
|
|
emory::chunk::params const &p
|
|
|
|
);
|
|
|
|
|
|
|
|
|
2020-12-29 09:36:21 +11:00
|
|
|
///----------------------------------------------------------------------------
|
|
|
|
/// Scan chunks in the path provided to a regular file.
|
2020-12-06 12:01:41 +11:00
|
|
|
static
|
|
|
|
void
|
|
|
|
find_regular_chunks (
|
2021-01-23 07:48:39 +11:00
|
|
|
std::deque<emory::chunk::region> &res,
|
2020-12-06 12:01:41 +11:00
|
|
|
std::filesystem::path const &src,
|
|
|
|
emory::chunk::params const &p
|
|
|
|
) {
|
|
|
|
try {
|
|
|
|
emory::chunk::find<emory::chunk::static_hash> (
|
|
|
|
std::back_inserter (res),
|
|
|
|
cruft::mapped_file (src),
|
|
|
|
p
|
|
|
|
);
|
|
|
|
} catch (cruft::posix::error &err) {
|
|
|
|
fmt::print (stderr, "skipping {}, error: {}\n", src.string (), err.what ());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-29 09:36:21 +11:00
|
|
|
///----------------------------------------------------------------------------
|
|
|
|
/// Scan chunks in the directory by recursing into all children.
|
2020-12-06 12:01:41 +11:00
|
|
|
static
|
|
|
|
void
|
|
|
|
find_directory_chunks (
|
2021-01-23 07:48:39 +11:00
|
|
|
std::deque<emory::chunk::region> &res,
|
2020-12-06 12:01:41 +11:00
|
|
|
std::filesystem::path const &src,
|
|
|
|
emory::chunk::params const &p
|
|
|
|
) {
|
|
|
|
fmt::print (stderr, "{}\n", src.string ());
|
|
|
|
for (auto const &child: std::filesystem::directory_iterator (src)) {
|
|
|
|
find_path_chunks (res, child, p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-29 09:36:21 +11:00
|
|
|
///----------------------------------------------------------------------------
|
|
|
|
/// Scan chunks from a given path by dispatching to `find_foo_chunks` style
|
|
|
|
/// functions depending on the file type.
|
2020-12-06 12:01:41 +11:00
|
|
|
static void find_path_chunks (
|
2021-01-23 07:48:39 +11:00
|
|
|
std::deque<emory::chunk::region> &res,
|
2020-12-06 12:01:41 +11:00
|
|
|
std::filesystem::path const &src,
|
|
|
|
emory::chunk::params const &p
|
|
|
|
) {
|
|
|
|
switch (auto const type = status (src).type (); type) {
|
|
|
|
case std::filesystem::file_type::regular:
|
|
|
|
return find_regular_chunks (res, src, p);
|
|
|
|
|
|
|
|
case std::filesystem::file_type::directory:
|
|
|
|
return find_directory_chunks (res, src, p);
|
|
|
|
|
|
|
|
case std::filesystem::file_type::none:
|
|
|
|
case std::filesystem::file_type::not_found:
|
|
|
|
case std::filesystem::file_type::symlink:
|
|
|
|
case std::filesystem::file_type::block:
|
|
|
|
case std::filesystem::file_type::character:
|
|
|
|
case std::filesystem::file_type::fifo:
|
|
|
|
case std::filesystem::file_type::socket:
|
|
|
|
case std::filesystem::file_type::unknown:
|
|
|
|
fmt::print (stderr, "skipping path of unhandled type: '{}'\n", src.string ());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
unreachable ();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-29 09:36:21 +11:00
|
|
|
///----------------------------------------------------------------------------
|
2021-01-23 07:48:39 +11:00
|
|
|
/// Find all regions in a path and return a container of the regions.
|
2020-12-06 12:01:41 +11:00
|
|
|
static
|
2021-01-23 07:48:39 +11:00
|
|
|
std::deque<emory::chunk::region>
|
2020-12-06 12:01:41 +11:00
|
|
|
find_chunks (std::filesystem::path const &src, emory::chunk::params const &p)
|
|
|
|
{
|
2021-01-23 07:48:39 +11:00
|
|
|
std::deque<emory::chunk::region> res;
|
2020-12-06 12:01:41 +11:00
|
|
|
find_path_chunks (res, src, p);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-06 08:14:05 +11:00
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
enum {
|
|
|
|
ARG_SELF,
|
2020-12-06 09:42:01 +11:00
|
|
|
|
|
|
|
ARGS_INPUT,
|
|
|
|
ARGS_OUTPUT,
|
|
|
|
|
2020-12-06 08:14:05 +11:00
|
|
|
ARG_BITS,
|
|
|
|
ARG_WINDOW,
|
2020-12-06 09:42:01 +11:00
|
|
|
ARG_MINIMUM,
|
2020-12-29 09:01:40 +11:00
|
|
|
ARG_MAXIMUM,
|
2020-12-06 08:14:05 +11:00
|
|
|
|
|
|
|
NUM_ARGS,
|
2020-12-06 09:42:01 +11:00
|
|
|
NUM_ARGS_REQUIRED = 3,
|
2020-12-06 08:14:05 +11:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2019-04-26 11:16:02 +10:00
|
|
|
//-----------------------------------------------------------------------------
|
2019-04-23 08:17:34 +10:00
|
|
|
int main (int argc, char const **argv)
|
|
|
|
{
|
2020-12-29 09:36:21 +11:00
|
|
|
// Extract commandline arguments
|
2020-12-06 09:42:01 +11:00
|
|
|
if (argc < NUM_ARGS_REQUIRED) {
|
2020-12-29 09:01:40 +11:00
|
|
|
std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum] [maximum]\n"
|
2020-12-06 09:48:20 +11:00
|
|
|
<< "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n'
|
|
|
|
<< "default window = " << emory::chunk::DEFAULT_PARAMS.window << '\n'
|
2020-12-29 09:01:40 +11:00
|
|
|
<< "default minimum = " << emory::chunk::DEFAULT_PARAMS.minimum << '\n'
|
|
|
|
<< "default maximum = " << emory::chunk::DEFAULT_PARAMS.maximum << '\n';
|
2019-04-23 08:55:16 +10:00
|
|
|
return EXIT_FAILURE;
|
|
|
|
}
|
|
|
|
|
2020-12-06 09:48:20 +11:00
|
|
|
emory::chunk::params p = emory::chunk::DEFAULT_PARAMS;
|
2020-12-29 09:01:53 +11:00
|
|
|
if (argc > ARG_BITS)
|
2020-12-06 12:01:41 +11:00
|
|
|
p.bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS]);
|
2020-12-29 09:01:53 +11:00
|
|
|
if (argc > ARG_WINDOW)
|
2020-12-06 12:01:41 +11:00
|
|
|
p.window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]);
|
2020-12-29 09:01:53 +11:00
|
|
|
if (argc > ARG_MINIMUM)
|
2020-12-06 09:48:20 +11:00
|
|
|
p.minimum = cruft::parse::from_string<std::size_t> (argv[ARG_MINIMUM]);
|
2020-12-29 09:01:53 +11:00
|
|
|
if (argc > ARG_MAXIMUM)
|
2020-12-29 09:01:40 +11:00
|
|
|
p.maximum = cruft::parse::from_string<std::size_t> (argv[ARG_MAXIMUM]);
|
2020-12-06 09:48:20 +11:00
|
|
|
|
|
|
|
std::cerr << p << '\n';
|
2020-12-06 09:42:01 +11:00
|
|
|
|
2020-12-29 09:36:21 +11:00
|
|
|
// Find all the chunks and prepare them for output
|
2020-12-06 07:22:52 +11:00
|
|
|
std::cout << "processing\n";
|
2021-01-23 07:48:39 +11:00
|
|
|
auto src = find_chunks (argv[ARGS_INPUT], p);
|
2020-12-06 07:22:52 +11:00
|
|
|
|
2020-12-06 12:01:41 +11:00
|
|
|
fmt::print ("analysing {} chunks\n", src.size ());
|
2020-12-29 09:36:38 +11:00
|
|
|
std::sort (src.begin (), src.end (), region_less);
|
2020-12-06 07:22:52 +11:00
|
|
|
|
2020-12-29 09:36:21 +11:00
|
|
|
// Write all chunks to the output file
|
2020-12-29 09:36:46 +11:00
|
|
|
std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc);
|
|
|
|
output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit);
|
2020-12-06 12:01:41 +11:00
|
|
|
output << "params: " << p << '\n';
|
|
|
|
for (auto const &chunk: src) {
|
|
|
|
output << chunk.size() << ' ';
|
|
|
|
for (auto const &c: chunk.digest)
|
|
|
|
output << std::hex << std::setw (2) << std::setfill ('0') << +c;
|
|
|
|
output << std::dec << '\n';
|
|
|
|
}
|
|
|
|
|
2020-12-29 09:36:21 +11:00
|
|
|
// Find the total and unique byte counts
|
2020-12-06 07:22:52 +11:00
|
|
|
auto const total_bytes = std::accumulate (
|
2020-12-06 10:34:33 +11:00
|
|
|
src.begin (),
|
|
|
|
src.end (),
|
2020-12-06 12:01:41 +11:00
|
|
|
std::uintmax_t (0),
|
2020-12-06 07:22:52 +11:00
|
|
|
[] (auto const accum, auto const rhs)
|
|
|
|
{
|
|
|
|
return accum + rhs.size ();
|
|
|
|
});
|
|
|
|
|
2020-12-29 09:37:32 +11:00
|
|
|
// WARNING: this is destructive, but suits our purposes for the moment as
|
|
|
|
// we intend to remove the reliance on in memory storage as much as we can.
|
|
|
|
auto const init_size = src.size ();
|
|
|
|
src.erase(
|
|
|
|
std::unique (
|
|
|
|
src.begin (),
|
|
|
|
src.end (),
|
|
|
|
region_equal
|
|
|
|
),
|
|
|
|
src.end ()
|
|
|
|
);
|
2020-12-06 07:22:52 +11:00
|
|
|
|
2020-12-29 09:37:32 +11:00
|
|
|
auto const unique_bytes = std::accumulate (
|
2020-12-06 10:34:33 +11:00
|
|
|
src.begin (),
|
|
|
|
src.end (),
|
2020-12-29 09:37:32 +11:00
|
|
|
0, [] (auto const &accum, auto const &rhs) { return accum + rhs.size (); }
|
2020-12-06 07:22:52 +11:00
|
|
|
);
|
|
|
|
|
|
|
|
auto const duplicated_bytes = total_bytes - unique_bytes;
|
2020-12-06 10:07:27 +11:00
|
|
|
float const duplicated_fraction = float (duplicated_bytes) / total_bytes;
|
|
|
|
|
|
|
|
fmt::print (
|
|
|
|
"{} duplicated bytes of {} ({:.2f}%)\n",
|
|
|
|
duplicated_bytes,
|
|
|
|
total_bytes,
|
|
|
|
100.f * duplicated_fraction
|
|
|
|
);
|
2020-12-06 07:22:52 +11:00
|
|
|
|
2020-12-29 09:37:32 +11:00
|
|
|
fmt::print ("{} duplicates\n", init_size - src.size ());
|
2019-04-23 08:17:34 +10:00
|
|
|
}
|