emory/tools/analyse.cpp

255 lines
7.3 KiB
C++
Raw Normal View History

2019-04-23 08:17:34 +10:00
/*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
2019-04-26 11:16:02 +10:00
* Copyright 2019 Danny Robson <danny@nerdcruft.net>
2019-04-23 08:17:34 +10:00
*/
#include "emory/chunk/find.hpp"
#include "emory/chunk/region.hpp"
2019-04-26 11:16:02 +10:00
#include "emory/chunk/params.hpp"
#include "emory/chunk/ostream.hpp"
2019-04-26 11:16:02 +10:00
2019-04-23 08:17:34 +10:00
#include <cruft/util/io.hpp>
2019-04-26 11:16:02 +10:00
#include <cruft/util/view.hpp>
#include <cruft/util/parse/value.hpp>
#include <cruft/util/posix/except.hpp>
2019-04-23 08:17:34 +10:00
#include <fmt/format.h>
#include <fmt/compile.h>
2020-12-06 07:22:18 +11:00
#include <algorithm>
#include <compare>
#include <filesystem>
2020-12-06 10:00:22 +11:00
#include <fstream>
#include <iostream>
2019-04-23 08:17:34 +10:00
2019-04-26 11:16:02 +10:00
///////////////////////////////////////////////////////////////////////////////
2020-12-06 07:22:18 +11:00
static
std::strong_ordering
region_ordering (
emory::chunk::region const &a,
emory::chunk::region const &b
) {
if (auto const cmp = a.size () <=> b.size (); cmp != 0)
return cmp;
for (int i = 0; i < std::ssize (a.digest); ++i)
if (auto const cmp = a.digest[i] <=> b.digest[i]; cmp != 0)
return cmp;
return std::strong_ordering::equal;
}
static bool region_less (emory::chunk::region const &a, emory::chunk::region const &b)
{
return region_ordering (a, b) < 0;
}
static bool region_equal (emory::chunk::region const &a, emory::chunk::region const &b)
{
return region_ordering (a, b) == 0;
}
//static bool overlap (emory::chunk::region const &a, emory::chunk::region const &b)
//{
// return a.offset.first < b.offset.second &&
// b.offset.first < a.offset.second;
//}
///////////////////////////////////////////////////////////////////////////////
static void find_path_chunks (
std::vector<emory::chunk::region> &res,
std::filesystem::path const &src,
emory::chunk::params const &p
);
//-----------------------------------------------------------------------------
static
void
find_regular_chunks (
std::vector<emory::chunk::region> &res,
std::filesystem::path const &src,
emory::chunk::params const &p
) {
try {
emory::chunk::find<emory::chunk::static_hash> (
std::back_inserter (res),
cruft::mapped_file (src),
p
);
} catch (cruft::posix::error &err) {
fmt::print (stderr, "skipping {}, error: {}\n", src.string (), err.what ());
}
}
//-----------------------------------------------------------------------------
static
void
find_directory_chunks (
std::vector<emory::chunk::region> &res,
std::filesystem::path const &src,
emory::chunk::params const &p
) {
fmt::print (stderr, "{}\n", src.string ());
for (auto const &child: std::filesystem::directory_iterator (src)) {
find_path_chunks (res, child, p);
}
}
//-----------------------------------------------------------------------------
static void find_path_chunks (
std::vector<emory::chunk::region> &res,
std::filesystem::path const &src,
emory::chunk::params const &p
) {
switch (auto const type = status (src).type (); type) {
case std::filesystem::file_type::regular:
return find_regular_chunks (res, src, p);
case std::filesystem::file_type::directory:
return find_directory_chunks (res, src, p);
case std::filesystem::file_type::none:
case std::filesystem::file_type::not_found:
case std::filesystem::file_type::symlink:
case std::filesystem::file_type::block:
case std::filesystem::file_type::character:
case std::filesystem::file_type::fifo:
case std::filesystem::file_type::socket:
case std::filesystem::file_type::unknown:
fmt::print (stderr, "skipping path of unhandled type: '{}'\n", src.string ());
return;
}
unreachable ();
}
//-----------------------------------------------------------------------------
static
std::vector<emory::chunk::region>
find_chunks (std::filesystem::path const &src, emory::chunk::params const &p)
{
std::vector<emory::chunk::region> res;
find_path_chunks (res, src, p);
return res;
}
///////////////////////////////////////////////////////////////////////////////
enum {
ARG_SELF,
ARGS_INPUT,
ARGS_OUTPUT,
ARG_BITS,
ARG_WINDOW,
ARG_MINIMUM,
ARG_MAXIMUM,
NUM_ARGS,
NUM_ARGS_REQUIRED = 3,
};
2019-04-26 11:16:02 +10:00
//-----------------------------------------------------------------------------
2019-04-23 08:17:34 +10:00
int main (int argc, char const **argv)
{
if (argc < NUM_ARGS_REQUIRED) {
std::cerr << "usage: " << argv[ARG_SELF] << " <input> <output> [bits] [window] [minimum] [maximum]\n"
<< "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n'
<< "default window = " << emory::chunk::DEFAULT_PARAMS.window << '\n'
<< "default minimum = " << emory::chunk::DEFAULT_PARAMS.minimum << '\n'
<< "default maximum = " << emory::chunk::DEFAULT_PARAMS.maximum << '\n';
return EXIT_FAILURE;
}
emory::chunk::params p = emory::chunk::DEFAULT_PARAMS;
if (argc > ARG_BITS + 1)
p.bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS]);
if (argc > ARG_WINDOW + 1)
p.window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]);
if (argc > ARG_BITS + 1)
p.minimum = cruft::parse::from_string<std::size_t> (argv[ARG_MINIMUM]);
if (argc > ARG_MAXIMUM + 1)
p.maximum = cruft::parse::from_string<std::size_t> (argv[ARG_MAXIMUM]);
std::cerr << p << '\n';
2020-12-06 10:00:22 +11:00
std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc);
output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit);
2019-04-23 08:17:34 +10:00
std::cout << "processing\n";
std::vector<emory::chunk::region> src = find_chunks (argv[ARGS_INPUT], p);
fmt::print ("analysing {} chunks\n", src.size ());
std::sort (
src.begin (),
src.end (),
region_less
);
output << "params: " << p << '\n';
for (auto const &chunk: src) {
output << chunk.size() << ' ';
for (auto const &c: chunk.digest)
output << std::hex << std::setw (2) << std::setfill ('0') << +c;
output << std::dec << '\n';
}
std::vector<off64_t> sizes;
std::transform (
src.begin (),
src.end (),
std::back_inserter (sizes),
[] (auto const &val) { return val.size (); }
);
auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0);
std::cout << myaccum << '\n';
auto const total_bytes = std::accumulate (
src.begin (),
src.end (),
std::uintmax_t (0),
[] (auto const accum, auto const rhs)
{
return accum + rhs.size ();
});
std::vector<emory::chunk::region> unique;
std::unique_copy (
src.begin (),
src.end (),
std::back_inserter (unique),
region_equal
);
auto const unique_bytes = std::accumulate (
unique.begin (),
unique.end (),
0, [] (auto const accum, auto const rhs) { return accum + rhs.size (); }
);
auto const duplicated_bytes = total_bytes - unique_bytes;
float const duplicated_fraction = float (duplicated_bytes) / total_bytes;
fmt::print (
"{} duplicated bytes of {} ({:.2f}%)\n",
duplicated_bytes,
total_bytes,
100.f * duplicated_fraction
);
fmt::print ("{} duplicates\n", src.size () - unique.size ());
2019-04-23 08:17:34 +10:00
}