analyse: add directory and recursion support
This commit is contained in:
parent
8b742647df
commit
3ad55453f7
@ -1 +1 @@
|
||||
Subproject commit 05880da6911c366d6f9546fec448ae1b3e6e8b7b
|
||||
Subproject commit 0d348a83458812d82ece205a67c638efdae64011
|
@ -14,14 +14,16 @@
|
||||
#include <cruft/util/io.hpp>
|
||||
#include <cruft/util/view.hpp>
|
||||
#include <cruft/util/parse/value.hpp>
|
||||
#include <cruft/util/posix/except.hpp>
|
||||
|
||||
#include <fmt/format.h>
|
||||
#include <fmt/compile.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <compare>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@ -59,6 +61,89 @@ static bool region_equal (emory::chunk::region const &a, emory::chunk::region co
|
||||
//}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
static void find_path_chunks (
|
||||
std::vector<emory::chunk::region> &res,
|
||||
std::filesystem::path const &src,
|
||||
emory::chunk::params const &p
|
||||
);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
static
|
||||
void
|
||||
find_regular_chunks (
|
||||
std::vector<emory::chunk::region> &res,
|
||||
std::filesystem::path const &src,
|
||||
emory::chunk::params const &p
|
||||
) {
|
||||
try {
|
||||
emory::chunk::find<emory::chunk::static_hash> (
|
||||
std::back_inserter (res),
|
||||
cruft::mapped_file (src),
|
||||
p
|
||||
);
|
||||
} catch (cruft::posix::error &err) {
|
||||
fmt::print (stderr, "skipping {}, error: {}\n", src.string (), err.what ());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
static
|
||||
void
|
||||
find_directory_chunks (
|
||||
std::vector<emory::chunk::region> &res,
|
||||
std::filesystem::path const &src,
|
||||
emory::chunk::params const &p
|
||||
) {
|
||||
fmt::print (stderr, "{}\n", src.string ());
|
||||
for (auto const &child: std::filesystem::directory_iterator (src)) {
|
||||
find_path_chunks (res, child, p);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
static void find_path_chunks (
|
||||
std::vector<emory::chunk::region> &res,
|
||||
std::filesystem::path const &src,
|
||||
emory::chunk::params const &p
|
||||
) {
|
||||
switch (auto const type = status (src).type (); type) {
|
||||
case std::filesystem::file_type::regular:
|
||||
return find_regular_chunks (res, src, p);
|
||||
|
||||
case std::filesystem::file_type::directory:
|
||||
return find_directory_chunks (res, src, p);
|
||||
|
||||
case std::filesystem::file_type::none:
|
||||
case std::filesystem::file_type::not_found:
|
||||
case std::filesystem::file_type::symlink:
|
||||
case std::filesystem::file_type::block:
|
||||
case std::filesystem::file_type::character:
|
||||
case std::filesystem::file_type::fifo:
|
||||
case std::filesystem::file_type::socket:
|
||||
case std::filesystem::file_type::unknown:
|
||||
fmt::print (stderr, "skipping path of unhandled type: '{}'\n", src.string ());
|
||||
return;
|
||||
}
|
||||
|
||||
unreachable ();
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
static
|
||||
std::vector<emory::chunk::region>
|
||||
find_chunks (std::filesystem::path const &src, emory::chunk::params const &p)
|
||||
{
|
||||
std::vector<emory::chunk::region> res;
|
||||
find_path_chunks (res, src, p);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
enum {
|
||||
ARG_SELF,
|
||||
@ -88,9 +173,9 @@ int main (int argc, char const **argv)
|
||||
|
||||
emory::chunk::params p = emory::chunk::DEFAULT_PARAMS;
|
||||
if (argc > ARG_BITS + 1)
|
||||
p.bits = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]);
|
||||
p.bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS]);
|
||||
if (argc > ARG_WINDOW + 1)
|
||||
p.window = cruft::parse::from_string<std::size_t> (argv[ARG_BITS]);
|
||||
p.window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]);
|
||||
if (argc > ARG_BITS + 1)
|
||||
p.minimum = cruft::parse::from_string<std::size_t> (argv[ARG_MINIMUM]);
|
||||
|
||||
@ -99,32 +184,10 @@ int main (int argc, char const **argv)
|
||||
std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc);
|
||||
output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit);
|
||||
|
||||
cruft::mapped_file data (argv[ARGS_INPUT]);
|
||||
std::cout << "size: " << data.size () << '\n';
|
||||
|
||||
std::cout << "processing\n";
|
||||
std::vector<emory::chunk::region> src;
|
||||
emory::chunk::find<emory::chunk::static_hash> (std::back_inserter (src), data, p);
|
||||
std::vector<emory::chunk::region> src = find_chunks (argv[ARGS_INPUT], p);
|
||||
|
||||
std::cout << "validating\n";
|
||||
std::cout << src.size () << " chunks\n";
|
||||
std::sort (
|
||||
src.begin (),
|
||||
src.end (),
|
||||
[] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; }
|
||||
);
|
||||
for (off_t i = 0, cursor = 0; i < std::ssize (src); ++i) {
|
||||
if (src[i].offset.first != cursor) {
|
||||
std::cout << "non-overlapping chunks\n";
|
||||
return -1;
|
||||
}
|
||||
cursor = src[i].offset.second;
|
||||
}
|
||||
|
||||
if (src.back ().offset.second != std::ssize (data)) {
|
||||
std::cout << "invalid total size\n";
|
||||
return -1;
|
||||
}
|
||||
fmt::print ("analysing {} chunks\n", src.size ());
|
||||
|
||||
std::sort (
|
||||
src.begin (),
|
||||
@ -132,6 +195,14 @@ int main (int argc, char const **argv)
|
||||
region_less
|
||||
);
|
||||
|
||||
output << "params: " << p << '\n';
|
||||
for (auto const &chunk: src) {
|
||||
output << chunk.size() << ' ';
|
||||
for (auto const &c: chunk.digest)
|
||||
output << std::hex << std::setw (2) << std::setfill ('0') << +c;
|
||||
output << std::dec << '\n';
|
||||
}
|
||||
|
||||
std::vector<off64_t> sizes;
|
||||
std::transform (
|
||||
src.begin (),
|
||||
@ -145,7 +216,7 @@ int main (int argc, char const **argv)
|
||||
auto const total_bytes = std::accumulate (
|
||||
src.begin (),
|
||||
src.end (),
|
||||
0,
|
||||
std::uintmax_t (0),
|
||||
[] (auto const accum, auto const rhs)
|
||||
{
|
||||
return accum + rhs.size ();
|
||||
@ -175,5 +246,5 @@ int main (int argc, char const **argv)
|
||||
100.f * duplicated_fraction
|
||||
);
|
||||
|
||||
std::cout << (src.size () - unique.size ()) << " duplicates\n";
|
||||
fmt::print ("{} duplicates\n", src.size () - unique.size ());
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user