diff --git a/cruft/util b/cruft/util index 05880da..0d348a8 160000 --- a/cruft/util +++ b/cruft/util @@ -1 +1 @@ -Subproject commit 05880da6911c366d6f9546fec448ae1b3e6e8b7b +Subproject commit 0d348a83458812d82ece205a67c638efdae64011 diff --git a/tools/analyse.cpp b/tools/analyse.cpp index 13d8d59..9e06f9b 100644 --- a/tools/analyse.cpp +++ b/tools/analyse.cpp @@ -14,14 +14,16 @@ #include #include #include +#include #include #include #include -#include #include +#include #include +#include /////////////////////////////////////////////////////////////////////////////// @@ -59,6 +61,89 @@ static bool region_equal (emory::chunk::region const &a, emory::chunk::region co //} +/////////////////////////////////////////////////////////////////////////////// +static void find_path_chunks ( + std::vector &res, + std::filesystem::path const &src, + emory::chunk::params const &p +); + + +//----------------------------------------------------------------------------- +static +void +find_regular_chunks ( + std::vector &res, + std::filesystem::path const &src, + emory::chunk::params const &p +) { + try { + emory::chunk::find ( + std::back_inserter (res), + cruft::mapped_file (src), + p + ); + } catch (cruft::posix::error &err) { + fmt::print (stderr, "skipping {}, error: {}\n", src.string (), err.what ()); + } +} + + +//----------------------------------------------------------------------------- +static +void +find_directory_chunks ( + std::vector &res, + std::filesystem::path const &src, + emory::chunk::params const &p +) { + fmt::print (stderr, "{}\n", src.string ()); + for (auto const &child: std::filesystem::directory_iterator (src)) { + find_path_chunks (res, child, p); + } +} + + +//----------------------------------------------------------------------------- +static void find_path_chunks ( + std::vector &res, + std::filesystem::path const &src, + emory::chunk::params const &p +) { + switch (auto const type = status (src).type (); type) { + case std::filesystem::file_type::regular: + return find_regular_chunks (res, src, p); + + case std::filesystem::file_type::directory: + return find_directory_chunks (res, src, p); + + case std::filesystem::file_type::none: + case std::filesystem::file_type::not_found: + case std::filesystem::file_type::symlink: + case std::filesystem::file_type::block: + case std::filesystem::file_type::character: + case std::filesystem::file_type::fifo: + case std::filesystem::file_type::socket: + case std::filesystem::file_type::unknown: + fmt::print (stderr, "skipping path of unhandled type: '{}'\n", src.string ()); + return; + } + + unreachable (); +} + + +//----------------------------------------------------------------------------- +static +std::vector +find_chunks (std::filesystem::path const &src, emory::chunk::params const &p) +{ + std::vector res; + find_path_chunks (res, src, p); + return res; +} + + /////////////////////////////////////////////////////////////////////////////// enum { ARG_SELF, @@ -88,9 +173,9 @@ int main (int argc, char const **argv) emory::chunk::params p = emory::chunk::DEFAULT_PARAMS; if (argc > ARG_BITS + 1) - p.bits = cruft::parse::from_string (argv[ARG_WINDOW]); + p.bits = cruft::parse::from_string (argv[ARG_BITS]); if (argc > ARG_WINDOW + 1) - p.window = cruft::parse::from_string (argv[ARG_BITS]); + p.window = cruft::parse::from_string (argv[ARG_WINDOW]); if (argc > ARG_BITS + 1) p.minimum = cruft::parse::from_string (argv[ARG_MINIMUM]); @@ -99,32 +184,10 @@ int main (int argc, char const **argv) std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc); output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit); - cruft::mapped_file data (argv[ARGS_INPUT]); - std::cout << "size: " << data.size () << '\n'; - std::cout << "processing\n"; - std::vector src; - emory::chunk::find (std::back_inserter (src), data, p); + std::vector src = find_chunks (argv[ARGS_INPUT], p); - std::cout << "validating\n"; - std::cout << src.size () << " chunks\n"; - std::sort ( - src.begin (), - src.end (), - [] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; } - ); - for (off_t i = 0, cursor = 0; i < std::ssize (src); ++i) { - if (src[i].offset.first != cursor) { - std::cout << "non-overlapping chunks\n"; - return -1; - } - cursor = src[i].offset.second; - } - - if (src.back ().offset.second != std::ssize (data)) { - std::cout << "invalid total size\n"; - return -1; - } + fmt::print ("analysing {} chunks\n", src.size ()); std::sort ( src.begin (), @@ -132,6 +195,14 @@ int main (int argc, char const **argv) region_less ); + output << "params: " << p << '\n'; + for (auto const &chunk: src) { + output << chunk.size() << ' '; + for (auto const &c: chunk.digest) + output << std::hex << std::setw (2) << std::setfill ('0') << +c; + output << std::dec << '\n'; + } + std::vector sizes; std::transform ( src.begin (), @@ -145,7 +216,7 @@ int main (int argc, char const **argv) auto const total_bytes = std::accumulate ( src.begin (), src.end (), - 0, + std::uintmax_t (0), [] (auto const accum, auto const rhs) { return accum + rhs.size (); @@ -175,5 +246,5 @@ int main (int argc, char const **argv) 100.f * duplicated_fraction ); - std::cout << (src.size () - unique.size ()) << " duplicates\n"; + fmt::print ("{} duplicates\n", src.size () - unique.size ()); }