/* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 2019 Danny Robson */ #include "emory/chunk/find.hpp" #include "emory/chunk/region.hpp" #include "emory/chunk/params.hpp" #include "emory/chunk/ostream.hpp" #include #include #include #include #include #include #include #include #include #include #include /////////////////////////////////////////////////////////////////////////////// /// Provides _some_ consistent ordering for regions. The meaning isn't well /// defined. The function is provided only so that we can identify duplicates. static std::strong_ordering region_ordering ( emory::chunk::region const &a, emory::chunk::region const &b ) { if (auto const cmp = a.size () <=> b.size (); cmp != 0) return cmp; for (int i = 0; i < std::ssize (a.digest); ++i) if (auto const cmp = a.digest[i] <=> b.digest[i]; cmp != 0) return cmp; return std::strong_ordering::equal; } //----------------------------------------------------------------------------- static bool region_less ( emory::chunk::region const &a, emory::chunk::region const &b ) { return region_ordering (a, b) < 0; } //----------------------------------------------------------------------------- static bool region_equal ( emory::chunk::region const &a, emory::chunk::region const &b ) { return region_ordering (a, b) == 0; } /////////////////////////////////////////////////////////////////////////////// static void find_path_chunks ( std::vector &res, std::filesystem::path const &src, emory::chunk::params const &p ); ///---------------------------------------------------------------------------- /// Scan chunks in the path provided to a regular file. static void find_regular_chunks ( std::vector &res, std::filesystem::path const &src, emory::chunk::params const &p ) { try { emory::chunk::find ( std::back_inserter (res), cruft::mapped_file (src), p ); } catch (cruft::posix::error &err) { fmt::print (stderr, "skipping {}, error: {}\n", src.string (), err.what ()); } } ///---------------------------------------------------------------------------- /// Scan chunks in the directory by recursing into all children. static void find_directory_chunks ( std::vector &res, std::filesystem::path const &src, emory::chunk::params const &p ) { fmt::print (stderr, "{}\n", src.string ()); for (auto const &child: std::filesystem::directory_iterator (src)) { find_path_chunks (res, child, p); } } ///---------------------------------------------------------------------------- /// Scan chunks from a given path by dispatching to `find_foo_chunks` style /// functions depending on the file type. static void find_path_chunks ( std::vector &res, std::filesystem::path const &src, emory::chunk::params const &p ) { switch (auto const type = status (src).type (); type) { case std::filesystem::file_type::regular: return find_regular_chunks (res, src, p); case std::filesystem::file_type::directory: return find_directory_chunks (res, src, p); case std::filesystem::file_type::none: case std::filesystem::file_type::not_found: case std::filesystem::file_type::symlink: case std::filesystem::file_type::block: case std::filesystem::file_type::character: case std::filesystem::file_type::fifo: case std::filesystem::file_type::socket: case std::filesystem::file_type::unknown: fmt::print (stderr, "skipping path of unhandled type: '{}'\n", src.string ()); return; } unreachable (); } ///---------------------------------------------------------------------------- /// Find all regions in a path and return a vector of the regions. static std::vector find_chunks (std::filesystem::path const &src, emory::chunk::params const &p) { std::vector res; find_path_chunks (res, src, p); return res; } /////////////////////////////////////////////////////////////////////////////// enum { ARG_SELF, ARGS_INPUT, ARGS_OUTPUT, ARG_BITS, ARG_WINDOW, ARG_MINIMUM, ARG_MAXIMUM, NUM_ARGS, NUM_ARGS_REQUIRED = 3, }; //----------------------------------------------------------------------------- int main (int argc, char const **argv) { // Extract commandline arguments if (argc < NUM_ARGS_REQUIRED) { std::cerr << "usage: " << argv[ARG_SELF] << " [bits] [window] [minimum] [maximum]\n" << "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n' << "default window = " << emory::chunk::DEFAULT_PARAMS.window << '\n' << "default minimum = " << emory::chunk::DEFAULT_PARAMS.minimum << '\n' << "default maximum = " << emory::chunk::DEFAULT_PARAMS.maximum << '\n'; return EXIT_FAILURE; } emory::chunk::params p = emory::chunk::DEFAULT_PARAMS; if (argc > ARG_BITS) p.bits = cruft::parse::from_string (argv[ARG_BITS]); if (argc > ARG_WINDOW) p.window = cruft::parse::from_string (argv[ARG_WINDOW]); if (argc > ARG_MINIMUM) p.minimum = cruft::parse::from_string (argv[ARG_MINIMUM]); if (argc > ARG_MAXIMUM) p.maximum = cruft::parse::from_string (argv[ARG_MAXIMUM]); std::cerr << p << '\n'; // Find all the chunks and prepare them for output std::cout << "processing\n"; std::vector src = find_chunks (argv[ARGS_INPUT], p); fmt::print ("analysing {} chunks\n", src.size ()); std::sort (src.begin (), src.end (), region_less); // Write all chunks to the output file std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc); output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit); output << "params: " << p << '\n'; for (auto const &chunk: src) { output << chunk.size() << ' '; for (auto const &c: chunk.digest) output << std::hex << std::setw (2) << std::setfill ('0') << +c; output << std::dec << '\n'; } std::vector sizes; std::transform ( src.begin (), src.end (), std::back_inserter (sizes), [] (auto const &val) { return val.size (); } ); auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0); std::cout << myaccum << '\n'; // Find the total and unique byte counts auto const total_bytes = std::accumulate ( src.begin (), src.end (), std::uintmax_t (0), [] (auto const accum, auto const rhs) { return accum + rhs.size (); }); std::vector unique; std::unique_copy ( src.begin (), src.end (), std::back_inserter (unique), region_equal ); auto const unique_bytes = std::accumulate ( unique.begin (), unique.end (), 0, [] (auto const accum, auto const rhs) { return accum + rhs.size (); } ); auto const duplicated_bytes = total_bytes - unique_bytes; float const duplicated_fraction = float (duplicated_bytes) / total_bytes; fmt::print ( "{} duplicated bytes of {} ({:.2f}%)\n", duplicated_bytes, total_bytes, 100.f * duplicated_fraction ); fmt::print ("{} duplicates\n", src.size () - unique.size ()); }