/* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 2019 Danny Robson */ #include "emory/chunk/find.hpp" #include "emory/chunk/region.hpp" #include "emory/chunk/params.hpp" #include "emory/chunk/ostream.hpp" #include #include #include #include #include #include #include #include #include /////////////////////////////////////////////////////////////////////////////// static std::strong_ordering region_ordering ( emory::chunk::region const &a, emory::chunk::region const &b ) { if (auto const cmp = a.size () <=> b.size (); cmp != 0) return cmp; for (int i = 0; i < std::ssize (a.digest); ++i) if (auto const cmp = a.digest[i] <=> b.digest[i]; cmp != 0) return cmp; return std::strong_ordering::equal; } static bool region_less (emory::chunk::region const &a, emory::chunk::region const &b) { return region_ordering (a, b) < 0; } static bool region_equal (emory::chunk::region const &a, emory::chunk::region const &b) { return region_ordering (a, b) == 0; } //static bool overlap (emory::chunk::region const &a, emory::chunk::region const &b) //{ // return a.offset.first < b.offset.second && // b.offset.first < a.offset.second; //} /////////////////////////////////////////////////////////////////////////////// enum { ARG_SELF, ARGS_INPUT, ARGS_OUTPUT, ARG_BITS, ARG_WINDOW, ARG_MINIMUM, NUM_ARGS, NUM_ARGS_REQUIRED = 3, }; //----------------------------------------------------------------------------- int main (int argc, char const **argv) { if (argc < NUM_ARGS_REQUIRED) { std::cerr << "usage: " << argv[ARG_SELF] << " [bits] [window] [minimum]\n" << "default bits = " << emory::chunk::DEFAULT_PARAMS.bits << '\n' << "default window = " << emory::chunk::DEFAULT_PARAMS.window << '\n' << "default minimum = " << emory::chunk::DEFAULT_PARAMS.minimum << '\n'; return EXIT_FAILURE; } emory::chunk::params p = emory::chunk::DEFAULT_PARAMS; if (argc > ARG_BITS + 1) p.bits = cruft::parse::from_string (argv[ARG_WINDOW]); if (argc > ARG_WINDOW + 1) p.window = cruft::parse::from_string (argv[ARG_BITS]); if (argc > ARG_BITS + 1) p.minimum = cruft::parse::from_string (argv[ARG_MINIMUM]); std::cerr << p << '\n'; std::ofstream output (argv[ARGS_OUTPUT], std::ios::out | std::ios::trunc); output.exceptions (std::ios::badbit | std::ios::eofbit | std::ios::failbit); cruft::mapped_file data (argv[ARGS_INPUT]); std::cout << "size: " << data.size () << '\n'; std::cout << "processing\n"; std::vector src; emory::chunk::find (std::back_inserter (src), data, p); std::cout << "validating\n"; std::cout << src.size () << " chunks\n"; std::sort ( src.begin (), src.end (), [] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; } ); for (off_t i = 0, cursor = 0; i < std::ssize (src); ++i) { if (src[i].offset.first != cursor) { std::cout << "non-overlapping chunks\n"; return -1; } cursor = src[i].offset.second; } if (src.back ().offset.second != std::ssize (data)) { std::cout << "invalid total size\n"; return -1; } std::sort ( src.begin (), src.end (), region_less ); std::vector sizes; std::transform ( src.begin (), src.end (), std::back_inserter (sizes), [] (auto const &val) { return val.size (); } ); auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0); std::cout << myaccum << '\n'; auto const total_bytes = std::accumulate ( src.begin (), src.end (), 0, [] (auto const accum, auto const rhs) { return accum + rhs.size (); }); std::vector unique; std::unique_copy ( src.begin (), src.end (), std::back_inserter (unique), region_equal ); auto const unique_bytes = std::accumulate ( unique.begin (), unique.end (), 0, [] (auto const accum, auto const rhs) { return accum + rhs.size (); } ); auto const duplicated_bytes = total_bytes - unique_bytes; float const duplicated_fraction = float (duplicated_bytes) / total_bytes; fmt::print ( "{} duplicated bytes of {} ({:.2f}%)\n", duplicated_bytes, total_bytes, 100.f * duplicated_fraction ); std::cout << (src.size () - unique.size ()) << " duplicates\n"; }