/* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 2019 Danny Robson */ #include "emory/chunk/map.hpp" #include "emory/chunk/params.hpp" #include #include #include #include #include #include /////////////////////////////////////////////////////////////////////////////// static std::strong_ordering region_ordering ( emory::chunk::region const &a, emory::chunk::region const &b ) { if (auto const cmp = a.size () <=> b.size (); cmp != 0) return cmp; for (int i = 0; i < std::ssize (a.digest); ++i) if (auto const cmp = a.digest[i] <=> b.digest[i]; cmp != 0) return cmp; return std::strong_ordering::equal; } static bool region_less (emory::chunk::region const &a, emory::chunk::region const &b) { return region_ordering (a, b) < 0; } static bool region_equal (emory::chunk::region const &a, emory::chunk::region const &b) { return region_ordering (a, b) == 0; } //static bool overlap (emory::chunk::region const &a, emory::chunk::region const &b) //{ // return a.offset.first < b.offset.second && // b.offset.first < a.offset.second; //} /////////////////////////////////////////////////////////////////////////////// enum { ARG_SELF, ARG_BITS, ARG_WINDOW, ARGS_MINIMUM, ARGS_INPUT, NUM_ARGS, }; //----------------------------------------------------------------------------- int main (int argc, char const **argv) { if (argc != NUM_ARGS) { std::cerr << "usage: " << argv[ARG_SELF] << " \n"; return EXIT_FAILURE; } emory::chunk::params const p { .bits = cruft::parse::from_string (argv[ARG_BITS ]), .window = cruft::parse::from_string (argv[ARG_WINDOW]), .minimum = cruft::parse::from_string (argv[ARGS_MINIMUM]), }; cruft::mapped_file data (argv[ARGS_INPUT]); std::cout << "size: " << data.size () << '\n'; std::cout << "processing\n"; emory::chunk::map src (data, p); std::cout << "validating\n"; std::cout << src.size () << " chunks\n"; std::sort ( src.elements.begin (), src.elements.end (), [] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; } ); for (int i = 0, cursor = 0; i < std::ssize (src.elements); ++i) { if (src.elements[i].offset.first != cursor) { std::cout << "non-overlapping chunks\n"; return -1; } cursor = src.elements[i].offset.second; } if (src.elements.back ().offset.second != std::ssize (data)) { std::cout << "invalid total size\n"; return -1; } std::sort ( src.elements.begin (), src.elements.end (), region_less ); std::vector sizes; std::transform ( src.elements.begin (), src.elements.end (), std::back_inserter (sizes), [] (auto const &val) { return val.size (); } ); auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0); std::cout << myaccum << '\n'; auto const total_bytes = std::accumulate ( src.elements.begin (), src.elements.end (), 0, [] (auto const accum, auto const rhs) { return accum + rhs.size (); }); std::vector unique; std::unique_copy ( src.elements.begin (), src.elements.end (), std::back_inserter (unique), region_equal ); auto const unique_bytes = std::accumulate ( unique.begin (), unique.end (), 0, [] (auto const accum, auto const rhs) { return accum + rhs.size (); } ); auto const duplicated_bytes = total_bytes - unique_bytes; float const duplicated_fraction = float (duplicated_bytes) / total_bytes; std::cout << duplicated_bytes << " duplicated bytes of " << total_bytes << " (" << duplicated_fraction << "%)\n"; std::cout << (src.elements.size () - unique.size ()) << " duplicates\n"; }