From 9b6e1e770febc01855cb16f8000b2341a68ca31a Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Sun, 6 Dec 2020 06:22:52 +1000 Subject: [PATCH] analyse: print duplicated count and total size --- tools/analyse.cpp | 66 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tools/analyse.cpp b/tools/analyse.cpp index e38ae2d..67fbbf2 100644 --- a/tools/analyse.cpp +++ b/tools/analyse.cpp @@ -80,7 +80,73 @@ int main (int argc, char const **argv) }; cruft::mapped_file data (argv[ARGS_INPUT]); + std::cout << "size: " << data.size () << '\n'; + + std::cout << "processing\n"; emory::chunk::map src (data, p); + std::cout << "validating\n"; std::cout << src.size () << " chunks\n"; + std::sort ( + src.elements.begin (), + src.elements.end (), + [] (auto const &a, auto const &b) { return a.offset.first < b.offset.first; } + ); + for (int i = 0, cursor = 0; i < std::ssize (src.elements); ++i) { + if (src.elements[i].offset.first != cursor) { + std::cout << "non-overlapping chunks\n"; + return -1; + } + cursor = src.elements[i].offset.second; + } + + if (src.elements.back ().offset.second != std::ssize (data)) { + std::cout << "invalid total size\n"; + return -1; + } + + std::sort ( + src.elements.begin (), + src.elements.end (), + region_less + ); + + std::vector sizes; + std::transform ( + src.elements.begin (), + src.elements.end (), + std::back_inserter (sizes), + [] (auto const &val) { return val.size (); } + ); + auto const myaccum = std::accumulate (std::begin (sizes), std::end (sizes), 0); + std::cout << myaccum << '\n'; + + auto const total_bytes = std::accumulate ( + src.elements.begin (), + src.elements.end (), + 0, + [] (auto const accum, auto const rhs) + { + return accum + rhs.size (); + }); + + + std::vector unique; + std::unique_copy ( + src.elements.begin (), + src.elements.end (), + std::back_inserter (unique), + region_equal + ); + auto const unique_bytes = std::accumulate ( + unique.begin (), + unique.end (), + 0, [] (auto const accum, auto const rhs) { return accum + rhs.size (); } + ); + + auto const duplicated_bytes = total_bytes - unique_bytes; + float const duplicated_fraction = float (duplicated_bytes) / total_bytes; + + std::cout << duplicated_bytes << " duplicated bytes of " << total_bytes << " (" << duplicated_fraction << "%)\n"; + std::cout << (src.elements.size () - unique.size ()) << " duplicates\n"; }