emory/tools/compare.cpp

/*
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * Copyright 2013 Danny Robson <danny@nerdcruft.net>
 */

#include "emory/chunk/params.hpp"
#include "emory/chunk/find.hpp"
#include "emory/chunk/region.hpp"
#include "emory/chunk/match.hpp"
#include "emory/chunk/ostream.hpp"

#include <cruft/util/io.hpp>
#include <cruft/util/parse/value.hpp>

#include <cstddef>
#include <iostream>


enum {
    ARG_SELF,
    ARG_BITS,
    ARG_WINDOW,
    ARGS_MINIMUM,
    ARGS_MAXIMUM,
    ARGS_TARGET,
    ARGS_SOURCE,

    NUM_ARGS,
};


int main (int argc, char const **argv)
{
    if (argc < NUM_ARGS) {
        std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <maximum> <target> <source> [...]\n";
        return EXIT_FAILURE;
    }

    emory::chunk::params const p {
        .bits    = cruft::parse::from_string<std::size_t> (argv[ARG_BITS  ]),
        .window  = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]),
        .minimum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]),
        .maximum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MAXIMUM]),
    };

    std::clog << "Hashing target\n";
    std::vector<emory::chunk::region> target;
    emory::chunk::find<emory::chunk::static_hash> (
        std::back_inserter (target),
        cruft::mapped_file (argv[ARGS_TARGET]), p
    );

    std::sort (target.begin (), target.end (), emory::chunk::region::digest_ordering {});
    std::cout << "Found " << target.size () << " chunks\n";

    std::vector<emory::chunk::match<int>> found;

    for (int i = ARGS_SOURCE; i != argc; ++i) {
        auto const path = argv[i];
        std::clog << "Hashing source: " << path << '\n';
        std::vector<emory::chunk::region> source;
        emory::chunk::find<emory::chunk::static_hash> (
            std::back_inserter (source),
            cruft::mapped_file (path), p
        );
        std::sort (source.begin (), source.end (), emory::chunk::region::digest_ordering {});

        std::clog << "Finding common\n";
        auto const &source_matches = common (target, source);
        std::clog << "Discovered " << source_matches.size () << " blocks\n";

        std::transform (
            std::begin (source_matches),
            std::end   (source_matches),
            std::back_inserter (found),
            [&] (auto const &j) -> emory::chunk::match<int>
        {
            return {
                .src = {           i, j.src.data },
                .dst = { ARGS_TARGET, j.dst.data },
            };
        });

        std::sort (
            std::begin (found),
            std::end   (found),
            emory::chunk::match<int>::src_ordering {}
        );

        found.erase (
            std::unique (
                std::begin (found),
                std::end   (found),
                emory::chunk::match<int>::dst_equality {}
            ),
            found.end ()
        );
    }

    std::clog << "Finalising\n";
    std::size_t matching = 0;
    std::vector<std::size_t> source_bytes (argc - ARGS_SOURCE, 0);
    for (auto const &i: found) {
        std::cout << i << '\n';
        auto const size = i.dst.data.offset.second - i.dst.data.offset.first;
        matching += size;
        source_bytes[i.src.id - ARGS_SOURCE] += size;
    }

    std::size_t const total = std::accumulate (
        target.begin (),
        target.end (),
        0u,
        [] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; }
    );

    std::cout << "Found " << found.size () << " chunks. " << matching << "/" << total << " bytes for a factor of " << float (matching) / total << "\n";

    for (int i = ARGS_SOURCE; i != argc; ++i)
        std::cerr << argv[i] << " contributed: " << source_bytes[i - ARGS_SOURCE] << '\n';
}
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`/*`
			`* This Source Code Form is subject to the terms of the Mozilla Public`
			`* License, v. 2.0. If a copy of the MPL was not distributed with this`
			`* file, You can obtain one at http://mozilla.org/MPL/2.0/.`
			`*`
			`* Copyright 2013 Danny Robson <danny@nerdcruft.net>`
			`*/`

emory: break comparison tool into distinct units 2019-04-26 11:10:07 +10:00			`#include "emory/chunk/params.hpp"`
chunk/find: remove map in favour of bare queries 2020-12-06 10:34:33 +11:00			`#include "emory/chunk/find.hpp"`
			`#include "emory/chunk/region.hpp"`
emory: break comparison tool into distinct units 2019-04-26 11:10:07 +10:00			`#include "emory/chunk/match.hpp"`
			`#include "emory/chunk/ostream.hpp"`

emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`#include <cruft/util/io.hpp>`
			`#include <cruft/util/parse/value.hpp>`

emory: break comparison tool into distinct units 2019-04-26 11:10:07 +10:00			`#include <cstddef>`
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`#include <iostream>`


			`enum {`
			`ARG_SELF,`
			`ARG_BITS,`
			`ARG_WINDOW,`
			`ARGS_MINIMUM,`
chunk/param: add a maximum chunk size param 2020-12-29 09:01:40 +11:00			`ARGS_MAXIMUM,`
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`ARGS_TARGET,`
			`ARGS_SOURCE,`

			`NUM_ARGS,`
			`};`


			`int main (int argc, char const **argv)`
			`{`
tools/compare: allow multiple sources 2019-04-26 09:08:03 +10:00			`if (argc < NUM_ARGS) {`
chunk/param: add a maximum chunk size param 2020-12-29 09:01:40 +11:00			`std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <maximum> <target> <source> [...]\n";`
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`return EXIT_FAILURE;`
			`}`

emory: break comparison tool into distinct units 2019-04-26 11:10:07 +10:00			`emory::chunk::params const p {`
chunk/param: add a maximum chunk size param 2020-12-29 09:01:40 +11:00			`.bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS ]),`
			`.window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]),`
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`.minimum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]),`
chunk/param: add a maximum chunk size param 2020-12-29 09:01:40 +11:00			`.maximum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MAXIMUM]),`
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`};`

			`std::clog << "Hashing target\n";`
chunk/find: remove map in favour of bare queries 2020-12-06 10:34:33 +11:00			`std::vector<emory::chunk::region> target;`
			`emory::chunk::find<emory::chunk::static_hash> (`
			`std::back_inserter (target),`
			`cruft::mapped_file (argv[ARGS_TARGET]), p`
			`);`

			`std::sort (target.begin (), target.end (), emory::chunk::region::digest_ordering {});`
			`std::cout << "Found " << target.size () << " chunks\n";`
tools/compare: allow multiple sources 2019-04-26 09:08:03 +10:00
emory: break comparison tool into distinct units 2019-04-26 11:10:07 +10:00			`std::vector<emory::chunk::match<int>> found;`
tools/compare: allow multiple sources 2019-04-26 09:08:03 +10:00
			`for (int i = ARGS_SOURCE; i != argc; ++i) {`
			`auto const path = argv[i];`
			`std::clog << "Hashing source: " << path << '\n';`
chunk/find: remove map in favour of bare queries 2020-12-06 10:34:33 +11:00			`std::vector<emory::chunk::region> source;`
			`emory::chunk::find<emory::chunk::static_hash> (`
			`std::back_inserter (source),`
			`cruft::mapped_file (path), p`
			`);`
			`std::sort (source.begin (), source.end (), emory::chunk::region::digest_ordering {});`
tools/compare: allow multiple sources 2019-04-26 09:08:03 +10:00
			`std::clog << "Finding common\n";`
			`auto const &source_matches = common (target, source);`
			`std::clog << "Discovered " << source_matches.size () << " blocks\n";`

			`std::transform (`
			`std::begin (source_matches),`
			`std::end (source_matches),`
			`std::back_inserter (found),`
emory: break comparison tool into distinct units 2019-04-26 11:10:07 +10:00			`[&] (auto const &j) -> emory::chunk::match<int>`
tools/compare: record the source of the matched blocks 2019-04-26 09:25:41 +10:00			`{`
			`return {`
			`.src = { i, j.src.data },`
			`.dst = { ARGS_TARGET, j.dst.data },`
			`};`
			`});`

			`std::sort (`
			`std::begin (found),`
			`std::end (found),`
emory: break comparison tool into distinct units 2019-04-26 11:10:07 +10:00			`emory::chunk::match<int>::src_ordering {}`
tools/compare: allow multiple sources 2019-04-26 09:08:03 +10:00			`);`

			`found.erase (`
tools/compare: record the source of the matched blocks 2019-04-26 09:25:41 +10:00			`std::unique (`
			`std::begin (found),`
			`std::end (found),`
emory: break comparison tool into distinct units 2019-04-26 11:10:07 +10:00			`emory::chunk::match<int>::dst_equality {}`
tools/compare: record the source of the matched blocks 2019-04-26 09:25:41 +10:00			`),`
tools/compare: allow multiple sources 2019-04-26 09:08:03 +10:00			`found.end ()`
			`);`
			`}`
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00
tools/compare: allow multiple sources 2019-04-26 09:08:03 +10:00			`std::clog << "Finalising\n";`
emory-compare: print percentage matched 2019-04-24 07:37:10 +10:00			`std::size_t matching = 0;`
tools/compare: record the source of the matched blocks 2019-04-26 09:25:41 +10:00			`std::vector<std::size_t> source_bytes (argc - ARGS_SOURCE, 0);`
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`for (auto const &i: found) {`
tools/compare: print full match data 2019-04-26 09:45:43 +10:00			`std::cout << i << '\n';`
tools/compare: record the source of the matched blocks 2019-04-26 09:25:41 +10:00			`auto const size = i.dst.data.offset.second - i.dst.data.offset.first;`
			`matching += size;`
			`source_bytes[i.src.id - ARGS_SOURCE] += size;`
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`}`

emory-compare: print percentage matched 2019-04-24 07:37:10 +10:00			`std::size_t const total = std::accumulate (`
chunk/find: remove map in favour of bare queries 2020-12-06 10:34:33 +11:00			`target.begin (),`
			`target.end (),`
emory-compare: print percentage matched 2019-04-24 07:37:10 +10:00			`0u,`
			`[] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; }`
			`);`

emory-compare: explicitly state byte counts for total and matched 2019-04-24 07:43:43 +10:00			`std::cout << "Found " << found.size () << " chunks. " << matching << "/" << total << " bytes for a factor of " << float (matching) / total << "\n";`
tools/compare: record the source of the matched blocks 2019-04-26 09:25:41 +10:00
			`for (int i = ARGS_SOURCE; i != argc; ++i)`
			`std::cerr << argv[i] << " contributed: " << source_bytes[i - ARGS_SOURCE] << '\n';`
emory-compare: add basic duplication estimator 2019-04-23 21:34:01 +10:00			`}`