emory/tools/compare.cpp

/*
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * Copyright 2013 Danny Robson <danny@nerdcruft.net>
 */

#include <cruft/util/debug.hpp>
#include <cruft/util/hash/buzhash.hpp>
#include <cruft/util/io.hpp>
#include <cruft/util/parse/value.hpp>

#include <cruft/crypto/hash/sha1.hpp>

#include <algorithm>
#include <iostream>
#include <iomanip>


namespace emory {
    using static_hash = cruft::crypto::hash::SHA1;

    struct params {
        std::size_t bits;
        std::size_t window;
        std::ptrdiff_t minimum;
    };

    struct chunk {
        std::pair<off64_t, off64_t> offset;
        static_hash::digest_t digest;

        struct digest_equality {
            bool operator() (chunk const &a, chunk const &b) const
            {
                return a.digest == b.digest;
            }
        };


        struct digest_ordering {
            bool operator() (chunk const &a, chunk const &b) const
            {
                return a.digest < b.digest;
            }
        };
    };


    template <typename IdT>
    struct match {
        struct side {
            IdT id;
            chunk data;
        } src, dst;


        struct src_equality {
            bool operator() (match const &a, match const &b) const
            {
                return a.src == b.src;
            }
        };


        struct dst_equality {
            bool operator() (match const &a, match const &b) const
            {
                return chunk::digest_equality {} (
                    a.dst.data,
                    b.dst.data
                );
            }
        };


        struct src_ordering {
            bool operator() (match const &a, match const &b) const
            {
                return chunk::digest_ordering {} (a.src.data, b.src.data);
            }
        };
    };


    std::ostream& operator<< (std::ostream &os, chunk const &val)
    {
        return os << "{ first: 0x"  << std::setw (8) << std::setfill ('0') << std::hex << val.offset.first
                  << ", second: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.second
                  << " }" << std::dec;
    }


    struct map {
        map (cruft::view<u08 const*> data, params const&);
        std::vector<chunk> elements;
    };


    std::vector<match<map const*>> common (map const &a, map const &b);
}


emory::map::map (cruft::view<u08 const *> data, const emory::params &p)
{
    using hash_type = cruft::hash::buzhash<u64>;
    hash_type h (p.window, data);
    auto remain = data.consume (p.window);

    using digest_type = hash_type::digest_type ;
    digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);

    for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) {
        auto const digest = h (cursor);

        if (std::distance (start, cursor) < p.minimum)
            continue;

        if (unlikely (digest & mask))
            continue;

        cruft::view<u08 const*> const region { start, cursor };
        start = cursor + 1;

        elements.push_back ({
            .offset = {
                std::pair<std::size_t,std::size_t> {
                    std::distance (data.begin (), region.begin ()),
                    std::distance (data.begin (), region.end   ())
                },
            },
            .digest = static_hash {} (region),
        });
    }
}


std::vector<
    emory::match<emory::map const*>
>
emory::common (emory::map const &src, emory::map const &dst)
{
    CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), emory::chunk::digest_ordering {}));
    CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), emory::chunk::digest_ordering {}));

    std::vector<emory::match<emory::map const*>> res;

    for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin ();
         src_cursor != src.elements.end () && dst_cursor != dst.elements.end ();
         /* nothing */)
    {
        if (src_cursor->digest < dst_cursor->digest) {
            ++src_cursor;
            continue;
        }

        if (dst_cursor->digest < src_cursor->digest) {
            ++dst_cursor;
            continue;
        }

        res.push_back ({
            .src = { .id = &src, .data = *src_cursor },
            .dst = { .id = &dst, .data = *dst_cursor },
        });
        ++src_cursor;
        ++dst_cursor;
    }

    return res;
}


enum {
    ARG_SELF,
    ARG_BITS,
    ARG_WINDOW,
    ARGS_MINIMUM,
    ARGS_TARGET,
    ARGS_SOURCE,

    NUM_ARGS,
};


int main (int argc, char const **argv)
{
    if (argc < NUM_ARGS) {
        std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <target> <source> [...]\n";
        return EXIT_FAILURE;
    }

    emory::params const p {
        .bits   = cruft::parse::from_string<std::size_t> (argv[ARG_BITS  ]),
        .window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]),
        .minimum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]),
    };

    std::clog << "Hashing target\n";
    emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p);
    std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_ordering {});

    std::vector<emory::match<int>> found;

    for (int i = ARGS_SOURCE; i != argc; ++i) {
        auto const path = argv[i];
        std::clog << "Hashing source: " << path << '\n';
        emory::map source (cruft::mapped_file (path), p);
        std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_ordering {});

        std::clog << "Finding common\n";
        auto const &source_matches = common (target, source);
        std::clog << "Discovered " << source_matches.size () << " blocks\n";

        std::transform (
            std::begin (source_matches),
            std::end   (source_matches),
            std::back_inserter (found),
            [&] (auto const &j) -> emory::match<int>
        {
            return {
                .src = {           i, j.src.data },
                .dst = { ARGS_TARGET, j.dst.data },
            };
        });

        std::sort (
            std::begin (found),
            std::end   (found),
            emory::match<int>::src_ordering {}
        );

        found.erase (
            std::unique (
                std::begin (found),
                std::end   (found),
                emory::match<int>::dst_equality {}
            ),
            found.end ()
        );
    }


    std::clog << "Finalising\n";
    std::size_t matching = 0;
    std::vector<std::size_t> source_bytes (argc - ARGS_SOURCE, 0);
    for (auto const &i: found) {
        std::cout << i.dst.data << '\n';
        auto const size = i.dst.data.offset.second - i.dst.data.offset.first;
        matching += size;
        source_bytes[i.src.id - ARGS_SOURCE] += size;
    }

    std::size_t const total = std::accumulate (
        target.elements.begin (),
        target.elements.end (),
        0u,
        [] (auto const &a, auto const &b) { return a + b.offset.second - b.offset.first; }
    );

    std::cout << "Found " << found.size () << " chunks. " << matching << "/" << total << " bytes for a factor of " << float (matching) / total << "\n";

    for (int i = ARGS_SOURCE; i != argc; ++i)
        std::cerr << argv[i] << " contributed: " << source_bytes[i - ARGS_SOURCE] << '\n';
}