emory-compare: add basic duplication estimator
This commit is contained in:
parent
51371bf3bd
commit
af35a7ffe0
@ -23,10 +23,7 @@ add_subdirectory(cruft/crypto)
|
||||
|
||||
|
||||
###############################################################################
|
||||
add_executable (emory
|
||||
emory.cpp
|
||||
)
|
||||
|
||||
|
||||
##-----------------------------------------------------------------------------
|
||||
target_link_libraries(emory cruft-crypto cruft-util)
|
||||
foreach (t emory emory-compare)
|
||||
add_executable ("${t}" "${t}.cpp")
|
||||
target_link_libraries("${t}" cruft-crypto cruft-util)
|
||||
endforeach()
|
||||
|
176
emory-compare.cpp
Normal file
176
emory-compare.cpp
Normal file
@ -0,0 +1,176 @@
|
||||
/*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* Copyright 2013 Danny Robson <danny@nerdcruft.net>
|
||||
*/
|
||||
|
||||
#include <cruft/util/debug.hpp>
|
||||
#include <cruft/util/hash/buzhash.hpp>
|
||||
#include <cruft/util/io.hpp>
|
||||
#include <cruft/util/parse/value.hpp>
|
||||
|
||||
#include <cruft/crypto/hash/sha1.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
|
||||
namespace emory {
|
||||
using static_hash = cruft::crypto::hash::SHA1;
|
||||
|
||||
struct params {
|
||||
std::size_t bits;
|
||||
std::size_t window;
|
||||
std::ptrdiff_t minimum;
|
||||
};
|
||||
|
||||
struct chunk {
|
||||
std::pair<off64_t, off64_t> offset;
|
||||
static_hash::digest_t digest;
|
||||
|
||||
struct digest_equality {
|
||||
bool operator() (chunk const &a, chunk const &b)
|
||||
{
|
||||
return a.digest == b.digest;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct digest_comparator {
|
||||
bool operator() (chunk const &a, chunk const &b)
|
||||
{
|
||||
return a.digest < b.digest;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
std::ostream& operator<< (std::ostream &os, chunk const &val)
|
||||
{
|
||||
return os << "{ first: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.first
|
||||
<< ", second: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.second
|
||||
<< " }" << std::dec;
|
||||
}
|
||||
|
||||
|
||||
struct map {
|
||||
map (cruft::view<u08 const*> data, params const&);
|
||||
std::vector<chunk> elements;
|
||||
};
|
||||
|
||||
|
||||
std::vector<chunk> common (map const &a, map const &b);
|
||||
}
|
||||
|
||||
|
||||
emory::map::map (cruft::view<u08 const *> data, const emory::params &p)
|
||||
{
|
||||
using hash_type = cruft::hash::buzhash<u64>;
|
||||
hash_type h (p.window, data);
|
||||
auto remain = data.consume (p.window);
|
||||
|
||||
using digest_type = hash_type::digest_type ;
|
||||
digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits);
|
||||
|
||||
for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) {
|
||||
auto const digest = h (cursor);
|
||||
|
||||
if (std::distance (start, cursor) < p.minimum)
|
||||
continue;
|
||||
|
||||
if (unlikely (digest & mask))
|
||||
continue;
|
||||
|
||||
cruft::view<u08 const*> const region { start, cursor };
|
||||
start = cursor + 1;
|
||||
|
||||
elements.push_back ({
|
||||
.offset = {
|
||||
std::pair<std::size_t,std::size_t> {
|
||||
std::distance (data.begin (), region.begin ()),
|
||||
std::distance (data.begin (), region.end ())
|
||||
},
|
||||
},
|
||||
.digest = static_hash {} (region),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::vector<emory::chunk>
|
||||
emory::common (emory::map const &a, emory::map const &b)
|
||||
{
|
||||
CHECK (std::is_sorted (a.elements.begin (), a.elements.end (), emory::chunk::digest_comparator {}));
|
||||
CHECK (std::is_sorted (b.elements.begin (), b.elements.end (), emory::chunk::digest_comparator {}));
|
||||
|
||||
std::vector<emory::chunk> res;
|
||||
|
||||
for (auto a_cursor = a.elements.begin (), b_cursor = b.elements.begin ();
|
||||
a_cursor != a.elements.end () && b_cursor != b.elements.end ();
|
||||
/* nothing */)
|
||||
{
|
||||
if (a_cursor->digest < b_cursor->digest) {
|
||||
++a_cursor;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (b_cursor->digest < a_cursor->digest) {
|
||||
++b_cursor;
|
||||
continue;
|
||||
}
|
||||
|
||||
res.push_back (*a_cursor);
|
||||
++a_cursor;
|
||||
++b_cursor;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
enum {
|
||||
ARG_SELF,
|
||||
ARG_BITS,
|
||||
ARG_WINDOW,
|
||||
ARGS_MINIMUM,
|
||||
ARGS_TARGET,
|
||||
ARGS_SOURCE,
|
||||
|
||||
NUM_ARGS,
|
||||
};
|
||||
|
||||
|
||||
int main (int argc, char const **argv)
|
||||
{
|
||||
if (argc != NUM_ARGS) {
|
||||
std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <target> <source>\n";
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
emory::params const p {
|
||||
.window = cruft::parse::from_string<std::size_t> (argv[ARG_WINDOW]),
|
||||
.bits = cruft::parse::from_string<std::size_t> (argv[ARG_BITS ]),
|
||||
.minimum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]),
|
||||
};
|
||||
|
||||
emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p);
|
||||
emory::map source (cruft::mapped_file (argv[ARGS_SOURCE]), p);
|
||||
|
||||
std::clog << "Hashing target\n";
|
||||
std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_comparator {});
|
||||
std::clog << "Hashing source\n";
|
||||
std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_comparator {});
|
||||
|
||||
std::clog << "Finding common\n";
|
||||
auto const &found = common (target, source);
|
||||
std::size_t total = 0;
|
||||
for (auto const &i: found) {
|
||||
std::cout << i << '\n';
|
||||
total += i.offset.second - i.offset.first;
|
||||
}
|
||||
|
||||
std::cout << "Found " << found.size () << " chunks of " << total << " bytes\n";
|
||||
}
|
Loading…
Reference in New Issue
Block a user