tools/compare: allow multiple sources

This commit is contained in:
Danny Robson 2019-04-26 09:08:03 +10:00
parent fd3f1bfc34
commit f063bdfd8e

View File

@ -39,7 +39,7 @@ namespace emory {
};
struct digest_comparator {
struct digest_ordering {
bool operator() (chunk const &a, chunk const &b)
{
return a.digest < b.digest;
@ -48,6 +48,30 @@ namespace emory {
};
template <typename IdT>
struct match {
struct {
IdT id;
chunk data;
} src, dst;
struct src_equality {
bool operator() (match const &a, match const &b)
{
return a.src == b.src;
}
};
struct src_ordering {
bool operator() (match const &a, match const &b)
{
return a.src < b.src;
}
};
};
std::ostream& operator<< (std::ostream &os, chunk const &val)
{
return os << "{ first: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.first
@ -62,7 +86,7 @@ namespace emory {
};
std::vector<chunk> common (map const &a, map const &b);
std::vector<match<map const*>> common (map const &a, map const &b);
}
@ -100,31 +124,36 @@ emory::map::map (cruft::view<u08 const *> data, const emory::params &p)
}
std::vector<emory::chunk>
emory::common (emory::map const &a, emory::map const &b)
std::vector<
emory::match<emory::map const*>
>
emory::common (emory::map const &src, emory::map const &dst)
{
CHECK (std::is_sorted (a.elements.begin (), a.elements.end (), emory::chunk::digest_comparator {}));
CHECK (std::is_sorted (b.elements.begin (), b.elements.end (), emory::chunk::digest_comparator {}));
CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), emory::chunk::digest_ordering {}));
CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), emory::chunk::digest_ordering {}));
std::vector<emory::chunk> res;
std::vector<emory::match<emory::map const*>> res;
for (auto a_cursor = a.elements.begin (), b_cursor = b.elements.begin ();
a_cursor != a.elements.end () && b_cursor != b.elements.end ();
for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin ();
src_cursor != src.elements.end () && dst_cursor != dst.elements.end ();
/* nothing */)
{
if (a_cursor->digest < b_cursor->digest) {
++a_cursor;
if (src_cursor->digest < dst_cursor->digest) {
++src_cursor;
continue;
}
if (b_cursor->digest < a_cursor->digest) {
++b_cursor;
if (dst_cursor->digest < src_cursor->digest) {
++dst_cursor;
continue;
}
res.push_back (*a_cursor);
++a_cursor;
++b_cursor;
res.push_back ({
.src = { .id = &src, .data = *src_cursor },
.dst = { .id = &dst, .data = *dst_cursor },
});
++src_cursor;
++dst_cursor;
}
return res;
@ -145,8 +174,8 @@ enum {
int main (int argc, char const **argv)
{
if (argc != NUM_ARGS) {
std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <target> <source>\n";
if (argc < NUM_ARGS) {
std::cerr << "usage: " << argv[ARG_SELF] << " <bits> <window> <minimum> <target> <source> [...]\n";
return EXIT_FAILURE;
}
@ -156,16 +185,37 @@ int main (int argc, char const **argv)
.minimum = cruft::parse::from_string<std::ptrdiff_t> (argv[ARGS_MINIMUM]),
};
emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p);
emory::map source (cruft::mapped_file (argv[ARGS_SOURCE]), p);
std::clog << "Hashing target\n";
std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_comparator {});
std::clog << "Hashing source\n";
std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_comparator {});
emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p);
std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_ordering {});
std::clog << "Finding common\n";
auto const &found = common (target, source);
std::vector<emory::chunk> found;
for (int i = ARGS_SOURCE; i != argc; ++i) {
auto const path = argv[i];
std::clog << "Hashing source: " << path << '\n';
emory::map source (cruft::mapped_file (path), p);
std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_ordering {});
std::clog << "Finding common\n";
auto const &source_matches = common (target, source);
std::clog << "Discovered " << source_matches.size () << " blocks\n";
std::transform (
std::begin (source_matches),
std::end (source_matches),
std::back_inserter (found),
[] (auto const &i) { return i.src.data; }
);
std::sort (std::begin (found), std::end (found), emory::chunk::digest_ordering {});
found.erase (
std::unique (std::begin (found), std::end (found), emory::chunk::digest_equality {}),
found.end ()
);
}
std::clog << "Finalising\n";
std::size_t matching = 0;
for (auto const &i: found) {
std::cout << i << '\n';