diff --git a/CMakeLists.txt b/CMakeLists.txt index 969d358..fe378c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,18 @@ include_directories(.) add_library(emory acl.cpp acl.hpp + emory/chunk/fwd.cpp + emory/chunk/fwd.hpp + emory/chunk/map.cpp + emory/chunk/map.hpp + emory/chunk/match.cpp + emory/chunk/match.hpp + emory/chunk/ostream.cpp + emory/chunk/ostream.hpp + emory/chunk/params.cpp + emory/chunk/params.hpp + emory/chunk/region.cpp + emory/chunk/region.hpp ) diff --git a/emory/chunk/fwd.cpp b/emory/chunk/fwd.cpp new file mode 100644 index 0000000..81ecd09 --- /dev/null +++ b/emory/chunk/fwd.cpp @@ -0,0 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#include "fwd.hpp" diff --git a/emory/chunk/fwd.hpp b/emory/chunk/fwd.hpp new file mode 100644 index 0000000..898ef15 --- /dev/null +++ b/emory/chunk/fwd.hpp @@ -0,0 +1,17 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#pragma once + +namespace emory::chunk { + struct params; + struct region; + struct map; + + template struct match; +}; \ No newline at end of file diff --git a/emory/chunk/map.cpp b/emory/chunk/map.cpp new file mode 100644 index 0000000..261bddc --- /dev/null +++ b/emory/chunk/map.cpp @@ -0,0 +1,50 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#include "map.hpp" + +#include "params.hpp" + +#include + +using emory::chunk::map; + + +/////////////////////////////////////////////////////////////////////////////// +map::map (cruft::view data, const emory::chunk::params &p) +{ + using hash_type = cruft::hash::buzhash; + hash_type h (p.window, data); + auto remain = data.consume (p.window); + + using digest_type = hash_type::digest_type ; + digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits); + + for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) { + auto const digest = h (cursor); + + if (std::distance (start, cursor) < p.minimum) + continue; + + if (unlikely (digest & mask)) + continue; + + cruft::view const region { start, cursor }; + start = cursor + 1; + + elements.push_back ({ + .offset = { + std::pair { + std::distance (data.begin (), region.begin ()), + std::distance (data.begin (), region.end ()) + }, + }, + .digest = static_hash {} (region), + }); + } +} diff --git a/emory/chunk/map.hpp b/emory/chunk/map.hpp new file mode 100644 index 0000000..1c136ae --- /dev/null +++ b/emory/chunk/map.hpp @@ -0,0 +1,25 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#pragma once + +#include "fwd.hpp" + +#include "region.hpp" + +#include +#include + +#include + +namespace emory::chunk { + struct map { + map (cruft::view data, params const&); + std::vector elements; + }; +} diff --git a/emory/chunk/match.cpp b/emory/chunk/match.cpp new file mode 100644 index 0000000..0f8a3ff --- /dev/null +++ b/emory/chunk/match.cpp @@ -0,0 +1,46 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#include "match.hpp" + +#include "map.hpp" + +std::vector< + emory::chunk::match +> +emory::chunk::common (map const &src, map const &dst) +{ + CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), region::digest_ordering {})); + CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), region::digest_ordering {})); + + std::vector> res; + + for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin (); + src_cursor != src.elements.end () && dst_cursor != dst.elements.end (); + /* nothing */) + { + if (src_cursor->digest < dst_cursor->digest) { + ++src_cursor; + continue; + } + + if (dst_cursor->digest < src_cursor->digest) { + ++dst_cursor; + continue; + } + + res.push_back ({ + .src = { .id = &src, .data = *src_cursor }, + .dst = { .id = &dst, .data = *dst_cursor }, + }); + ++src_cursor; + ++dst_cursor; + } + + return res; +} diff --git a/emory/chunk/match.hpp b/emory/chunk/match.hpp new file mode 100644 index 0000000..1af88d9 --- /dev/null +++ b/emory/chunk/match.hpp @@ -0,0 +1,56 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#pragma once + +#include "fwd.hpp" +#include "region.hpp" + + +namespace emory::chunk { + template + struct match { + struct side { + IdT id; + region data; + }; + + side src, dst; + + + struct src_equality { + bool operator() (match const &a, match const &b) const + { + return a.src == b.src; + } + }; + + + struct dst_equality { + bool operator() (match const &a, match const &b) const + { + return region::digest_equality {} ( + a.dst.data, + b.dst.data + ); + } + }; + + + struct src_ordering { + bool operator() (match const &a, match const &b) const + { + return region::digest_ordering {} (a.src.data, b.src.data); + } + }; + }; + + + std::vector> + common (map const &a, map const &b); +} diff --git a/emory/chunk/ostream.cpp b/emory/chunk/ostream.cpp new file mode 100644 index 0000000..1856fb4 --- /dev/null +++ b/emory/chunk/ostream.cpp @@ -0,0 +1,39 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#include "ostream.hpp" + +#include "region.hpp" +#include "match.hpp" + +#include +#include + +/////////////////////////////////////////////////////////////////////////////// +std::ostream& +emory::chunk::operator<< (std::ostream &os, region const &val) +{ + return os << "{ first: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.first + << ", second: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.second + << " }" << std::dec; +} + + +/////////////////////////////////////////////////////////////////////////////// +template +std::ostream& +emory::chunk::operator<< (std::ostream &os, match const &val) +{ + return os << "{ src: { id: " << val.src.id << ", data: " << val.src.data << " }" + << ", dst: { id: " << val.dst.id << ", data: " << val.dst.data << " }" + << " }"; +} + + +//----------------------------------------------------------------------------- +template std::ostream& emory::chunk::operator<< (std::ostream&, match const&); diff --git a/emory/chunk/ostream.hpp b/emory/chunk/ostream.hpp new file mode 100644 index 0000000..93f5316 --- /dev/null +++ b/emory/chunk/ostream.hpp @@ -0,0 +1,23 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#pragma once + +#include "fwd.hpp" + +#include + + +namespace emory::chunk { + std::ostream& operator<< (std::ostream &, region const &); + + + template + std::ostream& + operator<< (std::ostream &, match const &); +} diff --git a/emory/chunk/params.cpp b/emory/chunk/params.cpp new file mode 100644 index 0000000..bcace08 --- /dev/null +++ b/emory/chunk/params.cpp @@ -0,0 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#include "params.hpp" \ No newline at end of file diff --git a/emory/chunk/params.hpp b/emory/chunk/params.hpp new file mode 100644 index 0000000..2a970ca --- /dev/null +++ b/emory/chunk/params.hpp @@ -0,0 +1,19 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#pragma once + +#include + +namespace emory::chunk { + struct params { + std::size_t bits; + std::size_t window; + std::ptrdiff_t minimum; + }; +} diff --git a/emory/chunk/region.cpp b/emory/chunk/region.cpp new file mode 100644 index 0000000..e701cff --- /dev/null +++ b/emory/chunk/region.cpp @@ -0,0 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#include "region.hpp" diff --git a/emory/chunk/region.hpp b/emory/chunk/region.hpp new file mode 100644 index 0000000..956c0df --- /dev/null +++ b/emory/chunk/region.hpp @@ -0,0 +1,38 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019, Danny Robson + */ + +#pragma once + +#include + +#include + +namespace emory::chunk { + using static_hash = cruft::crypto::hash::SHA1; + + + struct region { + std::pair offset; + static_hash::digest_t digest; + + struct digest_equality { + bool operator() (region const &a, region const &b) const + { + return a.digest == b.digest; + } + }; + + + struct digest_ordering { + bool operator() (region const &a, region const &b) const + { + return a.digest < b.digest; + } + }; + }; +} diff --git a/tools/compare.cpp b/tools/compare.cpp index 8001f41..b8de046 100644 --- a/tools/compare.cpp +++ b/tools/compare.cpp @@ -6,182 +6,16 @@ * Copyright 2013 Danny Robson */ -#include -#include +#include "emory/chunk/params.hpp" +#include "emory/chunk/map.hpp" +#include "emory/chunk/match.hpp" +#include "emory/chunk/ostream.hpp" + #include #include -#include - -#include +#include #include -#include - - -namespace emory { - using static_hash = cruft::crypto::hash::SHA1; - - struct params { - std::size_t bits; - std::size_t window; - std::ptrdiff_t minimum; - }; - - struct chunk { - std::pair offset; - static_hash::digest_t digest; - - struct digest_equality { - bool operator() (chunk const &a, chunk const &b) const - { - return a.digest == b.digest; - } - }; - - - struct digest_ordering { - bool operator() (chunk const &a, chunk const &b) const - { - return a.digest < b.digest; - } - }; - }; - - - template - struct match { - struct side { - IdT id; - chunk data; - }; - - side src, dst; - - - struct src_equality { - bool operator() (match const &a, match const &b) const - { - return a.src == b.src; - } - }; - - - struct dst_equality { - bool operator() (match const &a, match const &b) const - { - return chunk::digest_equality {} ( - a.dst.data, - b.dst.data - ); - } - }; - - - struct src_ordering { - bool operator() (match const &a, match const &b) const - { - return chunk::digest_ordering {} (a.src.data, b.src.data); - } - }; - }; - - - std::ostream& operator<< (std::ostream &os, chunk const &val) - { - return os << "{ first: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.first - << ", second: 0x" << std::setw (8) << std::setfill ('0') << std::hex << val.offset.second - << " }" << std::dec; - } - - - struct map { - map (cruft::view data, params const&); - std::vector elements; - }; - - - template - std::ostream& - operator<< (std::ostream &os, match const &val) - { - return os << "{ src: { id: " << val.src.id << ", data: " << val.src.data << " }" - << ", dst: { id: " << val.dst.id << ", data: " << val.dst.data << " }" - << " }"; - } - - - std::vector> common (map const &a, map const &b); -} - - -emory::map::map (cruft::view data, const emory::params &p) -{ - using hash_type = cruft::hash::buzhash; - hash_type h (p.window, data); - auto remain = data.consume (p.window); - - using digest_type = hash_type::digest_type ; - digest_type const mask = ~digest_type (0) >> (sizeof (digest_type) * 8 - p.bits); - - for (u08 const *cursor = remain.begin (), *start = data.begin (); cursor != remain.end (); cursor++) { - auto const digest = h (cursor); - - if (std::distance (start, cursor) < p.minimum) - continue; - - if (unlikely (digest & mask)) - continue; - - cruft::view const region { start, cursor }; - start = cursor + 1; - - elements.push_back ({ - .offset = { - std::pair { - std::distance (data.begin (), region.begin ()), - std::distance (data.begin (), region.end ()) - }, - }, - .digest = static_hash {} (region), - }); - } -} - - -std::vector< - emory::match -> -emory::common (emory::map const &src, emory::map const &dst) -{ - CHECK (std::is_sorted (src.elements.begin (), src.elements.end (), emory::chunk::digest_ordering {})); - CHECK (std::is_sorted (dst.elements.begin (), dst.elements.end (), emory::chunk::digest_ordering {})); - - std::vector> res; - - for (auto src_cursor = src.elements.begin (), dst_cursor = dst.elements.begin (); - src_cursor != src.elements.end () && dst_cursor != dst.elements.end (); - /* nothing */) - { - if (src_cursor->digest < dst_cursor->digest) { - ++src_cursor; - continue; - } - - if (dst_cursor->digest < src_cursor->digest) { - ++dst_cursor; - continue; - } - - res.push_back ({ - .src = { .id = &src, .data = *src_cursor }, - .dst = { .id = &dst, .data = *dst_cursor }, - }); - ++src_cursor; - ++dst_cursor; - } - - return res; -} enum { @@ -203,24 +37,24 @@ int main (int argc, char const **argv) return EXIT_FAILURE; } - emory::params const p { + emory::chunk::params const p { .bits = cruft::parse::from_string (argv[ARG_BITS ]), .window = cruft::parse::from_string (argv[ARG_WINDOW]), .minimum = cruft::parse::from_string (argv[ARGS_MINIMUM]), }; std::clog << "Hashing target\n"; - emory::map target (cruft::mapped_file (argv[ARGS_TARGET]), p); - std::sort (target.elements.begin (), target.elements.end (), emory::chunk::digest_ordering {}); + emory::chunk::map target (cruft::mapped_file (argv[ARGS_TARGET]), p); + std::sort (target.elements.begin (), target.elements.end (), emory::chunk::region::digest_ordering {}); std::cout << "Found " << target.elements.size () << " chunks\n"; - std::vector> found; + std::vector> found; for (int i = ARGS_SOURCE; i != argc; ++i) { auto const path = argv[i]; std::clog << "Hashing source: " << path << '\n'; - emory::map source (cruft::mapped_file (path), p); - std::sort (source.elements.begin (), source.elements.end (), emory::chunk::digest_ordering {}); + emory::chunk::map source (cruft::mapped_file (path), p); + std::sort (source.elements.begin (), source.elements.end (), emory::chunk::region::digest_ordering {}); std::clog << "Finding common\n"; auto const &source_matches = common (target, source); @@ -230,7 +64,7 @@ int main (int argc, char const **argv) std::begin (source_matches), std::end (source_matches), std::back_inserter (found), - [&] (auto const &j) -> emory::match + [&] (auto const &j) -> emory::chunk::match { return { .src = { i, j.src.data }, @@ -241,14 +75,14 @@ int main (int argc, char const **argv) std::sort ( std::begin (found), std::end (found), - emory::match::src_ordering {} + emory::chunk::match::src_ordering {} ); found.erase ( std::unique ( std::begin (found), std::end (found), - emory::match::dst_equality {} + emory::chunk::match::dst_equality {} ), found.end () );