From 209a6800c45f84bba36e0215b85045c383ddc16b Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Mon, 22 Apr 2019 13:59:48 +1000 Subject: [PATCH] hash/buzhash: add a trivial buzhash implementation --- CMakeLists.txt | 2 + hash/buzhash.hpp | 109 ++++++++++++++++++++++++++++++++++++++++++ test/hash/buzhash.cpp | 38 +++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 hash/buzhash.hpp create mode 100644 test/hash/buzhash.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 617e98a0..2f3903fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -328,6 +328,7 @@ list ( hash/fwd.hpp hash/adler.cpp hash/adler.hpp + hash/buzhash.hpp hash/bsdsum.cpp hash/bsdsum.hpp hash/crc.cpp @@ -595,6 +596,7 @@ if (TESTS) geom/ray geom/segment geom/sphere + hash/buzhash hash/checksum hash/crc hash/fasthash diff --git a/hash/buzhash.hpp b/hash/buzhash.hpp new file mode 100644 index 00000000..4093f4ee --- /dev/null +++ b/hash/buzhash.hpp @@ -0,0 +1,109 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2019 Danny Robson + */ + +#pragma once + +#include "table.hpp" +#include "../bitwise.hpp" +#include "../std.hpp" +#include "../view.hpp" + +#include + + +/////////////////////////////////////////////////////////////////////////////// +namespace cruft::hash { + /// Implements a rolling hash by using a cyclic polynomial, aka buzhash. + /// see: https://en.wikipedia.org/wiki/Rolling_hash#Cyclic_polynomial + /// + /// \tparam StateT The hash state size, and the result type. + /// \tparam WordT The data type for each term of the polynomial + /// \tparam HashT The object used to hash the words prior to mixing. + /// This needs to be very fast for most clients of the algorithm; + /// although functionally it could be any functor that maps WordT + /// onto StateT. + template < + typename StateT, + typename WordT = u08, + typename HashT = table + > + class buzhash { + public: + static_assert (sizeof (StateT) >= sizeof (WordT)); + + template + buzhash ( + std::size_t _width, + cruft::view _init, + Args &&...args + ) + : m_width (_width) + , m_hash (std::forward (args)...) + { + // Zero width would make for a constant zero hash. + CHECK_NEZ (_width); + // Rotations greater than data type size is often undefined. + CHECK_LT (_width, sizeof (WordT ) * 8); + CHECK_LT (_width, sizeof (StateT) * 8); + + if (_init.size () < m_width) + throw std::out_of_range ("buzhash input too small"); + + // Prime the initial window + auto cursor = _init.begin (); + for (std::size_t i = 1; i <= m_width; i++, cursor++) { + m_state ^= rotatel (m_hash (*cursor), m_width - i); + } + } + + buzhash (buzhash const&) = default; + buzhash (buzhash &&) noexcept = default; + buzhash& operator= (buzhash const&) = default; + buzhash& operator= (buzhash &&) = default; + + /// Rotate the hash over a pointer to the buffer we've been operating + /// on. + /// + /// The previous `width` bytes _must_ be dereferencable and identical + /// to the previously observed values. + StateT operator() (WordT const *cursor) + { + return (*this) (cursor[0], *(cursor - m_width)); + } + + + /// Rotate a word into the hash, and the corresponding word out of the + /// hash. + /// + /// The `removal` value _must_ be the same as the value seen `width` + /// values previously. + StateT operator() (WordT const addition, WordT const removal) + { + // Shift the polynomial + m_state = rotatel (m_state, 1) + // Remove the data that's about to leave our window. + ^ rotatel (m_hash (removal), m_width) + // Mix in the new data + ^ m_hash (addition); + + return m_state; + } + + + /// An observer for the hash state/value. + /// + /// Provided for symmetry with other hash objects. + constexpr StateT digest (void) const noexcept { return m_state; } + operator StateT () const { return digest (); } + + private: + std::size_t m_width; + HashT m_hash; + StateT m_state = 0; + }; +} diff --git a/test/hash/buzhash.cpp b/test/hash/buzhash.cpp new file mode 100644 index 00000000..2bd85c84 --- /dev/null +++ b/test/hash/buzhash.cpp @@ -0,0 +1,38 @@ +#include "tap.hpp" +#include "hash/buzhash.hpp" +#include "std.hpp" + + +int main () +{ + cruft::TAP::logger tap; + + // Use buzhash to find the string 'needle' inside a larger string. + static constexpr u08 needle[] = "needle"; + static constexpr std::size_t WIDTH = std::size (needle) - 1; + + // Compute the hash of the needle + auto const key = cruft::hash::buzhash (WIDTH, needle).digest (); + + // Find the point at which the hash object equals the key's digest. + static constexpr u08 haystack[] = "there is a needle here somewhere"; + cruft::hash::buzhash h (WIDTH, haystack); + auto pos = std::find_if ( + std::begin (haystack) + WIDTH, + std::end (haystack), + [&h, key] (auto const &i) { return h (&i) == key; } + ); + + tap.expect_eq ( + key, + h.digest (), + "needle/haystack digests match" + ); + + tap.expect ( + pos == haystack + strlen ("there is a needle") - 1, + "buzhash finds the haystack" + ); + + return tap.status (); +} \ No newline at end of file