From bc4a0c3179993017adee430ffbade3540e4ccb62 Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Thu, 18 Jan 2018 16:29:06 +1100 Subject: [PATCH] xxhash: rewrite for safety and accurate tests there were a few potential buffer overflows, inaccurate test data, and the 64 bit path wasn't correct. fixes buffer overflow from clang-analyze --- hash/xxhash.cpp | 264 ++++++++++++++++++++----------------------- hash/xxhash.hpp | 14 ++- test/hash/xxhash.cpp | 18 +-- 3 files changed, 138 insertions(+), 158 deletions(-) diff --git a/hash/xxhash.cpp b/hash/xxhash.cpp index 6a45a264..0ffb8642 100644 --- a/hash/xxhash.cpp +++ b/hash/xxhash.cpp @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. * - * Copyright 2016 Danny Robson + * Copyright 2016-2018 Danny Robson */ #include "xxhash.hpp" @@ -39,7 +39,8 @@ read_le (const void *ptr) template struct constants { static const T prime[5]; - static const T round_rotl; + static const T round_rotate; + static const T final_rotate[3]; }; @@ -55,182 +56,159 @@ constants::prime[5] = { }; +template <> +const uint32_t +constants::final_rotate[3] = { + 15, 13, 16 +}; + + //----------------------------------------------------------------------------- template <> const uint32_t -constants::round_rotl = 13; +constants::round_rotate = 13; //----------------------------------------------------------------------------- template <> const uint64_t constants::prime[5] = { - 11400714785074694791u, - 14029467366897019727u, - 1609587929392839161u, - 9650029242287828579u, - 2870177450012600261u, + 11400714785074694791ull, + 14029467366897019727ull, + 1609587929392839161ull, + 9650029242287828579ull, + 2870177450012600261ull, +}; + + +template <> +const uint64_t +constants::final_rotate[3] = { + 33, 29, 32 }; //----------------------------------------------------------------------------- template <> const uint64_t -constants::round_rotl = 31; +constants::round_rotate = 31; /////////////////////////////////////////////////////////////////////////////// template static T -round (T seed, T input) +round (T state, T input) { - seed += input * constants::prime[1]; - seed = util::rotatel (seed, constants::round_rotl); - seed *= constants::prime[0]; + state += input * constants::prime[1]; + state = util::rotatel (state, constants::round_rotate); + state *= constants::prime[0]; - return seed; + return state; } /////////////////////////////////////////////////////////////////////////////// -template -xxhash::xxhash (uint32_t _seed): +template +xxhash::xxhash (WordT _seed): m_seed (_seed) { ; } /////////////////////////////////////////////////////////////////////////////// -template -typename xxhash::digest_t -xxhash::operator() (const util::view data) +template +typename xxhash::digest_t +xxhash::operator() (const util::view data) { - struct { - uint32_t total_len_32; - uint32_t large_len; + word_t state[4] { + m_seed + constants::prime[0] + constants::prime[1], + m_seed + constants::prime[1], + m_seed, + m_seed - constants::prime[0], + }; - T v1, v2, v3, v4; - uint32_t mem32[4]; - uint32_t memsize; - uint32_t reserved; - - //uint64_t length; - //T v[4]; - //T mem[4]; - //unsigned memsize; - } m_state; - - /* RESET */ - memset (&m_state, 0, sizeof (m_state)); - - m_state.v1 = m_seed + constants::prime[0] + constants::prime[1]; - m_state.v2 = m_seed + constants::prime[1]; - m_state.v3 = m_seed; - m_state.v4 = m_seed - constants::prime[0]; - - /* UPDATE */ - do { - auto first = data.begin (); - auto last = data.end (); - if (first == last) - break; - - CHECK (first); - CHECK (last); - CHECK_LE (first, last); - - //auto endian = XXH_littleEndian; - size_t len = last - first; - auto input = (const void*)first; - - auto p = reinterpret_cast (input); - auto const bEnd = p + len; - - constexpr auto CHUNK = 4 * sizeof (T); - - m_state.total_len_32 += (unsigned)len; - m_state.large_len |= (len >= CHUNK) | (m_state.total_len_32 >= CHUNK); - - if (m_state.memsize + len < CHUNK) { /* fill in tmp buffer */ - memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, len); - m_state.memsize += (unsigned)len; - break; + // consume block sized chunks while they're available. + // process each state word independently per block. + auto cursor = std::cbegin (data); + const auto last = std::cend (data); + while (last - cursor > block_bytes) { + for (int i = 0; i < 4; ++i) { + state[i] = round (state[i], read_le (cursor)); + cursor += sizeof (word_t); } - - if (m_state.memsize) { /* some data left from previous update */ - memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, CHUNK - m_state.memsize); - { const uint32_t* p32 = m_state.mem32; - m_state.v1 = round (m_state.v1, ltoh (*p32)); p32++; - m_state.v2 = round (m_state.v2, ltoh (*p32)); p32++; - m_state.v3 = round (m_state.v3, ltoh (*p32)); p32++; - m_state.v4 = round (m_state.v4, ltoh (*p32)); p32++; - } - p += CHUNK - m_state.memsize; - m_state.memsize = 0; - } - - if (p <= bEnd - CHUNK * sizeof (T)) { - const uint8_t* const limit = bEnd - 4 * sizeof (T); - T v1 = m_state.v1; - T v2 = m_state.v2; - T v3 = m_state.v3; - T v4 = m_state.v4; - - do { - v1 = round (v1, read_le (p)); p += sizeof (T); - v2 = round (v2, read_le (p)); p += sizeof (T); - v3 = round (v3, read_le (p)); p += sizeof (T); - v4 = round (v4, read_le (p)); p += sizeof (T); - } while (p <= limit); - - m_state.v1 = v1; - m_state.v2 = v2; - m_state.v3 = v3; - m_state.v4 = v4; - } - - if (p < bEnd) { - memcpy (m_state.mem32, p, (size_t)(bEnd-p)); - m_state.memsize = (unsigned)(bEnd-p); - } - } while (0); - - /* DIGEST */ - { - auto p = reinterpret_cast (m_state.mem32); - auto last = p + m_state.memsize; - - T h; - - if (m_state.large_len) { - h = rotatel (m_state.v1, T{ 1}) + - rotatel (m_state.v2, T{ 7}) + - rotatel (m_state.v3, T{12}) + - rotatel (m_state.v4, T{18}); - } else { - h = m_state.v3 /* == seed */ + constants::prime[4]; - } - - h += m_state.total_len_32; - - while (p + sizeof (T) <= last) { - h += read_le (p) * constants::prime[2]; - h = rotatel (h, 17) * constants::prime[3]; - p += 4; - } - - while (p < last) { - h += (*p) * constants::prime[4]; - h = rotatel (h, 11) * constants::prime[0]; - p++; - } - - h ^= h >> 15; h *= constants::prime[1]; - h ^= h >> 13; h *= constants::prime[2]; - h ^= h >> 16; - - return h; } -} + + // leave the remainder. it's used midway through finalisation. note that we + // don't update the cursor as it's used to detect the remaining bytes + // during finalisation. + ; + + // compress the state and mix in the data size + word_t h; + if (data.size () < block_bytes) { + h = state[2] + constants::prime[4]; + } else { + h = rotatel (state[0], 1) + + rotatel (state[1], 7) + + rotatel (state[2], 12) + + rotatel (state[3], 18); + + if constexpr (std::is_same_v) { + h = (h ^ round (0, state[0])) * constants::prime[0] + constants::prime[3]; + h = (h ^ round (0, state[1])) * constants::prime[0] + constants::prime[3]; + h = (h ^ round (0, state[2])) * constants::prime[0] + constants::prime[3]; + h = (h ^ round (0, state[3])) * constants::prime[0] + constants::prime[3]; + } + } + + h += static_cast (data.size ()); + + // drain the remainder of the data, first by words... + while (cursor + sizeof (WordT) <= last) { + if constexpr (std::is_same_v) { + h += read_le (cursor) * constants::prime[2]; + h = rotatel (h, 17) * constants::prime[3]; + } else { + h = rotatel ( + h ^ round (0, read_le (cursor)), 27 + ) * constants::prime[0] + constants::prime[3]; + } + + cursor += sizeof (WordT); + } + + // ...then maybe by half words... + if constexpr (std::is_same_v) { + while (cursor + sizeof (uint32_t) <= last) { + h = rotatel ( + h ^ read_le (cursor) * constants::prime[0], 23 + ) * constants::prime[1] + constants::prime[2]; + + cursor += sizeof (uint32_t); + } + } + + // ...then by bytes + while (cursor != last) { + if constexpr (std::is_same_v) { + h += *cursor * constants::prime[4]; + h = rotatel (h, 11) * constants::prime[0]; + } else { + h = rotatel (h ^ *cursor * constants::prime[4], 11) * constants::prime[0]; + } + + ++cursor; + } + + // everything should have been consumed by now + CHECK_EQ (cursor, std::cend (data)); + + // mix the result one last time before returning + h ^= h >> constants::final_rotate[0]; h *= constants::prime[1]; + h ^= h >> constants::final_rotate[1]; h *= constants::prime[2]; + h ^= h >> constants::final_rotate[2]; + + return h; +}; /////////////////////////////////////////////////////////////////////////////// diff --git a/hash/xxhash.hpp b/hash/xxhash.hpp index ad717683..dbad6e2c 100644 --- a/hash/xxhash.hpp +++ b/hash/xxhash.hpp @@ -23,20 +23,22 @@ #include namespace util::hash { - template + template class xxhash { public: - static_assert (std::is_same::value || std::is_same::value); - using digest_t = T; + static_assert (std::is_same::value || std::is_same::value); + using digest_t = WordT; + using word_t = WordT; + static constexpr int block_bytes = 4 * sizeof (word_t); - static constexpr uint32_t DEFAULT_SEED = 0; + static constexpr word_t DEFAULT_SEED = 0; - xxhash (uint32_t seed = DEFAULT_SEED); + xxhash (word_t seed = DEFAULT_SEED); digest_t operator() (const util::view data); private: - uint32_t m_seed; + word_t m_seed; }; using xxhash32 = xxhash; diff --git a/test/hash/xxhash.cpp b/test/hash/xxhash.cpp index ad9b69a1..5edd6a51 100644 --- a/test/hash/xxhash.cpp +++ b/test/hash/xxhash.cpp @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. * - * Copyright 2016 Danny Robson + * Copyright 2016-2018 Danny Robson */ @@ -44,12 +44,12 @@ main (int, char **) std::vector data; const char *msg; } TESTS[] = { - { 0x02CC5D05, 0xef46db3751d8e999, 0, ""_u8s, "empty string, 0 seed" }, - { 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string, 1 seed" }, - { 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a, 0 seed" }, - { 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a, 1 seed" }, - { 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc, 0 seed" }, - { 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc, 1 seed" }, + { 0x02cc5d05, 0xef46db3751d8e999, 0, ""_u8s, "empty string" }, + { 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string" }, + { 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a" }, + { 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a" }, + { 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc" }, + { 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc" }, { 0x54ca7e46, 0x892a0760a6343391, 0x1234, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+"_u8s, "long alphabet" } @@ -58,10 +58,10 @@ main (int, char **) for (const auto &t: TESTS) { util::hash::xxhash32 h32 (t.seed); - //util::hash::xxhash32 h64 (t.seed); + util::hash::xxhash64 h64 (t.seed); tap.expect_eq (h32 (t.data), t.hash32, "xxhash32 %s", t.msg); - //tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s", t.msg); + tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s, seed %!", t.msg, t.seed); } return tap.status ();