xxhash: rewrite for safety and accurate tests

there were a few potential buffer overflows, inaccurate test data, and
the 64 bit path wasn't correct.

fixes buffer overflow from clang-analyze
This commit is contained in:
Danny Robson 2018-01-18 16:29:06 +11:00
parent b5d8b6bca3
commit bc4a0c3179
3 changed files with 138 additions and 158 deletions

View File

@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
* *
* Copyright 2016 Danny Robson <danny@nerdcruft.net> * Copyright 2016-2018 Danny Robson <danny@nerdcruft.net>
*/ */
#include "xxhash.hpp" #include "xxhash.hpp"
@ -39,7 +39,8 @@ read_le (const void *ptr)
template <typename T> template <typename T>
struct constants { struct constants {
static const T prime[5]; static const T prime[5];
static const T round_rotl; static const T round_rotate;
static const T final_rotate[3];
}; };
@ -55,182 +56,159 @@ constants<uint32_t>::prime[5] = {
}; };
template <>
const uint32_t
constants<uint32_t>::final_rotate[3] = {
15, 13, 16
};
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
template <> template <>
const uint32_t const uint32_t
constants<uint32_t>::round_rotl = 13; constants<uint32_t>::round_rotate = 13;
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
template <> template <>
const uint64_t constants<uint64_t>::prime[5] = { const uint64_t constants<uint64_t>::prime[5] = {
11400714785074694791u, 11400714785074694791ull,
14029467366897019727u, 14029467366897019727ull,
1609587929392839161u, 1609587929392839161ull,
9650029242287828579u, 9650029242287828579ull,
2870177450012600261u, 2870177450012600261ull,
};
template <>
const uint64_t
constants<uint64_t>::final_rotate[3] = {
33, 29, 32
}; };
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
template <> template <>
const uint64_t const uint64_t
constants<uint64_t>::round_rotl = 31; constants<uint64_t>::round_rotate = 31;
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
template <typename T> template <typename T>
static static
T T
round (T seed, T input) round (T state, T input)
{ {
seed += input * constants<T>::prime[1]; state += input * constants<T>::prime[1];
seed = util::rotatel (seed, constants<T>::round_rotl); state = util::rotatel (state, constants<T>::round_rotate);
seed *= constants<T>::prime[0]; state *= constants<T>::prime[0];
return seed; return state;
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
template <typename T> template <typename WordT>
xxhash<T>::xxhash (uint32_t _seed): xxhash<WordT>::xxhash (WordT _seed):
m_seed (_seed) m_seed (_seed)
{ ; } { ; }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
template <typename T> template <typename WordT>
typename xxhash<T>::digest_t typename xxhash<WordT>::digest_t
xxhash<T>::operator() (const util::view<const uint8_t*> data) xxhash<WordT>::operator() (const util::view<const uint8_t*> data)
{ {
struct { word_t state[4] {
uint32_t total_len_32; m_seed + constants<WordT>::prime[0] + constants<WordT>::prime[1],
uint32_t large_len; m_seed + constants<WordT>::prime[1],
m_seed,
m_seed - constants<WordT>::prime[0],
};
T v1, v2, v3, v4; // consume block sized chunks while they're available.
uint32_t mem32[4]; // process each state word independently per block.
uint32_t memsize; auto cursor = std::cbegin (data);
uint32_t reserved; const auto last = std::cend (data);
while (last - cursor > block_bytes) {
//uint64_t length; for (int i = 0; i < 4; ++i) {
//T v[4]; state[i] = round<word_t> (state[i], read_le<word_t> (cursor));
//T mem[4]; cursor += sizeof (word_t);
//unsigned memsize; }
} m_state;
/* RESET */
memset (&m_state, 0, sizeof (m_state));
m_state.v1 = m_seed + constants<T>::prime[0] + constants<T>::prime[1];
m_state.v2 = m_seed + constants<T>::prime[1];
m_state.v3 = m_seed;
m_state.v4 = m_seed - constants<T>::prime[0];
/* UPDATE */
do {
auto first = data.begin ();
auto last = data.end ();
if (first == last)
break;
CHECK (first);
CHECK (last);
CHECK_LE (first, last);
//auto endian = XXH_littleEndian;
size_t len = last - first;
auto input = (const void*)first;
auto p = reinterpret_cast<const uint8_t*> (input);
auto const bEnd = p + len;
constexpr auto CHUNK = 4 * sizeof (T);
m_state.total_len_32 += (unsigned)len;
m_state.large_len |= (len >= CHUNK) | (m_state.total_len_32 >= CHUNK);
if (m_state.memsize + len < CHUNK) { /* fill in tmp buffer */
memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, len);
m_state.memsize += (unsigned)len;
break;
} }
if (m_state.memsize) { /* some data left from previous update */ // leave the remainder. it's used midway through finalisation. note that we
memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, CHUNK - m_state.memsize); // don't update the cursor as it's used to detect the remaining bytes
{ const uint32_t* p32 = m_state.mem32; // during finalisation.
m_state.v1 = round<T> (m_state.v1, ltoh (*p32)); p32++; ;
m_state.v2 = round<T> (m_state.v2, ltoh (*p32)); p32++;
m_state.v3 = round<T> (m_state.v3, ltoh (*p32)); p32++;
m_state.v4 = round<T> (m_state.v4, ltoh (*p32)); p32++;
}
p += CHUNK - m_state.memsize;
m_state.memsize = 0;
}
if (p <= bEnd - CHUNK * sizeof (T)) { // compress the state and mix in the data size
const uint8_t* const limit = bEnd - 4 * sizeof (T); word_t h;
T v1 = m_state.v1; if (data.size () < block_bytes) {
T v2 = m_state.v2; h = state[2] + constants<WordT>::prime[4];
T v3 = m_state.v3;
T v4 = m_state.v4;
do {
v1 = round<T> (v1, read_le<T> (p)); p += sizeof (T);
v2 = round<T> (v2, read_le<T> (p)); p += sizeof (T);
v3 = round<T> (v3, read_le<T> (p)); p += sizeof (T);
v4 = round<T> (v4, read_le<T> (p)); p += sizeof (T);
} while (p <= limit);
m_state.v1 = v1;
m_state.v2 = v2;
m_state.v3 = v3;
m_state.v4 = v4;
}
if (p < bEnd) {
memcpy (m_state.mem32, p, (size_t)(bEnd-p));
m_state.memsize = (unsigned)(bEnd-p);
}
} while (0);
/* DIGEST */
{
auto p = reinterpret_cast<const uint8_t*> (m_state.mem32);
auto last = p + m_state.memsize;
T h;
if (m_state.large_len) {
h = rotatel (m_state.v1, T{ 1}) +
rotatel (m_state.v2, T{ 7}) +
rotatel (m_state.v3, T{12}) +
rotatel (m_state.v4, T{18});
} else { } else {
h = m_state.v3 /* == seed */ + constants<T>::prime[4]; h = rotatel (state[0], 1) +
rotatel (state[1], 7) +
rotatel (state[2], 12) +
rotatel (state[3], 18);
if constexpr (std::is_same_v<WordT,uint64_t>) {
h = (h ^ round<WordT> (0, state[0])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
h = (h ^ round<WordT> (0, state[1])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
h = (h ^ round<WordT> (0, state[2])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
h = (h ^ round<WordT> (0, state[3])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
}
} }
h += m_state.total_len_32; h += static_cast<WordT> (data.size ());
while (p + sizeof (T) <= last) { // drain the remainder of the data, first by words...
h += read_le<T> (p) * constants<T>::prime[2]; while (cursor + sizeof (WordT) <= last) {
h = rotatel (h, 17) * constants<T>::prime[3]; if constexpr (std::is_same_v<WordT,uint32_t>) {
p += 4; h += read_le<WordT> (cursor) * constants<WordT>::prime[2];
h = rotatel (h, 17) * constants<WordT>::prime[3];
} else {
h = rotatel (
h ^ round<WordT> (0, read_le<WordT> (cursor)), 27
) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
} }
while (p < last) { cursor += sizeof (WordT);
h += (*p) * constants<T>::prime[4];
h = rotatel (h, 11) * constants<T>::prime[0];
p++;
} }
h ^= h >> 15; h *= constants<T>::prime[1]; // ...then maybe by half words...
h ^= h >> 13; h *= constants<T>::prime[2]; if constexpr (std::is_same_v<WordT,uint64_t>) {
h ^= h >> 16; while (cursor + sizeof (uint32_t) <= last) {
h = rotatel (
h ^ read_le<uint32_t> (cursor) * constants<WordT>::prime[0], 23
) * constants<WordT>::prime[1] + constants<WordT>::prime[2];
cursor += sizeof (uint32_t);
}
}
// ...then by bytes
while (cursor != last) {
if constexpr (std::is_same_v<WordT,uint32_t>) {
h += *cursor * constants<WordT>::prime[4];
h = rotatel (h, 11) * constants<WordT>::prime[0];
} else {
h = rotatel (h ^ *cursor * constants<WordT>::prime[4], 11) * constants<WordT>::prime[0];
}
++cursor;
}
// everything should have been consumed by now
CHECK_EQ (cursor, std::cend (data));
// mix the result one last time before returning
h ^= h >> constants<WordT>::final_rotate[0]; h *= constants<WordT>::prime[1];
h ^= h >> constants<WordT>::final_rotate[1]; h *= constants<WordT>::prime[2];
h ^= h >> constants<WordT>::final_rotate[2];
return h; return h;
} };
}
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////

View File

@ -23,20 +23,22 @@
#include <type_traits> #include <type_traits>
namespace util::hash { namespace util::hash {
template <typename T> template <typename WordT>
class xxhash { class xxhash {
public: public:
static_assert (std::is_same<T,uint32_t>::value || std::is_same<T,uint64_t>::value); static_assert (std::is_same<WordT,std::uint32_t>::value || std::is_same<WordT,std::uint64_t>::value);
using digest_t = T; using digest_t = WordT;
using word_t = WordT;
static constexpr int block_bytes = 4 * sizeof (word_t);
static constexpr uint32_t DEFAULT_SEED = 0; static constexpr word_t DEFAULT_SEED = 0;
xxhash (uint32_t seed = DEFAULT_SEED); xxhash (word_t seed = DEFAULT_SEED);
digest_t operator() (const util::view<const uint8_t*> data); digest_t operator() (const util::view<const uint8_t*> data);
private: private:
uint32_t m_seed; word_t m_seed;
}; };
using xxhash32 = xxhash<uint32_t>; using xxhash32 = xxhash<uint32_t>;

View File

@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
* *
* Copyright 2016 Danny Robson <danny@nerdcruft.net> * Copyright 2016-2018 Danny Robson <danny@nerdcruft.net>
*/ */
@ -44,12 +44,12 @@ main (int, char **)
std::vector<uint8_t> data; std::vector<uint8_t> data;
const char *msg; const char *msg;
} TESTS[] = { } TESTS[] = {
{ 0x02CC5D05, 0xef46db3751d8e999, 0, ""_u8s, "empty string, 0 seed" }, { 0x02cc5d05, 0xef46db3751d8e999, 0, ""_u8s, "empty string" },
{ 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string, 1 seed" }, { 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string" },
{ 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a, 0 seed" }, { 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a" },
{ 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a, 1 seed" }, { 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a" },
{ 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc, 0 seed" }, { 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc" },
{ 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc, 1 seed" }, { 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc" },
{ 0x54ca7e46, 0x892a0760a6343391, 0x1234, { 0x54ca7e46, 0x892a0760a6343391, 0x1234,
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+"_u8s, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+"_u8s,
"long alphabet" } "long alphabet" }
@ -58,10 +58,10 @@ main (int, char **)
for (const auto &t: TESTS) { for (const auto &t: TESTS) {
util::hash::xxhash32 h32 (t.seed); util::hash::xxhash32 h32 (t.seed);
//util::hash::xxhash32 h64 (t.seed); util::hash::xxhash64 h64 (t.seed);
tap.expect_eq (h32 (t.data), t.hash32, "xxhash32 %s", t.msg); tap.expect_eq (h32 (t.data), t.hash32, "xxhash32 %s", t.msg);
//tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s", t.msg); tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s, seed %!", t.msg, t.seed);
} }
return tap.status (); return tap.status ();