xxhash: rewrite for safety and accurate tests
there were a few potential buffer overflows, inaccurate test data, and the 64 bit path wasn't correct. fixes buffer overflow from clang-analyze
This commit is contained in:
parent
b5d8b6bca3
commit
bc4a0c3179
242
hash/xxhash.cpp
242
hash/xxhash.cpp
@ -11,7 +11,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* Copyright 2016 Danny Robson <danny@nerdcruft.net>
|
||||
* Copyright 2016-2018 Danny Robson <danny@nerdcruft.net>
|
||||
*/
|
||||
|
||||
#include "xxhash.hpp"
|
||||
@ -39,7 +39,8 @@ read_le (const void *ptr)
|
||||
template <typename T>
|
||||
struct constants {
|
||||
static const T prime[5];
|
||||
static const T round_rotl;
|
||||
static const T round_rotate;
|
||||
static const T final_rotate[3];
|
||||
};
|
||||
|
||||
|
||||
@ -55,182 +56,159 @@ constants<uint32_t>::prime[5] = {
|
||||
};
|
||||
|
||||
|
||||
template <>
|
||||
const uint32_t
|
||||
constants<uint32_t>::final_rotate[3] = {
|
||||
15, 13, 16
|
||||
};
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
template <>
|
||||
const uint32_t
|
||||
constants<uint32_t>::round_rotl = 13;
|
||||
constants<uint32_t>::round_rotate = 13;
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
template <>
|
||||
const uint64_t constants<uint64_t>::prime[5] = {
|
||||
11400714785074694791u,
|
||||
14029467366897019727u,
|
||||
1609587929392839161u,
|
||||
9650029242287828579u,
|
||||
2870177450012600261u,
|
||||
11400714785074694791ull,
|
||||
14029467366897019727ull,
|
||||
1609587929392839161ull,
|
||||
9650029242287828579ull,
|
||||
2870177450012600261ull,
|
||||
};
|
||||
|
||||
|
||||
template <>
|
||||
const uint64_t
|
||||
constants<uint64_t>::final_rotate[3] = {
|
||||
33, 29, 32
|
||||
};
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
template <>
|
||||
const uint64_t
|
||||
constants<uint64_t>::round_rotl = 31;
|
||||
constants<uint64_t>::round_rotate = 31;
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
static
|
||||
T
|
||||
round (T seed, T input)
|
||||
round (T state, T input)
|
||||
{
|
||||
seed += input * constants<T>::prime[1];
|
||||
seed = util::rotatel (seed, constants<T>::round_rotl);
|
||||
seed *= constants<T>::prime[0];
|
||||
state += input * constants<T>::prime[1];
|
||||
state = util::rotatel (state, constants<T>::round_rotate);
|
||||
state *= constants<T>::prime[0];
|
||||
|
||||
return seed;
|
||||
return state;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
xxhash<T>::xxhash (uint32_t _seed):
|
||||
template <typename WordT>
|
||||
xxhash<WordT>::xxhash (WordT _seed):
|
||||
m_seed (_seed)
|
||||
{ ; }
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
typename xxhash<T>::digest_t
|
||||
xxhash<T>::operator() (const util::view<const uint8_t*> data)
|
||||
template <typename WordT>
|
||||
typename xxhash<WordT>::digest_t
|
||||
xxhash<WordT>::operator() (const util::view<const uint8_t*> data)
|
||||
{
|
||||
struct {
|
||||
uint32_t total_len_32;
|
||||
uint32_t large_len;
|
||||
word_t state[4] {
|
||||
m_seed + constants<WordT>::prime[0] + constants<WordT>::prime[1],
|
||||
m_seed + constants<WordT>::prime[1],
|
||||
m_seed,
|
||||
m_seed - constants<WordT>::prime[0],
|
||||
};
|
||||
|
||||
T v1, v2, v3, v4;
|
||||
uint32_t mem32[4];
|
||||
uint32_t memsize;
|
||||
uint32_t reserved;
|
||||
|
||||
//uint64_t length;
|
||||
//T v[4];
|
||||
//T mem[4];
|
||||
//unsigned memsize;
|
||||
} m_state;
|
||||
|
||||
/* RESET */
|
||||
memset (&m_state, 0, sizeof (m_state));
|
||||
|
||||
m_state.v1 = m_seed + constants<T>::prime[0] + constants<T>::prime[1];
|
||||
m_state.v2 = m_seed + constants<T>::prime[1];
|
||||
m_state.v3 = m_seed;
|
||||
m_state.v4 = m_seed - constants<T>::prime[0];
|
||||
|
||||
/* UPDATE */
|
||||
do {
|
||||
auto first = data.begin ();
|
||||
auto last = data.end ();
|
||||
if (first == last)
|
||||
break;
|
||||
|
||||
CHECK (first);
|
||||
CHECK (last);
|
||||
CHECK_LE (first, last);
|
||||
|
||||
//auto endian = XXH_littleEndian;
|
||||
size_t len = last - first;
|
||||
auto input = (const void*)first;
|
||||
|
||||
auto p = reinterpret_cast<const uint8_t*> (input);
|
||||
auto const bEnd = p + len;
|
||||
|
||||
constexpr auto CHUNK = 4 * sizeof (T);
|
||||
|
||||
m_state.total_len_32 += (unsigned)len;
|
||||
m_state.large_len |= (len >= CHUNK) | (m_state.total_len_32 >= CHUNK);
|
||||
|
||||
if (m_state.memsize + len < CHUNK) { /* fill in tmp buffer */
|
||||
memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, len);
|
||||
m_state.memsize += (unsigned)len;
|
||||
break;
|
||||
// consume block sized chunks while they're available.
|
||||
// process each state word independently per block.
|
||||
auto cursor = std::cbegin (data);
|
||||
const auto last = std::cend (data);
|
||||
while (last - cursor > block_bytes) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
state[i] = round<word_t> (state[i], read_le<word_t> (cursor));
|
||||
cursor += sizeof (word_t);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_state.memsize) { /* some data left from previous update */
|
||||
memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, CHUNK - m_state.memsize);
|
||||
{ const uint32_t* p32 = m_state.mem32;
|
||||
m_state.v1 = round<T> (m_state.v1, ltoh (*p32)); p32++;
|
||||
m_state.v2 = round<T> (m_state.v2, ltoh (*p32)); p32++;
|
||||
m_state.v3 = round<T> (m_state.v3, ltoh (*p32)); p32++;
|
||||
m_state.v4 = round<T> (m_state.v4, ltoh (*p32)); p32++;
|
||||
}
|
||||
p += CHUNK - m_state.memsize;
|
||||
m_state.memsize = 0;
|
||||
}
|
||||
// leave the remainder. it's used midway through finalisation. note that we
|
||||
// don't update the cursor as it's used to detect the remaining bytes
|
||||
// during finalisation.
|
||||
;
|
||||
|
||||
if (p <= bEnd - CHUNK * sizeof (T)) {
|
||||
const uint8_t* const limit = bEnd - 4 * sizeof (T);
|
||||
T v1 = m_state.v1;
|
||||
T v2 = m_state.v2;
|
||||
T v3 = m_state.v3;
|
||||
T v4 = m_state.v4;
|
||||
|
||||
do {
|
||||
v1 = round<T> (v1, read_le<T> (p)); p += sizeof (T);
|
||||
v2 = round<T> (v2, read_le<T> (p)); p += sizeof (T);
|
||||
v3 = round<T> (v3, read_le<T> (p)); p += sizeof (T);
|
||||
v4 = round<T> (v4, read_le<T> (p)); p += sizeof (T);
|
||||
} while (p <= limit);
|
||||
|
||||
m_state.v1 = v1;
|
||||
m_state.v2 = v2;
|
||||
m_state.v3 = v3;
|
||||
m_state.v4 = v4;
|
||||
}
|
||||
|
||||
if (p < bEnd) {
|
||||
memcpy (m_state.mem32, p, (size_t)(bEnd-p));
|
||||
m_state.memsize = (unsigned)(bEnd-p);
|
||||
}
|
||||
} while (0);
|
||||
|
||||
/* DIGEST */
|
||||
{
|
||||
auto p = reinterpret_cast<const uint8_t*> (m_state.mem32);
|
||||
auto last = p + m_state.memsize;
|
||||
|
||||
T h;
|
||||
|
||||
if (m_state.large_len) {
|
||||
h = rotatel (m_state.v1, T{ 1}) +
|
||||
rotatel (m_state.v2, T{ 7}) +
|
||||
rotatel (m_state.v3, T{12}) +
|
||||
rotatel (m_state.v4, T{18});
|
||||
// compress the state and mix in the data size
|
||||
word_t h;
|
||||
if (data.size () < block_bytes) {
|
||||
h = state[2] + constants<WordT>::prime[4];
|
||||
} else {
|
||||
h = m_state.v3 /* == seed */ + constants<T>::prime[4];
|
||||
h = rotatel (state[0], 1) +
|
||||
rotatel (state[1], 7) +
|
||||
rotatel (state[2], 12) +
|
||||
rotatel (state[3], 18);
|
||||
|
||||
if constexpr (std::is_same_v<WordT,uint64_t>) {
|
||||
h = (h ^ round<WordT> (0, state[0])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||
h = (h ^ round<WordT> (0, state[1])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||
h = (h ^ round<WordT> (0, state[2])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||
h = (h ^ round<WordT> (0, state[3])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||
}
|
||||
}
|
||||
|
||||
h += m_state.total_len_32;
|
||||
h += static_cast<WordT> (data.size ());
|
||||
|
||||
while (p + sizeof (T) <= last) {
|
||||
h += read_le<T> (p) * constants<T>::prime[2];
|
||||
h = rotatel (h, 17) * constants<T>::prime[3];
|
||||
p += 4;
|
||||
// drain the remainder of the data, first by words...
|
||||
while (cursor + sizeof (WordT) <= last) {
|
||||
if constexpr (std::is_same_v<WordT,uint32_t>) {
|
||||
h += read_le<WordT> (cursor) * constants<WordT>::prime[2];
|
||||
h = rotatel (h, 17) * constants<WordT>::prime[3];
|
||||
} else {
|
||||
h = rotatel (
|
||||
h ^ round<WordT> (0, read_le<WordT> (cursor)), 27
|
||||
) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||
}
|
||||
|
||||
while (p < last) {
|
||||
h += (*p) * constants<T>::prime[4];
|
||||
h = rotatel (h, 11) * constants<T>::prime[0];
|
||||
p++;
|
||||
cursor += sizeof (WordT);
|
||||
}
|
||||
|
||||
h ^= h >> 15; h *= constants<T>::prime[1];
|
||||
h ^= h >> 13; h *= constants<T>::prime[2];
|
||||
h ^= h >> 16;
|
||||
// ...then maybe by half words...
|
||||
if constexpr (std::is_same_v<WordT,uint64_t>) {
|
||||
while (cursor + sizeof (uint32_t) <= last) {
|
||||
h = rotatel (
|
||||
h ^ read_le<uint32_t> (cursor) * constants<WordT>::prime[0], 23
|
||||
) * constants<WordT>::prime[1] + constants<WordT>::prime[2];
|
||||
|
||||
cursor += sizeof (uint32_t);
|
||||
}
|
||||
}
|
||||
|
||||
// ...then by bytes
|
||||
while (cursor != last) {
|
||||
if constexpr (std::is_same_v<WordT,uint32_t>) {
|
||||
h += *cursor * constants<WordT>::prime[4];
|
||||
h = rotatel (h, 11) * constants<WordT>::prime[0];
|
||||
} else {
|
||||
h = rotatel (h ^ *cursor * constants<WordT>::prime[4], 11) * constants<WordT>::prime[0];
|
||||
}
|
||||
|
||||
++cursor;
|
||||
}
|
||||
|
||||
// everything should have been consumed by now
|
||||
CHECK_EQ (cursor, std::cend (data));
|
||||
|
||||
// mix the result one last time before returning
|
||||
h ^= h >> constants<WordT>::final_rotate[0]; h *= constants<WordT>::prime[1];
|
||||
h ^= h >> constants<WordT>::final_rotate[1]; h *= constants<WordT>::prime[2];
|
||||
h ^= h >> constants<WordT>::final_rotate[2];
|
||||
|
||||
return h;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -23,20 +23,22 @@
|
||||
#include <type_traits>
|
||||
|
||||
namespace util::hash {
|
||||
template <typename T>
|
||||
template <typename WordT>
|
||||
class xxhash {
|
||||
public:
|
||||
static_assert (std::is_same<T,uint32_t>::value || std::is_same<T,uint64_t>::value);
|
||||
using digest_t = T;
|
||||
static_assert (std::is_same<WordT,std::uint32_t>::value || std::is_same<WordT,std::uint64_t>::value);
|
||||
using digest_t = WordT;
|
||||
using word_t = WordT;
|
||||
static constexpr int block_bytes = 4 * sizeof (word_t);
|
||||
|
||||
static constexpr uint32_t DEFAULT_SEED = 0;
|
||||
static constexpr word_t DEFAULT_SEED = 0;
|
||||
|
||||
xxhash (uint32_t seed = DEFAULT_SEED);
|
||||
xxhash (word_t seed = DEFAULT_SEED);
|
||||
|
||||
digest_t operator() (const util::view<const uint8_t*> data);
|
||||
|
||||
private:
|
||||
uint32_t m_seed;
|
||||
word_t m_seed;
|
||||
};
|
||||
|
||||
using xxhash32 = xxhash<uint32_t>;
|
||||
|
@ -11,7 +11,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* Copyright 2016 Danny Robson <danny@nerdcruft.net>
|
||||
* Copyright 2016-2018 Danny Robson <danny@nerdcruft.net>
|
||||
*/
|
||||
|
||||
|
||||
@ -44,12 +44,12 @@ main (int, char **)
|
||||
std::vector<uint8_t> data;
|
||||
const char *msg;
|
||||
} TESTS[] = {
|
||||
{ 0x02CC5D05, 0xef46db3751d8e999, 0, ""_u8s, "empty string, 0 seed" },
|
||||
{ 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string, 1 seed" },
|
||||
{ 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a, 0 seed" },
|
||||
{ 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a, 1 seed" },
|
||||
{ 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc, 0 seed" },
|
||||
{ 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc, 1 seed" },
|
||||
{ 0x02cc5d05, 0xef46db3751d8e999, 0, ""_u8s, "empty string" },
|
||||
{ 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string" },
|
||||
{ 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a" },
|
||||
{ 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a" },
|
||||
{ 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc" },
|
||||
{ 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc" },
|
||||
{ 0x54ca7e46, 0x892a0760a6343391, 0x1234,
|
||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+"_u8s,
|
||||
"long alphabet" }
|
||||
@ -58,10 +58,10 @@ main (int, char **)
|
||||
|
||||
for (const auto &t: TESTS) {
|
||||
util::hash::xxhash32 h32 (t.seed);
|
||||
//util::hash::xxhash32 h64 (t.seed);
|
||||
util::hash::xxhash64 h64 (t.seed);
|
||||
|
||||
tap.expect_eq (h32 (t.data), t.hash32, "xxhash32 %s", t.msg);
|
||||
//tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s", t.msg);
|
||||
tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s, seed %!", t.msg, t.seed);
|
||||
}
|
||||
|
||||
return tap.status ();
|
||||
|
Loading…
Reference in New Issue
Block a user