xxhash: rewrite for safety and accurate tests
there were a few potential buffer overflows, inaccurate test data, and the 64 bit path wasn't correct. fixes buffer overflow from clang-analyze
This commit is contained in:
parent
b5d8b6bca3
commit
bc4a0c3179
242
hash/xxhash.cpp
242
hash/xxhash.cpp
@ -11,7 +11,7 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
* Copyright 2016 Danny Robson <danny@nerdcruft.net>
|
* Copyright 2016-2018 Danny Robson <danny@nerdcruft.net>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "xxhash.hpp"
|
#include "xxhash.hpp"
|
||||||
@ -39,7 +39,8 @@ read_le (const void *ptr)
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
struct constants {
|
struct constants {
|
||||||
static const T prime[5];
|
static const T prime[5];
|
||||||
static const T round_rotl;
|
static const T round_rotate;
|
||||||
|
static const T final_rotate[3];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -55,182 +56,159 @@ constants<uint32_t>::prime[5] = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <>
|
||||||
|
const uint32_t
|
||||||
|
constants<uint32_t>::final_rotate[3] = {
|
||||||
|
15, 13, 16
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
template <>
|
template <>
|
||||||
const uint32_t
|
const uint32_t
|
||||||
constants<uint32_t>::round_rotl = 13;
|
constants<uint32_t>::round_rotate = 13;
|
||||||
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
template <>
|
template <>
|
||||||
const uint64_t constants<uint64_t>::prime[5] = {
|
const uint64_t constants<uint64_t>::prime[5] = {
|
||||||
11400714785074694791u,
|
11400714785074694791ull,
|
||||||
14029467366897019727u,
|
14029467366897019727ull,
|
||||||
1609587929392839161u,
|
1609587929392839161ull,
|
||||||
9650029242287828579u,
|
9650029242287828579ull,
|
||||||
2870177450012600261u,
|
2870177450012600261ull,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <>
|
||||||
|
const uint64_t
|
||||||
|
constants<uint64_t>::final_rotate[3] = {
|
||||||
|
33, 29, 32
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
//-----------------------------------------------------------------------------
|
//-----------------------------------------------------------------------------
|
||||||
template <>
|
template <>
|
||||||
const uint64_t
|
const uint64_t
|
||||||
constants<uint64_t>::round_rotl = 31;
|
constants<uint64_t>::round_rotate = 31;
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static
|
static
|
||||||
T
|
T
|
||||||
round (T seed, T input)
|
round (T state, T input)
|
||||||
{
|
{
|
||||||
seed += input * constants<T>::prime[1];
|
state += input * constants<T>::prime[1];
|
||||||
seed = util::rotatel (seed, constants<T>::round_rotl);
|
state = util::rotatel (state, constants<T>::round_rotate);
|
||||||
seed *= constants<T>::prime[0];
|
state *= constants<T>::prime[0];
|
||||||
|
|
||||||
return seed;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
template <typename T>
|
template <typename WordT>
|
||||||
xxhash<T>::xxhash (uint32_t _seed):
|
xxhash<WordT>::xxhash (WordT _seed):
|
||||||
m_seed (_seed)
|
m_seed (_seed)
|
||||||
{ ; }
|
{ ; }
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
template <typename T>
|
template <typename WordT>
|
||||||
typename xxhash<T>::digest_t
|
typename xxhash<WordT>::digest_t
|
||||||
xxhash<T>::operator() (const util::view<const uint8_t*> data)
|
xxhash<WordT>::operator() (const util::view<const uint8_t*> data)
|
||||||
{
|
{
|
||||||
struct {
|
word_t state[4] {
|
||||||
uint32_t total_len_32;
|
m_seed + constants<WordT>::prime[0] + constants<WordT>::prime[1],
|
||||||
uint32_t large_len;
|
m_seed + constants<WordT>::prime[1],
|
||||||
|
m_seed,
|
||||||
|
m_seed - constants<WordT>::prime[0],
|
||||||
|
};
|
||||||
|
|
||||||
T v1, v2, v3, v4;
|
// consume block sized chunks while they're available.
|
||||||
uint32_t mem32[4];
|
// process each state word independently per block.
|
||||||
uint32_t memsize;
|
auto cursor = std::cbegin (data);
|
||||||
uint32_t reserved;
|
const auto last = std::cend (data);
|
||||||
|
while (last - cursor > block_bytes) {
|
||||||
//uint64_t length;
|
for (int i = 0; i < 4; ++i) {
|
||||||
//T v[4];
|
state[i] = round<word_t> (state[i], read_le<word_t> (cursor));
|
||||||
//T mem[4];
|
cursor += sizeof (word_t);
|
||||||
//unsigned memsize;
|
}
|
||||||
} m_state;
|
|
||||||
|
|
||||||
/* RESET */
|
|
||||||
memset (&m_state, 0, sizeof (m_state));
|
|
||||||
|
|
||||||
m_state.v1 = m_seed + constants<T>::prime[0] + constants<T>::prime[1];
|
|
||||||
m_state.v2 = m_seed + constants<T>::prime[1];
|
|
||||||
m_state.v3 = m_seed;
|
|
||||||
m_state.v4 = m_seed - constants<T>::prime[0];
|
|
||||||
|
|
||||||
/* UPDATE */
|
|
||||||
do {
|
|
||||||
auto first = data.begin ();
|
|
||||||
auto last = data.end ();
|
|
||||||
if (first == last)
|
|
||||||
break;
|
|
||||||
|
|
||||||
CHECK (first);
|
|
||||||
CHECK (last);
|
|
||||||
CHECK_LE (first, last);
|
|
||||||
|
|
||||||
//auto endian = XXH_littleEndian;
|
|
||||||
size_t len = last - first;
|
|
||||||
auto input = (const void*)first;
|
|
||||||
|
|
||||||
auto p = reinterpret_cast<const uint8_t*> (input);
|
|
||||||
auto const bEnd = p + len;
|
|
||||||
|
|
||||||
constexpr auto CHUNK = 4 * sizeof (T);
|
|
||||||
|
|
||||||
m_state.total_len_32 += (unsigned)len;
|
|
||||||
m_state.large_len |= (len >= CHUNK) | (m_state.total_len_32 >= CHUNK);
|
|
||||||
|
|
||||||
if (m_state.memsize + len < CHUNK) { /* fill in tmp buffer */
|
|
||||||
memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, len);
|
|
||||||
m_state.memsize += (unsigned)len;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_state.memsize) { /* some data left from previous update */
|
// leave the remainder. it's used midway through finalisation. note that we
|
||||||
memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, CHUNK - m_state.memsize);
|
// don't update the cursor as it's used to detect the remaining bytes
|
||||||
{ const uint32_t* p32 = m_state.mem32;
|
// during finalisation.
|
||||||
m_state.v1 = round<T> (m_state.v1, ltoh (*p32)); p32++;
|
;
|
||||||
m_state.v2 = round<T> (m_state.v2, ltoh (*p32)); p32++;
|
|
||||||
m_state.v3 = round<T> (m_state.v3, ltoh (*p32)); p32++;
|
|
||||||
m_state.v4 = round<T> (m_state.v4, ltoh (*p32)); p32++;
|
|
||||||
}
|
|
||||||
p += CHUNK - m_state.memsize;
|
|
||||||
m_state.memsize = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (p <= bEnd - CHUNK * sizeof (T)) {
|
// compress the state and mix in the data size
|
||||||
const uint8_t* const limit = bEnd - 4 * sizeof (T);
|
word_t h;
|
||||||
T v1 = m_state.v1;
|
if (data.size () < block_bytes) {
|
||||||
T v2 = m_state.v2;
|
h = state[2] + constants<WordT>::prime[4];
|
||||||
T v3 = m_state.v3;
|
|
||||||
T v4 = m_state.v4;
|
|
||||||
|
|
||||||
do {
|
|
||||||
v1 = round<T> (v1, read_le<T> (p)); p += sizeof (T);
|
|
||||||
v2 = round<T> (v2, read_le<T> (p)); p += sizeof (T);
|
|
||||||
v3 = round<T> (v3, read_le<T> (p)); p += sizeof (T);
|
|
||||||
v4 = round<T> (v4, read_le<T> (p)); p += sizeof (T);
|
|
||||||
} while (p <= limit);
|
|
||||||
|
|
||||||
m_state.v1 = v1;
|
|
||||||
m_state.v2 = v2;
|
|
||||||
m_state.v3 = v3;
|
|
||||||
m_state.v4 = v4;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (p < bEnd) {
|
|
||||||
memcpy (m_state.mem32, p, (size_t)(bEnd-p));
|
|
||||||
m_state.memsize = (unsigned)(bEnd-p);
|
|
||||||
}
|
|
||||||
} while (0);
|
|
||||||
|
|
||||||
/* DIGEST */
|
|
||||||
{
|
|
||||||
auto p = reinterpret_cast<const uint8_t*> (m_state.mem32);
|
|
||||||
auto last = p + m_state.memsize;
|
|
||||||
|
|
||||||
T h;
|
|
||||||
|
|
||||||
if (m_state.large_len) {
|
|
||||||
h = rotatel (m_state.v1, T{ 1}) +
|
|
||||||
rotatel (m_state.v2, T{ 7}) +
|
|
||||||
rotatel (m_state.v3, T{12}) +
|
|
||||||
rotatel (m_state.v4, T{18});
|
|
||||||
} else {
|
} else {
|
||||||
h = m_state.v3 /* == seed */ + constants<T>::prime[4];
|
h = rotatel (state[0], 1) +
|
||||||
|
rotatel (state[1], 7) +
|
||||||
|
rotatel (state[2], 12) +
|
||||||
|
rotatel (state[3], 18);
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<WordT,uint64_t>) {
|
||||||
|
h = (h ^ round<WordT> (0, state[0])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||||
|
h = (h ^ round<WordT> (0, state[1])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||||
|
h = (h ^ round<WordT> (0, state[2])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||||
|
h = (h ^ round<WordT> (0, state[3])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
h += m_state.total_len_32;
|
h += static_cast<WordT> (data.size ());
|
||||||
|
|
||||||
while (p + sizeof (T) <= last) {
|
// drain the remainder of the data, first by words...
|
||||||
h += read_le<T> (p) * constants<T>::prime[2];
|
while (cursor + sizeof (WordT) <= last) {
|
||||||
h = rotatel (h, 17) * constants<T>::prime[3];
|
if constexpr (std::is_same_v<WordT,uint32_t>) {
|
||||||
p += 4;
|
h += read_le<WordT> (cursor) * constants<WordT>::prime[2];
|
||||||
|
h = rotatel (h, 17) * constants<WordT>::prime[3];
|
||||||
|
} else {
|
||||||
|
h = rotatel (
|
||||||
|
h ^ round<WordT> (0, read_le<WordT> (cursor)), 27
|
||||||
|
) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
while (p < last) {
|
cursor += sizeof (WordT);
|
||||||
h += (*p) * constants<T>::prime[4];
|
|
||||||
h = rotatel (h, 11) * constants<T>::prime[0];
|
|
||||||
p++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
h ^= h >> 15; h *= constants<T>::prime[1];
|
// ...then maybe by half words...
|
||||||
h ^= h >> 13; h *= constants<T>::prime[2];
|
if constexpr (std::is_same_v<WordT,uint64_t>) {
|
||||||
h ^= h >> 16;
|
while (cursor + sizeof (uint32_t) <= last) {
|
||||||
|
h = rotatel (
|
||||||
|
h ^ read_le<uint32_t> (cursor) * constants<WordT>::prime[0], 23
|
||||||
|
) * constants<WordT>::prime[1] + constants<WordT>::prime[2];
|
||||||
|
|
||||||
|
cursor += sizeof (uint32_t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ...then by bytes
|
||||||
|
while (cursor != last) {
|
||||||
|
if constexpr (std::is_same_v<WordT,uint32_t>) {
|
||||||
|
h += *cursor * constants<WordT>::prime[4];
|
||||||
|
h = rotatel (h, 11) * constants<WordT>::prime[0];
|
||||||
|
} else {
|
||||||
|
h = rotatel (h ^ *cursor * constants<WordT>::prime[4], 11) * constants<WordT>::prime[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
++cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
// everything should have been consumed by now
|
||||||
|
CHECK_EQ (cursor, std::cend (data));
|
||||||
|
|
||||||
|
// mix the result one last time before returning
|
||||||
|
h ^= h >> constants<WordT>::final_rotate[0]; h *= constants<WordT>::prime[1];
|
||||||
|
h ^= h >> constants<WordT>::final_rotate[1]; h *= constants<WordT>::prime[2];
|
||||||
|
h ^= h >> constants<WordT>::final_rotate[2];
|
||||||
|
|
||||||
return h;
|
return h;
|
||||||
}
|
};
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -23,20 +23,22 @@
|
|||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
namespace util::hash {
|
namespace util::hash {
|
||||||
template <typename T>
|
template <typename WordT>
|
||||||
class xxhash {
|
class xxhash {
|
||||||
public:
|
public:
|
||||||
static_assert (std::is_same<T,uint32_t>::value || std::is_same<T,uint64_t>::value);
|
static_assert (std::is_same<WordT,std::uint32_t>::value || std::is_same<WordT,std::uint64_t>::value);
|
||||||
using digest_t = T;
|
using digest_t = WordT;
|
||||||
|
using word_t = WordT;
|
||||||
|
static constexpr int block_bytes = 4 * sizeof (word_t);
|
||||||
|
|
||||||
static constexpr uint32_t DEFAULT_SEED = 0;
|
static constexpr word_t DEFAULT_SEED = 0;
|
||||||
|
|
||||||
xxhash (uint32_t seed = DEFAULT_SEED);
|
xxhash (word_t seed = DEFAULT_SEED);
|
||||||
|
|
||||||
digest_t operator() (const util::view<const uint8_t*> data);
|
digest_t operator() (const util::view<const uint8_t*> data);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint32_t m_seed;
|
word_t m_seed;
|
||||||
};
|
};
|
||||||
|
|
||||||
using xxhash32 = xxhash<uint32_t>;
|
using xxhash32 = xxhash<uint32_t>;
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
* Copyright 2016 Danny Robson <danny@nerdcruft.net>
|
* Copyright 2016-2018 Danny Robson <danny@nerdcruft.net>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
@ -44,12 +44,12 @@ main (int, char **)
|
|||||||
std::vector<uint8_t> data;
|
std::vector<uint8_t> data;
|
||||||
const char *msg;
|
const char *msg;
|
||||||
} TESTS[] = {
|
} TESTS[] = {
|
||||||
{ 0x02CC5D05, 0xef46db3751d8e999, 0, ""_u8s, "empty string, 0 seed" },
|
{ 0x02cc5d05, 0xef46db3751d8e999, 0, ""_u8s, "empty string" },
|
||||||
{ 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string, 1 seed" },
|
{ 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string" },
|
||||||
{ 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a, 0 seed" },
|
{ 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a" },
|
||||||
{ 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a, 1 seed" },
|
{ 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a" },
|
||||||
{ 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc, 0 seed" },
|
{ 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc" },
|
||||||
{ 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc, 1 seed" },
|
{ 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc" },
|
||||||
{ 0x54ca7e46, 0x892a0760a6343391, 0x1234,
|
{ 0x54ca7e46, 0x892a0760a6343391, 0x1234,
|
||||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+"_u8s,
|
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+"_u8s,
|
||||||
"long alphabet" }
|
"long alphabet" }
|
||||||
@ -58,10 +58,10 @@ main (int, char **)
|
|||||||
|
|
||||||
for (const auto &t: TESTS) {
|
for (const auto &t: TESTS) {
|
||||||
util::hash::xxhash32 h32 (t.seed);
|
util::hash::xxhash32 h32 (t.seed);
|
||||||
//util::hash::xxhash32 h64 (t.seed);
|
util::hash::xxhash64 h64 (t.seed);
|
||||||
|
|
||||||
tap.expect_eq (h32 (t.data), t.hash32, "xxhash32 %s", t.msg);
|
tap.expect_eq (h32 (t.data), t.hash32, "xxhash32 %s", t.msg);
|
||||||
//tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s", t.msg);
|
tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s, seed %!", t.msg, t.seed);
|
||||||
}
|
}
|
||||||
|
|
||||||
return tap.status ();
|
return tap.status ();
|
||||||
|
Loading…
Reference in New Issue
Block a user