xxhash: rewrite for safety and accurate tests

there were a few potential buffer overflows, inaccurate test data, and
the 64 bit path wasn't correct.

fixes buffer overflow from clang-analyze
This commit is contained in:
Danny Robson 2018-01-18 16:29:06 +11:00
parent b5d8b6bca3
commit bc4a0c3179
3 changed files with 138 additions and 158 deletions

View File

@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2016 Danny Robson <danny@nerdcruft.net>
* Copyright 2016-2018 Danny Robson <danny@nerdcruft.net>
*/
#include "xxhash.hpp"
@ -39,7 +39,8 @@ read_le (const void *ptr)
template <typename T>
struct constants {
static const T prime[5];
static const T round_rotl;
static const T round_rotate;
static const T final_rotate[3];
};
@ -55,182 +56,159 @@ constants<uint32_t>::prime[5] = {
};
template <>
const uint32_t
constants<uint32_t>::final_rotate[3] = {
15, 13, 16
};
//-----------------------------------------------------------------------------
template <>
const uint32_t
constants<uint32_t>::round_rotl = 13;
constants<uint32_t>::round_rotate = 13;
//-----------------------------------------------------------------------------
template <>
const uint64_t constants<uint64_t>::prime[5] = {
11400714785074694791u,
14029467366897019727u,
1609587929392839161u,
9650029242287828579u,
2870177450012600261u,
11400714785074694791ull,
14029467366897019727ull,
1609587929392839161ull,
9650029242287828579ull,
2870177450012600261ull,
};
template <>
const uint64_t
constants<uint64_t>::final_rotate[3] = {
33, 29, 32
};
//-----------------------------------------------------------------------------
template <>
const uint64_t
constants<uint64_t>::round_rotl = 31;
constants<uint64_t>::round_rotate = 31;
///////////////////////////////////////////////////////////////////////////////
template <typename T>
static
T
round (T seed, T input)
round (T state, T input)
{
seed += input * constants<T>::prime[1];
seed = util::rotatel (seed, constants<T>::round_rotl);
seed *= constants<T>::prime[0];
state += input * constants<T>::prime[1];
state = util::rotatel (state, constants<T>::round_rotate);
state *= constants<T>::prime[0];
return seed;
return state;
}
///////////////////////////////////////////////////////////////////////////////
template <typename T>
xxhash<T>::xxhash (uint32_t _seed):
template <typename WordT>
xxhash<WordT>::xxhash (WordT _seed):
m_seed (_seed)
{ ; }
///////////////////////////////////////////////////////////////////////////////
template <typename T>
typename xxhash<T>::digest_t
xxhash<T>::operator() (const util::view<const uint8_t*> data)
template <typename WordT>
typename xxhash<WordT>::digest_t
xxhash<WordT>::operator() (const util::view<const uint8_t*> data)
{
struct {
uint32_t total_len_32;
uint32_t large_len;
word_t state[4] {
m_seed + constants<WordT>::prime[0] + constants<WordT>::prime[1],
m_seed + constants<WordT>::prime[1],
m_seed,
m_seed - constants<WordT>::prime[0],
};
T v1, v2, v3, v4;
uint32_t mem32[4];
uint32_t memsize;
uint32_t reserved;
//uint64_t length;
//T v[4];
//T mem[4];
//unsigned memsize;
} m_state;
/* RESET */
memset (&m_state, 0, sizeof (m_state));
m_state.v1 = m_seed + constants<T>::prime[0] + constants<T>::prime[1];
m_state.v2 = m_seed + constants<T>::prime[1];
m_state.v3 = m_seed;
m_state.v4 = m_seed - constants<T>::prime[0];
/* UPDATE */
do {
auto first = data.begin ();
auto last = data.end ();
if (first == last)
break;
CHECK (first);
CHECK (last);
CHECK_LE (first, last);
//auto endian = XXH_littleEndian;
size_t len = last - first;
auto input = (const void*)first;
auto p = reinterpret_cast<const uint8_t*> (input);
auto const bEnd = p + len;
constexpr auto CHUNK = 4 * sizeof (T);
m_state.total_len_32 += (unsigned)len;
m_state.large_len |= (len >= CHUNK) | (m_state.total_len_32 >= CHUNK);
if (m_state.memsize + len < CHUNK) { /* fill in tmp buffer */
memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, len);
m_state.memsize += (unsigned)len;
break;
// consume block sized chunks while they're available.
// process each state word independently per block.
auto cursor = std::cbegin (data);
const auto last = std::cend (data);
while (last - cursor > block_bytes) {
for (int i = 0; i < 4; ++i) {
state[i] = round<word_t> (state[i], read_le<word_t> (cursor));
cursor += sizeof (word_t);
}
if (m_state.memsize) { /* some data left from previous update */
memcpy ((uint8_t*)(m_state.mem32) + m_state.memsize, input, CHUNK - m_state.memsize);
{ const uint32_t* p32 = m_state.mem32;
m_state.v1 = round<T> (m_state.v1, ltoh (*p32)); p32++;
m_state.v2 = round<T> (m_state.v2, ltoh (*p32)); p32++;
m_state.v3 = round<T> (m_state.v3, ltoh (*p32)); p32++;
m_state.v4 = round<T> (m_state.v4, ltoh (*p32)); p32++;
}
p += CHUNK - m_state.memsize;
m_state.memsize = 0;
}
if (p <= bEnd - CHUNK * sizeof (T)) {
const uint8_t* const limit = bEnd - 4 * sizeof (T);
T v1 = m_state.v1;
T v2 = m_state.v2;
T v3 = m_state.v3;
T v4 = m_state.v4;
do {
v1 = round<T> (v1, read_le<T> (p)); p += sizeof (T);
v2 = round<T> (v2, read_le<T> (p)); p += sizeof (T);
v3 = round<T> (v3, read_le<T> (p)); p += sizeof (T);
v4 = round<T> (v4, read_le<T> (p)); p += sizeof (T);
} while (p <= limit);
m_state.v1 = v1;
m_state.v2 = v2;
m_state.v3 = v3;
m_state.v4 = v4;
}
if (p < bEnd) {
memcpy (m_state.mem32, p, (size_t)(bEnd-p));
m_state.memsize = (unsigned)(bEnd-p);
}
} while (0);
/* DIGEST */
{
auto p = reinterpret_cast<const uint8_t*> (m_state.mem32);
auto last = p + m_state.memsize;
T h;
if (m_state.large_len) {
h = rotatel (m_state.v1, T{ 1}) +
rotatel (m_state.v2, T{ 7}) +
rotatel (m_state.v3, T{12}) +
rotatel (m_state.v4, T{18});
} else {
h = m_state.v3 /* == seed */ + constants<T>::prime[4];
}
h += m_state.total_len_32;
while (p + sizeof (T) <= last) {
h += read_le<T> (p) * constants<T>::prime[2];
h = rotatel (h, 17) * constants<T>::prime[3];
p += 4;
}
while (p < last) {
h += (*p) * constants<T>::prime[4];
h = rotatel (h, 11) * constants<T>::prime[0];
p++;
}
h ^= h >> 15; h *= constants<T>::prime[1];
h ^= h >> 13; h *= constants<T>::prime[2];
h ^= h >> 16;
return h;
}
}
// leave the remainder. it's used midway through finalisation. note that we
// don't update the cursor as it's used to detect the remaining bytes
// during finalisation.
;
// compress the state and mix in the data size
word_t h;
if (data.size () < block_bytes) {
h = state[2] + constants<WordT>::prime[4];
} else {
h = rotatel (state[0], 1) +
rotatel (state[1], 7) +
rotatel (state[2], 12) +
rotatel (state[3], 18);
if constexpr (std::is_same_v<WordT,uint64_t>) {
h = (h ^ round<WordT> (0, state[0])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
h = (h ^ round<WordT> (0, state[1])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
h = (h ^ round<WordT> (0, state[2])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
h = (h ^ round<WordT> (0, state[3])) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
}
}
h += static_cast<WordT> (data.size ());
// drain the remainder of the data, first by words...
while (cursor + sizeof (WordT) <= last) {
if constexpr (std::is_same_v<WordT,uint32_t>) {
h += read_le<WordT> (cursor) * constants<WordT>::prime[2];
h = rotatel (h, 17) * constants<WordT>::prime[3];
} else {
h = rotatel (
h ^ round<WordT> (0, read_le<WordT> (cursor)), 27
) * constants<WordT>::prime[0] + constants<WordT>::prime[3];
}
cursor += sizeof (WordT);
}
// ...then maybe by half words...
if constexpr (std::is_same_v<WordT,uint64_t>) {
while (cursor + sizeof (uint32_t) <= last) {
h = rotatel (
h ^ read_le<uint32_t> (cursor) * constants<WordT>::prime[0], 23
) * constants<WordT>::prime[1] + constants<WordT>::prime[2];
cursor += sizeof (uint32_t);
}
}
// ...then by bytes
while (cursor != last) {
if constexpr (std::is_same_v<WordT,uint32_t>) {
h += *cursor * constants<WordT>::prime[4];
h = rotatel (h, 11) * constants<WordT>::prime[0];
} else {
h = rotatel (h ^ *cursor * constants<WordT>::prime[4], 11) * constants<WordT>::prime[0];
}
++cursor;
}
// everything should have been consumed by now
CHECK_EQ (cursor, std::cend (data));
// mix the result one last time before returning
h ^= h >> constants<WordT>::final_rotate[0]; h *= constants<WordT>::prime[1];
h ^= h >> constants<WordT>::final_rotate[1]; h *= constants<WordT>::prime[2];
h ^= h >> constants<WordT>::final_rotate[2];
return h;
};
///////////////////////////////////////////////////////////////////////////////

View File

@ -23,20 +23,22 @@
#include <type_traits>
namespace util::hash {
template <typename T>
template <typename WordT>
class xxhash {
public:
static_assert (std::is_same<T,uint32_t>::value || std::is_same<T,uint64_t>::value);
using digest_t = T;
static_assert (std::is_same<WordT,std::uint32_t>::value || std::is_same<WordT,std::uint64_t>::value);
using digest_t = WordT;
using word_t = WordT;
static constexpr int block_bytes = 4 * sizeof (word_t);
static constexpr uint32_t DEFAULT_SEED = 0;
static constexpr word_t DEFAULT_SEED = 0;
xxhash (uint32_t seed = DEFAULT_SEED);
xxhash (word_t seed = DEFAULT_SEED);
digest_t operator() (const util::view<const uint8_t*> data);
private:
uint32_t m_seed;
word_t m_seed;
};
using xxhash32 = xxhash<uint32_t>;

View File

@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2016 Danny Robson <danny@nerdcruft.net>
* Copyright 2016-2018 Danny Robson <danny@nerdcruft.net>
*/
@ -44,12 +44,12 @@ main (int, char **)
std::vector<uint8_t> data;
const char *msg;
} TESTS[] = {
{ 0x02CC5D05, 0xef46db3751d8e999, 0, ""_u8s, "empty string, 0 seed" },
{ 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string, 1 seed" },
{ 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a, 0 seed" },
{ 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a, 1 seed" },
{ 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc, 0 seed" },
{ 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc, 1 seed" },
{ 0x02cc5d05, 0xef46db3751d8e999, 0, ""_u8s, "empty string" },
{ 0x0b2cb792, 0xd5afba1336a3be4b, 1, ""_u8s, "empty string" },
{ 0x550d7456, 0xd24ec4f1a98c6e5b, 0, "a"_u8s, "single a" },
{ 0xf514706f, 0xdec2bc81c3cd46c6, 1, "a"_u8s, "single a" },
{ 0x32d153ff, 0x44bc2cf5ad770999, 0, "abc"_u8s, "abc" },
{ 0xaa3da8ff, 0xbea9ca8199328908, 1, "abc"_u8s, "abc" },
{ 0x54ca7e46, 0x892a0760a6343391, 0x1234,
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_+"_u8s,
"long alphabet" }
@ -58,10 +58,10 @@ main (int, char **)
for (const auto &t: TESTS) {
util::hash::xxhash32 h32 (t.seed);
//util::hash::xxhash32 h64 (t.seed);
util::hash::xxhash64 h64 (t.seed);
tap.expect_eq (h32 (t.data), t.hash32, "xxhash32 %s", t.msg);
//tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s", t.msg);
tap.expect_eq (h64 (t.data), t.hash64, "xxhash64 %s, seed %!", t.msg, t.seed);
}
return tap.status ();