/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Copyright 2018 Danny Robson <danny@nerdcruft.net>
 */

#include "blake.hpp"

#include <cruft/util/bitwise.hpp>
#include <cruft/util/endian.hpp>
#include <cruft/util/view.hpp>

#include <array>

#include <cstdint>

using cruft::crypto::hash::blake;
using cruft::crypto::hash::traits;


///////////////////////////////////////////////////////////////////////////////
const std::array<traits<256>::word_t,8>
traits<256>::iv
{
    0x6A09E667, // frac(sqrt( 2))
    0xBB67AE85, // frac(sqrt( 3))
    0x3C6EF372, // frac(sqrt( 5))
    0xA54FF53A, // frac(sqrt( 7))
    0x510E527F, // frac(sqrt(11))
    0x9B05688C, // frac(sqrt(13))
    0x1F83D9AB, // frac(sqrt(17))
    0x5BE0CD19, // frac(sqrt(19))
};


//-----------------------------------------------------------------------------
const std::array<traits<256>::word_t,16>
traits<256>::pi
{
    0x243F6A88,
    0x85A308D3,
    0x13198A2E,
    0x03707344,
    0xA4093822,
    0x299F31D0,
    0x082EFA98,
    0xEC4E6C89,
    0x452821E6,
    0x38D01377,
    0xBE5466CF,
    0x34E90C6C,
    0xC0AC29B7,
    0xC97C50DD,
    0x3F84D5B5,
    0xB5470917,
};


//-----------------------------------------------------------------------------
const std::array<int,4>
traits<256>::rotations {
    16, 12, 8, 7
};


///////////////////////////////////////////////////////////////////////////////
const std::array<traits<512>::word_t,8>
traits<512>::iv {
    0x6A09E667F3BCC908,
    0xBB67AE8584CAA73B,
    0x3C6EF372FE94F82B,
    0xA54FF53A5F1D36F1,
    0x510E527FADE682D1,
    0x9B05688C2B3E6C1F,
    0x1F83D9ABFB41BD6B,
    0x5BE0CD19137E2179,
};


//-----------------------------------------------------------------------------
const std::array<traits<512>::word_t,16>
traits<512>::pi {
    0x243F6A8885A308D3,
    0x13198A2E03707344,
    0xA4093822299F31D0,
    0x082EFA98EC4E6C89,
    0x452821E638D01377,
    0xBE5466CF34E90C6C,
    0xC0AC29B7C97C50DD,
    0x3F84D5B5B5470917,
    0x9216D5D98979FB1B,
    0xD1310BA698DFB5AC,
    0x2FFD72DBD01ADFB7,
    0xB8E1AFED6A267E96,
    0xBA7C9045F12C7F99,
    0x24A19947B3916CF7,
    0x0801F2E2858EFC16,
    0x636920D871574E69,
};


//-----------------------------------------------------------------------------
const std::array<int,4> traits<512>::rotations  { 32, 25, 16, 11 };


///////////////////////////////////////////////////////////////////////////////
// the last six rows are repeats of the first two rows. this allows us to cut
// out a pretty frequent modulus operation.
static constexpr
int
permute[16][16] = {
    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, },
    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3, },
    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4, },
    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8, },
    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13, },
    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9, },
    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11, },
    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10, },
    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5, },
    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0, },

    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, },
    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3, },
    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4, },
    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8, },
    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13, },
    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9, },
};


///////////////////////////////////////////////////////////////////////////////
template <int width>
void
G (int i,
   int r,
   typename traits<width>::word_t &a,
   typename traits<width>::word_t &b,
   typename traits<width>::word_t &c,
   typename traits<width>::word_t &d,
   const typename traits<width>::word_t m[]
) {
    const auto j = permute[r][2 * i    ];
    const auto k = permute[r][2 * i + 1];

    a = a + b + (m[j] ^ traits<width>::pi[k]);
    d = util::rotater (d ^ a, traits<width>::rotations[0]);

    c = c + d;
    b = util::rotater (b ^ c, traits<width>::rotations[1]);

    a = a + b + (m[k] ^ traits<width>::pi[j]);
    d = util::rotater (d ^ a, traits<width>::rotations[2]);

    c = c + d;
    b = util::rotater (b ^ c, traits<width>::rotations[3]);
}


///////////////////////////////////////////////////////////////////////////////
template <int width>
std::array<typename traits<width>::word_t,8>
compress (
    std::array<typename traits<width>::word_t,8> h,
    const typename traits<width>::word_t m[16],
    const std::array<typename traits<width>::word_t,4> s,
    uint64_t t
) {
    typename traits<width>::word_t t0 =  t         & 0xffffffff;
    typename traits<width>::word_t t1 = (t >> 32u) & 0xffffffff;

    typename traits<width>::word_t v[16] = {
        h[0], h[1], h[2], h[3],
        h[4], h[5], h[6], h[7],

        s[0] ^ traits<width>::pi[0],
        s[1] ^ traits<width>::pi[1],
        s[2] ^ traits<width>::pi[2],
        s[3] ^ traits<width>::pi[3],

        t0 ^ traits<width>::pi[4],
        t0 ^ traits<width>::pi[5],
        t1 ^ traits<width>::pi[6],
        t1 ^ traits<width>::pi[7],
    };

    for (int r = 0; r < traits<width>::rounds; ++r) {
        G<width> (0, r, v[ 0], v[ 4], v[ 8], v[12], m);
        G<width> (1, r, v[ 1], v[ 5], v[ 9], v[13], m);
        G<width> (2, r, v[ 2], v[ 6], v[10], v[14], m);
        G<width> (3, r, v[ 3], v[ 7], v[11], v[15], m);

        G<width> (4, r, v[ 0], v[ 5], v[10], v[15], m);
        G<width> (5, r, v[ 1], v[ 6], v[11], v[12], m);
        G<width> (6, r, v[ 2], v[ 7], v[ 8], v[13], m);
        G<width> (7, r, v[ 3], v[ 4], v[ 9], v[14], m);
    }

    for (int i = 0; i < 8; ++i)
        h[i] = h[i] ^ s[i % 4] ^ v[i] ^ v[8 + i];

    return h;
}


///////////////////////////////////////////////////////////////////////////////
template <int width>
typename blake<width>::digest_t
blake<width>::operator() (
    util::view<const uint8_t *> data,
    const std::array<typename traits<width>::word_t, 4> salt
) const noexcept {
    auto h = traits<width>::iv;

    // bounce the message data through d08/dw so we can perform endian
    // conversion.
    //
    // however: this should probably be done in the compression function
    // instead, because it may be possible to optimise that implementation
    // more than simple calls to hton would allow.
    union {
        word_t dw[16];
        uint8_t d08[16*sizeof(word_t)];
    };

    uint64_t t = 0;
    auto cursor = data.cbegin ();

    // perform the simple case where we're consuming whole blocks
    for (auto last = data.cend ();
         (unsigned)(last - cursor) >= sizeof (dw);
         cursor += sizeof (dw))
    {
        t+= BLOCK_SIZE;
        memcpy (d08, cursor, sizeof (d08));
        std::transform (
            std::cbegin (dw),
            std::cend   (dw),
            std::begin  (dw),
            util::ntoh<word_t>
        );
        h = compress<width> (h, dw, salt, t);
    }

    // perform the messsage padding.
    //
    // * drain the buffer. this is guaranteed to fit into the bounce buffer.
    // * always append a 1 bit
    // * append enough 0 bits to give block_size-8 bytes
    // * set the last bit as 1
    // * append the two halves of the timer
    // * hash again
    //
    // if we need more space for padding then rehash
    // if at any point no message bits contributed then pass a zero counter
    {
        auto tail = std::copy (cursor, data.cend (), d08);
        t += (data.cend () - cursor) * 8;
        *tail = 0x80;
        bool empty = cursor == data.cend ();

        const auto last = std::end (d08) - 8 - 1;
        // we're _just_ within the space limits. set the high bit in place.
        if (tail == last) {
            *tail++ |= 0x01;
        // we're going to overflow
        } else if (tail > last) {
            std::fill (tail + 1, std::end (d08), 0);
            std::transform (
                std::cbegin (dw),
                std::cend   (dw),
                std::begin  (dw),
                util::ntoh<word_t>
            );
            h = compress<width> (h, dw, salt, t);

            empty = true;
            tail = last;
            std::fill (std::begin (d08), tail, 0);
            *tail++ = 0x01;
        // the simple case of appending zeros and a one
        } else {
            std::fill (tail+1, last, 0);
            tail = last;
            *tail++ = 0x01;
        }

        dw[14] = t>>32;
        dw[15] = t&0xffffffff;

        std::transform (
            std::cbegin (dw),
            std::cend   (dw) - 2,
            std::begin  (dw),
            util::ntoh<word_t>
        );
        h = compress<width> (h, dw, salt, empty ? 0 : t);
    }

    std::transform (std::cbegin (h), std::cend (h), std::begin (h), util::hton<word_t>);
    digest_t d;
    memcpy (d.data (), h.data (), sizeof (d));
    return d;
}


///////////////////////////////////////////////////////////////////////////////
template class cruft::crypto::hash::blake<256>;
template class cruft::crypto::hash::blake<512>;