libcruft-util/coord/simd_sse.hpp

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Copyright 2018 Danny Robson <danny@nerdcruft.net>
 */


#ifndef CRUFT_UTIL_COORD_SIMD_SSE_HPP
#define CRUFT_UTIL_COORD_SIMD_SSE_HPP

#ifndef __SSE2__
#error "SSE2 is required"
#endif

#include <xmmintrin.h>
#include <pmmintrin.h>
#include <immintrin.h>

#include <array>

#include <iostream>

namespace util::coord {
    ///////////////////////////////////////////////////////////////////////////
    namespace detail {

    }

    constexpr int alignment = 16;

    struct alignas (16) simd {
        ///////////////////////////////////////////////////////////////////////
        simd (float a, float b, float c, float d):
            data (_mm_setr_ps (a, b, c, d))
        { ; }


        //---------------------------------------------------------------------
        simd (float v):
            data (_mm_set_ps1 (v))
        { ; }


        //---------------------------------------------------------------------
        simd (__m128 _data):
            data (_data)
        { ; }


        //---------------------------------------------------------------------
        operator       __m128& ()       { return data; }
        operator const __m128& () const { return data; }

        explicit operator bool () const;

        float operator[] (int idx) const { return data[idx]; }


        ///////////////////////////////////////////////////////////////////////
        __m128 data;
    };


    ///////////////////////////////////////////////////////////////////////////
    simd operator+ (simd a, simd b) { return _mm_add_ps (a, b); }
    simd operator- (simd a, simd b) { return _mm_sub_ps (a, b); }
    simd operator/ (simd a, simd b) { return _mm_div_ps (a, b); }
    simd operator* (simd a, simd b) { return _mm_mul_ps (a, b); }


    //-------------------------------------------------------------------------
    simd operator<  (simd a, simd b) { return _mm_cmplt_ps (a, b); }
    simd operator<= (simd a, simd b) { return _mm_cmple_ps (a, b); }
    simd operator>  (simd a, simd b) { return _mm_cmpgt_ps (a, b); }
    simd operator>= (simd a, simd b) { return _mm_cmpge_ps (a, b); }
    simd operator== (simd a, simd b) { return _mm_cmpeq_ps (a, b); }


    //-------------------------------------------------------------------------
    simd operator| (simd a, simd b) { return _mm_or_ps (a, b); }
    simd operator& (simd a, simd b) { return _mm_and_ps (a, b); }


    ///////////////////////////////////////////////////////////////////////////
    simd
    select (simd mask, simd a, simd b)
    {
#if defined(__SSE4_1__)
        return _mm_blendv_ps (a, b, mask);
#else
        return _mm_or_ps (
            _mm_and_ps    (mask, a),
            _mm_andnot_ps (mask, b)
        );
#endif
    }


    //-------------------------------------------------------------------------
    bool
    all (simd val)
    {
        return _mm_movemask_ps (val) == 0b1111;
    }


    //-------------------------------------------------------------------------
    auto
    clamp (simd val, simd lo, simd hi)
    {
        auto lo_mask = val > lo;
        auto hi_mask = val < hi;

        auto res = (lo_mask & val)
    }


    ///////////////////////////////////////////////////////////////////////////
    // use the same comparator in both because we're likely to use min
    // and max near each other and the mask might be sharable this way.
    simd min (simd a, simd b) { return select (a < b, a, b); }
    simd max (simd a, simd b) { return select (a < b, b, a); }


    ///////////////////////////////////////////////////////////////////////////
#if defined (__SSE3__)
    simd
    sum (simd a)
    {
        auto part = _mm_hadd_ps (a, a);
        return _mm_hadd_ps (part, part);
    }
#else
    auto
    sum (simd vals)
    {
        // swap pairs of components
        // vals: 3 2 1 0
        // shuf: 2 3 0 1
        auto shuf = _mm_shuffle_ps (vals, vals, _MM_SHUFFLE(2, 3, 0, 1));

        // combine the pairs
        auto sums = _mm_add_ps (vals, shuf);

        // copy the lower components of sums up, then add with the original sums
        // sums: 2+3 2+3 1+0 1+0
        // shuf: xxx xxx 2+3 2+3
        shuf = _mm_movehl_ps (shuf, sums);
        sums = _mm_add_ss (sums, shuf);

        // sums: xxx xxx 0123 1234
        return _mm_cvtss_f32 (sums);
    }
#endif


    ///////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)
    simd
    dot (simd a, simd b)
    {
        return _mm_dp_ps (a, b, 0xff);
    }
#elif defined(__SSE3__)
    simd
    dot (simd a, simd b)
    {
        return sum (a * b)
    }
#else
    auto
    dot (simd a, simd b)
    {
        auto mul = a * b;
        return sum (mul);
   }
#endif


    ///////////////////////////////////////////////////////////////////////////
    simd  sqrt (simd a) { return _mm_sqrt_ps (a);  }
    simd rsqrt (simd a) { return _mm_rsqrt_ps (a); }


    ///////////////////////////////////////////////////////////////////////////
    simd
    norm2 (simd a)
    {
        return dot (a, a);
    }


    //-------------------------------------------------------------------------
    simd
    norm (simd a)
    {
        return sqrt (norm2 (a));
    }


    //-------------------------------------------------------------------------
    simd
    normalised (simd a)
    {
        return a * rsqrt (norm (a));
    }


    ///////////////////////////////////////////////////////////////////////////
    auto
    abs (simd a)
    {
        auto bffff = _mm_set1_epi32 (-1);
        auto b7fff = _mm_srli_epi32 (bffff, 1);
        auto mask = _mm_castsi128_ps (b7fff);

        return _mm_and_ps (mask, a);
    }


    ///////////////////////////////////////////////////////////////////////////
    simd
    hypot (simd a)
    {
        return sqrt (sum (a * a));
    }


    ///////////////////////////////////////////////////////////////////////////
    simd::operator bool() const
    {
        return all (data);
    }
}

#endif