libcruft-util/coord/simd_sse.hpp

248 lines
6.4 KiB
C++

/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2018 Danny Robson <danny@nerdcruft.net>
*/
#ifndef CRUFT_UTIL_COORD_SIMD_SSE_HPP
#define CRUFT_UTIL_COORD_SIMD_SSE_HPP
#ifndef __SSE2__
#error "SSE2 is required"
#endif
#include <xmmintrin.h>
#include <pmmintrin.h>
#include <immintrin.h>
#include <array>
#include <iostream>
namespace util::coord {
///////////////////////////////////////////////////////////////////////////
namespace detail {
}
constexpr int alignment = 16;
struct alignas (16) simd {
///////////////////////////////////////////////////////////////////////
simd (float a, float b, float c, float d):
data (_mm_setr_ps (a, b, c, d))
{ ; }
//---------------------------------------------------------------------
simd (float v):
data (_mm_set_ps1 (v))
{ ; }
//---------------------------------------------------------------------
simd (__m128 _data):
data (_data)
{ ; }
//---------------------------------------------------------------------
operator __m128& () { return data; }
operator const __m128& () const { return data; }
explicit operator bool () const;
float operator[] (int idx) const { return data[idx]; }
///////////////////////////////////////////////////////////////////////
__m128 data;
};
///////////////////////////////////////////////////////////////////////////
simd operator+ (simd a, simd b) { return _mm_add_ps (a, b); }
simd operator- (simd a, simd b) { return _mm_sub_ps (a, b); }
simd operator/ (simd a, simd b) { return _mm_div_ps (a, b); }
simd operator* (simd a, simd b) { return _mm_mul_ps (a, b); }
//-------------------------------------------------------------------------
simd operator< (simd a, simd b) { return _mm_cmplt_ps (a, b); }
simd operator<= (simd a, simd b) { return _mm_cmple_ps (a, b); }
simd operator> (simd a, simd b) { return _mm_cmpgt_ps (a, b); }
simd operator>= (simd a, simd b) { return _mm_cmpge_ps (a, b); }
simd operator== (simd a, simd b) { return _mm_cmpeq_ps (a, b); }
//-------------------------------------------------------------------------
simd operator| (simd a, simd b) { return _mm_or_ps (a, b); }
simd operator& (simd a, simd b) { return _mm_and_ps (a, b); }
///////////////////////////////////////////////////////////////////////////
simd
select (simd mask, simd a, simd b)
{
#if defined(__SSE4_1__)
return _mm_blendv_ps (a, b, mask);
#else
return _mm_or_ps (
_mm_and_ps (mask, a),
_mm_andnot_ps (mask, b)
);
#endif
}
//-------------------------------------------------------------------------
bool
all (simd val)
{
return _mm_movemask_ps (val) == 0b1111;
}
//-------------------------------------------------------------------------
auto
clamp (simd val, simd lo, simd hi)
{
auto lo_mask = val > lo;
auto hi_mask = val < hi;
auto res = (lo_mask & val)
}
///////////////////////////////////////////////////////////////////////////
// use the same comparator in both because we're likely to use min
// and max near each other and the mask might be sharable this way.
simd min (simd a, simd b) { return select (a < b, a, b); }
simd max (simd a, simd b) { return select (a < b, b, a); }
///////////////////////////////////////////////////////////////////////////
#if defined (__SSE3__)
simd
sum (simd a)
{
auto part = _mm_hadd_ps (a, a);
return _mm_hadd_ps (part, part);
}
#else
auto
sum (simd vals)
{
// swap pairs of components
// vals: 3 2 1 0
// shuf: 2 3 0 1
auto shuf = _mm_shuffle_ps (vals, vals, _MM_SHUFFLE(2, 3, 0, 1));
// combine the pairs
auto sums = _mm_add_ps (vals, shuf);
// copy the lower components of sums up, then add with the original sums
// sums: 2+3 2+3 1+0 1+0
// shuf: xxx xxx 2+3 2+3
shuf = _mm_movehl_ps (shuf, sums);
sums = _mm_add_ss (sums, shuf);
// sums: xxx xxx 0123 1234
return _mm_cvtss_f32 (sums);
}
#endif
///////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)
simd
dot (simd a, simd b)
{
return _mm_dp_ps (a, b, 0xff);
}
#elif defined(__SSE3__)
simd
dot (simd a, simd b)
{
return sum (a * b)
}
#else
auto
dot (simd a, simd b)
{
auto mul = a * b;
return sum (mul);
}
#endif
///////////////////////////////////////////////////////////////////////////
simd sqrt (simd a) { return _mm_sqrt_ps (a); }
simd rsqrt (simd a) { return _mm_rsqrt_ps (a); }
///////////////////////////////////////////////////////////////////////////
simd
norm2 (simd a)
{
return dot (a, a);
}
//-------------------------------------------------------------------------
simd
norm (simd a)
{
return sqrt (norm2 (a));
}
//-------------------------------------------------------------------------
simd
normalised (simd a)
{
return a * rsqrt (norm (a));
}
///////////////////////////////////////////////////////////////////////////
auto
abs (simd a)
{
auto bffff = _mm_set1_epi32 (-1);
auto b7fff = _mm_srli_epi32 (bffff, 1);
auto mask = _mm_castsi128_ps (b7fff);
return _mm_and_ps (mask, a);
}
///////////////////////////////////////////////////////////////////////////
simd
hypot (simd a)
{
return sqrt (sum (a * a));
}
///////////////////////////////////////////////////////////////////////////
simd::operator bool() const
{
return all (data);
}
}
#endif