/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Copyright 2018 Danny Robson */ #ifndef CRUFT_UTIL_COORD_SIMD_SSE_HPP #define CRUFT_UTIL_COORD_SIMD_SSE_HPP #ifndef __SSE2__ #error "SSE2 is required" #endif #include #include #include #include #include namespace util::coord { /////////////////////////////////////////////////////////////////////////// namespace detail { } constexpr int alignment = 16; struct alignas (16) simd { /////////////////////////////////////////////////////////////////////// simd (float a, float b, float c, float d): data (_mm_setr_ps (a, b, c, d)) { ; } //--------------------------------------------------------------------- simd (float v): data (_mm_set_ps1 (v)) { ; } //--------------------------------------------------------------------- simd (__m128 _data): data (_data) { ; } //--------------------------------------------------------------------- operator __m128& () { return data; } operator const __m128& () const { return data; } explicit operator bool () const; float operator[] (int idx) const { return data[idx]; } /////////////////////////////////////////////////////////////////////// __m128 data; }; /////////////////////////////////////////////////////////////////////////// simd operator+ (simd a, simd b) { return _mm_add_ps (a, b); } simd operator- (simd a, simd b) { return _mm_sub_ps (a, b); } simd operator/ (simd a, simd b) { return _mm_div_ps (a, b); } simd operator* (simd a, simd b) { return _mm_mul_ps (a, b); } //------------------------------------------------------------------------- simd operator< (simd a, simd b) { return _mm_cmplt_ps (a, b); } simd operator<= (simd a, simd b) { return _mm_cmple_ps (a, b); } simd operator> (simd a, simd b) { return _mm_cmpgt_ps (a, b); } simd operator>= (simd a, simd b) { return _mm_cmpge_ps (a, b); } simd operator== (simd a, simd b) { return _mm_cmpeq_ps (a, b); } //------------------------------------------------------------------------- simd operator| (simd a, simd b) { return _mm_or_ps (a, b); } simd operator& (simd a, simd b) { return _mm_and_ps (a, b); } /////////////////////////////////////////////////////////////////////////// simd select (simd mask, simd a, simd b) { #if defined(__SSE4_1__) return _mm_blendv_ps (a, b, mask); #else return _mm_or_ps ( _mm_and_ps (mask, a), _mm_andnot_ps (mask, b) ); #endif } //------------------------------------------------------------------------- bool all (simd val) { return _mm_movemask_ps (val) == 0b1111; } //------------------------------------------------------------------------- auto clamp (simd val, simd lo, simd hi) { auto lo_mask = val > lo; auto hi_mask = val < hi; auto res = (lo_mask & val) } /////////////////////////////////////////////////////////////////////////// // use the same comparator in both because we're likely to use min // and max near each other and the mask might be sharable this way. simd min (simd a, simd b) { return select (a < b, a, b); } simd max (simd a, simd b) { return select (a < b, b, a); } /////////////////////////////////////////////////////////////////////////// #if defined (__SSE3__) simd sum (simd a) { auto part = _mm_hadd_ps (a, a); return _mm_hadd_ps (part, part); } #else auto sum (simd vals) { // swap pairs of components // vals: 3 2 1 0 // shuf: 2 3 0 1 auto shuf = _mm_shuffle_ps (vals, vals, _MM_SHUFFLE(2, 3, 0, 1)); // combine the pairs auto sums = _mm_add_ps (vals, shuf); // copy the lower components of sums up, then add with the original sums // sums: 2+3 2+3 1+0 1+0 // shuf: xxx xxx 2+3 2+3 shuf = _mm_movehl_ps (shuf, sums); sums = _mm_add_ss (sums, shuf); // sums: xxx xxx 0123 1234 return _mm_cvtss_f32 (sums); } #endif /////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) simd dot (simd a, simd b) { return _mm_dp_ps (a, b, 0xff); } #elif defined(__SSE3__) simd dot (simd a, simd b) { return sum (a * b) } #else auto dot (simd a, simd b) { auto mul = a * b; return sum (mul); } #endif /////////////////////////////////////////////////////////////////////////// simd sqrt (simd a) { return _mm_sqrt_ps (a); } simd rsqrt (simd a) { return _mm_rsqrt_ps (a); } /////////////////////////////////////////////////////////////////////////// simd norm2 (simd a) { return dot (a, a); } //------------------------------------------------------------------------- simd norm (simd a) { return sqrt (norm2 (a)); } //------------------------------------------------------------------------- simd normalised (simd a) { return a * rsqrt (norm (a)); } /////////////////////////////////////////////////////////////////////////// auto abs (simd a) { auto bffff = _mm_set1_epi32 (-1); auto b7fff = _mm_srli_epi32 (bffff, 1); auto mask = _mm_castsi128_ps (b7fff); return _mm_and_ps (mask, a); } /////////////////////////////////////////////////////////////////////////// simd hypot (simd a) { return sqrt (sum (a * a)); } /////////////////////////////////////////////////////////////////////////// simd::operator bool() const { return all (data); } } #endif