From b307ab935d8bfae78f0152602a5a1dbe18d80a49 Mon Sep 17 00:00:00 2001
From: Danny Robson <danny@nerdcruft.net>
Date: Fri, 16 Mar 2018 11:10:44 +1100
Subject: [PATCH] coord/simd: add initial simd tests

---
 CMakeLists.txt      |   3 +
 coord/simd.hpp      |  26 +++++
 coord/simd_sse.hpp  | 247 ++++++++++++++++++++++++++++++++++++++++++++
 test/coord/simd.cpp |  34 ++++++
 4 files changed, 310 insertions(+)
 create mode 100644 coord/simd.hpp
 create mode 100644 coord/simd_sse.hpp
 create mode 100644 test/coord/simd.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5624bc43..c95d7cc7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -193,6 +193,8 @@ list (
     coord/init.hpp
     coord/iostream.hpp
     coord/ops.hpp
+    coord/simd.hpp
+    coord/simd_sse.hpp
     coord/store.hpp
     coord/traits.hpp
     debug.cpp
@@ -466,6 +468,7 @@ if (TESTS)
         colour
         comparator
         coord
+        coord/simd
         encode/base
         endian
         exe
diff --git a/coord/simd.hpp b/coord/simd.hpp
new file mode 100644
index 00000000..e3ae38dd
--- /dev/null
+++ b/coord/simd.hpp
@@ -0,0 +1,26 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Copyright 2018 Danny Robson <danny@nerdcruft.net>
+ */
+
+#ifndef CRUFT_UTIL_COORD_SIMD_HPP
+#define CRUFT_UTIL_COORD_SIMD_HPP
+
+#ifdef __SSE__
+#include "simd_sse.hpp"
+#else
+#error "Unsupported SIMD architecture"
+#endif
+
+#endif
diff --git a/coord/simd_sse.hpp b/coord/simd_sse.hpp
new file mode 100644
index 00000000..a544ce60
--- /dev/null
+++ b/coord/simd_sse.hpp
@@ -0,0 +1,247 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Copyright 2018 Danny Robson <danny@nerdcruft.net>
+ */
+
+
+#ifndef CRUFT_UTIL_COORD_SIMD_SSE_HPP
+#define CRUFT_UTIL_COORD_SIMD_SSE_HPP
+
+#ifndef __SSE2__
+#error "SSE2 is required"
+#endif
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include <immintrin.h>
+
+#include <array>
+
+#include <iostream>
+
+namespace util::coord {
+    ///////////////////////////////////////////////////////////////////////////
+    namespace detail {
+
+    }
+
+    constexpr int alignment = 16;
+
+    struct alignas (16) simd {
+        ///////////////////////////////////////////////////////////////////////
+        simd (float a, float b, float c, float d):
+            data (_mm_setr_ps (a, b, c, d))
+        { ; }
+
+
+        //---------------------------------------------------------------------
+        simd (float v):
+            data (_mm_set_ps1 (v))
+        { ; }
+
+
+        //---------------------------------------------------------------------
+        simd (__m128 _data):
+            data (_data)
+        { ; }
+
+
+        //---------------------------------------------------------------------
+        operator       __m128& ()       { return data; }
+        operator const __m128& () const { return data; }
+
+        explicit operator bool () const;
+
+        float operator[] (int idx) const { return data[idx]; }
+
+
+        ///////////////////////////////////////////////////////////////////////
+        __m128 data;
+    };
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    simd operator+ (simd a, simd b) { return _mm_add_ps (a, b); }
+    simd operator- (simd a, simd b) { return _mm_sub_ps (a, b); }
+    simd operator/ (simd a, simd b) { return _mm_div_ps (a, b); }
+    simd operator* (simd a, simd b) { return _mm_mul_ps (a, b); }
+
+
+    //-------------------------------------------------------------------------
+    simd operator<  (simd a, simd b) { return _mm_cmplt_ps (a, b); }
+    simd operator<= (simd a, simd b) { return _mm_cmple_ps (a, b); }
+    simd operator>  (simd a, simd b) { return _mm_cmpgt_ps (a, b); }
+    simd operator>= (simd a, simd b) { return _mm_cmpge_ps (a, b); }
+    simd operator== (simd a, simd b) { return _mm_cmpeq_ps (a, b); }
+
+
+    //-------------------------------------------------------------------------
+    simd operator| (simd a, simd b) { return _mm_or_ps (a, b); }
+    simd operator& (simd a, simd b) { return _mm_and_ps (a, b); }
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    simd
+    select (simd mask, simd a, simd b)
+    {
+#if defined(__SSE4_1__)
+        return _mm_blendv_ps (a, b, mask);
+#else
+        return _mm_or_ps (
+            _mm_and_ps    (mask, a),
+            _mm_andnot_ps (mask, b)
+        );
+#endif
+    }
+
+
+    //-------------------------------------------------------------------------
+    bool
+    all (simd val)
+    {
+        return _mm_movemask_ps (val) == 0b1111;
+    }
+
+
+    //-------------------------------------------------------------------------
+    auto
+    clamp (simd val, simd lo, simd hi)
+    {
+        auto lo_mask = val > lo;
+        auto hi_mask = val < hi;
+
+        auto res = (lo_mask & val)
+    }
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    // use the same comparator in both because we're likely to use min
+    // and max near each other and the mask might be sharable this way.
+    simd min (simd a, simd b) { return select (a < b, a, b); }
+    simd max (simd a, simd b) { return select (a < b, b, a); }
+
+
+    ///////////////////////////////////////////////////////////////////////////
+#if defined (__SSE3__)
+    simd
+    sum (simd a)
+    {
+        auto part = _mm_hadd_ps (a, a);
+        return _mm_hadd_ps (part, part);
+    }
+#else
+    auto
+    sum (simd vals)
+    {
+        // swap pairs of components
+        // vals: 3 2 1 0
+        // shuf: 2 3 0 1
+        auto shuf = _mm_shuffle_ps (vals, vals, _MM_SHUFFLE(2, 3, 0, 1));
+
+        // combine the pairs
+        auto sums = _mm_add_ps (vals, shuf);
+
+        // copy the lower components of sums up, then add with the original sums
+        // sums: 2+3 2+3 1+0 1+0
+        // shuf: xxx xxx 2+3 2+3
+        shuf = _mm_movehl_ps (shuf, sums);
+        sums = _mm_add_ss (sums, shuf);
+
+        // sums: xxx xxx 0123 1234
+        return _mm_cvtss_f32 (sums);
+    }
+#endif
+
+
+    ///////////////////////////////////////////////////////////////////////////
+#if defined(__SSE4_1__)
+    simd
+    dot (simd a, simd b)
+    {
+        return _mm_dp_ps (a, b, 0xff);
+    }
+#elif defined(__SSE3__)
+    simd
+    dot (simd a, simd b)
+    {
+        return sum (a * b)
+    }
+#else
+    auto
+    dot (simd a, simd b)
+    {
+        auto mul = a * b;
+        return sum (mul);
+   }
+#endif
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    simd  sqrt (simd a) { return _mm_sqrt_ps (a);  }
+    simd rsqrt (simd a) { return _mm_rsqrt_ps (a); }
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    simd
+    norm2 (simd a)
+    {
+        return dot (a, a);
+    }
+
+
+    //-------------------------------------------------------------------------
+    simd
+    norm (simd a)
+    {
+        return sqrt (norm2 (a));
+    }
+
+
+    //-------------------------------------------------------------------------
+    simd
+    normalised (simd a)
+    {
+        return a * rsqrt (norm (a));
+    }
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    auto
+    abs (simd a)
+    {
+        auto bffff = _mm_set1_epi32 (-1);
+        auto b7fff = _mm_srli_epi32 (bffff, 1);
+        auto mask = _mm_castsi128_ps (b7fff);
+
+        return _mm_and_ps (mask, a);
+    }
+
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    simd
+    hypot (simd a)
+    {
+        return sqrt (sum (a * a));
+    }
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    simd::operator bool() const
+    {
+        return all (data);
+    }
+}
+
+#endif
diff --git a/test/coord/simd.cpp b/test/coord/simd.cpp
new file mode 100644
index 00000000..99a3b120
--- /dev/null
+++ b/test/coord/simd.cpp
@@ -0,0 +1,34 @@
+#include "coord/simd.hpp"
+#include "tap.hpp"
+
+
+int
+main ()
+{
+    util::TAP::logger tap;
+
+    {
+        const util::coord::simd a (1,2,3,4);
+        const util::coord::simd b (4,1,3,2);
+        const float res = dot (a, b);
+        tap.expect_eq (res, 4+2+9+8, "trivial dot product");
+    }
+
+    {
+        const util::coord::simd a (1, 2, 3, 4);
+        const util::coord::simd b (0, 3, 3, 9);
+
+        const auto lo = min (a, b);
+        const auto hi = max (a, b);
+
+        tap.expect_eq (lo, util::coord::simd {0,2,3,4}, "vector minimum");
+        tap.expect_eq (hi, util::coord::simd {1,3,3,9}, "vector maximum");
+    }
+
+    {
+        const util::coord::simd val { -INFINITY, INFINITY, 0, -9 };
+        tap.expect_eq (abs (val), util::coord::simd {INFINITY,INFINITY,0,9}, "absolute value");
+    }
+
+    return tap.status ();
+}
\ No newline at end of file