libcruft-util/coord/simd_neon.hpp

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Copyright 2018 Danny Robson <danny@nerdcruft.net>
 */

#ifndef __ARM_NEON__
#error
#endif

#include "arm_neon.h"

namespace util::coord {
    struct alignas (16) simd {
        using value_type = float32x4_t;

        ///////////////////////////////////////////////////////////////////////
        simd (float a, float b, float c, float d):
            data (_mm_setr_ps (a, b, c, d))
        { ; }


        //---------------------------------------------------------------------
        simd (float v):
            data (_mm_set_ps1 (v))
        { ; }


        //---------------------------------------------------------------------
        simd (value_type _data):
            data (_data)
        { ; }


        //---------------------------------------------------------------------
        operator       value_type& ()       { return data; }
        operator const value_type& () const { return data; }

        explicit operator bool () const;

        float operator[] (int idx) const { return data[idx]; }


        ///////////////////////////////////////////////////////////////////////
        value_type data;
    };


    ///////////////////////////////////////////////////////////////////////////
    simd operator* (simd a, simd b) { return vmulq_f32 (a, b); };
    simd operator/ (simd a, simd b) { return vdivq_f32 (a, b); };
    simd operator+ (simd a, simd b) { return vaddq_f32 (a, b); };
    simd operator- (simd a, simd b) { return vsubq_f32 (a, b); };

    simd operator<  (simd a, simd b);
    simd operator<= (simd a, simd b);
    simd operator>  (simd a, simd b);
    simd operator>= (simd a, simd b);
    simd operator== (simd a, simd b);

    simd select (simd mask, simd a, simd b);


    auto sum (simd val)
    {
        // reverse and add to self giving: 0123 + 3210
        auto revq = vrev64q_f32 (val);
        auto pair = vaddq_f32 (val, revq);

        // reverse the upper and lower pairs given (2301 + 1023)
        auto shuf = vcombine_f32 (
            vget_high_f32 (pair),
            vget_low_f32  (pair)
        );

        // add both partial sums: (2301 + 1032) + (0123 + 3210)
        return vaddq_f32 (shuf, pair);
    }

    simd
    dot (simd a, simd b)
    {
        return sum (a * b);
    }
}
#endif