97 lines
2.8 KiB
C++
97 lines
2.8 KiB
C++
|
/*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*
|
||
|
* Copyright 2018 Danny Robson <danny@nerdcruft.net>
|
||
|
*/
|
||
|
|
||
|
#ifndef __ARM_NEON__
|
||
|
#error
|
||
|
#endif
|
||
|
|
||
|
#include "arm_neon.h"
|
||
|
|
||
|
namespace util::coord {
|
||
|
struct alignas (16) simd {
|
||
|
using value_type = float32x4_t;
|
||
|
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
simd (float a, float b, float c, float d):
|
||
|
data (_mm_setr_ps (a, b, c, d))
|
||
|
{ ; }
|
||
|
|
||
|
|
||
|
//---------------------------------------------------------------------
|
||
|
simd (float v):
|
||
|
data (_mm_set_ps1 (v))
|
||
|
{ ; }
|
||
|
|
||
|
|
||
|
//---------------------------------------------------------------------
|
||
|
simd (value_type _data):
|
||
|
data (_data)
|
||
|
{ ; }
|
||
|
|
||
|
|
||
|
//---------------------------------------------------------------------
|
||
|
operator value_type& () { return data; }
|
||
|
operator const value_type& () const { return data; }
|
||
|
|
||
|
explicit operator bool () const;
|
||
|
|
||
|
float operator[] (int idx) const { return data[idx]; }
|
||
|
|
||
|
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
value_type data;
|
||
|
};
|
||
|
|
||
|
|
||
|
///////////////////////////////////////////////////////////////////////////
|
||
|
simd operator* (simd a, simd b) { return vmulq_f32 (a, b); };
|
||
|
simd operator/ (simd a, simd b) { return vdivq_f32 (a, b); };
|
||
|
simd operator+ (simd a, simd b) { return vaddq_f32 (a, b); };
|
||
|
simd operator- (simd a, simd b) { return vsubq_f32 (a, b); };
|
||
|
|
||
|
simd operator< (simd a, simd b);
|
||
|
simd operator<= (simd a, simd b);
|
||
|
simd operator> (simd a, simd b);
|
||
|
simd operator>= (simd a, simd b);
|
||
|
simd operator== (simd a, simd b);
|
||
|
|
||
|
simd select (simd mask, simd a, simd b);
|
||
|
|
||
|
|
||
|
auto sum (simd val)
|
||
|
{
|
||
|
// reverse and add to self giving: 0123 + 3210
|
||
|
auto revq = vrev64q_f32 (val);
|
||
|
auto pair = vaddq_f32 (val, revq);
|
||
|
|
||
|
// reverse the upper and lower pairs given (2301 + 1023)
|
||
|
auto shuf = vcombine_f32 (
|
||
|
vget_high_f32 (pair),
|
||
|
vget_low_f32 (pair)
|
||
|
);
|
||
|
|
||
|
// add both partial sums: (2301 + 1032) + (0123 + 3210)
|
||
|
return vaddq_f32 (shuf, pair);
|
||
|
}
|
||
|
|
||
|
simd
|
||
|
dot (simd a, simd b)
|
||
|
{
|
||
|
return sum (a * b);
|
||
|
}
|
||
|
}
|
||
|
#endif
|