// Simd NEON specific implementations -*- C++ -*- // Copyright (C) 2020-2024 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the // terms of the GNU General Public License as published by the // Free Software Foundation; either version 3, or (at your option) // any later version. // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // Under Section 7 of GPL version 3, you are granted additional // permissions described in the GCC Runtime Library Exception, version // 3.1, as published by the Free Software Foundation. // You should have received a copy of the GNU General Public License and // a copy of the GCC Runtime Library Exception along with this program; // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see // . #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ #if __cplusplus >= 201703L #if !_GLIBCXX_SIMD_HAVE_NEON #error "simd_neon.h may only be included when NEON on ARM is available" #endif _GLIBCXX_SIMD_BEGIN_NAMESPACE // _CommonImplNeon {{{ struct _CommonImplNeon : _CommonImplBuiltin { // _S_store {{{ using _CommonImplBuiltin::_S_store; // }}} }; // }}} // _SimdImplNeon {{{ template struct _SimdImplNeon : _SimdImplBuiltin<_Abi> { using _Base = _SimdImplBuiltin<_Abi>; template using _MaskMember = typename _Base::template _MaskMember<_Tp>; template static constexpr size_t _S_max_store_size = 16; // _S_masked_load {{{ template static inline _SimdWrapper<_Tp, _Np> _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, const _Up* __mem) noexcept { __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__k[__i] != 0) __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); }); return __merge; } // }}} // _S_masked_store_nocvt {{{ template _GLIBCXX_SIMD_INTRINSIC static void _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k) { __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { if (__k[__i] != 0) __mem[__i] = __v[__i]; }); } // }}} // _S_reduce {{{ template _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) { if (not __builtin_is_constant_evaluated()) { constexpr size_t _Np = __x.size(); if constexpr (sizeof(__x) == 16 && _Np >= 4 && !_Abi::template _S_is_partial<_Tp>) { const auto __halves = split>>(__x); const auto __y = __binary_op(__halves[0], __halves[1]); return _SimdImplNeon>::_S_reduce( __y, static_cast<_BinaryOperation&&>(__binary_op)); } else if constexpr (_Np == 8) { __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(__x._M_data))); __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(__x._M_data))); __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(__x._M_data))); return __x[0]; } else if constexpr (_Np == 4) { __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( __vector_permute<1, 0, 3, 2>(__x._M_data))); __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( __vector_permute<3, 2, 1, 0>(__x._M_data))); return __x[0]; } else if constexpr (_Np == 2) { __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( __vector_permute<1, 0>(__x._M_data))); return __x[0]; } } return _Base::_S_reduce(__x, static_cast<_BinaryOperation&&>(__binary_op)); } // }}} // math {{{ // _S_sqrt {{{ template > _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x) { if constexpr (__have_neon_a64) { const auto __intrin = __to_intrin(__x); if constexpr (_TVT::template _S_is) return vsqrt_f32(__intrin); else if constexpr (_TVT::template _S_is) return vsqrtq_f32(__intrin); else if constexpr (_TVT::template _S_is) return vsqrt_f64(__intrin); else if constexpr (_TVT::template _S_is) return vsqrtq_f64(__intrin); else __assert_unreachable<_Tp>(); } else return _Base::_S_sqrt(__x); } // }}} // _S_trunc {{{ template > _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x) { using _Tp = typename _TVT::value_type; if constexpr (__have_neon_a32) { const auto __intrin = __to_intrin(__x); if constexpr (_TVT::template _S_is) return vrnd_f32(__intrin); else if constexpr (_TVT::template _S_is) return vrndq_f32(__intrin); else if constexpr (_TVT::template _S_is) return vrnd_f64(__intrin); else if constexpr (_TVT::template _S_is) return vrndq_f64(__intrin); else __assert_unreachable<_Tp>(); } else if constexpr (is_same_v<_Tp, float>) { auto __intrin = __to_intrin(__x); if constexpr (sizeof(__x) == 16) __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin)); else __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin)); return _Base::_S_abs(__x)._M_data < 0x1p23f ? __vector_bitcast(__intrin) : __x._M_data; } else return _Base::_S_trunc(__x); } // }}} // _S_round {{{ template _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> _S_round(_SimdWrapper<_Tp, _Np> __x) { if constexpr (__have_neon_a32) { const auto __intrin = __to_intrin(__x); if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8) return vrnda_f32(__intrin); else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16) return vrndaq_f32(__intrin); else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8) return vrnda_f64(__intrin); else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16) return vrndaq_f64(__intrin); else __assert_unreachable<_Tp>(); } else return _Base::_S_round(__x); } // }}} // _S_floor {{{ template > _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x) { if constexpr (__have_neon_a32) { const auto __intrin = __to_intrin(__x); if constexpr (_TVT::template _S_is) return vrndm_f32(__intrin); else if constexpr (_TVT::template _S_is) return vrndmq_f32(__intrin); else if constexpr (_TVT::template _S_is) return vrndm_f64(__intrin); else if constexpr (_TVT::template _S_is) return vrndmq_f64(__intrin); else __assert_unreachable<_Tp>(); } else return _Base::_S_floor(__x); } // }}} // _S_ceil {{{ template > _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x) { if constexpr (__have_neon_a32) { const auto __intrin = __to_intrin(__x); if constexpr (_TVT::template _S_is) return vrndp_f32(__intrin); else if constexpr (_TVT::template _S_is) return vrndpq_f32(__intrin); else if constexpr (_TVT::template _S_is) return vrndp_f64(__intrin); else if constexpr (_TVT::template _S_is) return vrndpq_f64(__intrin); else __assert_unreachable<_Tp>(); } else return _Base::_S_ceil(__x); } //}}} }}} }; // }}} // _MaskImplNeonMixin {{{ struct _MaskImplNeonMixin { using _Base = _MaskImplBuiltinMixin; template _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> _S_to_bits(_SimdWrapper<_Tp, _Np> __x) { if (__builtin_is_constant_evaluated()) return _Base::_S_to_bits(__x); using _I = __int_for_sizeof_t<_Tp>; if constexpr (sizeof(__x) == 16) { auto __asint = __vector_bitcast<_I>(__x); #ifdef __aarch64__ [[maybe_unused]] constexpr auto __zero = decltype(__asint)(); #else [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))(); #endif if constexpr (sizeof(_Tp) == 1) { constexpr auto __bitsel = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>( [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>( __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0); }); __asint &= __bitsel; #ifdef __aarch64__ return __vector_bitcast<_UShort>( vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero), __zero))[0]; #else return __vector_bitcast<_UShort>( vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)), __zero), __zero))[0]; #endif } else if constexpr (sizeof(_Tp) == 2) { constexpr auto __bitsel = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; #ifdef __aarch64__ return vaddvq_s16(__asint); #else return vpadd_s16( vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero), __zero)[0]; #endif } else if constexpr (sizeof(_Tp) == 4) { constexpr auto __bitsel = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; #ifdef __aarch64__ return vaddvq_s32(__asint); #else return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)), __zero)[0]; #endif } else if constexpr (sizeof(_Tp) == 8) return (__asint[0] & 1) | (__asint[1] & 2); else __assert_unreachable<_Tp>(); } else if constexpr (sizeof(__x) == 8) { auto __asint = __vector_bitcast<_I>(__x); [[maybe_unused]] constexpr auto __zero = decltype(__asint)(); if constexpr (sizeof(_Tp) == 1) { constexpr auto __bitsel = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; #ifdef __aarch64__ return vaddv_s8(__asint); #else return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero), __zero)[0]; #endif } else if constexpr (sizeof(_Tp) == 2) { constexpr auto __bitsel = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return static_cast<_I>(__i < _Np ? 1 << __i : 0); }); __asint &= __bitsel; #ifdef __aarch64__ return vaddv_s16(__asint); #else return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0]; #endif } else if constexpr (sizeof(_Tp) == 4) { __asint &= __make_vector<_I>(0x1, 0x2); #ifdef __aarch64__ return vaddv_s32(__asint); #else return vpadd_s32(__asint, __zero)[0]; #endif } else __assert_unreachable<_Tp>(); } else return _Base::_S_to_bits(__x); } }; // }}} // _MaskImplNeon {{{ template struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi> { using _MaskImplBuiltinMixin::_S_to_maskvector; using _MaskImplNeonMixin::_S_to_bits; using _Base = _MaskImplBuiltin<_Abi>; using _Base::_S_convert; // _S_all_of {{{ template _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k) { const auto __kk = __vector_bitcast(__k._M_data) | ~__vector_bitcast(_Abi::template _S_implicit_mask<_Tp>()); if constexpr (sizeof(__k) == 16) { const auto __x = __vector_bitcast(__kk); return __x[0] + __x[1] == -2; } else if constexpr (sizeof(__k) <= 8) return __bit_cast<__int_for_sizeof_t>(__kk) == -1; else __assert_unreachable<_Tp>(); } // }}} // _S_any_of {{{ template _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k) { const auto __kk = __vector_bitcast(__k._M_data) | ~__vector_bitcast(_Abi::template _S_implicit_mask<_Tp>()); if constexpr (sizeof(__k) == 16) { const auto __x = __vector_bitcast(__kk); return (__x[0] | __x[1]) != 0; } else if constexpr (sizeof(__k) <= 8) return __bit_cast<__int_for_sizeof_t>(__kk) != 0; else __assert_unreachable<_Tp>(); } // }}} // _S_none_of {{{ template _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k) { const auto __kk = _Abi::_S_masked(__k._M_data); if constexpr (sizeof(__k) == 16) { const auto __x = __vector_bitcast(__kk); return (__x[0] | __x[1]) == 0; } else if constexpr (sizeof(__k) <= 8) return __bit_cast<__int_for_sizeof_t>(__kk) == 0; else __assert_unreachable<_Tp>(); } // }}} // _S_some_of {{{ template _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k) { if constexpr (sizeof(__k) <= 8) { const auto __kk = __vector_bitcast(__k._M_data) | ~__vector_bitcast( _Abi::template _S_implicit_mask<_Tp>()); using _Up = make_unsigned_t<__int_for_sizeof_t>; return __bit_cast<_Up>(__kk) + 1 > 1; } else return _Base::_S_some_of(__k); } // }}} // _S_popcount {{{ template _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k) { if constexpr (sizeof(_Tp) == 1) { const auto __s8 = __vector_bitcast<_SChar>(__k._M_data); int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8); return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()), int8x8_t())[0]; } else if constexpr (sizeof(_Tp) == 2) { const auto __s16 = __vector_bitcast(__k._M_data); int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16); return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0]; } else if constexpr (sizeof(_Tp) == 4) { const auto __s32 = __vector_bitcast(__k._M_data); int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32); return -vpadd_s32(__tmp, int32x2_t())[0]; } else if constexpr (sizeof(_Tp) == 8) { static_assert(sizeof(__k) == 16); const auto __s64 = __vector_bitcast(__k._M_data); return -(__s64[0] + __s64[1]); } } // }}} // _S_find_first_set {{{ template _GLIBCXX_SIMD_INTRINSIC static int _S_find_first_set(simd_mask<_Tp, _Abi> __k) { // TODO: the _Base implementation is not optimal for NEON return _Base::_S_find_first_set(__k); } // }}} // _S_find_last_set {{{ template _GLIBCXX_SIMD_INTRINSIC static int _S_find_last_set(simd_mask<_Tp, _Abi> __k) { // TODO: the _Base implementation is not optimal for NEON return _Base::_S_find_last_set(__k); } // }}} }; // }}} _GLIBCXX_SIMD_END_NAMESPACE #endif // __cplusplus >= 201703L #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80