From aed3bbb525469b02632a91919430c1ee3a13f63c Mon Sep 17 00:00:00 2001 From: mx989 Date: Tue, 1 Nov 2022 01:25:35 +0100 Subject: [PATCH] Add rounding before int cast --- src/FastSIMD/Internal/NEON.h | 42 ++++++++++++++++++++++++++++-------- tests/SIMDUnitTest.cpp | 4 +++- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/src/FastSIMD/Internal/NEON.h b/src/FastSIMD/Internal/NEON.h index e1e7b085..ad614128 100644 --- a/src/FastSIMD/Internal/NEON.h +++ b/src/FastSIMD/Internal/NEON.h @@ -321,10 +321,7 @@ namespace FastSIMD return vcvtq_f32_s32( a ); } - FS_INLINE static int32v Convertf32_i32( float32v a ) - { - return vcvtq_s32_f32( a ); - } + // Comparisons @@ -479,28 +476,36 @@ namespace FastSIMD return res1; } - + FS_INLINE static float32v Floor_f32(float32v a) { static const float32x4_t zerox = vdupq_n_f32( 0 ); float32x4_t ifl = IntFloor_f32(a); - uint32x4_t cmpmask = vcltq_f32(a, zerox); + uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl)); + uint32x4_t cond2 = vcltq_f32(a, zerox); + + uint32x4_t cmpmask = vandq_u32(cond1, cond2); float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) ); float32x4_t ret0 = vaddq_f32(ifl, addx); return ret0; } + FS_INLINE static float32v Ceil_f32(float32v a) { static const float32x4_t zerox = vdupq_n_f32( 0 ); float32x4_t ifl = IntFloor_f32(a); + + uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl)); + uint32x4_t cond2 = vcgeq_f32(a, zerox); - uint32x4_t cmpmask = vcgeq_f32(a, zerox); + uint32x4_t cmpmask = vandq_u32(cond1, cond2); float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) ); + float32x4_t ret0 = vsubq_f32(ifl, addx); @@ -508,16 +513,31 @@ namespace FastSIMD } FS_INLINE static float32v Round_f32(float32v a) { + static const float32x4_t zerox = vdupq_n_f32( 0 ); static const float32x4_t halfx = vdupq_n_f32( 0.5f ); + static const float32x4_t onex = vdupq_n_f32( 1.0f ); + + float32x4_t a2 = vaddq_f32(vabsq_f32(a), halfx); + float32x4_t ifl = IntFloor_f32(a2); + + + + uint32x4_t cmpmask = vcltq_f32(a, zerox); + float32x4_t rhs = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) ); + float32x4_t rhs2 = vaddq_f32(vmulq_n_f32(rhs, 2.0f), onex); + - return Floor_f32( vaddq_f32( a, halfx ) ); + return vmulq_f32(ifl, rhs2); } FS_INLINE static float32v Sqrt_f32( float32v a ) { return Reciprocal_f32(InvSqrt_f32(a)); } - + FS_INLINE static int32v Convertf32_i32( float32v a ) + { + return vcvtq_s32_f32( Round_f32(a) ); + } #else FS_INLINE static float32v Floor_f32( float32v a ) { @@ -535,6 +555,10 @@ namespace FastSIMD { return vsqrtq_f32( a ); } + FS_INLINE static int32v Convertf32_i32( float32v a ) + { + return vcvtq_s32_f32( vrndnq_f32(a) ); + } #endif diff --git a/tests/SIMDUnitTest.cpp b/tests/SIMDUnitTest.cpp index e39e807c..b6f897b6 100644 --- a/tests/SIMDUnitTest.cpp +++ b/tests/SIMDUnitTest.cpp @@ -40,14 +40,16 @@ struct SIMDClassContainer }; typedef SIMDClassContainer< - FastSIMD::Scalar, + FastSIMD::Scalar #if FASTSIMD_x86 + , FastSIMD::SSE2, FastSIMD::SSE41, FastSIMD::AVX2, FastSIMD::AVX512 #endif #if FASTSIMD_ARM + , FastSIMD::NEON #endif >