From 1fe20ed87890f47b74d1a3a58452ca6d1a424892 Mon Sep 17 00:00:00 2001 From: Jordan Peck Date: Thu, 3 Nov 2022 00:28:00 +0000 Subject: [PATCH] Neon tweaks and formatting --- CMakeLists.txt | 21 ++-- src/FastSIMD/Internal/NEON.h | 199 ++++++++++++++++------------------- 2 files changed, 102 insertions(+), 118 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c1e0fb6..f107329d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,28 +4,27 @@ cmake_minimum_required(VERSION 3.7.1) project(FastNoise2 VERSION 0.9.6) set(CMAKE_CXX_STANDARD 17) +message("FastNoise2 Arch: ${CMAKE_SYSTEM_PROCESSOR}") +if(CMAKE_SYSTEM_PROCESSOR MATCHES armv7) -if(CMAKE_SYSTEM_PROCESSOR STREQUAL armv7-a) - -set(FASTSIMD_COMPILE_ARMV7 true) -set(FASTSIMD_COMPILE_ARM true) -set(FASTSIMD_COMPILE_HAVE_NEON true) + set(FASTSIMD_COMPILE_ARMV7 true) + set(FASTSIMD_COMPILE_ARM true) + set(FASTSIMD_COMPILE_HAVE_NEON true) elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) -set(FASTSIMD_COMPILE_AARCH64 true) -set(FASTSIMD_COMPILE_ARM true) -set(FASTSIMD_COMPILE_HAVE_NEON true) + set(FASTSIMD_COMPILE_AARCH64 true) + set(FASTSIMD_COMPILE_ARM true) + set(FASTSIMD_COMPILE_HAVE_NEON true) -elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL arm) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES arm) -set(FASTSIMD_COMPILE_ARM true) + set(FASTSIMD_COMPILE_ARM true) endif() - # determine whether this is a standalone project or included by other projects if (NOT DEFINED FASTNOISE2_STANDALONE_PROJECT) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) diff --git a/src/FastSIMD/Internal/NEON.h b/src/FastSIMD/Internal/NEON.h index ad614128..532f0614 100644 --- a/src/FastSIMD/Internal/NEON.h +++ b/src/FastSIMD/Internal/NEON.h @@ -320,8 +320,11 @@ namespace FastSIMD { return vcvtq_f32_s32( a ); } - + FS_INLINE static int32v Convertf32_i32( float32v a ) + { + return vcvtq_s32_f32( Round_f32(a) ); + } // Comparisons @@ -440,135 +443,120 @@ namespace FastSIMD FS_INLINE static int32v Abs_i32( int32v a ) { return vabsq_s32( a ); - } - - - - + } FS_INLINE static float32v InvSqrt_f32( float32v a ) { return vrsqrteq_f32( a ); - } - + } - // Floor, Ceil, Round: http://dss.stephanierct.com/DevBlog/?p=8 + // Floor, Ceil, Round: - - #ifdef FASTSIMD_USE_ARMV7 - - FS_INLINE static float32v IntFloor_f32(float32v a) - { - static const float32x4_t cmpval = vcvtq_f32_s32( vdupq_n_s32( 0x7FFFFFFF ) ); - +#ifdef FASTSIMD_USE_ARMV7 + FS_INLINE static float32v IntFloor_f32(float32v a) + { + static const float32x4_t cmpval = vcvtq_f32_s32( vdupq_n_s32( 0x7FFFFFFF ) ); - uint32x4_t cmp1 = vcagtq_f32( a, cmpval ); - uint32x4_t cmp2 = vcaleq_f32( a, cmpval ); + uint32x4_t cmp1 = vcagtq_f32( a, cmpval ); + uint32x4_t cmp2 = vcaleq_f32( a, cmpval ); - float32x4_t tr = vcvtq_f32_s32( vcvtq_s32_f32( a ) ); + float32x4_t tr = vcvtq_f32_s32( vcvtq_s32_f32( a ) ); - uint32x4_t xcmp1 = vandq_u32(cmp1, vreinterpretq_u32_f32( a ) ); - uint32x4_t xcmp2 = vandq_u32(cmp2, vreinterpretq_u32_f32( tr ) ); + uint32x4_t xcmp1 = vandq_u32(cmp1, vreinterpretq_u32_f32( a ) ); + uint32x4_t xcmp2 = vandq_u32(cmp2, vreinterpretq_u32_f32( tr ) ); - uint32x4_t res0 = vorrq_u32( xcmp1, xcmp2 ); + uint32x4_t res0 = vorrq_u32( xcmp1, xcmp2 ); - float32x4_t res1 = vreinterpretq_f32_u32( res0 ); - - return res1; - } - - FS_INLINE static float32v Floor_f32(float32v a) - { - static const float32x4_t zerox = vdupq_n_f32( 0 ); + float32x4_t res1 = vreinterpretq_f32_u32( res0 ); + + return res1; + } + + FS_INLINE static float32v Floor_f32(float32v a) + { + static const float32x4_t zerox = vdupq_n_f32( 0 ); - float32x4_t ifl = IntFloor_f32(a); + float32x4_t ifl = IntFloor_f32(a); - uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl)); - uint32x4_t cond2 = vcltq_f32(a, zerox); + uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl)); + uint32x4_t cond2 = vcltq_f32(a, zerox); - uint32x4_t cmpmask = vandq_u32(cond1, cond2); - float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) ); + uint32x4_t cmpmask = vandq_u32(cond1, cond2); + float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) ); - float32x4_t ret0 = vaddq_f32(ifl, addx); + float32x4_t ret0 = vaddq_f32(ifl, addx); - return ret0; - } + return ret0; + } - FS_INLINE static float32v Ceil_f32(float32v a) - { - static const float32x4_t zerox = vdupq_n_f32( 0 ); + FS_INLINE static float32v Ceil_f32(float32v a) + { + static const float32x4_t zerox = vdupq_n_f32( 0 ); - float32x4_t ifl = IntFloor_f32(a); - - uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl)); - uint32x4_t cond2 = vcgeq_f32(a, zerox); + float32x4_t ifl = IntFloor_f32(a); + + uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl)); + uint32x4_t cond2 = vcgeq_f32(a, zerox); - uint32x4_t cmpmask = vandq_u32(cond1, cond2); - float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) ); - + uint32x4_t cmpmask = vandq_u32(cond1, cond2); + float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) ); + - float32x4_t ret0 = vsubq_f32(ifl, addx); + float32x4_t ret0 = vsubq_f32(ifl, addx); - return ret0; - } - FS_INLINE static float32v Round_f32(float32v a) - { - static const float32x4_t zerox = vdupq_n_f32( 0 ); - static const float32x4_t halfx = vdupq_n_f32( 0.5f ); - static const float32x4_t onex = vdupq_n_f32( 1.0f ); - - float32x4_t a2 = vaddq_f32(vabsq_f32(a), halfx); - float32x4_t ifl = IntFloor_f32(a2); - - - - uint32x4_t cmpmask = vcltq_f32(a, zerox); - float32x4_t rhs = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) ); - float32x4_t rhs2 = vaddq_f32(vmulq_n_f32(rhs, 2.0f), onex); - + return ret0; + } - return vmulq_f32(ifl, rhs2); - } - - FS_INLINE static float32v Sqrt_f32( float32v a ) - { - return Reciprocal_f32(InvSqrt_f32(a)); - } - FS_INLINE static int32v Convertf32_i32( float32v a ) - { - return vcvtq_s32_f32( Round_f32(a) ); - } - #else - FS_INLINE static float32v Floor_f32( float32v a ) - { - return vrndmq_f32( a ); - } - FS_INLINE static float32v Ceil_f32( float32v a ) - { - return vrndpq_f32( a ); - } - FS_INLINE static float32v Round_f32( float32v a ) - { - return vrndnq_f32( a ); - } - FS_INLINE static float32v Sqrt_f32( float32v a ) - { - return vsqrtq_f32( a ); - } - FS_INLINE static int32v Convertf32_i32( float32v a ) - { - return vcvtq_s32_f32( vrndnq_f32(a) ); - } - #endif + FS_INLINE static float32v Round_f32(float32v a) + { + static const float32x4_t zerox = vdupq_n_f32( 0 ); + static const float32x4_t halfx = vdupq_n_f32( 0.5f ); + static const float32x4_t onex = vdupq_n_f32( 1.0f ); + float32x4_t a2 = vaddq_f32(vabsq_f32(a), halfx); + float32x4_t ifl = IntFloor_f32(a2); + + uint32x4_t cmpmask = vcltq_f32(a, zerox); + float32x4_t rhs = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) ); + float32x4_t rhs2 = vaddq_f32(vmulq_n_f32(rhs, 2.0f), onex); + + return vmulq_f32(ifl, rhs2); + } + FS_INLINE static float32v Sqrt_f32( float32v a ) + { + return Reciprocal_f32(InvSqrt_f32(a)); + } + #else + FS_INLINE static float32v Floor_f32( float32v a ) + { + return vrndmq_f32( a ); + } + + FS_INLINE static float32v Ceil_f32( float32v a ) + { + return vrndpq_f32( a ); + } + + FS_INLINE static float32v Round_f32( float32v a ) + { + return vrndnq_f32( a ); + } + + FS_INLINE static float32v Sqrt_f32( float32v a ) + { + return vsqrtq_f32( a ); + } + #endif + // Mask FS_INLINE static int32v Mask_i32( int32v a, mask32v m ) { return a & m; } + FS_INLINE static int32v NMask_i32( int32v a, mask32v m ) { return BitwiseAndNot_i32(a, m); @@ -578,6 +566,7 @@ namespace FastSIMD { return BitwiseAnd_f32( a, vreinterpretq_f32_s32( m ) ); } + FS_INLINE static float32v NMask_f32( float32v a, mask32v m ) { return BitwiseAndNot_f32( a, vreinterpretq_f32_s32( m ) ); @@ -587,21 +576,17 @@ namespace FastSIMD { return vgetq_lane_f32(a, 0); } + FS_INLINE static int32_t Extract0_i32( int32v a ) { return vgetq_lane_s32(a, 0); } + FS_INLINE static float32v Reciprocal_f32( float32v a ) - { - -// float32x4_t reciprocal = vrecpeq_f32( a ); -// reciprocal = vmulq_f32( vrecpsq_f32( a, reciprocal ), reciprocal ); -// reciprocal = vmulq_f32( vrecpsq_f32( a, reciprocal ), reciprocal ); -// return reciprocal; - - + { return vrecpeq_f32( a ); } + FS_INLINE static float32v BitwiseShiftRightZX_f32( float32v a, int32_t b ) { int32x4_t rhs2 = vdupq_n_s32( -b );