Skip to content

Commit

Permalink
Neon tweaks and formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
Auburn committed Nov 3, 2022
1 parent aed3bbb commit 1fe20ed
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 118 deletions.
21 changes: 10 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,27 @@ cmake_minimum_required(VERSION 3.7.1)
project(FastNoise2 VERSION 0.9.6)
set(CMAKE_CXX_STANDARD 17)

message("FastNoise2 Arch: ${CMAKE_SYSTEM_PROCESSOR}")

if(CMAKE_SYSTEM_PROCESSOR MATCHES armv7)

if(CMAKE_SYSTEM_PROCESSOR STREQUAL armv7-a)

set(FASTSIMD_COMPILE_ARMV7 true)
set(FASTSIMD_COMPILE_ARM true)
set(FASTSIMD_COMPILE_HAVE_NEON true)
set(FASTSIMD_COMPILE_ARMV7 true)
set(FASTSIMD_COMPILE_ARM true)
set(FASTSIMD_COMPILE_HAVE_NEON true)

elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)

set(FASTSIMD_COMPILE_AARCH64 true)
set(FASTSIMD_COMPILE_ARM true)
set(FASTSIMD_COMPILE_HAVE_NEON true)
set(FASTSIMD_COMPILE_AARCH64 true)
set(FASTSIMD_COMPILE_ARM true)
set(FASTSIMD_COMPILE_HAVE_NEON true)

elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES arm)

set(FASTSIMD_COMPILE_ARM true)
set(FASTSIMD_COMPILE_ARM true)

endif()



# determine whether this is a standalone project or included by other projects
if (NOT DEFINED FASTNOISE2_STANDALONE_PROJECT)
if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
Expand Down
199 changes: 92 additions & 107 deletions src/FastSIMD/Internal/NEON.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,11 @@ namespace FastSIMD
{
return vcvtq_f32_s32( a );
}


FS_INLINE static int32v Convertf32_i32( float32v a )
{
return vcvtq_s32_f32( Round_f32(a) );
}

// Comparisons

Expand Down Expand Up @@ -440,135 +443,120 @@ namespace FastSIMD
FS_INLINE static int32v Abs_i32( int32v a )
{
return vabsq_s32( a );
}




}

FS_INLINE static float32v InvSqrt_f32( float32v a )
{
return vrsqrteq_f32( a );
}

}

// Floor, Ceil, Round: http://dss.stephanierct.com/DevBlog/?p=8
// Floor, Ceil, Round:


#ifdef FASTSIMD_USE_ARMV7

FS_INLINE static float32v IntFloor_f32(float32v a)
{
static const float32x4_t cmpval = vcvtq_f32_s32( vdupq_n_s32( 0x7FFFFFFF ) );

#ifdef FASTSIMD_USE_ARMV7
FS_INLINE static float32v IntFloor_f32(float32v a)
{
static const float32x4_t cmpval = vcvtq_f32_s32( vdupq_n_s32( 0x7FFFFFFF ) );

uint32x4_t cmp1 = vcagtq_f32( a, cmpval );
uint32x4_t cmp2 = vcaleq_f32( a, cmpval );
uint32x4_t cmp1 = vcagtq_f32( a, cmpval );
uint32x4_t cmp2 = vcaleq_f32( a, cmpval );

float32x4_t tr = vcvtq_f32_s32( vcvtq_s32_f32( a ) );
float32x4_t tr = vcvtq_f32_s32( vcvtq_s32_f32( a ) );

uint32x4_t xcmp1 = vandq_u32(cmp1, vreinterpretq_u32_f32( a ) );
uint32x4_t xcmp2 = vandq_u32(cmp2, vreinterpretq_u32_f32( tr ) );
uint32x4_t xcmp1 = vandq_u32(cmp1, vreinterpretq_u32_f32( a ) );
uint32x4_t xcmp2 = vandq_u32(cmp2, vreinterpretq_u32_f32( tr ) );

uint32x4_t res0 = vorrq_u32( xcmp1, xcmp2 );
uint32x4_t res0 = vorrq_u32( xcmp1, xcmp2 );

float32x4_t res1 = vreinterpretq_f32_u32( res0 );
return res1;
}
FS_INLINE static float32v Floor_f32(float32v a)
{
static const float32x4_t zerox = vdupq_n_f32( 0 );
float32x4_t res1 = vreinterpretq_f32_u32( res0 );

return res1;
}

FS_INLINE static float32v Floor_f32(float32v a)
{
static const float32x4_t zerox = vdupq_n_f32( 0 );

float32x4_t ifl = IntFloor_f32(a);
float32x4_t ifl = IntFloor_f32(a);

uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl));
uint32x4_t cond2 = vcltq_f32(a, zerox);
uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl));
uint32x4_t cond2 = vcltq_f32(a, zerox);

uint32x4_t cmpmask = vandq_u32(cond1, cond2);
float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) );
uint32x4_t cmpmask = vandq_u32(cond1, cond2);
float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) );

float32x4_t ret0 = vaddq_f32(ifl, addx);
float32x4_t ret0 = vaddq_f32(ifl, addx);

return ret0;
}
return ret0;
}

FS_INLINE static float32v Ceil_f32(float32v a)
{
static const float32x4_t zerox = vdupq_n_f32( 0 );
FS_INLINE static float32v Ceil_f32(float32v a)
{
static const float32x4_t zerox = vdupq_n_f32( 0 );

float32x4_t ifl = IntFloor_f32(a);
uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl));
uint32x4_t cond2 = vcgeq_f32(a, zerox);
float32x4_t ifl = IntFloor_f32(a);

uint32x4_t cond1 = vmvnq_u32(vceqq_f32(a, ifl));
uint32x4_t cond2 = vcgeq_f32(a, zerox);

uint32x4_t cmpmask = vandq_u32(cond1, cond2);
float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) );
uint32x4_t cmpmask = vandq_u32(cond1, cond2);
float32x4_t addx = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) );


float32x4_t ret0 = vsubq_f32(ifl, addx);
float32x4_t ret0 = vsubq_f32(ifl, addx);

return ret0;
}
FS_INLINE static float32v Round_f32(float32v a)
{
static const float32x4_t zerox = vdupq_n_f32( 0 );
static const float32x4_t halfx = vdupq_n_f32( 0.5f );
static const float32x4_t onex = vdupq_n_f32( 1.0f );

float32x4_t a2 = vaddq_f32(vabsq_f32(a), halfx);
float32x4_t ifl = IntFloor_f32(a2);



uint32x4_t cmpmask = vcltq_f32(a, zerox);
float32x4_t rhs = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) );
float32x4_t rhs2 = vaddq_f32(vmulq_n_f32(rhs, 2.0f), onex);

return ret0;
}

return vmulq_f32(ifl, rhs2);
}

FS_INLINE static float32v Sqrt_f32( float32v a )
{
return Reciprocal_f32(InvSqrt_f32(a));
}
FS_INLINE static int32v Convertf32_i32( float32v a )
{
return vcvtq_s32_f32( Round_f32(a) );
}
#else
FS_INLINE static float32v Floor_f32( float32v a )
{
return vrndmq_f32( a );
}
FS_INLINE static float32v Ceil_f32( float32v a )
{
return vrndpq_f32( a );
}
FS_INLINE static float32v Round_f32( float32v a )
{
return vrndnq_f32( a );
}
FS_INLINE static float32v Sqrt_f32( float32v a )
{
return vsqrtq_f32( a );
}
FS_INLINE static int32v Convertf32_i32( float32v a )
{
return vcvtq_s32_f32( vrndnq_f32(a) );
}
#endif
FS_INLINE static float32v Round_f32(float32v a)
{
static const float32x4_t zerox = vdupq_n_f32( 0 );
static const float32x4_t halfx = vdupq_n_f32( 0.5f );
static const float32x4_t onex = vdupq_n_f32( 1.0f );

float32x4_t a2 = vaddq_f32(vabsq_f32(a), halfx);
float32x4_t ifl = IntFloor_f32(a2);

uint32x4_t cmpmask = vcltq_f32(a, zerox);
float32x4_t rhs = vcvtq_f32_s32( vreinterpretq_s32_u32(cmpmask) );
float32x4_t rhs2 = vaddq_f32(vmulq_n_f32(rhs, 2.0f), onex);

return vmulq_f32(ifl, rhs2);
}

FS_INLINE static float32v Sqrt_f32( float32v a )
{
return Reciprocal_f32(InvSqrt_f32(a));
}

#else
FS_INLINE static float32v Floor_f32( float32v a )
{
return vrndmq_f32( a );
}

FS_INLINE static float32v Ceil_f32( float32v a )
{
return vrndpq_f32( a );
}

FS_INLINE static float32v Round_f32( float32v a )
{
return vrndnq_f32( a );
}

FS_INLINE static float32v Sqrt_f32( float32v a )
{
return vsqrtq_f32( a );
}
#endif

// Mask

FS_INLINE static int32v Mask_i32( int32v a, mask32v m )
{
return a & m;
}

FS_INLINE static int32v NMask_i32( int32v a, mask32v m )
{
return BitwiseAndNot_i32(a, m);
Expand All @@ -578,6 +566,7 @@ namespace FastSIMD
{
return BitwiseAnd_f32( a, vreinterpretq_f32_s32( m ) );
}

FS_INLINE static float32v NMask_f32( float32v a, mask32v m )
{
return BitwiseAndNot_f32( a, vreinterpretq_f32_s32( m ) );
Expand All @@ -587,21 +576,17 @@ namespace FastSIMD
{
return vgetq_lane_f32(a, 0);
}

FS_INLINE static int32_t Extract0_i32( int32v a )
{
return vgetq_lane_s32(a, 0);
}

FS_INLINE static float32v Reciprocal_f32( float32v a )
{

// float32x4_t reciprocal = vrecpeq_f32( a );
// reciprocal = vmulq_f32( vrecpsq_f32( a, reciprocal ), reciprocal );
// reciprocal = vmulq_f32( vrecpsq_f32( a, reciprocal ), reciprocal );
// return reciprocal;


{
return vrecpeq_f32( a );
}

FS_INLINE static float32v BitwiseShiftRightZX_f32( float32v a, int32_t b )
{
int32x4_t rhs2 = vdupq_n_s32( -b );
Expand Down

0 comments on commit 1fe20ed

Please sign in to comment.