Merge pull request #727 from argilo/remove-vcvtrq-asm

Remove inline assembler from volk_32fc_convert_16ic_neon
gnuradio · Jan 7, 2024 · 67cf98a · 67cf98a
2 parents 863aff5 + 0b9dc5f
commit 67cf98a
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 12 deletions.
diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h
@@ -150,12 +150,6 @@ static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
 #if LV_HAVE_NEONV7
 #include <arm_neon.h>
 
-#define VCVTRQ_S32_F32(result, value)                                       \
-    __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[0]) : "t"(value[0]) :); \
-    __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[1]) : "t"(value[1]) :); \
-    __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[2]) : "t"(value[2]) :); \
-    __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[3]) : "t"(value[3]) :);
-
 static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
                                                const lv_32fc_t* inputVector,
                                                unsigned int num_points)
@@ -173,7 +167,8 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
 
     const float32x4_t min_val = vmovq_n_f32(min_val_f);
     const float32x4_t max_val = vmovq_n_f32(max_val_f);
-    float32x4_t ret1, ret2, a, b;
+    float32x4_t half = vdupq_n_f32(0.5f);
+    float32x4_t ret1, ret2, a, b, sign, PlusHalf, Round;
 
     int32x4_t toint_a = { 0, 0, 0, 0 };
     int32x4_t toint_b = { 0, 0, 0, 0 };
@@ -190,9 +185,15 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
         ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
         ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
 
-        // vcvtr takes into account the current rounding mode (as does rintf)
-        VCVTRQ_S32_F32(toint_a, ret1);
-        VCVTRQ_S32_F32(toint_b, ret2);
+        sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
+        PlusHalf = vaddq_f32(ret1, half);
+        Round = vsubq_f32(PlusHalf, sign);
+        toint_a = vcvtq_s32_f32(Round);
+
+        sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret2), 31)));
+        PlusHalf = vaddq_f32(ret2, half);
+        Round = vsubq_f32(PlusHalf, sign);
+        toint_b = vcvtq_s32_f32(Round);
 
         intInputVal1 = vqmovn_s32(toint_a);
         intInputVal2 = vqmovn_s32(toint_b);
@@ -212,7 +213,6 @@ static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
     }
 }
 
-#undef VCVTRQ_S32_F32
 #endif /* LV_HAVE_NEONV7 */
 
 #if LV_HAVE_NEONV8

diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
@@ -126,7 +126,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
     QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params.make_tol(1)))
     QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params))
     QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params.make_tol(1)))
-    QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params.make_tol(1)))
     QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params.make_tol(2e-6)))
     QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params))
     QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params))