diff --git a/.github/workflows/run-tests-rvv.yml b/.github/workflows/run-tests-rvv.yml
new file mode 100644
index 00000000..e97825f3
--- /dev/null
+++ b/.github/workflows/run-tests-rvv.yml
@@ -0,0 +1,55 @@
+#
+# Copyright 2020 - 2022 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+name: Run VOLK tests on different RVV configurations
+
+on: [push, pull_request]
+
+jobs:
+ Tests:
+ runs-on: ubuntu-24.04
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: "recursive"
+ - name: Install packages
+ run: |
+ sudo apt-get update -q -y
+ sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18
+ mkdir build
+ cd build
+ - name: Test gcc-14 VLEN=128
+ run: |
+ cd build; rm -rf *
+ CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \
+ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
+ make -j$(nproc)
+ ARGS=-V make test
+ - name: Test gcc-14 VLEN=256
+ run: |
+ cd build; rm -rf *
+ CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=256 \
+ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
+ make -j$(nproc)
+ ARGS=-V make test
+ - name: Test clang-18 VLEN=512
+ run: |
+ cd build; rm -rf *
+ CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \
+ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
+ make -j$(nproc)
+ ARGS=-V make test
+ - name: Test clang-18 VLEN=1024
+ run: |
+ cd build; rm -rf *
+ CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \
+ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
+ make -j$(nproc)
+ ARGS=-V make test
+
+
diff --git a/cmake/Checks/check-rvv-intrinsics.c b/cmake/Checks/check-rvv-intrinsics.c
new file mode 100644
index 00000000..48d874de
--- /dev/null
+++ b/cmake/Checks/check-rvv-intrinsics.c
@@ -0,0 +1,5 @@
+#if (__riscv_v_intrinsic >= 1000000 || __clang_major__ >= 18 || __GNUC__ >= 14)
+int main() { return 0; }
+#else
+#error "rvv intrinsics aren't supported"
+#endif
diff --git a/cmake/Toolchains/rv64gcv-linux-gnu.cmake b/cmake/Toolchains/rv64gcv-linux-gnu.cmake
new file mode 100644
index 00000000..f6edd741
--- /dev/null
+++ b/cmake/Toolchains/rv64gcv-linux-gnu.cmake
@@ -0,0 +1,34 @@
+#
+# Copyright 2024 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+set(CMAKE_C_COMPILER $ENV{CC})
+set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+set(CMAKE_CXX_COMPILER $ENV{CXX})
+
+set(CMAKE_C_FLAGS "$ENV{CFLAGS} -march=rv64gcv" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "" FORCE)
+set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g" CACHE STRING "" FORCE)
+
+set(CMAKE_OBJCOPY
+ ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy
+ CACHE INTERNAL "objcopy tool")
+set(CMAKE_SIZE_UTIL
+ ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size
+ CACHE INTERNAL "size tool")
+
+set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})
+
+set(QEMU_VLEN $ENV{VLEN})
+if(NOT QEMU_VLEN)
+ set(QEMU_VLEN "128")
+endif()
+
+set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static -L /usr/riscv64-linux-gnu/ -cpu rv64,zba=true,zbb=true,v=on,vlen=${QEMU_VLEN},rvv_ta_all_1s=on,rvv_ma_all_1s=on")
diff --git a/gen/archs.xml b/gen/archs.xml
index 164c7bb4..7f971369 100644
--- a/gen/archs.xml
+++ b/gen/archs.xml
@@ -181,4 +181,48 @@ at the top, as a last resort.
+
+ tmpl/ currently assumes that every arch.name starting with "rv" requires
+ RVV intrinsics
+-->
+
+ There is currently no mechanism in RISC-V to append extensions,
+ so each arch needs to specify all of them, and the order needs in the
+ machine definition needs to be from the fewest to the most extensions.
+ Fortunately, this maps quite well to the profiles concept.
+-->
+
+
+ -march=rv64gcv
+ -march=rv64gcv
+
+
+
+
+ -march=rv64gcv
+ -march=rv64gcv
+
+ It's unclear how performance portable segmented load/stores are, so the
+ default rvv implementations avoid using them.
+ This is a pseudo arch for separate segmented load/store implementations,
+ and is expected to never be used standalone without "rvv".
+ -->
+
+
+
+ google/cpu_features currently doesn't support these extensions and profiles.
+-->
+
+
+
+
diff --git a/gen/machines.xml b/gen/machines.xml
index 887f9794..64e1bbd8 100644
--- a/gen/machines.xml
+++ b/gen/machines.xml
@@ -33,6 +33,18 @@
generic riscv64 orc|
+
+generic riscv64 rvv rvvseg orc|
+
+
+
+
+
+
generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc|
diff --git a/include/volk/volk_rvv_intrinsics.h b/include/volk/volk_rvv_intrinsics.h
new file mode 100644
index 00000000..85e21d43
--- /dev/null
+++ b/include/volk/volk_rvv_intrinsics.h
@@ -0,0 +1,77 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2024 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold RVV intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
+#include
+
+#define RISCV_SHRINK2(op, T, S, v) \
+ __riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
+ __riscv_vget_##T##S##m1(v, 1), \
+ __riscv_vsetvlmax_e##S##m1())
+
+#define RISCV_SHRINK4(op, T, S, v) \
+ __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
+ __riscv_vget_##T##S##m1(v, 1), \
+ __riscv_vsetvlmax_e##S##m1()), \
+ __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
+ __riscv_vget_##T##S##m1(v, 3), \
+ __riscv_vsetvlmax_e##S##m1()), \
+ __riscv_vsetvlmax_e##S##m1())
+
+#define RISCV_SHRINK8(op, T, S, v) \
+ __riscv_##op(__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
+ __riscv_vget_##T##S##m1(v, 1), \
+ __riscv_vsetvlmax_e##S##m1()), \
+ __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
+ __riscv_vget_##T##S##m1(v, 3), \
+ __riscv_vsetvlmax_e##S##m1()), \
+ __riscv_vsetvlmax_e##S##m1()), \
+ __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 4), \
+ __riscv_vget_##T##S##m1(v, 5), \
+ __riscv_vsetvlmax_e##S##m1()), \
+ __riscv_##op(__riscv_vget_##T##S##m1(v, 6), \
+ __riscv_vget_##T##S##m1(v, 7), \
+ __riscv_vsetvlmax_e##S##m1()), \
+ __riscv_vsetvlmax_e##S##m1()), \
+ __riscv_vsetvlmax_e##S##m1())
+
+#define RISCV_PERM4(f, v, vidx) \
+ __riscv_vcreate_v_u8m1_u8m4( \
+ f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()))
+
+#define RISCV_LUT4(f, vtbl, v) \
+ __riscv_vcreate_v_u8m1_u8m4( \
+ f(vtbl, __riscv_vget_u8m1(v, 0), __riscv_vsetvlmax_e8m1()), \
+ f(vtbl, __riscv_vget_u8m1(v, 1), __riscv_vsetvlmax_e8m1()), \
+ f(vtbl, __riscv_vget_u8m1(v, 2), __riscv_vsetvlmax_e8m1()), \
+ f(vtbl, __riscv_vget_u8m1(v, 3), __riscv_vsetvlmax_e8m1()))
+
+#define RISCV_PERM8(f, v, vidx) \
+ __riscv_vcreate_v_u8m1_u8m8( \
+ f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 4), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 5), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 6), vidx, __riscv_vsetvlmax_e8m1()), \
+ f(__riscv_vget_u8m1(v, 7), vidx, __riscv_vsetvlmax_e8m1()))
+
+#define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl)
+
+#endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */
diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
index 8949785f..8d772ba8 100644
--- a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
+++ b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -668,5 +668,66 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+ vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+ vfloat32m4_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl);
+ vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m4_t v =
+ __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
+ vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
+ vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
+ vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+ __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+#include
+
+static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result,
+ const short* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+ vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+ vfloat32m4_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
+ vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
+ vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
+ vfloat32m4_t v =
+ __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
+ vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
+ vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
+ vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+ __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVVSEG*/
#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h
index b0f4d3b6..775b1523 100644
--- a/kernels/volk/volk_16i_branch_4_state_8.h
+++ b/kernels/volk/volk_16i_branch_4_state_8.h
@@ -10,6 +10,10 @@
/*!
* \page volk_16i_branch_4_state_8
*
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
* \b Overview
*
*
diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h
index cb7168ef..648712af 100644
--- a/kernels/volk/volk_16i_convert_8i.h
+++ b/kernels/volk/volk_16i_convert_8i.h
@@ -275,5 +275,20 @@ static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16i_convert_8i_rvv(int8_t* outputVector,
+ const int16_t* inputVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e16m8(n);
+ vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl);
+ __riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h
index fba73da1..ab0a4bcf 100644
--- a/kernels/volk/volk_16i_max_star_16i.h
+++ b/kernels/volk/volk_16i_max_star_16i.h
@@ -10,6 +10,10 @@
/*!
* \page volk_16i_max_star_16i
*
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
* \b Overview
*
*
diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h
index 2b0b65c3..ee08ba43 100644
--- a/kernels/volk/volk_16i_max_star_horizontal_16i.h
+++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -11,6 +11,10 @@
/*!
* \page volk_16i_max_star_horizontal_16i
*
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
* \b Overview
*
*
diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h
index 077c37b0..f57603db 100644
--- a/kernels/volk/volk_16i_permute_and_scalar_add.h
+++ b/kernels/volk/volk_16i_permute_and_scalar_add.h
@@ -10,6 +10,10 @@
/*!
* \page volk_16i_permute_and_scalar_add
*
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
* \b Overview
*
*
diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h
index 817ecd22..1f9660ce 100644
--- a/kernels/volk/volk_16i_s32f_convert_32f.h
+++ b/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -483,4 +483,21 @@ static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16i_s32f_convert_32f_rvv(float* outputVector,
+ const int16_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vfloat32m8_t v = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(inputVector, vl), vl);
+ __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h
index a8337cc3..94e264fe 100644
--- a/kernels/volk/volk_16i_x4_quad_max_star_16i.h
+++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h
@@ -10,6 +10,10 @@
/*!
* \page volk_16i_x4_quad_max_star_16i
*
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
* \b Overview
*
*
diff --git a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
index 53fa8de5..ba14c59d 100644
--- a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
+++ b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
@@ -10,6 +10,10 @@
/*!
* \page volk_16i_x5_add_quad_16i_x4
*
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
* \b Overview
*
*
diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h
index 7a779bf8..99fe7cb2 100644
--- a/kernels/volk/volk_16ic_convert_32fc.h
+++ b/kernels/volk/volk_16ic_convert_32fc.h
@@ -315,4 +315,23 @@ static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
}
#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16ic_convert_32fc_rvv(lv_32fc_t* outputVector,
+ const lv_16sc_t* inputVector,
+ unsigned int num_points)
+{
+ const int16_t* in = (const int16_t*)inputVector;
+ float* out = (float*)outputVector;
+ size_t n = num_points * 2;
+ for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint16m4_t v = __riscv_vle16_v_i16m4(in, vl);
+ __riscv_vse32(out, __riscv_vfwcvt_f(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
diff --git a/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/kernels/volk/volk_16ic_deinterleave_16i_x2.h
index 37fb41e1..9f4ad7f7 100644
--- a/kernels/volk/volk_16ic_deinterleave_16i_x2.h
+++ b/kernels/volk/volk_16ic_deinterleave_16i_x2.h
@@ -375,4 +375,45 @@ static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16ic_deinterleave_16i_x2_rvv(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vuint32m8_t vc = __riscv_vle32_v_u32m8((const uint32_t*)complexVector, vl);
+ vuint16m4_t vr = __riscv_vnsrl(vc, 0, vl);
+ vuint16m4_t vi = __riscv_vnsrl(vc, 16, vl);
+ __riscv_vse16((uint16_t*)iBuffer, vr, vl);
+ __riscv_vse16((uint16_t*)qBuffer, vi, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_16ic_deinterleave_16i_x2_rvvseg(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vuint16m4x2_t vc =
+ __riscv_vlseg2e16_v_u16m4x2((const uint16_t*)complexVector, vl);
+ vuint16m4_t vr = __riscv_vget_u16m4(vc, 0);
+ vuint16m4_t vi = __riscv_vget_u16m4(vc, 1);
+ __riscv_vse16((uint16_t*)iBuffer, vr, vl);
+ __riscv_vse16((uint16_t*)qBuffer, vi, vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */
diff --git a/kernels/volk/volk_16ic_deinterleave_real_16i.h b/kernels/volk/volk_16ic_deinterleave_real_16i.h
index 92110a3a..f5a9696f 100644
--- a/kernels/volk/volk_16ic_deinterleave_real_16i.h
+++ b/kernels/volk/volk_16ic_deinterleave_real_16i.h
@@ -377,4 +377,21 @@ static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16ic_deinterleave_real_16i_rvv(int16_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
+{
+ const uint32_t* in = (const uint32_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl);
+ __riscv_vse16((uint16_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */
diff --git a/kernels/volk/volk_16ic_deinterleave_real_8i.h b/kernels/volk/volk_16ic_deinterleave_real_8i.h
index 231be417..257ea519 100644
--- a/kernels/volk/volk_16ic_deinterleave_real_8i.h
+++ b/kernels/volk/volk_16ic_deinterleave_real_8i.h
@@ -415,4 +415,24 @@ static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
}
}
#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16ic_deinterleave_real_8i_rvv(int8_t* iBuffer,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
+{
+ const uint32_t* in = (const uint32_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl);
+ __riscv_vse8(
+ (uint8_t*)iBuffer, __riscv_vnsrl(__riscv_vnsrl(vc, 0, vl), 8, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
diff --git a/kernels/volk/volk_16ic_magnitude_16i.h b/kernels/volk/volk_16ic_magnitude_16i.h
index 76472540..79553d65 100644
--- a/kernels/volk/volk_16ic_magnitude_16i.h
+++ b/kernels/volk/volk_16ic_magnitude_16i.h
@@ -411,4 +411,50 @@ static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
}
#endif /* LV_HAVE_NEONV7 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16ic_magnitude_16i_rvv(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
+{
+ const float scale = SHRT_MAX, iscale = 1.0f / scale;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl);
+ vint16m4_t vr = __riscv_vnsra(vc, 0, vl);
+ vint16m4_t vi = __riscv_vnsra(vc, 16, vl);
+ vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), iscale, vl);
+ vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), iscale, vl);
+ vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
+ vf = __riscv_vfmul(__riscv_vfsqrt(vf, vl), scale, vl);
+ __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(vf, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_16ic_magnitude_16i_rvvseg(int16_t* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ unsigned int num_points)
+{
+ const float scale = SHRT_MAX, iscale = 1.0f / scale;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl);
+ vint16m4_t vr = __riscv_vget_i16m4(vc, 0);
+ vint16m4_t vi = __riscv_vget_i16m4(vc, 1);
+ vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), iscale, vl);
+ vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), iscale, vl);
+ vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
+ vf = __riscv_vfmul(__riscv_vfsqrt(vf, vl), scale, vl);
+ __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(vf, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_16ic_magnitude_16i_u_H */
diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
index 219e977c..7f9b8ad6 100644
--- a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
+++ b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
@@ -327,4 +327,51 @@ volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16ic_s32f_deinterleave_32f_x2_rvv(float* iBuffer,
+ float* qBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl);
+ vint16m4_t vr = __riscv_vnsra(vc, 0, vl);
+ vint16m4_t vi = __riscv_vnsra(vc, 16, vl);
+ vfloat32m8_t vrf = __riscv_vfwcvt_f(vr, vl);
+ vfloat32m8_t vif = __riscv_vfwcvt_f(vi, vl);
+ __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl);
+ __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_rvvseg(float* iBuffer,
+ float* qBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl);
+ vint16m4_t vr = __riscv_vget_i16m4(vc, 0);
+ vint16m4_t vi = __riscv_vget_i16m4(vc, 1);
+ vfloat32m8_t vrf = __riscv_vfwcvt_f(vr, vl);
+ vfloat32m8_t vif = __riscv_vfwcvt_f(vi, vl);
+ __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl);
+ __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */
diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
index 55688329..e8a0d1a0 100644
--- a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
+++ b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
@@ -334,4 +334,24 @@ volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_16ic_s32f_deinterleave_real_32f_rvv(float* iBuffer,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ const int32_t* in = (const int32_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vint32m8_t vc = __riscv_vle32_v_i32m8(in, vl);
+ vfloat32m8_t vr = __riscv_vfwcvt_f(__riscv_vncvt_x(vc, vl), vl);
+ __riscv_vse32(iBuffer, __riscv_vfmul(vr, 1.0f / scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H */
diff --git a/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/kernels/volk/volk_16ic_s32f_magnitude_32f.h
index 89600632..8b193ee2 100644
--- a/kernels/volk/volk_16ic_s32f_magnitude_32f.h
+++ b/kernels/volk/volk_16ic_s32f_magnitude_32f.h
@@ -329,4 +329,48 @@ static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16ic_s32f_magnitude_32f_rvv(float* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl);
+ vint16m4_t vr = __riscv_vnsra(vc, 0, vl);
+ vint16m4_t vi = __riscv_vnsra(vc, 16, vl);
+ vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
+ vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
+ vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
+ __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_16ic_s32f_magnitude_32f_rvvseg(float* magnitudeVector,
+ const lv_16sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl);
+ vint16m4_t vr = __riscv_vget_i16m4(vc, 0);
+ vint16m4_t vi = __riscv_vget_i16m4(vc, 1);
+ vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
+ vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
+ vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
+ __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
index 48e33abf..a12350a0 100644
--- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
+++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
@@ -690,4 +690,68 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out,
#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_RVV
+#include "volk_32fc_x2_dot_prod_32fc.h"
+
+static inline void volk_16ic_x2_dot_prod_16ic_rvv(lv_16sc_t* result,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
+{
+ vint16m4_t vsumr = __riscv_vmv_v_x_i16m4(0, __riscv_vsetvlmax_e16m4());
+ vint16m4_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint32m8_t va = __riscv_vle32_v_i32m8((const int32_t*)in_a, vl);
+ vint32m8_t vb = __riscv_vle32_v_i32m8((const int32_t*)in_b, vl);
+ vint16m4_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 16, vl);
+ vint16m4_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 16, vl);
+ vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl);
+ vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl);
+ vsumr = __riscv_vadd_tu(vsumr, vsumr, vr, vl);
+ vsumi = __riscv_vadd_tu(vsumi, vsumi, vi, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e16m1();
+ vint16m1_t vr = RISCV_SHRINK4(vadd, i, 16, vsumr);
+ vint16m1_t vi = RISCV_SHRINK4(vadd, i, 16, vsumi);
+ vint16m1_t z = __riscv_vmv_s_x_i16m1(0, vl);
+ *result = lv_cmake(__riscv_vmv_x(__riscv_vredsum(vr, z, vl)),
+ __riscv_vmv_x(__riscv_vredsum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include "volk_32fc_x2_dot_prod_32fc.h"
+
+
+static inline void volk_16ic_x2_dot_prod_16ic_rvvseg(lv_16sc_t* result,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
+{
+ vint16m4_t vsumr = __riscv_vmv_v_x_i16m4(0, __riscv_vsetvlmax_e16m4());
+ vint16m4_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint16m4x2_t va = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_a, vl);
+ vint16m4x2_t vb = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_b, vl);
+ vint16m4_t var = __riscv_vget_i16m4(va, 0), vai = __riscv_vget_i16m4(va, 1);
+ vint16m4_t vbr = __riscv_vget_i16m4(vb, 0), vbi = __riscv_vget_i16m4(vb, 1);
+ vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl);
+ vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl);
+ vsumr = __riscv_vadd_tu(vsumr, vsumr, vr, vl);
+ vsumi = __riscv_vadd_tu(vsumi, vsumi, vi, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e16m1();
+ vint16m1_t vr = RISCV_SHRINK4(vadd, i, 16, vsumr);
+ vint16m1_t vi = RISCV_SHRINK4(vadd, i, 16, vsumi);
+ vint16m1_t z = __riscv_vmv_s_x_i16m1(0, vl);
+ *result = lv_cmake(__riscv_vmv_x(__riscv_vredsum(vr, z, vl)),
+ __riscv_vmv_x(__riscv_vredsum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/
diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h
index 03ee145c..37f0fb66 100644
--- a/kernels/volk/volk_16ic_x2_multiply_16ic.h
+++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h
@@ -462,4 +462,52 @@ static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out,
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_16ic_x2_multiply_16ic_rvv(lv_16sc_t* result,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl, result += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint32m8_t va = __riscv_vle32_v_i32m8((const int32_t*)in_a, vl);
+ vint32m8_t vb = __riscv_vle32_v_i32m8((const int32_t*)in_b, vl);
+ vint16m4_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 16, vl);
+ vint16m4_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 16, vl);
+ vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl);
+ vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl);
+ vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr);
+ vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi);
+ vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl);
+ __riscv_vse32((uint32_t*)result, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_16ic_x2_multiply_16ic_rvvseg(lv_16sc_t* result,
+ const lv_16sc_t* in_a,
+ const lv_16sc_t* in_b,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl, result += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vint16m4x2_t va = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_a, vl);
+ vint16m4x2_t vb = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_b, vl);
+ vint16m4_t var = __riscv_vget_i16m4(va, 0), vai = __riscv_vget_i16m4(va, 1);
+ vint16m4_t vbr = __riscv_vget_i16m4(vb, 0), vbi = __riscv_vget_i16m4(vb, 1);
+ vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl);
+ vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl);
+ __riscv_vsseg2e16_v_i16m4x2(
+ (int16_t*)result, __riscv_vcreate_v_i16m4x2(vr, vi), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/
diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h
index 8b1b8c03..50e59906 100644
--- a/kernels/volk/volk_16u_byteswap.h
+++ b/kernels/volk/volk_16u_byteswap.h
@@ -280,5 +280,54 @@ static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int nu
}
#endif /* LV_HAVE_ORC */
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_16u_byteswap_rvv(uint16_t* intsToSwap, unsigned int num_points)
+{
+ size_t n = num_points;
+ size_t vlmax = __riscv_vsetvlmax_e8m1();
+ if (vlmax <= 256) {
+ vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
+ __riscv_vsub(__riscv_vreinterpret_u16m1(__riscv_vid_v_u8m1(vlmax)),
+ 0x100 - 0x1,
+ vlmax / 2));
+ for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
+ vl = __riscv_vsetvl_e16m8(n);
+ vuint8m8_t v =
+ __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl));
+ v = RISCV_PERM8(__riscv_vrgather, v, vidx);
+ __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl);
+ }
+ } else {
+ vuint16m2_t vidx = __riscv_vreinterpret_u16m2(
+ __riscv_vsub(__riscv_vreinterpret_u32m2(__riscv_vid_v_u16m2(vlmax)),
+ 0x10000 - 0x1,
+ vlmax / 2));
+ for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
+ vl = __riscv_vsetvl_e16m8(n);
+ vuint8m8_t v =
+ __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl));
+ v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx);
+ __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl);
+ }
+ }
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVA23
+#include
+
+static inline void volk_16u_byteswap_rva23(uint16_t* intsToSwap, unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
+ vl = __riscv_vsetvl_e16m8(n);
+ vuint16m8_t v = __riscv_vle16_v_u16m8(intsToSwap, vl);
+ __riscv_vse16(intsToSwap, __riscv_vrev8(v, vl), vl);
+ }
+}
+#endif /* LV_HAVE_RVA23 */
#endif /* INCLUDED_volk_16u_byteswap_a_H */
diff --git a/kernels/volk/volk_16u_byteswappuppet_16u.h b/kernels/volk/volk_16u_byteswappuppet_16u.h
index 16e75d91..f01129eb 100644
--- a/kernels/volk/volk_16u_byteswappuppet_16u.h
+++ b/kernels/volk/volk_16u_byteswappuppet_16u.h
@@ -102,4 +102,26 @@ static inline void volk_16u_byteswappuppet_16u_u_orc(uint16_t* output,
}
#endif /* LV_HAVE_ORC */
+#ifdef LV_HAVE_RVV
+static inline void volk_16u_byteswappuppet_16u_rvv(uint16_t* output,
+ uint16_t* intsToSwap,
+ unsigned int num_points)
+{
+
+ volk_16u_byteswap_rvv((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+}
+#endif
+
+#ifdef LV_HAVE_RVA23
+static inline void volk_16u_byteswappuppet_16u_rva23(uint16_t* output,
+ uint16_t* intsToSwap,
+ unsigned int num_points)
+{
+
+ volk_16u_byteswap_rva23((uint16_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+}
+#endif
+
#endif
diff --git a/kernels/volk/volk_32f_64f_add_64f.h b/kernels/volk/volk_32f_64f_add_64f.h
index 06b56819..54d890e3 100644
--- a/kernels/volk/volk_32f_64f_add_64f.h
+++ b/kernels/volk/volk_32f_64f_add_64f.h
@@ -230,4 +230,22 @@ static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_64f_add_64f_rvv(double* cVector,
+ const float* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl);
+ vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
+ __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
diff --git a/kernels/volk/volk_32f_64f_multiply_64f.h b/kernels/volk/volk_32f_64f_multiply_64f.h
index 069cd73e..5ff81578 100644
--- a/kernels/volk/volk_32f_64f_multiply_64f.h
+++ b/kernels/volk/volk_32f_64f_multiply_64f.h
@@ -188,5 +188,22 @@ static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector,
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_64f_multiply_64f_rvv(double* cVector,
+ const float* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl);
+ vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
+ __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */
diff --git a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h
index b3683a96..41e98a80 100644
--- a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h
+++ b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h
@@ -383,4 +383,174 @@ static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_8u_polarbutterfly_32f_rvv(float* llrs,
+ unsigned char* u,
+ const int frame_exp,
+ const int stage,
+ const int u_num,
+ const int row)
+{
+ const int frame_size = 0x01 << frame_exp;
+ if (row % 2) { // for odd rows just do the only necessary calculation and return.
+ const float* next_llrs = llrs + frame_size + row;
+ *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+ return;
+ }
+
+ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+ if (max_stage_depth < 3) { // vectorized version needs larger vectors.
+ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+ return;
+ }
+
+ int loop_stage = max_stage_depth;
+ int stage_size = 0x01 << loop_stage;
+
+ float* src_llr_ptr;
+ float* dst_llr_ptr;
+
+ if (row) { // not necessary for ZERO row. == first bit to be decoded.
+ // first do bit combination for all stages
+ // effectively encode some decoded bits again.
+ unsigned char* u_target = u + frame_size;
+ unsigned char* u_temp = u + 2 * frame_size;
+ memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
+
+ volk_8u_x2_encodeframepolar_8u_rvv(u_target, u_temp, stage_size);
+
+ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+ dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+
+ size_t n = stage_size;
+ for (size_t vl; n > 0;
+ n -= vl, u_target += vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) {
+ vl = __riscv_vsetvl_e32m1(n);
+ vint8mf4_t v = __riscv_vle8_v_i8mf4((int8_t*)u_target, vl);
+ vuint64m2_t llr = __riscv_vle64_v_u64m2((const uint64_t*)src_llr_ptr, vl);
+ vfloat32m1_t llr0 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 0, vl));
+ vfloat32m1_t llr1 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 32, vl));
+ llr0 = __riscv_vfneg_mu(__riscv_vmslt(v, 0, vl), llr0, llr0, vl);
+ llr0 = __riscv_vfadd(llr0, llr1, vl);
+ __riscv_vse32(dst_llr_ptr, llr0, vl);
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
+ }
+
+ const int min_stage = stage > 2 ? stage : 2;
+
+ while (min_stage < loop_stage) {
+ dst_llr_ptr = llrs + loop_stage * frame_size + row;
+ src_llr_ptr = dst_llr_ptr + frame_size;
+
+ size_t n = stage_size;
+ for (size_t vl; n > 0; n -= vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) {
+ vl = __riscv_vsetvl_e32m1(n);
+ vuint64m2_t llr = __riscv_vle64_v_u64m2((const uint64_t*)src_llr_ptr, vl);
+ vfloat32m1_t llr0 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 0, vl));
+ vfloat32m1_t llr1 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 32, vl));
+ vfloat32m1_t v =
+ __riscv_vfmin(__riscv_vfabs(llr0, vl), __riscv_vfabs(llr1, vl), vl);
+ v = __riscv_vfsgnjx(__riscv_vfsgnj(v, llr0, vl), llr1, vl);
+ __riscv_vse32(dst_llr_ptr, v, vl);
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
+ }
+
+ // for stages < 3 vectors are too small!.
+ llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32f_8u_polarbutterfly_32f_rvvseg(float* llrs,
+ unsigned char* u,
+ const int frame_exp,
+ const int stage,
+ const int u_num,
+ const int row)
+{
+ const int frame_size = 0x01 << frame_exp;
+ if (row % 2) { // for odd rows just do the only necessary calculation and return.
+ const float* next_llrs = llrs + frame_size + row;
+ *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+ return;
+ }
+
+ const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+ if (max_stage_depth < 3) { // vectorized version needs larger vectors.
+ volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+ return;
+ }
+
+ int loop_stage = max_stage_depth;
+ int stage_size = 0x01 << loop_stage;
+
+ float* src_llr_ptr;
+ float* dst_llr_ptr;
+
+ if (row) { // not necessary for ZERO row. == first bit to be decoded.
+ // first do bit combination for all stages
+ // effectively encode some decoded bits again.
+ unsigned char* u_target = u + frame_size;
+ unsigned char* u_temp = u + 2 * frame_size;
+ memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
+
+ volk_8u_x2_encodeframepolar_8u_rvv(u_target, u_temp, stage_size);
+
+ src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+ dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+
+ size_t n = stage_size;
+ for (size_t vl; n > 0;
+ n -= vl, u_target += vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) {
+ vl = __riscv_vsetvl_e32m1(n);
+ vint8mf4_t v = __riscv_vle8_v_i8mf4((int8_t*)u_target, vl);
+ vfloat32m1x2_t llr = __riscv_vlseg2e32_v_f32m1x2(src_llr_ptr, vl);
+ vfloat32m1_t llr0 = __riscv_vget_f32m1(llr, 0);
+ vfloat32m1_t llr1 = __riscv_vget_f32m1(llr, 1);
+ llr0 = __riscv_vfneg_mu(__riscv_vmslt(v, 0, vl), llr0, llr0, vl);
+ llr0 = __riscv_vfadd(llr0, llr1, vl);
+ __riscv_vse32(dst_llr_ptr, llr0, vl);
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
+ }
+
+ const int min_stage = stage > 2 ? stage : 2;
+
+ while (min_stage < loop_stage) {
+ dst_llr_ptr = llrs + loop_stage * frame_size + row;
+ src_llr_ptr = dst_llr_ptr + frame_size;
+
+ size_t n = stage_size;
+ for (size_t vl; n > 0; n -= vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) {
+ vl = __riscv_vsetvl_e32m1(n);
+ vfloat32m1x2_t llr = __riscv_vlseg2e32_v_f32m1x2(src_llr_ptr, vl);
+ vfloat32m1_t llr0 = __riscv_vget_f32m1(llr, 0);
+ vfloat32m1_t llr1 = __riscv_vget_f32m1(llr, 1);
+ vfloat32m1_t v =
+ __riscv_vfmin(__riscv_vfabs(llr0, vl), __riscv_vfabs(llr1, vl), vl);
+ v = __riscv_vfsgnjx(__riscv_vfsgnj(v, llr0, vl), llr1, vl);
+ __riscv_vse32(dst_llr_ptr, v, vl);
+ }
+
+ --loop_stage;
+ stage_size >>= 1;
+ }
+
+ // for stages < 3 vectors are too small!.
+ llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
+}
+#endif /* LV_HAVE_RVVSEG */
+
#endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
diff --git a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h
index c97da33d..6ebcd22e 100644
--- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h
+++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h
@@ -162,5 +162,62 @@ static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+static inline void volk_32f_8u_polarbutterflypuppet_32f_rvv(float* llrs,
+ const float* input,
+ unsigned char* u,
+ const int elements)
+{
+ (void)input; // suppress unused parameter warning
+
+ if (elements < 2) {
+ return;
+ }
+
+ unsigned int frame_size = maximum_frame_size(elements);
+ unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+ sanitize_bytes(u, elements);
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
+ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+ unsigned int u_num = 0;
+ for (; u_num < frame_size; u_num++) {
+ volk_32f_8u_polarbutterfly_32f_rvv(llrs, u, frame_exp, 0, u_num, u_num);
+ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ }
+
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVVSEG
+static inline void volk_32f_8u_polarbutterflypuppet_32f_rvvseg(float* llrs,
+ const float* input,
+ unsigned char* u,
+ const int elements)
+{
+ (void)input; // suppress unused parameter warning
+
+ if (elements < 2) {
+ return;
+ }
+
+ unsigned int frame_size = maximum_frame_size(elements);
+ unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+ sanitize_bytes(u, elements);
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
+ generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+ unsigned int u_num = 0;
+ for (; u_num < frame_size; u_num++) {
+ volk_32f_8u_polarbutterfly_32f_rvvseg(llrs, u, frame_exp, 0, u_num, u_num);
+ u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ }
+
+ clean_up_intermediate_values(llrs, u, frame_size, elements);
+}
+#endif /* LV_HAVE_RVVSEG */
#endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */
diff --git a/kernels/volk/volk_32f_accumulator_s32f.h b/kernels/volk/volk_32f_accumulator_s32f.h
index 1cd8568e..7e9a81f7 100644
--- a/kernels/volk/volk_32f_accumulator_s32f.h
+++ b/kernels/volk/volk_32f_accumulator_s32f.h
@@ -232,4 +232,26 @@ static inline void volk_32f_accumulator_s32f_generic(float* result,
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32f_accumulator_s32f_rvv(float* result,
+ const float* inputBuffer,
+ unsigned int num_points)
+{
+ vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputBuffer += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl);
+ vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ *result = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h
index 5cf0d693..4331987c 100644
--- a/kernels/volk/volk_32f_acos_32f.h
+++ b/kernels/volk/volk_32f_acos_32f.h
@@ -102,13 +102,15 @@ static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
x = _mm256_add_ps(
z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
- for (i = 0; i < 2; i++)
+ for (i = 0; i < 2; i++) {
x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+ }
x = _mm256_div_ps(fones, x);
y = fzeroes;
- for (j = ACOS_TERMS - 1; j >= 0; j--)
+ for (j = ACOS_TERMS - 1; j >= 0; j--) {
y = _mm256_fmadd_ps(
y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
@@ -171,14 +173,16 @@ volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p
x = _mm256_add_ps(
z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
- for (i = 0; i < 2; i++)
+ for (i = 0; i < 2; i++) {
x = _mm256_add_ps(x,
_mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ }
x = _mm256_div_ps(fones, x);
y = fzeroes;
- for (j = ACOS_TERMS - 1; j >= 0; j--)
+ for (j = ACOS_TERMS - 1; j >= 0; j--) {
y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
_mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
@@ -240,13 +244,15 @@ volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu
condition = _mm_cmplt_ps(z, fones);
x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
- for (i = 0; i < 2; i++)
+ for (i = 0; i < 2; i++) {
x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ }
x = _mm_div_ps(fones, x);
y = fzeroes;
- for (j = ACOS_TERMS - 1; j >= 0; j--)
+ for (j = ACOS_TERMS - 1; j >= 0; j--) {
y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
_mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
condition = _mm_cmpgt_ps(z, fones);
@@ -315,13 +321,15 @@ static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
x = _mm256_add_ps(
z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
- for (i = 0; i < 2; i++)
+ for (i = 0; i < 2; i++) {
x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+ }
x = _mm256_div_ps(fones, x);
y = fzeroes;
- for (j = ACOS_TERMS - 1; j >= 0; j--)
+ for (j = ACOS_TERMS - 1; j >= 0; j--) {
y = _mm256_fmadd_ps(
y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
@@ -384,14 +392,16 @@ volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p
x = _mm256_add_ps(
z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
- for (i = 0; i < 2; i++)
+ for (i = 0; i < 2; i++) {
x = _mm256_add_ps(x,
_mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+ }
x = _mm256_div_ps(fones, x);
y = fzeroes;
- for (j = ACOS_TERMS - 1; j >= 0; j--)
+ for (j = ACOS_TERMS - 1; j >= 0; j--) {
y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
_mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
@@ -453,14 +463,16 @@ volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu
condition = _mm_cmplt_ps(z, fones);
x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
- for (i = 0; i < 2; i++)
+ for (i = 0; i < 2; i++) {
x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ }
x = _mm_div_ps(fones, x);
y = fzeroes;
- for (j = ACOS_TERMS - 1; j >= 0; j--)
+ for (j = ACOS_TERMS - 1; j >= 0; j--) {
y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
_mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+ }
y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
condition = _mm_cmpgt_ps(z, fones);
@@ -501,4 +513,72 @@ volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void
+volk_32f_acos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
+ const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax);
+ const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
+
+#if ACOS_TERMS == 2
+ const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax);
+#elif ACOS_TERMS == 3
+ const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax);
+#elif ACOS_TERMS == 4
+ const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax);
+#endif
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
+ vfloat32m2_t a =
+ __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl);
+ vfloat32m2_t z = __riscv_vfabs(a, vl);
+ vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl);
+ x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
+ x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
+ x = __riscv_vfdiv(cf1, x, vl);
+ vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
+
+#if ACOS_TERMS < 1
+ vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl);
+#elif ACOS_TERMS == 1
+ y = __riscv_vfmadd(y, xx, cf1, vl);
+#elif ACOS_TERMS == 2
+ vfloat32m2_t y = cfm1o3;
+ y = __riscv_vfmadd(y, xx, cf1, vl);
+#elif ACOS_TERMS == 3
+ vfloat32m2_t y = cf1o5;
+ y = __riscv_vfmadd(y, xx, cfm1o3, vl);
+ y = __riscv_vfmadd(y, xx, cf1, vl);
+#elif ACOS_TERMS == 4
+ vfloat32m2_t y = cfm1o7;
+ y = __riscv_vfmadd(y, xx, cf1o5, vl);
+ y = __riscv_vfmadd(y, xx, cfm1o3, vl);
+ y = __riscv_vfmadd(y, xx, cf1, vl);
+#else
+#error "ACOS_TERMS > 4 not supported by volk_32f_acos_32f_rvv"
+#endif
+ y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl);
+ y = __riscv_vfadd_mu(
+ __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl);
+
+ vfloat32m2_t acosine;
+ acosine = __riscv_vfneg_mu(RISCV_VMFLTZ(32m2, a, vl), y, y, vl);
+ acosine = __riscv_vfadd_mu(RISCV_VMFLTZ(32m2, v, vl), acosine, acosine, cpi, vl);
+
+ __riscv_vse32(bVector, acosine, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_acos_32f_u_H */
diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h
index 09377163..1914c39e 100644
--- a/kernels/volk/volk_32f_asin_32f.h
+++ b/kernels/volk/volk_32f_asin_32f.h
@@ -486,4 +486,70 @@ volk_32f_asin_32f_generic(float* bVector, const float* aVector, unsigned int num
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void
+volk_32f_asin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax);
+ const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
+
+#if ASIN_TERMS == 2
+ const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax);
+#elif ASIN_TERMS == 3
+ const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax);
+#elif ASIN_TERMS == 4
+ const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax);
+#endif
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
+ vfloat32m2_t a =
+ __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl);
+ vfloat32m2_t z = __riscv_vfabs(a, vl);
+ vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl);
+ x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
+ x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
+ x = __riscv_vfdiv(cf1, x, vl);
+ vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
+
+#if ASIN_TERMS < 1
+ vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl);
+#elif ASIN_TERMS == 1
+ y = __riscv_vfmadd(y, xx, cf1, vl);
+#elif ASIN_TERMS == 2
+ vfloat32m2_t y = cfm1o3;
+ y = __riscv_vfmadd(y, xx, cf1, vl);
+#elif ASIN_TERMS == 3
+ vfloat32m2_t y = cf1o5;
+ y = __riscv_vfmadd(y, xx, cfm1o3, vl);
+ y = __riscv_vfmadd(y, xx, cf1, vl);
+#elif ASIN_TERMS == 4
+ vfloat32m2_t y = cfm1o7;
+ y = __riscv_vfmadd(y, xx, cf1o5, vl);
+ y = __riscv_vfmadd(y, xx, cfm1o3, vl);
+ y = __riscv_vfmadd(y, xx, cf1, vl);
+#else
+#error "ASIN_TERMS > 4 not supported by volk_32f_asin_32f_rvv"
+#endif
+ y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl);
+ y = __riscv_vfadd_mu(
+ __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl);
+
+ vfloat32m2_t asine;
+ asine = __riscv_vfneg_mu(RISCV_VMFLTZ(32m2, a, vl), y, y, vl);
+
+ __riscv_vse32(bVector, asine, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_asin_32f_u_H */
diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
index dc5987cb..300f46ca 100644
--- a/kernels/volk/volk_32f_atan_32f.h
+++ b/kernels/volk/volk_32f_atan_32f.h
@@ -293,4 +293,46 @@ volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_atan_32f_rvv(float* out, const float* in, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
+ const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
+ const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
+ const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
+ const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(in, vl);
+ vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(v, vl), cf1, vl);
+ vfloat32m2_t x = __riscv_vfdiv_mu(mswap, v, cf1, v, vl);
+ vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
+ vfloat32m2_t p = c13;
+ p = __riscv_vfmadd(p, xx, c11, vl);
+ p = __riscv_vfmadd(p, xx, c9, vl);
+ p = __riscv_vfmadd(p, xx, c7, vl);
+ p = __riscv_vfmadd(p, xx, c5, vl);
+ p = __riscv_vfmadd(p, xx, c3, vl);
+ p = __riscv_vfmadd(p, xx, c1, vl);
+ p = __riscv_vfmul(p, x, vl);
+
+ vfloat32m2_t t = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
+ p = __riscv_vmerge(p, t, mswap, vl);
+
+ __riscv_vse32(out, p, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_atan_32f_u_H */
diff --git a/kernels/volk/volk_32f_binary_slicer_32i.h b/kernels/volk/volk_32f_binary_slicer_32i.h
index 7606145b..861ef478 100644
--- a/kernels/volk/volk_32f_binary_slicer_32i.h
+++ b/kernels/volk/volk_32f_binary_slicer_32i.h
@@ -261,5 +261,21 @@ static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_binary_slicer_32i_rvv(int* cVector,
+ const float* aVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)aVector, vl);
+ v = __riscv_vsrl(__riscv_vnot(v, vl), 31, vl);
+ __riscv_vse32((uint32_t*)cVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_binary_slicer_32i_H */
diff --git a/kernels/volk/volk_32f_binary_slicer_8i.h b/kernels/volk/volk_32f_binary_slicer_8i.h
index c6929db4..9623ae90 100644
--- a/kernels/volk/volk_32f_binary_slicer_8i.h
+++ b/kernels/volk/volk_32f_binary_slicer_8i.h
@@ -500,5 +500,22 @@ static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_binary_slicer_8i_rvv(int8_t* cVector,
+ const float* aVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ vint8m2_t v0 = __riscv_vmv_v_x_i8m2(1, __riscv_vsetvlmax_e8m2());
+ for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
+ vint8m2_t vn = __riscv_vmerge(v0, 0, __riscv_vmflt(v, 0, vl), vl);
+ __riscv_vse8(cVector, vn, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
diff --git a/kernels/volk/volk_32f_convert_64f.h b/kernels/volk/volk_32f_convert_64f.h
index 93d1c611..5e907d39 100644
--- a/kernels/volk/volk_32f_convert_64f.h
+++ b/kernels/volk/volk_32f_convert_64f.h
@@ -230,5 +230,20 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector,
}
#endif /* LV_HAVE_SSE2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_convert_64f_rvv(double* outputVector,
+ const float* inputVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4_t v = __riscv_vle32_v_f32m4(inputVector, vl);
+ __riscv_vse64(outputVector, __riscv_vfwcvt_f(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_convert_64f_a_H */
diff --git a/kernels/volk/volk_32f_cos_32f.h b/kernels/volk/volk_32f_cos_32f.h
index 37785df0..854dd00e 100644
--- a/kernels/volk/volk_32f_cos_32f.h
+++ b/kernels/volk/volk_32f_cos_32f.h
@@ -127,8 +127,9 @@ static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
cp1),
s);
- for (i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++) {
s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
+ }
s = _mm512_div_ps(s, ftwos);
sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
@@ -224,8 +225,9 @@ volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int n
cp1),
s);
- for (i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++) {
s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
s = _mm256_div_ps(s, ftwos);
sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
@@ -335,8 +337,9 @@ volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_p
cp1),
s);
- for (i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++) {
s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
s = _mm256_div_ps(s, ftwos);
sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
@@ -442,8 +445,9 @@ volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num
cp1),
s);
- for (i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++) {
s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
s = _mm_div_ps(s, ftwos);
sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
@@ -546,8 +550,9 @@ static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
cp1),
s);
- for (i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++) {
s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
+ }
s = _mm512_div_ps(s, ftwos);
sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
@@ -644,8 +649,9 @@ volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int n
cp1),
s);
- for (i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++) {
s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
s = _mm256_div_ps(s, ftwos);
sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
@@ -755,8 +761,9 @@ volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_p
cp1),
s);
- for (i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++) {
s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+ }
s = _mm256_div_ps(s, ftwos);
sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
@@ -995,5 +1002,65 @@ volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_poi
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_cos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
+ const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
+ const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
+ const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
+
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
+
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax);
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
+ vfloat32m2_t s = __riscv_vfabs(v, vl);
+ vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
+ vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
+
+ s = __riscv_vfnmsac(s, cPio4a, r, vl);
+ s = __riscv_vfnmsac(s, cPio4b, r, vl);
+ s = __riscv_vfnmsac(s, cPio4c, r, vl);
+
+ s = __riscv_vfmul(s, 1 / 8.0f, vl);
+ s = __riscv_vfmul(s, s, vl);
+ vfloat32m2_t t = s;
+ s = __riscv_vfmsub(s, c5, c4, vl);
+ s = __riscv_vfmadd(s, t, c3, vl);
+ s = __riscv_vfmsub(s, t, c2, vl);
+ s = __riscv_vfmadd(s, t, cf1, vl);
+ s = __riscv_vfmul(s, t, vl);
+ s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
+ s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
+ s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
+ s = __riscv_vfmul(s, 1 / 2.0f, vl);
+
+ vfloat32m2_t sine =
+ __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
+ vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
+
+ vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
+ vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl);
+
+ cosine = __riscv_vmerge(cosine, sine, m1, vl);
+ cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl);
+
+ __riscv_vse32(bVector, cosine, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_cos_32f_u_H */
diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h
index 13d21201..85571dbc 100644
--- a/kernels/volk/volk_32f_exp_32f.h
+++ b/kernels/volk/volk_32f_exp_32f.h
@@ -266,4 +266,58 @@ volk_32f_exp_32f_generic(float* bVector, const float* aVector, unsigned int num_
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_exp_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t exp_hi = __riscv_vfmv_v_f_f32m2(88.376259f, vlmax);
+ const vfloat32m2_t exp_lo = __riscv_vfmv_v_f_f32m2(-88.376259f, vlmax);
+ const vfloat32m2_t log2EF = __riscv_vfmv_v_f_f32m2(1.442695f, vlmax);
+ const vfloat32m2_t exp_C1 = __riscv_vfmv_v_f_f32m2(-0.6933594f, vlmax);
+ const vfloat32m2_t exp_C2 = __riscv_vfmv_v_f_f32m2(0.000212194f, vlmax);
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vfloat32m2_t cf1o2 = __riscv_vfmv_v_f_f32m2(0.5f, vlmax);
+
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(1.9875691500e-4, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(1.3981999507e-3, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(8.3334519073e-3, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(4.1665795894e-2, vlmax);
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(1.6666665459e-1, vlmax);
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.0000001201e-1, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
+ v = __riscv_vfmin(v, exp_hi, vl);
+ v = __riscv_vfmax(v, exp_lo, vl);
+ vfloat32m2_t fx = __riscv_vfmadd(v, log2EF, cf1o2, vl);
+
+ vfloat32m2_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
+ fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
+ v = __riscv_vfmacc(v, fx, exp_C1, vl);
+ v = __riscv_vfmacc(v, fx, exp_C2, vl);
+ vfloat32m2_t vv = __riscv_vfmul(v, v, vl);
+
+ vfloat32m2_t y = c0;
+ y = __riscv_vfmadd(y, v, c1, vl);
+ y = __riscv_vfmadd(y, v, c2, vl);
+ y = __riscv_vfmadd(y, v, c3, vl);
+ y = __riscv_vfmadd(y, v, c4, vl);
+ y = __riscv_vfmadd(y, v, c5, vl);
+ y = __riscv_vfmadd(y, vv, v, vl);
+ y = __riscv_vfadd(y, cf1, vl);
+
+ vfloat32m2_t pow2n = __riscv_vreinterpret_f32m2(
+ __riscv_vsll(__riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), 0x7f, vl), 23, vl));
+
+ __riscv_vse32(bVector, __riscv_vfmul(y, pow2n, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_exp_32f_u_H */
diff --git a/kernels/volk/volk_32f_expfast_32f.h b/kernels/volk/volk_32f_expfast_32f.h
index 7dfbaacb..3b65968a 100644
--- a/kernels/volk/volk_32f_expfast_32f.h
+++ b/kernels/volk/volk_32f_expfast_32f.h
@@ -301,4 +301,25 @@ static inline void volk_32f_expfast_32f_generic(float* bVector,
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_expfast_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m8();
+ const vfloat32m8_t ca = __riscv_vfmv_v_f_f32m8(A / Mln2, vlmax);
+ const vfloat32m8_t cb = __riscv_vfmv_v_f_f32m8(B - C, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
+ v = __riscv_vfmadd(v, ca, cb, vl);
+ v = __riscv_vreinterpret_f32m8(__riscv_vfcvt_x(v, vl));
+ __riscv_vse32(bVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_expfast_32f_u_H */
diff --git a/kernels/volk/volk_32f_index_max_16u.h b/kernels/volk/volk_32f_index_max_16u.h
index 2aad087e..3e7c0fb9 100644
--- a/kernels/volk/volk_32f_index_max_16u.h
+++ b/kernels/volk/volk_32f_index_max_16u.h
@@ -359,4 +359,32 @@ volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_p
#endif /*LV_HAVE_AVX*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void
+volk_32f_index_max_16u_rvv(uint16_t* target, const float* src0, uint32_t num_points)
+{
+ vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(-FLT_MAX, __riscv_vsetvlmax_e32m8());
+ vuint16m4_t vmaxi = __riscv_vmv_v_x_u16m4(0, __riscv_vsetvlmax_e16m4());
+ vuint16m4_t vidx = __riscv_vid_v_u16m4(__riscv_vsetvlmax_e16m4());
+ size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ for (size_t vl; n > 0; n -= vl, src0 += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(src0, vl);
+ vbool4_t m = __riscv_vmfgt(v, vmax, vl);
+ vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
+ vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m8();
+ float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK8(vfmax, f, 32, vmax),
+ __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool4_t m = __riscv_vmfeq(vmax, max, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /*INCLUDED_volk_32f_index_max_16u_u_H*/
diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h
index 86dad0d1..0bf071fc 100644
--- a/kernels/volk/volk_32f_index_max_32u.h
+++ b/kernels/volk/volk_32f_index_max_32u.h
@@ -542,4 +542,32 @@ volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_p
#endif /*LV_HAVE_SSE*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void
+volk_32f_index_max_32u_rvv(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(-FLT_MAX, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, src0 += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl);
+ vbool8_t m = __riscv_vmfgt(v, vmax, vl);
+ vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
+ vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
+ __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmax, max, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /*INCLUDED_volk_32f_index_max_32u_u_H*/
diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h
index 000ecafc..5e1f0aa1 100644
--- a/kernels/volk/volk_32f_index_min_16u.h
+++ b/kernels/volk/volk_32f_index_min_16u.h
@@ -346,4 +346,32 @@ volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num
#endif /*LV_HAVE_AVX*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void
+volk_32f_index_min_16u_rvv(uint16_t* target, const float* src0, uint32_t num_points)
+{
+ vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(FLT_MAX, __riscv_vsetvlmax_e32m8());
+ vuint16m4_t vmini = __riscv_vmv_v_x_u16m4(0, __riscv_vsetvlmax_e16m4());
+ vuint16m4_t vidx = __riscv_vid_v_u16m4(__riscv_vsetvlmax_e16m4());
+ size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ for (size_t vl; n > 0; n -= vl, src0 += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(src0, vl);
+ vbool4_t m = __riscv_vmflt(v, vmin, vl);
+ vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
+ vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m8();
+ float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK8(vfmin, f, 32, vmin),
+ __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool4_t m = __riscv_vmfeq(vmin, min, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /*INCLUDED_volk_32f_index_min_16u_u_H*/
diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h
index 0c8bf8c0..7d01fbb4 100644
--- a/kernels/volk/volk_32f_index_min_32u.h
+++ b/kernels/volk/volk_32f_index_min_32u.h
@@ -508,4 +508,32 @@ volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num
#endif /*LV_HAVE_SSE*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void
+volk_32f_index_min_32u_rvv(uint32_t* target, const float* src0, uint32_t num_points)
+{
+ vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, src0 += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl);
+ vbool8_t m = __riscv_vmflt(v, vmin, vl);
+ vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
+ vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
+ __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmin, min, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /*INCLUDED_volk_32f_index_min_32u_u_H*/
diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h
index e91b6c7c..838c9927 100644
--- a/kernels/volk/volk_32f_invsqrt_32f.h
+++ b/kernels/volk/volk_32f_invsqrt_32f.h
@@ -97,8 +97,9 @@ volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int nu
}
number = eighthPoints * 8;
- for (; number < num_points; number++)
+ for (; number < num_points; number++) {
*cPtr++ = Q_rsqrt(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
@@ -156,8 +157,9 @@ volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num
cPtr += 4;
}
- for (number = quarter_points * 4; number < num_points; number++)
+ for (number = quarter_points * 4; number < num_points; number++) {
*cPtr++ = Q_rsqrt(*aPtr++);
+ }
}
#endif /* LV_HAVE_NEON */
@@ -198,9 +200,25 @@ volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int nu
}
number = eighthPoints * 8;
- for (; number < num_points; number++)
+ for (; number < num_points; number++) {
*cPtr++ = Q_rsqrt(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_invsqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
+ __riscv_vse32(cVector, __riscv_vfrsqrt7(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */
diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h
index 0443e56e..47a7cbe3 100644
--- a/kernels/volk/volk_32f_log2_32f.h
+++ b/kernels/volk/volk_32f_log2_32f.h
@@ -95,8 +95,9 @@ volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num
const float* aPtr = aVector;
unsigned int number = 0;
- for (number = 0; number < num_points; number++)
+ for (number = 0; number < num_points; number++) {
*bPtr++ = log2f_non_ieee(*aPtr++);
+ }
}
#endif /* LV_HAVE_GENERIC */
@@ -718,5 +719,73 @@ volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_
#endif /* LV_HAVE_AVX2 for unaligned */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+#if LOG_POLY_DEGREE == 6
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax);
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax);
+#elif LOG_POLY_DEGREE == 5
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax);
+#elif LOG_POLY_DEGREE == 4
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax);
+#elif LOG_POLY_DEGREE == 3
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax);
+#else
+#error
+#endif
+
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1);
+ const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
+ const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
+ vfloat32m2_t a = __riscv_vfabs(v, vl);
+ vfloat32m2_t exp = __riscv_vfcvt_f(
+ __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl),
+ vl);
+ vfloat32m2_t frac = __riscv_vreinterpret_f32m2(
+ __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl));
+
+ vfloat32m2_t mant = c0;
+ mant = __riscv_vfmadd(mant, frac, c1, vl);
+ mant = __riscv_vfmadd(mant, frac, c2, vl);
+#if LOG_POLY_DEGREE >= 4
+ mant = __riscv_vfmadd(mant, frac, c3, vl);
+#if LOG_POLY_DEGREE >= 5
+ mant = __riscv_vfmadd(mant, frac, c4, vl);
+#if LOG_POLY_DEGREE >= 6
+ mant = __riscv_vfmadd(mant, frac, c5, vl);
+#endif
+#endif
+#endif
+ exp = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
+
+ __riscv_vse32(bVector, exp, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_log2_32f_u_H */
diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h
index 37bd16a8..f44a9885 100644
--- a/kernels/volk/volk_32f_reciprocal_32f.h
+++ b/kernels/volk/volk_32f_reciprocal_32f.h
@@ -198,4 +198,19 @@ volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_p
}
#endif /* LV_HAVE_AVX512F */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_reciprocal_32f_rvv(float* out, const float* in, unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
+ __riscv_vse32(out, __riscv_vfrdiv(v, 1.0f, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */
diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
index a6eb37c2..607bd6d8 100644
--- a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
+++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
@@ -335,4 +335,41 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector,
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_rvv(float* outputVector,
+ const float* inputVector,
+ const float bound,
+ float* saveValue,
+ unsigned int num_points)
+{
+ if (num_points < 1)
+ return;
+
+ *outputVector = *inputVector - *saveValue;
+ if (*outputVector > bound)
+ *outputVector -= 2 * bound;
+ if (*outputVector < -bound)
+ *outputVector += 2 * bound;
+ ++inputVector;
+ ++outputVector;
+
+ vfloat32m8_t v2bound = __riscv_vfmv_v_f_f32m8(bound * 2, __riscv_vsetvlmax_e32m8());
+
+ size_t n = num_points - 1;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t va = __riscv_vle32_v_f32m8(inputVector, vl);
+ vfloat32m8_t vb = __riscv_vle32_v_f32m8(inputVector - 1, vl);
+ vfloat32m8_t v = __riscv_vfsub(va, vb, vl);
+ v = __riscv_vfsub_mu(__riscv_vmfgt(v, bound, vl), v, v, v2bound, vl);
+ v = __riscv_vfadd_mu(__riscv_vmflt(v, -bound, vl), v, v, v2bound, vl);
+ __riscv_vse32(outputVector, v, vl);
+ }
+
+ *saveValue = inputVector[-1];
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */
diff --git a/kernels/volk/volk_32f_s32f_add_32f.h b/kernels/volk/volk_32f_s32f_add_32f.h
index d7ae2aa1..e3301a7a 100644
--- a/kernels/volk/volk_32f_s32f_add_32f.h
+++ b/kernels/volk/volk_32f_s32f_add_32f.h
@@ -258,4 +258,21 @@ static inline void volk_32f_s32f_add_32f_u_orc(float* cVector,
}
#endif /* LV_HAVE_ORC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_s32f_add_32f_rvv(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
+ __riscv_vse32(cVector, __riscv_vfadd(v, scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_s32f_add_32f_a_H */
diff --git a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
index 816f6092..368a987a 100644
--- a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
+++ b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
@@ -52,6 +52,8 @@
#include
#include
+#include
+
#ifdef LV_HAVE_AVX
#include
@@ -458,4 +460,37 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_u_avx(float* noiseFloorAmplitude,
*noiseFloorAmplitude = localNoiseFloorAmplitude;
}
#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_s32f_calc_spectral_noise_floor_32f_rvv(float* noiseFloorAmplitude,
+ const float* realDataPoints,
+ const float spectralExclusionValue,
+ const unsigned int num_points)
+{
+ float sum;
+ volk_32f_accumulator_s32f_rvv(&sum, realDataPoints, num_points);
+ float meanAmplitude = sum / num_points + spectralExclusionValue;
+
+ vfloat32m8_t vbin = __riscv_vfmv_v_f_f32m8(meanAmplitude, __riscv_vsetvlmax_e32m8());
+ vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
+ size_t n = num_points, binCount = 0;
+ for (size_t vl; n > 0; n -= vl, realDataPoints += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(realDataPoints, vl);
+ vbool4_t m = __riscv_vmfle(v, vbin, vl);
+ binCount += __riscv_vcpop(m, vl);
+ vsum = __riscv_vfadd_tumu(m, vsum, vsum, v, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ sum = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
+
+ *noiseFloorAmplitude = binCount == 0 ? meanAmplitude : sum / binCount;
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H */
diff --git a/kernels/volk/volk_32f_s32f_clamppuppet_32f.h b/kernels/volk/volk_32f_s32f_clamppuppet_32f.h
index 254bfdd5..b4a0e3af 100644
--- a/kernels/volk/volk_32f_s32f_clamppuppet_32f.h
+++ b/kernels/volk/volk_32f_s32f_clamppuppet_32f.h
@@ -62,4 +62,14 @@ static inline void volk_32f_s32f_clamppuppet_32f_u_sse4_1(float* out,
}
#endif
+#ifdef LV_HAVE_RVV
+static inline void volk_32f_s32f_clamppuppet_32f_rvv(float* out,
+ const float* in,
+ const float min,
+ unsigned int num_points)
+{
+ volk_32f_s32f_x2_clamp_32f_rvv(out, in, min, -min, num_points);
+}
+#endif
+
#endif /* INCLUDED_volk_32f_s32f_clamppuppet_32f_H */
diff --git a/kernels/volk/volk_32f_s32f_convert_16i.h b/kernels/volk/volk_32f_s32f_convert_16i.h
index fe5a31b3..667e97f6 100644
--- a/kernels/volk/volk_32f_s32f_convert_16i.h
+++ b/kernels/volk/volk_32f_s32f_convert_16i.h
@@ -552,5 +552,22 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_s32f_convert_16i_rvv(int16_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
+ v = __riscv_vfmul(v, scalar, vl);
+ __riscv_vse16(outputVector, __riscv_vfncvt_x(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
diff --git a/kernels/volk/volk_32f_s32f_convert_32i.h b/kernels/volk/volk_32f_s32f_convert_32i.h
index 0cd9dee8..b7b6fb1a 100644
--- a/kernels/volk/volk_32f_s32f_convert_32i.h
+++ b/kernels/volk/volk_32f_s32f_convert_32i.h
@@ -405,5 +405,22 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector,
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_s32f_convert_32i_rvv(int32_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
+ v = __riscv_vfmul(v, scalar, vl);
+ __riscv_vse32(outputVector, __riscv_vfcvt_x(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
diff --git a/kernels/volk/volk_32f_s32f_convert_8i.h b/kernels/volk/volk_32f_s32f_convert_8i.h
index d47f95a0..a21ae7aa 100644
--- a/kernels/volk/volk_32f_s32f_convert_8i.h
+++ b/kernels/volk/volk_32f_s32f_convert_8i.h
@@ -437,5 +437,22 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_s32f_convert_8i_rvv(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
+ vint16m4_t vi = __riscv_vfncvt_x(__riscv_vfmul(v, scalar, vl), vl);
+ __riscv_vse8(outputVector, __riscv_vnclip(vi, 0, 0, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
diff --git a/kernels/volk/volk_32f_s32f_convertpuppet_8u.h b/kernels/volk/volk_32f_s32f_convertpuppet_8u.h
index 7f530c44..aa1258ba 100644
--- a/kernels/volk/volk_32f_s32f_convertpuppet_8u.h
+++ b/kernels/volk/volk_32f_s32f_convertpuppet_8u.h
@@ -102,4 +102,15 @@ static inline void volk_32f_s32f_convertpuppet_8u_a_sse(uint8_t* output,
volk_32f_s32f_x2_convert_8u_a_sse(output, input, scale, 128.0, num_points);
}
#endif
+
+#ifdef LV_HAVE_RVV
+static inline void volk_32f_s32f_convertpuppet_8u_rvv(uint8_t* output,
+ const float* input,
+ float scale,
+ unsigned int num_points)
+{
+ volk_32f_s32f_x2_convert_8u_rvv(output, input, scale, 128.0, num_points);
+}
+#endif
+
#endif
diff --git a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h
index 3a178596..f4a7a2b0 100644
--- a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h
+++ b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h
@@ -86,4 +86,14 @@ static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float* output,
output, input, bound - 3.131f, bound, num_points);
}
#endif
+#ifdef LV_HAVE_RVV
+static inline void volk_32f_s32f_mod_rangepuppet_32f_rvv(float* output,
+ const float* input,
+ float bound,
+ unsigned int num_points)
+{
+ volk_32f_s32f_s32f_mod_range_32f_rvv(
+ output, input, bound - 3.131f, bound, num_points);
+}
+#endif
#endif
diff --git a/kernels/volk/volk_32f_s32f_multiply_32f.h b/kernels/volk/volk_32f_s32f_multiply_32f.h
index 26fc148c..27d86149 100644
--- a/kernels/volk/volk_32f_s32f_multiply_32f.h
+++ b/kernels/volk/volk_32f_s32f_multiply_32f.h
@@ -257,4 +257,21 @@ static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
#endif /* LV_HAVE_ORC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_s32f_multiply_32f_rvv(float* cVector,
+ const float* aVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
+ __riscv_vse32(cVector, __riscv_vfmul(v, scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h
index 46f5799b..e572f24c 100644
--- a/kernels/volk/volk_32f_s32f_normalize.h
+++ b/kernels/volk/volk_32f_s32f_normalize.h
@@ -203,5 +203,19 @@ static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_s32f_normalize_rvv(float* vecBuffer, const float scalar, unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, vecBuffer += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(vecBuffer, vl);
+ __riscv_vse32(vecBuffer, __riscv_vfmul(v, 1.0f / scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_s32f_normalize_u_H */
diff --git a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
index d185f102..f5176150 100644
--- a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
+++ b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
@@ -359,5 +359,37 @@ static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_s32f_s32f_mod_range_32f_rvv(float* outputVector,
+ const float* inputVector,
+ const float lower_bound,
+ const float upper_bound,
+ unsigned int num_points)
+{
+ const float dist = upper_bound - lower_bound;
+ size_t vlmax = __riscv_vsetvlmax_e32m4();
+ vfloat32m4_t vdist = __riscv_vfmv_v_f_f32m4(dist, vlmax);
+ vfloat32m4_t vmdist = __riscv_vfmv_v_f_f32m4(-dist, vlmax);
+ vfloat32m4_t vupper = __riscv_vfmv_v_f_f32m4(upper_bound, vlmax);
+ vfloat32m4_t vlower = __riscv_vfmv_v_f_f32m4(lower_bound, vlmax);
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, outputVector += vl, inputVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4_t v = __riscv_vle32_v_f32m4(inputVector, vl);
+ vfloat32m4_t vlt = __riscv_vfsub(vlower, v, vl);
+ vfloat32m4_t vgt = __riscv_vfsub(v, vupper, vl);
+ vbool8_t mlt = __riscv_vmflt(v, vlower, vl);
+ vfloat32m4_t vmul = __riscv_vmerge(vmdist, vdist, mlt, vl);
+ vfloat32m4_t vcnt = __riscv_vfdiv(__riscv_vmerge(vgt, vlt, mlt, vl), vdist, vl);
+ vcnt = __riscv_vfcvt_f(__riscv_vadd(__riscv_vfcvt_rtz_x(vcnt, vl), 1, vl), vl);
+ vbool8_t mgt = __riscv_vmfgt(v, vupper, vl);
+ v = __riscv_vfmacc_mu(__riscv_vmor(mlt, mgt, vl), v, vcnt, vmul, vl);
+
+ __riscv_vse32(outputVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h
index 3b5bb6e1..8774277b 100644
--- a/kernels/volk/volk_32f_s32f_stddev_32f.h
+++ b/kernels/volk/volk_32f_s32f_stddev_32f.h
@@ -344,4 +344,32 @@ static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32f_s32f_stddev_32f_rvv(float* stddev,
+ const float* inputBuffer,
+ const float mean,
+ unsigned int num_points)
+{
+ if (num_points == 0) {
+ *stddev = 0;
+ return;
+ }
+ vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputBuffer += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl);
+ vsum = __riscv_vfmacc_tu(vsum, v, v, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum);
+ v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl);
+ float sum = __riscv_vfmv_f(v);
+ *stddev = sqrtf((sum / num_points) - (mean * mean));
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */
diff --git a/kernels/volk/volk_32f_s32f_x2_clamp_32f.h b/kernels/volk/volk_32f_s32f_x2_clamp_32f.h
index 19d51795..2b194eaa 100644
--- a/kernels/volk/volk_32f_s32f_x2_clamp_32f.h
+++ b/kernels/volk/volk_32f_s32f_x2_clamp_32f.h
@@ -187,4 +187,25 @@ static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(float* out,
}
#endif /* LV_HAVE_SSE4_1 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_s32f_x2_clamp_32f_rvv(float* out,
+ const float* in,
+ const float min,
+ const float max,
+ unsigned int num_points)
+{
+ vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(min, __riscv_vsetvlmax_e32m8());
+ vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(max, __riscv_vsetvlmax_e32m8());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
+ v = __riscv_vfmin(__riscv_vfmax(v, vmin, vl), vmax, vl);
+ __riscv_vse32(out, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H */
diff --git a/kernels/volk/volk_32f_s32f_x2_convert_8u.h b/kernels/volk/volk_32f_s32f_x2_convert_8u.h
index a52cdf28..1ad2b1ac 100644
--- a/kernels/volk/volk_32f_s32f_x2_convert_8u.h
+++ b/kernels/volk/volk_32f_s32f_x2_convert_8u.h
@@ -612,5 +612,24 @@ static inline void volk_32f_s32f_x2_convert_8u_a_sse(uint8_t* outputVector,
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_s32f_x2_convert_8u_rvv(uint8_t* outputVector,
+ const float* inputVector,
+ const float scale,
+ const float bias,
+ unsigned int num_points)
+{
+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(bias, __riscv_vsetvlmax_e32m8());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
+ vuint16m4_t vi = __riscv_vfncvt_xu(__riscv_vfmadd_vf_f32m8(v, scale, vb, vl), vl);
+ __riscv_vse8(outputVector, __riscv_vnclipu(vi, 0, 0, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_s32f_x2_convert_8u_a_H */
diff --git a/kernels/volk/volk_32f_sin_32f.h b/kernels/volk/volk_32f_sin_32f.h
index 371e424f..a02f2260 100644
--- a/kernels/volk/volk_32f_sin_32f.h
+++ b/kernels/volk/volk_32f_sin_32f.h
@@ -127,8 +127,9 @@ static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
cp1),
s);
- for (i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++) {
s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
+ }
s = _mm512_div_ps(s, ftwos);
sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
@@ -520,8 +521,9 @@ static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
cp1),
s);
- for (i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++) {
s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
+ }
s = _mm512_div_ps(s, ftwos);
sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
@@ -893,5 +895,67 @@ volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_poi
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_sin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
+ const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
+ const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
+ const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
+
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
+
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05, vlmax);
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
+ vfloat32m2_t s = __riscv_vfabs(v, vl);
+ vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
+ vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
+
+ s = __riscv_vfnmsac(s, cPio4a, r, vl);
+ s = __riscv_vfnmsac(s, cPio4b, r, vl);
+ s = __riscv_vfnmsac(s, cPio4c, r, vl);
+
+ s = __riscv_vfmul(s, 1 / 8.0f, vl);
+ s = __riscv_vfmul(s, s, vl);
+ vfloat32m2_t t = s;
+ s = __riscv_vfmsub(s, c5, c4, vl);
+ s = __riscv_vfmadd(s, t, c3, vl);
+ s = __riscv_vfmsub(s, t, c2, vl);
+ s = __riscv_vfmadd(s, t, cf1, vl);
+ s = __riscv_vfmul(s, t, vl);
+ s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
+ s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
+ s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
+ s = __riscv_vfmul(s, 1 / 2.0f, vl);
+
+ vfloat32m2_t sine =
+ __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
+ vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
+
+ vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
+ vbool16_t m2 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl),
+ __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl),
+ vl);
+
+ sine = __riscv_vmerge(sine, cosine, m1, vl);
+ sine = __riscv_vfneg_mu(m2, sine, sine, vl);
+
+ __riscv_vse32(bVector, sine, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_sin_32f_u_H */
diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h
index 9d269413..c5672534 100644
--- a/kernels/volk/volk_32f_sqrt_32f.h
+++ b/kernels/volk/volk_32f_sqrt_32f.h
@@ -205,4 +205,20 @@ volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_p
}
#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_sqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
+ __riscv_vse32(cVector, __riscv_vfsqrt(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_sqrt_32f_u_H */
diff --git a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
index c71514bb..96535ed6 100644
--- a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
+++ b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
@@ -569,4 +569,75 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_stddev_and_mean_32f_x2_rvv(float* stddev,
+ float* mean,
+ const float* inputBuffer,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m4();
+ if (num_points < vlmax) {
+ volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
+ return;
+ }
+
+ vfloat32m4_t vsum = __riscv_vle32_v_f32m4(inputBuffer, vlmax);
+ inputBuffer += vlmax;
+ vfloat32m4_t vsumsq = __riscv_vfmv_v_f_f32m4(0, vlmax);
+ size_t partLen = num_points / vlmax;
+
+ for (size_t i = 1; i < partLen; ++i, inputBuffer += vlmax) {
+ vfloat32m4_t v = __riscv_vle32_v_f32m4(inputBuffer, vlmax);
+ vsum = __riscv_vfadd(vsum, v, vlmax);
+ vfloat32m4_t vaux = __riscv_vfmsub(v, i + 1.0f, vsum, vlmax);
+ vaux = __riscv_vfmul(vaux, vaux, vlmax);
+ vaux = __riscv_vfmul(vaux, 1.0f / (i * (i + 1.0f)), vlmax);
+ vsumsq = __riscv_vfadd(vsumsq, vaux, vlmax);
+ }
+
+ size_t vl = __riscv_vsetvlmax_e32m2();
+ vfloat32m2_t vsum2 =
+ __riscv_vfadd(__riscv_vget_f32m2(vsum, 0), __riscv_vget_f32m2(vsum, 1), vl);
+ vfloat32m2_t vfix2 =
+ __riscv_vfsub(__riscv_vget_f32m2(vsum, 0), __riscv_vget_f32m2(vsum, 1), vl);
+ vfix2 = __riscv_vfmul(vfix2, vfix2, vl);
+ vfloat32m2_t vsumsq2 =
+ __riscv_vfadd(__riscv_vget_f32m2(vsumsq, 0), __riscv_vget_f32m2(vsumsq, 1), vl);
+ vsumsq2 = __riscv_vfmacc(vsumsq2, 0.5f / (num_points / vlmax), vfix2, vl);
+
+ vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vsum1 =
+ __riscv_vfadd(__riscv_vget_f32m1(vsum2, 0), __riscv_vget_f32m1(vsum2, 1), vl);
+ vfloat32m1_t vfix1 =
+ __riscv_vfsub(__riscv_vget_f32m1(vsum2, 0), __riscv_vget_f32m1(vsum2, 1), vl);
+ vfix1 = __riscv_vfmul(vfix1, vfix1, vl);
+ vfloat32m1_t vsumsq1 =
+ __riscv_vfadd(__riscv_vget_f32m1(vsumsq2, 0), __riscv_vget_f32m1(vsumsq2, 1), vl);
+ vsumsq1 = __riscv_vfmacc(vsumsq1, 0.5f / (num_points / vlmax * 2), vfix1, vl);
+
+ for (size_t n = num_points / vlmax * 4, vl = vlmax >> 2; vl >>= 1; n *= 2) {
+ vfloat32m1_t vsumdown = __riscv_vslidedown(vsum1, vl, vl);
+ vfix1 = __riscv_vfsub(vsum1, vsumdown, vl);
+ vfix1 = __riscv_vfmul(vfix1, vfix1, vl);
+ vsum1 = __riscv_vfadd(vsum1, vsumdown, vl);
+ vsumsq1 = __riscv_vfadd(vsumsq1, __riscv_vslidedown(vsumsq1, vl, vl), vl);
+ vsumsq1 = __riscv_vfmacc(vsumsq1, 0.5f / n, vfix1, vl);
+ }
+
+ float sum = __riscv_vfmv_f(vsum1);
+ float sumsq = __riscv_vfmv_f(vsumsq1);
+
+ for (size_t i = partLen * vlmax; i < num_points; ++i) {
+ float in = *inputBuffer++;
+ sum += in;
+ sumsq = update_square_sum_1_val(sumsq, sum, i, in);
+ }
+
+ *stddev = sqrtf(sumsq / num_points);
+ *mean = sum / num_points;
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
diff --git a/kernels/volk/volk_32f_tan_32f.h b/kernels/volk/volk_32f_tan_32f.h
index 1ec0202f..28810c94 100644
--- a/kernels/volk/volk_32f_tan_32f.h
+++ b/kernels/volk/volk_32f_tan_32f.h
@@ -750,5 +750,72 @@ volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_poi
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_tan_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
+ const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
+ const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
+ const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
+
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
+
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax);
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
+ vfloat32m2_t s = __riscv_vfabs(v, vl);
+ vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
+ vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
+
+ s = __riscv_vfnmsac(s, cPio4a, r, vl);
+ s = __riscv_vfnmsac(s, cPio4b, r, vl);
+ s = __riscv_vfnmsac(s, cPio4c, r, vl);
+
+ s = __riscv_vfmul(s, 1 / 8.0f, vl);
+ s = __riscv_vfmul(s, s, vl);
+ vfloat32m2_t t = s;
+ s = __riscv_vfmsub(s, c5, c4, vl);
+ s = __riscv_vfmadd(s, t, c3, vl);
+ s = __riscv_vfmsub(s, t, c2, vl);
+ s = __riscv_vfmadd(s, t, cf1, vl);
+ s = __riscv_vfmul(s, t, vl);
+ s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
+ s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
+ s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
+ s = __riscv_vfmul(s, 1 / 2.0f, vl);
+
+ vfloat32m2_t sine =
+ __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
+ vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
+
+ vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
+ vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl);
+ vbool16_t m3 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl),
+ __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl),
+ vl);
+
+ vfloat32m2_t sine0 = sine;
+ sine = __riscv_vmerge(sine, cosine, m1, vl);
+ sine = __riscv_vfneg_mu(m3, sine, sine, vl);
+
+ cosine = __riscv_vmerge(cosine, sine0, m1, vl);
+ cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl);
+
+ __riscv_vse32(bVector, __riscv_vfdiv(sine, cosine, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_tan_32f_u_H */
diff --git a/kernels/volk/volk_32f_tanh_32f.h b/kernels/volk/volk_32f_tanh_32f.h
index 3e36adb7..e90e4025 100644
--- a/kernels/volk/volk_32f_tanh_32f.h
+++ b/kernels/volk/volk_32f_tanh_32f.h
@@ -412,4 +412,38 @@ volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int n
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32f_tanh_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(135135.0f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(17325.0f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(378.0f, vlmax);
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(62370.0f, vlmax);
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3150.0f, vlmax);
+ const vfloat32m2_t c6 = __riscv_vfmv_v_f_f32m2(28.0f, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2_t x = __riscv_vle32_v_f32m2(aVector, vl);
+ vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
+ vfloat32m2_t a, b;
+ a = __riscv_vfadd(xx, c3, vl);
+ a = __riscv_vfmadd(a, xx, c2, vl);
+ a = __riscv_vfmadd(a, xx, c1, vl);
+ a = __riscv_vfmul(a, x, vl);
+ b = c6;
+ b = __riscv_vfmadd(b, xx, c5, vl);
+ b = __riscv_vfmadd(b, xx, c4, vl);
+ b = __riscv_vfmadd(b, xx, c1, vl);
+ __riscv_vse32(bVector, __riscv_vfdiv(a, b, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_tanh_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h
index f99e6b55..be9f6aa7 100644
--- a/kernels/volk/volk_32f_x2_add_32f.h
+++ b/kernels/volk/volk_32f_x2_add_32f.h
@@ -391,5 +391,22 @@ static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
#endif /* LV_HAVE_ORC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_x2_add_32f_rvv(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
+ vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
+ __riscv_vse32(cVector, __riscv_vfadd(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h
index bcb9da7c..fbece7d5 100644
--- a/kernels/volk/volk_32f_x2_divide_32f.h
+++ b/kernels/volk/volk_32f_x2_divide_32f.h
@@ -347,4 +347,22 @@ static inline void volk_32f_x2_divide_32f_u_avx(float* cVector,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_x2_divide_32f_rvv(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
+ vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
+ __riscv_vse32(cVector, __riscv_vfdiv(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h
index 3a4b7177..3502b3a5 100644
--- a/kernels/volk/volk_32f_x2_dot_prod_16i.h
+++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h
@@ -678,5 +678,20 @@ static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
#endif /*LV_HAVE_AVX512F*/
+#ifdef LV_HAVE_RVV
+#include
+
+#include "volk_32f_x2_dot_prod_32f.h"
+
+static inline void volk_32f_x2_dot_prod_16i_rvv(int16_t* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ float fresult = 0;
+ volk_32f_x2_dot_prod_32f_rvv(&fresult, input, taps, num_points);
+ *result = (int16_t)rintf(fresult);
+}
+#endif /*LV_HAVE_RVV*/
#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
diff --git a/kernels/volk/volk_32f_x2_dot_prod_32f.h b/kernels/volk/volk_32f_x2_dot_prod_32f.h
index 5bdb72ce..2d86411e 100644
--- a/kernels/volk/volk_32f_x2_dot_prod_32f.h
+++ b/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -949,4 +949,28 @@ extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
unsigned int num_points);
#endif /* LV_HAVE_NEONV7 */
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32f_x2_dot_prod_32f_rvv(float* result,
+ const float* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v0 = __riscv_vle32_v_f32m8(input, vl);
+ vfloat32m8_t v1 = __riscv_vle32_v_f32m8(taps, vl);
+ vsum = __riscv_vfmacc_tu(vsum, v0, v1, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum);
+ v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl);
+ *result = __riscv_vfmv_f(v);
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
index b4901543..62e30ad8 100644
--- a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
+++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
@@ -79,4 +79,17 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector,
outputVector, inputVector, bound, saveValue, num_points);
}
#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_RVV
+static inline void volk_32f_x2_fm_detectpuppet_32f_rvv(float* outputVector,
+ const float* inputVector,
+ float* saveValue,
+ unsigned int num_points)
+{
+ const float bound = 2.0f;
+ volk_32f_s32f_32f_fm_detect_32f_rvv(
+ outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_RVV */
+
#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_interleave_32fc.h b/kernels/volk/volk_32f_x2_interleave_32fc.h
index 140fa9ff..2190f1a4 100644
--- a/kernels/volk/volk_32f_x2_interleave_32fc.h
+++ b/kernels/volk/volk_32f_x2_interleave_32fc.h
@@ -255,4 +255,43 @@ static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_x2_interleave_32fc_rvv(lv_32fc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ unsigned int num_points)
+{
+ uint64_t* out = (uint64_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint32m4_t vr = __riscv_vle32_v_u32m4((const uint32_t*)iBuffer, vl);
+ vuint32m4_t vi = __riscv_vle32_v_u32m4((const uint32_t*)qBuffer, vl);
+ vuint64m8_t vc =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFFFFFF, vi, vl);
+ __riscv_vse64(out, vc, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32f_x2_interleave_32fc_rvvseg(lv_32fc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4_t vr = __riscv_vle32_v_f32m4(iBuffer, vl);
+ vfloat32m4_t vi = __riscv_vle32_v_f32m4(qBuffer, vl);
+ __riscv_vsseg2e32((float*)complexVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_32f_x2_interleave_32fc_u_H */
diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h
index 0f88ffe6..a0d48f75 100644
--- a/kernels/volk/volk_32f_x2_max_32f.h
+++ b/kernels/volk/volk_32f_x2_max_32f.h
@@ -330,4 +330,22 @@ static inline void volk_32f_x2_max_32f_u_avx(float* cVector,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_x2_max_32f_rvv(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
+ vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
+ __riscv_vse32(cVector, __riscv_vfmax(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_x2_max_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h
index 128c7483..2910b1f9 100644
--- a/kernels/volk/volk_32f_x2_min_32f.h
+++ b/kernels/volk/volk_32f_x2_min_32f.h
@@ -334,4 +334,22 @@ static inline void volk_32f_x2_min_32f_u_avx(float* cVector,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_x2_min_32f_rvv(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
+ vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
+ __riscv_vse32(cVector, __riscv_vfmin(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_x2_min_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_multiply_32f.h b/kernels/volk/volk_32f_x2_multiply_32f.h
index c36adfc2..af266041 100644
--- a/kernels/volk/volk_32f_x2_multiply_32f.h
+++ b/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -356,5 +356,22 @@ static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector,
}
#endif /* LV_HAVE_ORC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_x2_multiply_32f_rvv(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
+ vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
+ __riscv_vse32(cVector, __riscv_vfmul(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
diff --git a/kernels/volk/volk_32f_x2_pow_32f.h b/kernels/volk/volk_32f_x2_pow_32f.h
index 637fd4b7..c2b77233 100644
--- a/kernels/volk/volk_32f_x2_pow_32f.h
+++ b/kernels/volk/volk_32f_x2_pow_32f.h
@@ -976,4 +976,127 @@ static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
#endif /* LV_HAVE_AVX2 for unaligned */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_x2_pow_32f_rvv(float* cVector,
+ const float* bVector,
+ const float* aVector,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m1();
+
+#if POW_POLY_DEGREE == 6
+ const vfloat32m1_t cl5 = __riscv_vfmv_v_f_f32m1(3.1157899f, vlmax);
+ const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(-3.3241990f, vlmax);
+ const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.5988452f, vlmax);
+ const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.2315303f, vlmax);
+ const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(3.1821337e-1f, vlmax);
+ const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-3.4436006e-2f, vlmax);
+#elif POW_POLY_DEGREE == 5
+ const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(2.8882704548164776201f, vlmax);
+ const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(-2.52074962577807006663f, vlmax);
+ const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(1.48116647521213171641f, vlmax);
+ const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-0.465725644288844778798f, vlmax);
+ const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.0596515482674574969533f, vlmax);
+#elif POW_POLY_DEGREE == 4
+ const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.61761038894603480148f, vlmax);
+ const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.75647175389045657003f, vlmax);
+ const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(0.688243882994381274313f, vlmax);
+ const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-0.107254423828329604454f, vlmax);
+#elif POW_POLY_DEGREE == 3
+ const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(2.28330284476918490682f, vlmax);
+ const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-1.04913055217340124191f, vlmax);
+ const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.204446009836232697516f, vlmax);
+#else
+#error
+#endif
+
+ const vfloat32m1_t exp_hi = __riscv_vfmv_v_f_f32m1(88.376259f, vlmax);
+ const vfloat32m1_t exp_lo = __riscv_vfmv_v_f_f32m1(-88.376259f, vlmax);
+ const vfloat32m1_t log2EF = __riscv_vfmv_v_f_f32m1(1.442695f, vlmax);
+ const vfloat32m1_t exp_C1 = __riscv_vfmv_v_f_f32m1(-0.6933594f, vlmax);
+ const vfloat32m1_t exp_C2 = __riscv_vfmv_v_f_f32m1(0.000212194f, vlmax);
+ const vfloat32m1_t cf1 = __riscv_vfmv_v_f_f32m1(1.0f, vlmax);
+ const vfloat32m1_t cf1o2 = __riscv_vfmv_v_f_f32m1(0.5f, vlmax);
+ const vfloat32m1_t ln2 = __riscv_vfmv_v_f_f32m1(0.6931471805f, vlmax);
+
+ const vfloat32m1_t ce0 = __riscv_vfmv_v_f_f32m1(1.9875691500e-4, vlmax);
+ const vfloat32m1_t ce1 = __riscv_vfmv_v_f_f32m1(1.3981999507e-3, vlmax);
+ const vfloat32m1_t ce2 = __riscv_vfmv_v_f_f32m1(8.3334519073e-3, vlmax);
+ const vfloat32m1_t ce3 = __riscv_vfmv_v_f_f32m1(4.1665795894e-2, vlmax);
+ const vfloat32m1_t ce4 = __riscv_vfmv_v_f_f32m1(1.6666665459e-1, vlmax);
+ const vfloat32m1_t ce5 = __riscv_vfmv_v_f_f32m1(5.0000001201e-1, vlmax);
+
+ const vint32m1_t m1 = __riscv_vreinterpret_i32m1(cf1);
+ const vint32m1_t m2 = __riscv_vmv_v_x_i32m1(0x7FFFFF, vlmax);
+ const vint32m1_t c127 = __riscv_vmv_v_x_i32m1(127, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m1(n);
+ vfloat32m1_t va = __riscv_vle32_v_f32m1(aVector, vl);
+ vfloat32m1_t log;
+
+ { /* log(a) */
+ vfloat32m1_t a = __riscv_vfabs(va, vl);
+ vfloat32m1_t exp = __riscv_vfcvt_f(
+ __riscv_vsub(
+ __riscv_vsra(__riscv_vreinterpret_i32m1(a), 23, vl), c127, vl),
+ vl);
+ vfloat32m1_t frac = __riscv_vreinterpret_f32m1(__riscv_vor(
+ __riscv_vand(__riscv_vreinterpret_i32m1(va), m2, vl), m1, vl));
+
+ vfloat32m1_t mant = cl0;
+ mant = __riscv_vfmadd(mant, frac, cl1, vl);
+ mant = __riscv_vfmadd(mant, frac, cl2, vl);
+#if POW_POLY_DEGREE >= 4
+ mant = __riscv_vfmadd(mant, frac, cl3, vl);
+#if POW_POLY_DEGREE >= 5
+ mant = __riscv_vfmadd(mant, frac, cl4, vl);
+#if POW_POLY_DEGREE >= 6
+ mant = __riscv_vfmadd(mant, frac, cl5, vl);
+#endif
+#endif
+#endif
+ log = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
+ log = __riscv_vfmul(log, ln2, vl);
+ }
+
+ vfloat32m1_t vb = __riscv_vle32_v_f32m1(bVector, vl);
+ vb = __riscv_vfmul(vb, log, vl); /* b*log(a) */
+ vfloat32m1_t exp;
+
+ { /* exp(b*log(a)) */
+ vb = __riscv_vfmin(vb, exp_hi, vl);
+ vb = __riscv_vfmax(vb, exp_lo, vl);
+ vfloat32m1_t fx = __riscv_vfmadd(vb, log2EF, cf1o2, vl);
+
+ vfloat32m1_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
+ fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
+ vb = __riscv_vfmacc(vb, exp_C1, fx, vl);
+ vb = __riscv_vfmacc(vb, exp_C2, fx, vl);
+ vfloat32m1_t vv = __riscv_vfmul(vb, vb, vl);
+
+ vfloat32m1_t y = ce0;
+ y = __riscv_vfmadd(y, vb, ce1, vl);
+ y = __riscv_vfmadd(y, vb, ce2, vl);
+ y = __riscv_vfmadd(y, vb, ce3, vl);
+ y = __riscv_vfmadd(y, vb, ce4, vl);
+ y = __riscv_vfmadd(y, vb, ce5, vl);
+ y = __riscv_vfmadd(y, vv, vb, vl);
+ y = __riscv_vfadd(y, cf1, vl);
+
+ vfloat32m1_t pow2n = __riscv_vreinterpret_f32m1(__riscv_vsll(
+ __riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), c127, vl), 23, vl));
+
+ exp = __riscv_vfmul(y, pow2n, vl);
+ }
+
+ __riscv_vse32(cVector, exp, vl);
+ }
+}
+
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_powpuppet_32f.h b/kernels/volk/volk_32f_x2_powpuppet_32f.h
index 419ee18e..d4df0b3d 100644
--- a/kernels/volk/volk_32f_x2_powpuppet_32f.h
+++ b/kernels/volk/volk_32f_x2_powpuppet_32f.h
@@ -111,4 +111,16 @@ static inline void volk_32f_x2_powpuppet_32f_u_avx2(float* cVector,
}
#endif /* LV_HAVE_AVX2 for unaligned */
+#ifdef LV_HAVE_RVV
+static inline void volk_32f_x2_powpuppet_32f_rvv(float* cVector,
+ const float* bVector,
+ const float* aVector,
+ unsigned int num_points)
+{
+ float* aVectorPos = make_positive(aVector, num_points);
+ volk_32f_x2_pow_32f_rvv(cVector, bVector, aVectorPos, num_points);
+ volk_free(aVectorPos);
+}
+#endif /* LV_HAVE_RVV */
+
#endif /* INCLUDED_volk_32f_x2_powpuppet_32f_H */
diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
index 2ddfb0fd..9a78a01a 100644
--- a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
+++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
@@ -326,5 +326,51 @@ static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVec
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_x2_s32f_interleave_16ic_rvv(lv_16sc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ const float scalar,
+ unsigned int num_points)
+{
+ uint32_t* out = (uint32_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
+ vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
+ vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
+ vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
+ vuint16m4_t vr = __riscv_vreinterpret_u16m4(vri);
+ vuint16m4_t vi = __riscv_vreinterpret_u16m4(vii);
+ vuint32m8_t vc = __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFF, vi, vl);
+ __riscv_vse32(out, vc, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32f_x2_s32f_interleave_16ic_rvvseg(lv_16sc_t* complexVector,
+ const float* iBuffer,
+ const float* qBuffer,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
+ vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
+ vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
+ vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
+ __riscv_vsseg2e16(
+ (int16_t*)complexVector, __riscv_vcreate_v_i16m4x2(vri, vii), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h
index 631b72f8..e3d563fc 100644
--- a/kernels/volk/volk_32f_x2_subtract_32f.h
+++ b/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -272,4 +272,22 @@ static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32f_x2_subtract_32f_rvv(float* cVector,
+ const float* aVector,
+ const float* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
+ vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
+ __riscv_vse32(cVector, __riscv_vfsub(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */
diff --git a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
index 6afd262a..b9a83714 100644
--- a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+++ b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -341,8 +341,9 @@ static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
}
}
- for (k = 0; k < 8; k += 2)
+ for (k = 0; k < 8; k += 2) {
result[k] = result[k] + result[k + 1];
+ }
*target = result[0] + result[2] + result[4] + result[6];
@@ -654,4 +655,45 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
}
#endif // LV_HAVE_AVX
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32f_x3_sum_of_poly_32f_rvv(float* target,
+ float* src0,
+ float* center_point_array,
+ float* cutoff,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m4();
+ vfloat32m4_t vsum = __riscv_vfmv_v_f_f32m4(0, vlmax);
+ float mul1 = center_point_array[0]; // scalar to avoid register spills
+ float mul2 = center_point_array[1];
+ vfloat32m4_t vmul3 = __riscv_vfmv_v_f_f32m4(center_point_array[2], vlmax);
+ vfloat32m4_t vmul4 = __riscv_vfmv_v_f_f32m4(center_point_array[3], vlmax);
+ vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(*cutoff, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, src0 += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl);
+ vfloat32m4_t v1 = __riscv_vfmax(v, vmax, vl);
+ vfloat32m4_t v2 = __riscv_vfmul(v1, v1, vl);
+ vfloat32m4_t v3 = __riscv_vfmul(v1, v2, vl);
+ vfloat32m4_t v4 = __riscv_vfmul(v2, v2, vl);
+ v2 = __riscv_vfmul(v2, mul2, vl);
+ v4 = __riscv_vfmul(v4, vmul4, vl);
+ v1 = __riscv_vfmadd(v1, mul1, v2, vl);
+ v3 = __riscv_vfmadd(v3, vmul3, v4, vl);
+ v1 = __riscv_vfadd(v1, v3, vl);
+ vsum = __riscv_vfadd_tu(vsum, vsum, v1, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t v = RISCV_SHRINK4(vfadd, f, 32, vsum);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ float sum = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
+ *target = sum + num_points * center_point_array[4];
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/
diff --git a/kernels/volk/volk_32fc_32f_add_32fc.h b/kernels/volk/volk_32fc_32f_add_32fc.h
index b820ed5d..24eff2b4 100644
--- a/kernels/volk/volk_32fc_32f_add_32fc.h
+++ b/kernels/volk/volk_32fc_32f_add_32fc.h
@@ -230,5 +230,24 @@ static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_32f_add_32fc_rvv(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, cVector += vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m8_t vc = __riscv_vle32_v_f32m8((const float*)aVector, vl * 2);
+ vuint32m4_t v = __riscv_vle32_v_u32m4((const uint32_t*)bVector, vl);
+ vfloat32m8_t vf = __riscv_vreinterpret_f32m8(
+ __riscv_vreinterpret_u32m8(__riscv_vzext_vf2_u64m8(v, vl)));
+ __riscv_vse32((float*)cVector, __riscv_vfadd(vc, vf, vl * 2), vl * 2);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
index 363bf657..472d405a 100644
--- a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -743,5 +743,63 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
#endif /*LV_HAVE_SSE*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32fc_32f_dot_prod_32fc_rvv(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+ vfloat32m4_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)input, vl);
+ vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
+ vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
+ vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
+ vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
+ vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
+ vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+ __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+#include
+
+static inline void volk_32fc_32f_dot_prod_32fc_rvvseg(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const float* taps,
+ unsigned int num_points)
+{
+ vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+ vfloat32m4_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl);
+ vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
+ vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
+ vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
+ vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
+ vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+ __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVVSEG*/
#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
diff --git a/kernels/volk/volk_32fc_32f_multiply_32fc.h b/kernels/volk/volk_32fc_32f_multiply_32fc.h
index 76ed1af7..b731414c 100644
--- a/kernels/volk/volk_32fc_32f_multiply_32fc.h
+++ b/kernels/volk/volk_32fc_32f_multiply_32fc.h
@@ -224,5 +224,24 @@ static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_32f_multiply_32fc_rvv(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const float* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, cVector += vl, aVector += vl, bVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m8_t vc = __riscv_vle32_v_f32m8((const float*)aVector, vl * 2);
+ vuint32m4_t v = __riscv_vle32_v_u32m4((const uint32_t*)bVector, vl);
+ vfloat32m8_t vf = __riscv_vreinterpret_f32m8(__riscv_vreinterpret_u32m8(
+ __riscv_vwmaccu(__riscv_vwaddu_vv(v, v, vl), 0xFFFFFFFF, v, vl)));
+ __riscv_vse32((float*)cVector, __riscv_vfmul(vc, vf, vl * 2), vl * 2);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_accumulator_s32fc.h b/kernels/volk/volk_32fc_accumulator_s32fc.h
index d7267ea6..72266bd5 100644
--- a/kernels/volk/volk_32fc_accumulator_s32fc.h
+++ b/kernels/volk/volk_32fc_accumulator_s32fc.h
@@ -276,4 +276,33 @@ static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result,
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32fc_accumulator_s32fc_rvv(lv_32fc_t* result,
+ const lv_32fc_t* inputBuffer,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m8();
+ vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, vlmax);
+ const float* in = (const float*)inputBuffer;
+ size_t n = num_points * 2;
+ for (size_t vl; n > 0; n -= vl, in += vl) {
+ vl = __riscv_vsetvl_e32m8(n < vlmax ? n : vlmax); /* force exact vl */
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
+ vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
+ }
+ vuint64m8_t vsumu = __riscv_vreinterpret_u64m8(__riscv_vreinterpret_u32m8(vsum));
+ vfloat32m4_t vsum1 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 0, vlmax));
+ vfloat32m4_t vsum2 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 32, vlmax));
+ vlmax = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsum1);
+ vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsum2);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vlmax);
+ *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vlmax)),
+ __riscv_vfmv_f(__riscv_vfredusum(vi, z, vlmax)));
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */
diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h
index aa1134ab..2edff119 100644
--- a/kernels/volk/volk_32fc_conjugate_32fc.h
+++ b/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -260,4 +260,21 @@ static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector,
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_conjugate_32fc_rvv(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ vuint64m8_t m = __riscv_vmv_v_x_u64m8(1ull << 63, __riscv_vsetvlmax_e64m8());
+ for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vuint64m8_t v = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl);
+ __riscv_vse64((uint64_t*)cVector, __riscv_vxor(v, m, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h
index a38cce64..55768ab0 100644
--- a/kernels/volk/volk_32fc_convert_16ic.h
+++ b/kernels/volk/volk_32fc_convert_16ic.h
@@ -416,4 +416,23 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
}
}
#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_convert_16ic_rvv(lv_16sc_t* outputVector,
+ const lv_32fc_t* inputVector,
+ unsigned int num_points)
+{
+ int16_t* out = (int16_t*)outputVector;
+ float* in = (float*)inputVector;
+ size_t n = num_points * 2;
+ for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
+ __riscv_vse16(out, __riscv_vfncvt_x(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/kernels/volk/volk_32fc_deinterleave_32f_x2.h
index f269d661..569942fe 100644
--- a/kernels/volk/volk_32fc_deinterleave_32f_x2.h
+++ b/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -254,4 +254,46 @@ static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer,
}
}
#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_deinterleave_32f_x2_rvv(float* iBuffer,
+ float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
+ vuint32m4_t vr = __riscv_vnsrl(vc, 0, vl);
+ vuint32m4_t vi = __riscv_vnsrl(vc, 32, vl);
+ __riscv_vse32((uint32_t*)iBuffer, vr, vl);
+ __riscv_vse32((uint32_t*)qBuffer, vi, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32fc_deinterleave_32f_x2_rvvseg(float* iBuffer,
+ float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint32m4x2_t vc =
+ __riscv_vlseg2e32_v_u32m4x2((const uint32_t*)complexVector, vl);
+ vuint32m4_t vr = __riscv_vget_u32m4(vc, 0);
+ vuint32m4_t vi = __riscv_vget_u32m4(vc, 1);
+ __riscv_vse32((uint32_t*)iBuffer, vr, vl);
+ __riscv_vse32((uint32_t*)qBuffer, vi, vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/kernels/volk/volk_32fc_deinterleave_64f_x2.h
index 1af5098f..6599780b 100644
--- a/kernels/volk/volk_32fc_deinterleave_64f_x2.h
+++ b/kernels/volk/volk_32fc_deinterleave_64f_x2.h
@@ -314,4 +314,44 @@ static inline void volk_32fc_deinterleave_64f_x2_neon(double* iBuffer,
}
#endif /* LV_HAVE_NEONV8 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_deinterleave_64f_x2_rvv(double* iBuffer,
+ double* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
+ vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+ __riscv_vse64(iBuffer, __riscv_vfwcvt_f(vr, vl), vl);
+ __riscv_vse64(qBuffer, __riscv_vfwcvt_f(vi, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32fc_deinterleave_64f_x2_rvvseg(double* iBuffer,
+ double* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl);
+ vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
+ vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
+ __riscv_vse64(iBuffer, __riscv_vfwcvt_f(vr, vl), vl);
+ __riscv_vse64(qBuffer, __riscv_vfwcvt_f(vi, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/kernels/volk/volk_32fc_deinterleave_imag_32f.h
index 9e330d33..bb54411b 100644
--- a/kernels/volk/volk_32fc_deinterleave_imag_32f.h
+++ b/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -229,4 +229,22 @@ static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer,
}
}
#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_deinterleave_imag_32f_rvv(float* qBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ const uint64_t* in = (const uint64_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8(in, vl);
+ __riscv_vse32((uint32_t*)qBuffer, __riscv_vnsrl(vc, 32, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_real_32f.h b/kernels/volk/volk_32fc_deinterleave_real_32f.h
index 6fc0679d..f75cdd03 100644
--- a/kernels/volk/volk_32fc_deinterleave_real_32f.h
+++ b/kernels/volk/volk_32fc_deinterleave_real_32f.h
@@ -234,4 +234,21 @@ static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_deinterleave_real_32f_rvv(float* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ const uint64_t* in = (const uint64_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8(in, vl);
+ __riscv_vse32((uint32_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_real_64f.h b/kernels/volk/volk_32fc_deinterleave_real_64f.h
index 31d8f3ec..5c6b0c95 100644
--- a/kernels/volk/volk_32fc_deinterleave_real_64f.h
+++ b/kernels/volk/volk_32fc_deinterleave_real_64f.h
@@ -240,4 +240,21 @@ static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_deinterleave_real_64f_rvv(double* iBuffer,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ const uint64_t* in = (const uint64_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vuint32m4_t vi = __riscv_vnsrl(__riscv_vle64_v_u64m8(in, vl), 0, vl);
+ __riscv_vse64(iBuffer, __riscv_vfwcvt_f(__riscv_vreinterpret_f32m4(vi), vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_u_H */
diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h
index 28b51766..781876d1 100644
--- a/kernels/volk/volk_32fc_index_max_16u.h
+++ b/kernels/volk/volk_32fc_index_max_16u.h
@@ -321,7 +321,7 @@ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_
uint32_t i = 0;
- for (; i> 3; ++i) {
+ for (; i < (num_bytes >> 3); ++i) {
sq_dist =
lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
@@ -466,4 +466,65 @@ static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
#endif /*LV_HAVE_AVX2*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void
+volk_32fc_index_max_16u_rvv(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+ vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
+ vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
+ size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ for (size_t vl; n > 0; n -= vl, src0 += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl);
+ vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
+ vbool8_t m = __riscv_vmflt(vmax, v, vl);
+ vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
+ vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
+ __riscv_vfmv_v_f_f32m1(0, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmax, max, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+#include
+
+static inline void
+volk_32fc_index_max_16u_rvvseg(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+ vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
+ vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
+ size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ for (size_t vl; n > 0; n -= vl, src0 += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl);
+ vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
+ vbool8_t m = __riscv_vmflt(vmax, v, vl);
+ vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
+ vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
+ __riscv_vfmv_v_f_f32m1(0, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmax, max, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h
index fafff48c..993187ca 100644
--- a/kernels/volk/volk_32fc_index_max_32u.h
+++ b/kernels/volk/volk_32fc_index_max_32u.h
@@ -307,7 +307,7 @@ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_
uint32_t i = 0;
- for (; i> 3; ++i) {
+ for (; i < (num_bytes >> 3); ++i) {
sq_dist =
lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
@@ -509,4 +509,65 @@ volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi
#endif /*LV_HAVE_NEON*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void
+volk_32fc_index_max_32u_rvv(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+ vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, src0 += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl);
+ vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
+ vbool8_t m = __riscv_vmflt(vmax, v, vl);
+ vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
+ vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
+ __riscv_vfmv_v_f_f32m1(0, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmax, max, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+#include
+
+static inline void
+volk_32fc_index_max_32u_rvvseg(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+ vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, src0 += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl);
+ vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
+ vbool8_t m = __riscv_vmflt(vmax, v, vl);
+ vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
+ vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
+ __riscv_vfmv_v_f_f32m1(0, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmax, max, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h
index 6cf6d844..706db915 100644
--- a/kernels/volk/volk_32fc_index_min_16u.h
+++ b/kernels/volk/volk_32fc_index_min_16u.h
@@ -462,4 +462,67 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
#endif /*LV_HAVE_AVX2*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32fc_index_min_16u_rvv(uint16_t* target,
+ const lv_32fc_t* source,
+ uint32_t num_points)
+{
+ vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
+ vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
+ vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
+ size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ for (size_t vl; n > 0; n -= vl, source += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl);
+ vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
+ vbool8_t m = __riscv_vmfgt(vmin, v, vl);
+ vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
+ vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
+ __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmin, min, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+#include
+
+static inline void volk_32fc_index_min_16u_rvvseg(uint16_t* target,
+ const lv_32fc_t* source,
+ uint32_t num_points)
+{
+ vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
+ vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
+ vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
+ size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+ for (size_t vl; n > 0; n -= vl, source += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl);
+ vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
+ vbool8_t m = __riscv_vmfgt(vmin, v, vl);
+ vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
+ vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
+ __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmin, min, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/
diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h
index 5e409b99..807a3bb5 100644
--- a/kernels/volk/volk_32fc_index_min_32u.h
+++ b/kernels/volk/volk_32fc_index_min_32u.h
@@ -504,4 +504,67 @@ static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
#endif /*LV_HAVE_NEON*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32fc_index_min_32u_rvv(uint32_t* target,
+ const lv_32fc_t* source,
+ uint32_t num_points)
+{
+ vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, source += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl);
+ vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
+ vbool8_t m = __riscv_vmfgt(vmin, v, vl);
+ vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
+ vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
+ __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmin, min, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+#include
+
+static inline void volk_32fc_index_min_32u_rvvseg(uint32_t* target,
+ const lv_32fc_t* source,
+ uint32_t num_points)
+{
+ vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
+ vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, source += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl);
+ vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
+ vbool8_t m = __riscv_vmfgt(vmin, v, vl);
+ vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
+ vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
+ vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
+ }
+ size_t vl = __riscv_vsetvlmax_e32m4();
+ float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
+ __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
+ __riscv_vsetvlmax_e32m1()));
+ vbool8_t m = __riscv_vmfeq(vmin, min, vl);
+ *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/
diff --git a/kernels/volk/volk_32fc_magnitude_32f.h b/kernels/volk/volk_32fc_magnitude_32f.h
index eca00e24..7b4e44a5 100644
--- a/kernels/volk/volk_32fc_magnitude_32f.h
+++ b/kernels/volk/volk_32fc_magnitude_32f.h
@@ -420,5 +420,42 @@ static inline void volk_32fc_magnitude_32f_neon_fancy_sweet(
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_magnitude_32f_rvv(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
+ vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32fc_magnitude_32f_rvvseg(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl);
+ vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
+ vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/kernels/volk/volk_32fc_magnitude_squared_32f.h b/kernels/volk/volk_32fc_magnitude_squared_32f.h
index e7b11ae9..24fa3a9a 100644
--- a/kernels/volk/volk_32fc_magnitude_squared_32f.h
+++ b/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -350,5 +350,42 @@ static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector,
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_magnitude_squared_32f_rvv(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
+ vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ __riscv_vse32(magnitudeVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32fc_magnitude_squared_32f_rvvseg(float* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl);
+ vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
+ vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ __riscv_vse32(magnitudeVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h
index 759db24c..7d98b7c2 100644
--- a/kernels/volk/volk_32fc_s32f_atan2_32f.h
+++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h
@@ -344,4 +344,113 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
}
#endif /* LV_HAVE_AVX2 for unaligned */
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector,
+ const lv_32fc_t* inputVector,
+ const float normalizeFactor,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
+ const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
+ const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
+ const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
+ const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
+ const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
+ const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vuint64m4_t v = __riscv_vle64_v_u64m4((const uint64_t*)inputVector, vl);
+ vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 0, vl));
+ vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 32, vl));
+ vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl);
+ vfloat32m2_t x = __riscv_vfdiv(
+ __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl);
+ vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl);
+ x = __riscv_vreinterpret_f32m2(
+ __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl));
+
+ vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
+ vfloat32m2_t p = c13;
+ p = __riscv_vfmadd(p, xx, c11, vl);
+ p = __riscv_vfmadd(p, xx, c9, vl);
+ p = __riscv_vfmadd(p, xx, c7, vl);
+ p = __riscv_vfmadd(p, xx, c5, vl);
+ p = __riscv_vfmadd(p, xx, c3, vl);
+ p = __riscv_vfmadd(p, xx, c1, vl);
+ p = __riscv_vfmul(p, x, vl);
+
+ x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
+ p = __riscv_vmerge(p, x, mswap, vl);
+ p = __riscv_vfadd_mu(
+ RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
+
+ __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+#include
+
+static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector,
+ const lv_32fc_t* inputVector,
+ const float normalizeFactor,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+ const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
+ const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
+ const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
+ const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
+ const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
+ const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
+ const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2((const float*)inputVector, vl);
+ vfloat32m2_t vr = __riscv_vget_f32m2(v, 0), vi = __riscv_vget_f32m2(v, 1);
+ vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl);
+ vfloat32m2_t x = __riscv_vfdiv(
+ __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl);
+ vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl);
+ x = __riscv_vreinterpret_f32m2(
+ __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl));
+
+ vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
+ vfloat32m2_t p = c13;
+ p = __riscv_vfmadd(p, xx, c11, vl);
+ p = __riscv_vfmadd(p, xx, c9, vl);
+ p = __riscv_vfmadd(p, xx, c7, vl);
+ p = __riscv_vfmadd(p, xx, c5, vl);
+ p = __riscv_vfmadd(p, xx, c3, vl);
+ p = __riscv_vfmadd(p, xx, c1, vl);
+ p = __riscv_vfmul(p, x, vl);
+
+ x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
+ p = __riscv_vmerge(p, x, mswap, vl);
+ p = __riscv_vfadd_mu(
+ RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
+
+ __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_u_H */
diff --git a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
index c4bfc28e..51840e3b 100644
--- a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
+++ b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
@@ -253,4 +253,24 @@ volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32fc_s32f_deinterleave_real_16i_rvv(int16_t* iBuffer,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ const uint64_t* in = (const uint64_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vuint32m4_t vi = __riscv_vnsrl(__riscv_vle64_v_u64m8(in, vl), 0, vl);
+ vfloat32m4_t vif = __riscv_vfmul(__riscv_vreinterpret_f32m4(vi), scalar, vl);
+ __riscv_vse16(iBuffer, __riscv_vncvt_x(__riscv_vfcvt_x(vif, vl), vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H */
diff --git a/kernels/volk/volk_32fc_s32f_magnitude_16i.h b/kernels/volk/volk_32fc_s32f_magnitude_16i.h
index 21e12e2d..f699ed72 100644
--- a/kernels/volk/volk_32fc_s32f_magnitude_16i.h
+++ b/kernels/volk/volk_32fc_s32f_magnitude_16i.h
@@ -302,4 +302,46 @@ static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_s32f_magnitude_16i_rvv(int16_t* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
+ vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl);
+ __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32fc_s32f_magnitude_16i_rvvseg(int16_t* magnitudeVector,
+ const lv_32fc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl);
+ vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
+ vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl);
+ __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */
diff --git a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
index be9aa88a..f676758e 100644
--- a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
+++ b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
@@ -142,4 +142,167 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput,
#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_s32f_power_spectrum_32f_rvv(float* logPowerOutput,
+ const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+#if LOG_POLY_DEGREE == 6
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax);
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax);
+#elif LOG_POLY_DEGREE == 5
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax);
+#elif LOG_POLY_DEGREE == 4
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax);
+#elif LOG_POLY_DEGREE == 3
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax);
+#else
+#error
+#endif
+
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1);
+ const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
+ const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax);
+
+ const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexFFTInput += vl, logPowerOutput += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)complexFFTInput, vl);
+ vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m2_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ v = __riscv_vfmul(v, normFactSq, vl);
+
+ vfloat32m2_t a = __riscv_vfabs(v, vl);
+ vfloat32m2_t exp = __riscv_vfcvt_f(
+ __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl),
+ vl);
+ vfloat32m2_t frac = __riscv_vreinterpret_f32m2(
+ __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl));
+
+ vfloat32m2_t mant = c0;
+ mant = __riscv_vfmadd(mant, frac, c1, vl);
+ mant = __riscv_vfmadd(mant, frac, c2, vl);
+#if LOG_POLY_DEGREE >= 4
+ mant = __riscv_vfmadd(mant, frac, c3, vl);
+#if LOG_POLY_DEGREE >= 5
+ mant = __riscv_vfmadd(mant, frac, c4, vl);
+#if LOG_POLY_DEGREE >= 6
+ mant = __riscv_vfmadd(mant, frac, c5, vl);
+#endif
+#endif
+#endif
+ v = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
+ v = __riscv_vfmul(v, volk_log2to10factor, vl);
+
+ __riscv_vse32(logPowerOutput, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void
+volk_32fc_s32f_power_spectrum_32f_rvvseg(float* logPowerOutput,
+ const lv_32fc_t* complexFFTInput,
+ const float normalizationFactor,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+
+#if LOG_POLY_DEGREE == 6
+ const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax);
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax);
+#elif LOG_POLY_DEGREE == 5
+ const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax);
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax);
+#elif LOG_POLY_DEGREE == 4
+ const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax);
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax);
+#elif LOG_POLY_DEGREE == 3
+ const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax);
+ const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax);
+ const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax);
+#else
+#error
+#endif
+
+ const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
+ const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1);
+ const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
+ const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax);
+
+ const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, complexFFTInput += vl, logPowerOutput += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2x2_t vc =
+ __riscv_vlseg2e32_v_f32m2x2((const float*)complexFFTInput, vl);
+ vfloat32m2_t vr = __riscv_vget_f32m2(vc, 0);
+ vfloat32m2_t vi = __riscv_vget_f32m2(vc, 1);
+ vfloat32m2_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ v = __riscv_vfmul(v, normFactSq, vl);
+
+ vfloat32m2_t a = __riscv_vfabs(v, vl);
+ vfloat32m2_t exp = __riscv_vfcvt_f(
+ __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl),
+ vl);
+ vfloat32m2_t frac = __riscv_vreinterpret_f32m2(
+ __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl));
+
+ vfloat32m2_t mant = c0;
+ mant = __riscv_vfmadd(mant, frac, c1, vl);
+ mant = __riscv_vfmadd(mant, frac, c2, vl);
+#if LOG_POLY_DEGREE >= 4
+ mant = __riscv_vfmadd(mant, frac, c3, vl);
+#if LOG_POLY_DEGREE >= 5
+ mant = __riscv_vfmadd(mant, frac, c4, vl);
+#if LOG_POLY_DEGREE >= 6
+ mant = __riscv_vfmadd(mant, frac, c5, vl);
+#endif
+#endif
+#endif
+ v = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
+ v = __riscv_vfmul(v, volk_log2to10factor, vl);
+
+ __riscv_vse32(logPowerOutput, v, vl);
+ }
+}
+
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h
index 3ce071ca..1ae8ad92 100644
--- a/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h
+++ b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h
@@ -170,4 +170,34 @@ volk_32fc_s32fc_rotator2puppet_32fc_u_avx_fma(lv_32fc_t* outVector,
#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+#ifdef LV_HAVE_RVV
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_rvv(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_rvv(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+#endif /*LV_HAVE_RVV*/
+
+
+#ifdef LV_HAVE_RVVSEG
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_rvvseg(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_rvv(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+#endif /*LV_HAVE_RVVSEG*/
#endif /* INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h
index bee1f068..e668e3c5 100644
--- a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h
+++ b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h
@@ -779,4 +779,158 @@ static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(lv_32fc_t* outVect
#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+/* Note on the RVV implementation:
+ * The complex multiply was expanded, because we don't care about the corner cases.
+ * Otherwise, without -ffast-math, the compiler would inserts function calls,
+ * which invalidates all vector registers and spills them on each loop iteration. */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvv(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+ vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD;
+
+ lv_32fc_t inc = 1.0f;
+ vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
+ for (size_t i = 0; i < vlmax; ++i) {
+ lv_32fc_t ph =
+ lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc),
+ lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc));
+ phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax);
+ phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax);
+ inc = lv_cmake(
+ lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc),
+ lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc));
+ }
+ vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax);
+ vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax);
+
+ size_t vl = 0;
+ if (num_points > 0)
+ while (1) {
+ size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD;
+ num_points -= n;
+
+ for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
+ // vl
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvvseg(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m2();
+ vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD;
+
+ lv_32fc_t inc = 1.0f;
+ vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
+ for (size_t i = 0; i < vlmax; ++i) {
+ lv_32fc_t ph =
+ lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc),
+ lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc));
+ phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax);
+ phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax);
+ inc = lv_cmake(
+ lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc),
+ lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc));
+ }
+ vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax);
+ vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax);
+
+ size_t vl = 0;
+ if (num_points > 0)
+ while (1) {
+ size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD;
+ num_points -= n;
+
+ for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
+ // vl
+
+static inline void volk_32fc_x2_add_32fc_rvv(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
+{
+ const float* ina = (const float*)aVector;
+ const float* inb = (const float*)bVector;
+ float* out = (float*)cVector;
+ size_t n = num_points * 2;
+ for (size_t vl; n > 0; n -= vl, ina += vl, inb += vl, out += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t va = __riscv_vle32_v_f32m8(ina, vl);
+ vfloat32m8_t vb = __riscv_vle32_v_f32m8(inb, vl);
+ __riscv_vse32(out, __riscv_vfadd(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
index 7b9aae3a..a5a4a9df 100644
--- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -421,5 +421,72 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result
#endif /*LV_HAVE_SSE3*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_rvv(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+ vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
+ vfloat32m2_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl);
+ vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl);
+ vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
+ vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl));
+ vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
+ vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl));
+ vbi = __riscv_vfneg(vbi, vl);
+ vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
+ vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
+ vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr);
+ vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+ __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+#include
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_rvvseg(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+ vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
+ vfloat32m2_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vfloat32m2x2_t va = __riscv_vlseg2e32_v_f32m2x2((const float*)input, vl);
+ vfloat32m2x2_t vb = __riscv_vlseg2e32_v_f32m2x2((const float*)taps, vl);
+ vfloat32m2_t var = __riscv_vget_f32m2(va, 0), vai = __riscv_vget_f32m2(va, 1);
+ vfloat32m2_t vbr = __riscv_vget_f32m2(vb, 0), vbi = __riscv_vget_f32m2(vb, 1);
+ vbi = __riscv_vfneg(vbi, vl);
+ vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
+ vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
+ vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr);
+ vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+ __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVVSEG*/
#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
diff --git a/kernels/volk/volk_32fc_x2_divide_32fc.h b/kernels/volk/volk_32fc_x2_divide_32fc.h
index 3a013cb0..ceee6559 100644
--- a/kernels/volk/volk_32fc_x2_divide_32fc.h
+++ b/kernels/volk/volk_32fc_x2_divide_32fc.h
@@ -414,5 +414,66 @@ static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+
+static inline void volk_32fc_x2_divide_32fc_rvv(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
+{
+ uint64_t* out = (uint64_t*)cVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, out += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl);
+ vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl);
+ vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
+ vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
+ vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
+ vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
+ vfloat32m4_t mul = __riscv_vfrdiv(
+ __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
+ vfloat32m4_t vr = __riscv_vfmul(
+ __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
+ vfloat32m4_t vi = __riscv_vfmul(
+ __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
+ vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr);
+ vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi);
+ vuint64m8_t v =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
+ __riscv_vse64(out, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32fc_x2_divide_32fc_rvvseg(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
+ vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
+ vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
+ vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
+ vfloat32m4_t mul = __riscv_vfrdiv(
+ __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
+ vfloat32m4_t vr = __riscv_vfmul(
+ __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
+ vfloat32m4_t vi = __riscv_vfmul(
+ __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
+ __riscv_vsseg2e32_v_f32m4x2(
+ (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
+ }
+}
+
+#endif /*LV_HAVE_RVVSEG*/
#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 47d6f697..d4acab3a 100644
--- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -730,5 +730,70 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
+#ifdef LV_HAVE_RVV
+#include
+#include
+
+static inline void volk_32fc_x2_dot_prod_32fc_rvv(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+ vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
+ vfloat32m2_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl);
+ vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl);
+ vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
+ vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl));
+ vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
+ vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl));
+ vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
+ vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
+ vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr);
+ vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+ __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+#include
+
+static inline void volk_32fc_x2_dot_prod_32fc_rvvseg(lv_32fc_t* result,
+ const lv_32fc_t* input,
+ const lv_32fc_t* taps,
+ unsigned int num_points)
+{
+ vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+ vfloat32m4_t vsumi = vsumr;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl);
+ vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
+ vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
+ vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
+ vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
+ vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
+ vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
+ }
+ size_t vl = __riscv_vsetvlmax_e32m1();
+ vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
+ vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
+ vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+ *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+ __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVVSEG*/
#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h
index 96cefed5..2db2929b 100644
--- a/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -460,4 +460,55 @@ static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
#endif /* LV_HAVE_ORC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_x2_multiply_32fc_rvv(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl);
+ vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl);
+ vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
+ vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
+ vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
+ vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
+ vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
+ vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr);
+ vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi);
+ vuint64m8_t v =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
+ __riscv_vse64((uint64_t*)cVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32fc_x2_multiply_32fc_rvvseg(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
+ vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
+ vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
+ vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
+ vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
+ __riscv_vsseg2e32_v_f32m4x2(
+ (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
index 12e4948a..ce01d6d6 100644
--- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -287,5 +287,56 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector,
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_x2_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl);
+ vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl);
+ vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
+ vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
+ vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
+ vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
+ vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl);
+ vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr);
+ vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi);
+ vuint64m8_t v =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
+ __riscv_vse64((uint64_t*)cVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32fc_x2_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
+ vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
+ vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
+ vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
+ vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl);
+ __riscv_vsseg2e32_v_f32m4x2(
+ (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
+ }
+}
+
+#endif /*LV_HAVE_RVVSEG*/
#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
index 54ffbf0f..0b956c20 100644
--- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
+++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
@@ -535,4 +535,62 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target,
}
#endif // LV_HAVE_SSE
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m4();
+ vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
+ vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
+ vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl);
+ vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
+ vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
+ vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
+ vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvvseg(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ float scalar,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m4();
+ vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
+ vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
+ vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl);
+ vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
+ vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
+ vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
+ vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
index b35bed5e..b27f7b7b 100644
--- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
+++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
@@ -342,4 +342,69 @@ volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t* cVector,
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvv(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ vfloat32m2_t vbr =
+ __riscv_vfmv_v_f_f32m2(lv_creal(*scalar), __riscv_vsetvlmax_e32m2());
+ vfloat32m2_t vbi =
+ __riscv_vfmv_v_f_f32m2(lv_cimag(*scalar), __riscv_vsetvlmax_e32m2());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, bVector += vl, aVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m2(n);
+ vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)bVector, vl);
+ vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)aVector, vl);
+ vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
+ vfloat32m2_t vcr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
+ vfloat32m2_t vci = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl));
+ vfloat32m2_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m2_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
+ vuint32m2_t vru = __riscv_vreinterpret_u32m2(__riscv_vfadd(vr, vcr, vl));
+ vuint32m2_t viu = __riscv_vreinterpret_u32m2(__riscv_vfadd(vi, vci, vl));
+ vuint64m4_t v =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
+ __riscv_vse64((uint64_t*)cVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvvseg(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ vfloat32m4_t vbr =
+ __riscv_vfmv_v_f_f32m4(lv_creal(*scalar), __riscv_vsetvlmax_e32m4());
+ vfloat32m4_t vbi =
+ __riscv_vfmv_v_f_f32m4(lv_cimag(*scalar), __riscv_vsetvlmax_e32m4());
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
+ vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
+ vfloat32m4_t vcr = __riscv_vget_f32m4(vc, 0), vci = __riscv_vget_f32m4(vc, 1);
+ vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
+ vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
+ vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
+ vr = __riscv_vfadd(vr, vcr, vl);
+ vi = __riscv_vfadd(vi, vci, vl);
+ __riscv_vsseg2e32_v_f32m4x2(
+ (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */
diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h
index 4a93d5bf..b711bcf1 100644
--- a/kernels/volk/volk_32fc_x2_square_dist_32f.h
+++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h
@@ -277,7 +277,7 @@ static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
float sq_dist;
unsigned int i = 0;
- for (; i> 3; ++i) {
+ for (; i < (num_bytes >> 3); ++i) {
diff = src0[0] - points[i];
sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
@@ -374,4 +374,56 @@ static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
#endif /*LV_HAVE_AVX2*/
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32fc_x2_square_dist_32f_rvv(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m4();
+ vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
+ vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl);
+ vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
+ vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
+ vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
+ vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ __riscv_vse32(target, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_32fc_x2_square_dist_32f_rvvseg(float* target,
+ lv_32fc_t* src0,
+ lv_32fc_t* points,
+ unsigned int num_points)
+{
+ size_t vlmax = __riscv_vsetvlmax_e32m4();
+ vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
+ vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
+
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl);
+ vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
+ vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
+ vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
+ vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
+ vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
+ __riscv_vse32(target, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
diff --git a/kernels/volk/volk_32i_s32f_convert_32f.h b/kernels/volk/volk_32i_s32f_convert_32f.h
index 678290fc..749cb1af 100644
--- a/kernels/volk/volk_32i_s32f_convert_32f.h
+++ b/kernels/volk/volk_32i_s32f_convert_32f.h
@@ -313,5 +313,21 @@ static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector,
}
#endif /* LV_HAVE_SSE2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32i_s32f_convert_32f_rvv(float* outputVector,
+ const int32_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vfloat32m8_t v = __riscv_vfcvt_f(__riscv_vle32_v_i32m8(inputVector, vl), vl);
+ __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
diff --git a/kernels/volk/volk_32i_x2_and_32i.h b/kernels/volk/volk_32i_x2_and_32i.h
index d2bcf6b8..79e4f221 100644
--- a/kernels/volk/volk_32i_x2_and_32i.h
+++ b/kernels/volk/volk_32i_x2_and_32i.h
@@ -337,5 +337,22 @@ static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32i_x2_and_32i_rvv(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl);
+ vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl);
+ __riscv_vse32(cVector, __riscv_vand(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32i_x2_and_32i_u_H */
diff --git a/kernels/volk/volk_32i_x2_or_32i.h b/kernels/volk/volk_32i_x2_or_32i.h
index f3e4b769..3642f13d 100644
--- a/kernels/volk/volk_32i_x2_or_32i.h
+++ b/kernels/volk/volk_32i_x2_or_32i.h
@@ -336,5 +336,22 @@ static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32i_x2_or_32i_rvv(int32_t* cVector,
+ const int32_t* aVector,
+ const int32_t* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl);
+ vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl);
+ __riscv_vse32(cVector, __riscv_vor(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_32i_x2_or_32i_u_H */
diff --git a/kernels/volk/volk_32u_byteswap.h b/kernels/volk/volk_32u_byteswap.h
index a6ec86f8..d5d0613e 100644
--- a/kernels/volk/volk_32u_byteswap.h
+++ b/kernels/volk/volk_32u_byteswap.h
@@ -343,5 +343,53 @@ static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int n
}
#endif /* LV_HAVE_SSE2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32u_byteswap_rvv(uint32_t* intsToSwap, unsigned int num_points)
+{
+ size_t n = num_points;
+ size_t vlmax = __riscv_vsetvlmax_e8m1();
+ if (vlmax <= 256) {
+ vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
+ __riscv_vsub(__riscv_vreinterpret_u32m1(__riscv_vid_v_u8m1(vlmax)),
+ 0x3020100 - 0x10203,
+ vlmax / 4));
+ for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vuint8m8_t v =
+ __riscv_vreinterpret_u8m8(__riscv_vle32_v_u32m8(intsToSwap, vl));
+ v = RISCV_PERM8(__riscv_vrgather, v, vidx);
+ __riscv_vse32(intsToSwap, __riscv_vreinterpret_u32m8(v), vl);
+ }
+ } else {
+ vuint16m2_t vidx = __riscv_vreinterpret_u16m2(
+ __riscv_vsub(__riscv_vreinterpret_u64m2(__riscv_vid_v_u16m2(vlmax)),
+ 0x3000200010000 - 0x100020003,
+ vlmax / 4));
+ for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vuint8m8_t v =
+ __riscv_vreinterpret_u8m8(__riscv_vle32_v_u32m8(intsToSwap, vl));
+ v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx);
+ __riscv_vse32(intsToSwap, __riscv_vreinterpret_u32m8(v), vl);
+ }
+ }
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVA23
+#include
+
+static inline void volk_32u_byteswap_rva23(uint32_t* intsToSwap, unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vuint32m8_t v = __riscv_vle32_v_u32m8(intsToSwap, vl);
+ __riscv_vse32(intsToSwap, __riscv_vrev8(v, vl), vl);
+ }
+}
+#endif /* LV_HAVE_RVA23 */
#endif /* INCLUDED_volk_32u_byteswap_a_H */
diff --git a/kernels/volk/volk_32u_byteswappuppet_32u.h b/kernels/volk/volk_32u_byteswappuppet_32u.h
index a6ef921f..4ad3deac 100644
--- a/kernels/volk/volk_32u_byteswappuppet_32u.h
+++ b/kernels/volk/volk_32u_byteswappuppet_32u.h
@@ -91,4 +91,26 @@ static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output,
}
#endif
+#ifdef LV_HAVE_RVV
+static inline void volk_32u_byteswappuppet_32u_rvv(uint32_t* output,
+ uint32_t* intsToSwap,
+ unsigned int num_points)
+{
+
+ volk_32u_byteswap_rvv((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+}
+#endif
+
+#ifdef LV_HAVE_RVA23
+static inline void volk_32u_byteswappuppet_32u_rva23(uint32_t* output,
+ uint32_t* intsToSwap,
+ unsigned int num_points)
+{
+
+ volk_32u_byteswap_rva23((uint32_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+}
+#endif
+
#endif
diff --git a/kernels/volk/volk_32u_popcnt.h b/kernels/volk/volk_32u_popcnt.h
index b8c371fb..3ad2f0aa 100644
--- a/kernels/volk/volk_32u_popcnt.h
+++ b/kernels/volk/volk_32u_popcnt.h
@@ -76,4 +76,22 @@ static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
#endif /*LV_HAVE_SSE4_2*/
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_32u_popcnt_rvv(uint32_t* ret, const uint32_t value)
+{
+ *ret = __riscv_vcpop(__riscv_vreinterpret_b4(__riscv_vmv_s_x_u64m1(value, 1)), 32);
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVA22V
+#include
+
+static inline void volk_32u_popcnt_rva22(uint32_t* ret, const uint32_t value)
+{
+ *ret = __riscv_cpop_32(value);
+}
+#endif /*LV_HAVE_RVA22V*/
+
#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/
diff --git a/kernels/volk/volk_32u_popcntpuppet_32u.h b/kernels/volk/volk_32u_popcntpuppet_32u.h
index 19a17f56..b808eb00 100644
--- a/kernels/volk/volk_32u_popcntpuppet_32u.h
+++ b/kernels/volk/volk_32u_popcntpuppet_32u.h
@@ -18,9 +18,8 @@ static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector,
const uint32_t* inVector,
unsigned int num_points)
{
- unsigned int ii;
- for (ii = 0; ii < num_points; ++ii) {
- volk_32u_popcnt_generic(outVector + ii, *(inVector + ii));
+ for (size_t i = 0; i < num_points; ++i) {
+ volk_32u_popcnt_generic(outVector + i, inVector[i]);
}
}
#endif /* LV_HAVE_GENERIC */
@@ -30,11 +29,32 @@ static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector,
const uint32_t* inVector,
unsigned int num_points)
{
- unsigned int ii;
- for (ii = 0; ii < num_points; ++ii) {
- volk_32u_popcnt_a_sse4_2(outVector + ii, *(inVector + ii));
+ for (size_t i = 0; i < num_points; ++i) {
+ volk_32u_popcnt_a_sse4_2(outVector + i, inVector[i]);
}
}
#endif /* LV_HAVE_SSE4_2 */
+#ifdef LV_HAVE_RVV
+static inline void volk_32u_popcntpuppet_32u_rvv(uint32_t* outVector,
+ const uint32_t* inVector,
+ unsigned int num_points)
+{
+ for (size_t i = 0; i < num_points; ++i) {
+ volk_32u_popcnt_rvv(outVector + i, inVector[i]);
+ }
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVA22V
+static inline void volk_32u_popcntpuppet_32u_rva22(uint32_t* outVector,
+ const uint32_t* inVector,
+ unsigned int num_points)
+{
+ for (size_t i = 0; i < num_points; ++i) {
+ volk_32u_popcnt_rva22(outVector + i, inVector[i]);
+ }
+}
+#endif /* LV_HAVE_RVA22V */
+
#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h
index 62150ac6..ece8f48b 100644
--- a/kernels/volk/volk_32u_reverse_32u.h
+++ b/kernels/volk/volk_32u_reverse_32u.h
@@ -337,4 +337,57 @@ volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_poi
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void
+volk_32u_reverse_32u_rvv(uint32_t* out, const uint32_t* in, unsigned int num_points)
+{
+ size_t n = num_points;
+
+ static const uint64_t tblLo[] = {
+ 0xE060A020C0408000,
+ 0xF070B030D0509010,
+ };
+ static const uint64_t tblHi[] = {
+ 0x0E060A020C040800,
+ 0x0F070B030D050901,
+ };
+ vuint8m1_t vtblLo = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(tblLo, 2));
+ vuint8m1_t vtblHi = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(tblHi, 2));
+
+ size_t vlmax = __riscv_vsetvlmax_e8m1();
+ vuint16m2_t vidx = __riscv_vreinterpret_u16m2(
+ __riscv_vsub(__riscv_vreinterpret_u64m2(__riscv_vid_v_u16m2(vlmax)),
+ 0x3000200010000 - 0x100020003,
+ vlmax / 4));
+ for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
+ vl = __riscv_vsetvl_e32m4(n);
+ vuint8m4_t v = __riscv_vreinterpret_u8m4(__riscv_vle32_v_u32m4(in, vl));
+ v = RISCV_PERM4(__riscv_vrgatherei16, v, vidx);
+ vuint8m4_t lo = __riscv_vand(v, 0xF, vl * 4);
+ lo = RISCV_LUT4(__riscv_vrgather, vtblLo, lo);
+ vuint8m4_t hi = __riscv_vsrl(v, 4, vl * 4);
+ hi = RISCV_LUT4(__riscv_vrgather, vtblHi, hi);
+ v = __riscv_vor(hi, lo, vl * 4);
+ __riscv_vse32(out, __riscv_vreinterpret_u32m4(v), vl);
+ }
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVA23
+#include
+
+static inline void
+volk_32u_reverse_32u_rva23(uint32_t* out, const uint32_t* in, unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
+ vl = __riscv_vsetvl_e32m8(n);
+ vuint32m8_t v = __riscv_vle32_v_u32m8(in, vl);
+ __riscv_vse32(out, __riscv_vbrev(v, vl), vl);
+ }
+}
+#endif /* LV_HAVE_RVA23 */
+
#endif /* INCLUDED_volk_32u_reverse_32u_u_H */
diff --git a/kernels/volk/volk_64f_convert_32f.h b/kernels/volk/volk_64f_convert_32f.h
index b5f9b507..67f6ae48 100644
--- a/kernels/volk/volk_64f_convert_32f.h
+++ b/kernels/volk/volk_64f_convert_32f.h
@@ -315,5 +315,20 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector,
}
#endif /* LV_HAVE_SSE2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_64f_convert_32f_rvv(float* outputVector,
+ const double* inputVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vfloat64m8_t v = __riscv_vle64_v_f64m8(inputVector, vl);
+ __riscv_vse32(outputVector, __riscv_vfncvt_f(v, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_64f_convert_32f_a_H */
diff --git a/kernels/volk/volk_64f_x2_add_64f.h b/kernels/volk/volk_64f_x2_add_64f.h
index 867a5d3b..bf9024e8 100644
--- a/kernels/volk/volk_64f_x2_add_64f.h
+++ b/kernels/volk/volk_64f_x2_add_64f.h
@@ -244,4 +244,22 @@ static inline void volk_64f_x2_add_64f_a_avx(double* cVector,
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_64f_x2_add_64f_rvv(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl);
+ vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
+ __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_64f_x2_add_64f_u_H */
diff --git a/kernels/volk/volk_64f_x2_max_64f.h b/kernels/volk/volk_64f_x2_max_64f.h
index 973605c7..e9ca3ef6 100644
--- a/kernels/volk/volk_64f_x2_max_64f.h
+++ b/kernels/volk/volk_64f_x2_max_64f.h
@@ -290,5 +290,22 @@ static inline void volk_64f_x2_max_64f_u_avx(double* cVector,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_64f_x2_max_64f_rvv(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl);
+ vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
+ __riscv_vse64(cVector, __riscv_vfmax(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_64f_x2_max_64f_u_H */
diff --git a/kernels/volk/volk_64f_x2_min_64f.h b/kernels/volk/volk_64f_x2_min_64f.h
index 970b843f..7652ef72 100644
--- a/kernels/volk/volk_64f_x2_min_64f.h
+++ b/kernels/volk/volk_64f_x2_min_64f.h
@@ -290,5 +290,22 @@ static inline void volk_64f_x2_min_64f_u_avx(double* cVector,
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_64f_x2_min_64f_rvv(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl);
+ vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
+ __riscv_vse64(cVector, __riscv_vfmin(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_64f_x2_min_64f_u_H */
diff --git a/kernels/volk/volk_64f_x2_multiply_64f.h b/kernels/volk/volk_64f_x2_multiply_64f.h
index caab3aaa..57eb468a 100644
--- a/kernels/volk/volk_64f_x2_multiply_64f.h
+++ b/kernels/volk/volk_64f_x2_multiply_64f.h
@@ -244,4 +244,22 @@ static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector,
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_64f_x2_multiply_64f_rvv(double* cVector,
+ const double* aVector,
+ const double* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl);
+ vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
+ __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_64f_x2_multiply_64f_u_H */
diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h
index 2fbf3cce..a8da031c 100644
--- a/kernels/volk/volk_64u_byteswap.h
+++ b/kernels/volk/volk_64u_byteswap.h
@@ -383,4 +383,53 @@ static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
#endif /* LV_HAVE_SSSE3 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_64u_byteswap_rvv(uint64_t* intsToSwap, unsigned int num_points)
+{
+ size_t n = num_points;
+ size_t vlmax = __riscv_vsetvlmax_e8m1();
+ if (vlmax <= 256) {
+ vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
+ __riscv_vsub(__riscv_vreinterpret_u64m1(__riscv_vid_v_u8m1(vlmax)),
+ 0x0706050403020100 - 0x1020304050607,
+ vlmax / 8));
+ for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vuint8m8_t v =
+ __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
+ v = RISCV_PERM8(__riscv_vrgather, v, vidx);
+ __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
+ }
+ } else {
+ vuint16m2_t vid = __riscv_vid_v_u16m2(vlmax);
+ vuint16m2_t voff1 = __riscv_vand(vid, 0x7, vlmax);
+ vuint16m2_t voff2 = __riscv_vrsub(voff1, 0x7, vlmax);
+ vuint16m2_t vidx = __riscv_vadd(__riscv_vsub(vid, voff1, vlmax), voff2, vlmax);
+ for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vuint8m8_t v =
+ __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
+ v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx);
+ __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
+ }
+ }
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVA23
+#include
+
+static inline void volk_64u_byteswap_rva23(uint64_t* intsToSwap, unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
+ vl = __riscv_vsetvl_e64m8(n);
+ vuint64m8_t v = __riscv_vle64_v_u64m8(intsToSwap, vl);
+ __riscv_vse64(intsToSwap, __riscv_vrev8(v, vl), vl);
+ }
+}
+#endif /* LV_HAVE_RVA23 */
+
#endif /* INCLUDED_volk_64u_byteswap_a_H */
diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h
index c2b55bf4..2be3b0b7 100644
--- a/kernels/volk/volk_64u_byteswappuppet_64u.h
+++ b/kernels/volk/volk_64u_byteswappuppet_64u.h
@@ -92,4 +92,26 @@ static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output,
}
#endif
+#ifdef LV_HAVE_RVV
+static inline void volk_64u_byteswappuppet_64u_rvv(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
+
+ volk_64u_byteswap_rvv((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+
+#ifdef LV_HAVE_RVA23
+static inline void volk_64u_byteswappuppet_64u_rva23(uint64_t* output,
+ uint64_t* intsToSwap,
+ unsigned int num_points)
+{
+
+ volk_64u_byteswap_rva23((uint64_t*)intsToSwap, num_points);
+ memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+
#endif
diff --git a/kernels/volk/volk_64u_popcnt.h b/kernels/volk/volk_64u_popcnt.h
index 5c9b2a3a..fb12bbe1 100644
--- a/kernels/volk/volk_64u_popcnt.h
+++ b/kernels/volk/volk_64u_popcnt.h
@@ -116,5 +116,22 @@ static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value)
}
#endif /*LV_HAVE_NEON*/
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_64u_popcnt_rvv(uint64_t* ret, const uint64_t value)
+{
+ *ret = __riscv_vcpop(__riscv_vreinterpret_b2(__riscv_vmv_s_x_u64m1(value, 1)), 64);
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVA22V
+#include
+
+static inline void volk_64u_popcnt_rva22(uint64_t* ret, const uint64_t value)
+{
+ *ret = __riscv_cpop_64(value);
+}
+#endif /*LV_HAVE_RVA22V*/
#endif /*INCLUDED_volk_64u_popcnt_a_H*/
diff --git a/kernels/volk/volk_64u_popcntpuppet_64u.h b/kernels/volk/volk_64u_popcntpuppet_64u.h
index 300d4fd1..245aeba1 100644
--- a/kernels/volk/volk_64u_popcntpuppet_64u.h
+++ b/kernels/volk/volk_64u_popcntpuppet_64u.h
@@ -19,11 +19,9 @@ static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector,
const uint64_t* inVector,
unsigned int num_points)
{
- unsigned int ii;
- for (ii = 0; ii < num_points; ++ii) {
- volk_64u_popcnt_generic(outVector + ii, num_points);
+ for (size_t i = 0; i < num_points; ++i) {
+ volk_64u_popcnt_generic(outVector + i, inVector[i]);
}
- memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
}
#endif /* LV_HAVE_GENERIC */
@@ -32,11 +30,9 @@ static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector,
const uint64_t* inVector,
unsigned int num_points)
{
- unsigned int ii;
- for (ii = 0; ii < num_points; ++ii) {
- volk_64u_popcnt_a_sse4_2(outVector + ii, num_points);
+ for (size_t i = 0; i < num_points; ++i) {
+ volk_64u_popcnt_a_sse4_2(outVector + i, inVector[i]);
}
- memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
}
#endif /* LV_HAVE_SSE4_2 */
@@ -45,12 +41,32 @@ static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector,
const uint64_t* inVector,
unsigned int num_points)
{
- unsigned int ii;
- for (ii = 0; ii < num_points; ++ii) {
- volk_64u_popcnt_neon(outVector + ii, num_points);
+ for (size_t i = 0; i < num_points; ++i) {
+ volk_64u_popcnt_neon(outVector + i, inVector[i]);
}
- memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
}
#endif /* LV_HAVE_NEON */
+#ifdef LV_HAVE_RVV
+static inline void volk_64u_popcntpuppet_64u_rvv(uint64_t* outVector,
+ const uint64_t* inVector,
+ unsigned int num_points)
+{
+ for (size_t i = 0; i < num_points; ++i) {
+ volk_64u_popcnt_rvv(outVector + i, inVector[i]);
+ }
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVA22V
+static inline void volk_64u_popcntpuppet_64u_rva22(uint64_t* outVector,
+ const uint64_t* inVector,
+ unsigned int num_points)
+{
+ for (size_t i = 0; i < num_points; ++i) {
+ volk_64u_popcnt_rva22(outVector + i, inVector[i]);
+ }
+}
+#endif /* LV_HAVE_RVA22V */
+
#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/kernels/volk/volk_8i_convert_16i.h b/kernels/volk/volk_8i_convert_16i.h
index 36e929bb..0800f7c5 100644
--- a/kernels/volk/volk_8i_convert_16i.h
+++ b/kernels/volk/volk_8i_convert_16i.h
@@ -266,5 +266,20 @@ static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
}
#endif /* LV_HAVE_ORC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8i_convert_16i_rvv(int16_t* outputVector,
+ const int8_t* inputVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e8m4(n);
+ vint16m8_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m4(inputVector, vl), vl);
+ __riscv_vse16(outputVector, __riscv_vsll(v, 8, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/kernels/volk/volk_8i_s32f_convert_32f.h b/kernels/volk/volk_8i_s32f_convert_32f.h
index d904d25d..cd2c325e 100644
--- a/kernels/volk/volk_8i_s32f_convert_32f.h
+++ b/kernels/volk/volk_8i_s32f_convert_32f.h
@@ -350,5 +350,22 @@ static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
}
#endif /* LV_HAVE_ORC */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8i_s32f_convert_32f_rvv(float* outputVector,
+ const int8_t* inputVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+ vl = __riscv_vsetvl_e8m2(n);
+ vint16m4_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m2(inputVector, vl), vl);
+ __riscv_vse32(
+ outputVector, __riscv_vfmul(__riscv_vfwcvt_f(v, vl), 1.0f / scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/kernels/volk/volk_8ic_deinterleave_16i_x2.h b/kernels/volk/volk_8ic_deinterleave_16i_x2.h
index 46b2e2e4..87d745b8 100644
--- a/kernels/volk/volk_8ic_deinterleave_16i_x2.h
+++ b/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -392,4 +392,26 @@ static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
}
}
#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8ic_deinterleave_16i_x2_rvv(int16_t* iBuffer,
+ int16_t* qBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
+{
+ const uint16_t* in = (const uint16_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e16m8(n);
+ vuint16m8_t vc = __riscv_vle16_v_u16m8(in, vl);
+ vuint16m8_t vr = __riscv_vsll(vc, 8, vl);
+ vuint16m8_t vi = __riscv_vand(vc, 0xFF00, vl);
+ __riscv_vse16((uint16_t*)iBuffer, vr, vl);
+ __riscv_vse16((uint16_t*)qBuffer, vi, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
diff --git a/kernels/volk/volk_8ic_deinterleave_real_16i.h b/kernels/volk/volk_8ic_deinterleave_real_16i.h
index bef47592..8814e5e1 100644
--- a/kernels/volk/volk_8ic_deinterleave_real_16i.h
+++ b/kernels/volk/volk_8ic_deinterleave_real_16i.h
@@ -300,4 +300,22 @@ static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
}
}
#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8ic_deinterleave_real_16i_rvv(int16_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
+{
+ const int16_t* in = (const int16_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
+ vl = __riscv_vsetvl_e16m8(n);
+ vint16m8_t v = __riscv_vle16_v_i16m8(in, vl);
+ __riscv_vse16(iBuffer, __riscv_vsra(__riscv_vsll(v, 8, vl), 1, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */
diff --git a/kernels/volk/volk_8ic_deinterleave_real_8i.h b/kernels/volk/volk_8ic_deinterleave_real_8i.h
index 116b1afb..2c409c69 100644
--- a/kernels/volk/volk_8ic_deinterleave_real_8i.h
+++ b/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -402,4 +402,21 @@ static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8ic_deinterleave_real_8i_rvv(int8_t* iBuffer,
+ const lv_8sc_t* complexVector,
+ unsigned int num_points)
+{
+ const uint16_t* in = (const uint16_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
+ vl = __riscv_vsetvl_e16m8(n);
+ vuint16m8_t vc = __riscv_vle16_v_u16m8(in, vl);
+ __riscv_vse8((uint8_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */
diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
index 8936a169..e0234b16 100644
--- a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
+++ b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
@@ -441,4 +441,28 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8ic_s32f_deinterleave_32f_x2_rvv(float* iBuffer,
+ float* qBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ const uint16_t* in = (const uint16_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl, qBuffer += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vuint16m4_t vc = __riscv_vle16_v_u16m4(in, vl);
+ vint8m2_t vr = __riscv_vreinterpret_i8m2(__riscv_vnsrl(vc, 0, vl));
+ vint8m2_t vi = __riscv_vreinterpret_i8m2(__riscv_vnsrl(vc, 8, vl));
+ vfloat32m8_t vrf = __riscv_vfwcvt_f(__riscv_vsext_vf2(vr, vl), vl);
+ vfloat32m8_t vif = __riscv_vfwcvt_f(__riscv_vsext_vf2(vi, vl), vl);
+ __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl);
+ __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */
diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
index 37cb2555..7ec8958d 100644
--- a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
+++ b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
@@ -349,5 +349,24 @@ volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8ic_s32f_deinterleave_real_32f_rvv(float* iBuffer,
+ const lv_8sc_t* complexVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ const uint16_t* in = (const uint16_t*)complexVector;
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
+ vl = __riscv_vsetvl_e16m4(n);
+ vuint16m4_t vc = __riscv_vle16_v_u16m4(in, vl);
+ vint8m2_t vr = __riscv_vreinterpret_i8m2(__riscv_vnsrl(vc, 0, vl));
+ vfloat32m8_t vrf = __riscv_vfwcvt_f(__riscv_vsext_vf2(vr, vl), vl);
+ __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H */
diff --git a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
index 5462ea67..5de0e312 100644
--- a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
+++ b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
@@ -274,4 +274,55 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8ic_x2_multiply_conjugate_16ic_rvv(lv_16sc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e8m2(n);
+ vint16m4_t va = __riscv_vle16_v_i16m4((const int16_t*)aVector, vl);
+ vint16m4_t vb = __riscv_vle16_v_i16m4((const int16_t*)bVector, vl);
+ vint8m2_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
+ vint8m2_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
+ vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
+ vint16m4_t vi =
+ __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
+ vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr);
+ vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi);
+ vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl);
+ __riscv_vse32((uint32_t*)cVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_8ic_x2_multiply_conjugate_16ic_rvvseg(lv_16sc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e8m2(n);
+ vint8m2x2_t va = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)aVector, vl);
+ vint8m2x2_t vb = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)bVector, vl);
+ vint8m2_t var = __riscv_vget_i8m2(va, 0), vai = __riscv_vget_i8m2(va, 1);
+ vint8m2_t vbr = __riscv_vget_i8m2(vb, 0), vbi = __riscv_vget_i8m2(vb, 1);
+ vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
+ vint16m4_t vi =
+ __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
+ __riscv_vsseg2e16_v_i16m4x2(
+ (int16_t*)cVector, __riscv_vcreate_v_i16m4x2(vr, vi), vl);
+ }
+}
+
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */
diff --git a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
index 318a7819..5316ada0 100644
--- a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
+++ b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
@@ -341,4 +341,63 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
#endif /* LV_HAVE_AVX2*/
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e8m1(n);
+ vint16m2_t va = __riscv_vle16_v_i16m2((const int16_t*)aVector, vl);
+ vint16m2_t vb = __riscv_vle16_v_i16m2((const int16_t*)bVector, vl);
+ vint8m1_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
+ vint8m1_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
+ vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
+ vint16m2_t vi =
+ __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
+ vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
+ vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
+ vuint32m4_t vru = __riscv_vreinterpret_u32m4(vrf);
+ vuint32m4_t viu = __riscv_vreinterpret_u32m4(vif);
+ vuint64m8_t v =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
+ __riscv_vse64((uint64_t*)cVector, v, vl);
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void
+volk_8ic_x2_s32f_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector,
+ const lv_8sc_t* aVector,
+ const lv_8sc_t* bVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ size_t n = num_points;
+ for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
+ vl = __riscv_vsetvl_e8m1(n);
+ vint8m1x2_t va = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)aVector, vl);
+ vint8m1x2_t vb = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)bVector, vl);
+ vint8m1_t var = __riscv_vget_i8m1(va, 0), vai = __riscv_vget_i8m1(va, 1);
+ vint8m1_t vbr = __riscv_vget_i8m1(vb, 0), vbi = __riscv_vget_i8m1(vb, 1);
+ vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
+ vint16m2_t vi =
+ __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
+ vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
+ vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
+ __riscv_vsseg2e32_v_f32m4x2(
+ (float*)cVector, __riscv_vcreate_v_f32m4x2(vrf, vif), vl);
+ }
+}
+
+#endif /*LV_HAVE_RVVSEG*/
+
#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
index 51963efd..5314622b 100644
--- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
+++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
@@ -20,11 +20,14 @@ typedef union {
unsigned int* w;
} p_decision_t;
-static inline int parity(int x, unsigned char* Partab)
+static inline int parity(int x)
{
- x ^= (x >> 16);
- x ^= (x >> 8);
- return Partab[x];
+ x ^= x >> 16;
+ x ^= x >> 8;
+ x ^= x >> 4;
+ x ^= x >> 2;
+ x ^= x >> 1;
+ return x & 1;
}
static inline int chainback_viterbi(unsigned char* data,
@@ -113,7 +116,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* dec,
static unsigned char* X;
static unsigned int excess = 6;
static unsigned char* Branchtab;
- static unsigned char Partab[256];
int d_polys[2] = { 79, 109 };
@@ -127,24 +129,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* dec,
D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
volk_get_alignment());
int state, i;
- int cnt, ti;
-
- /* Initialize parity lookup table */
- for (i = 0; i < 256; i++) {
- cnt = 0;
- ti = i;
- while (ti) {
- if (ti & 1)
- cnt++;
- ti >>= 1;
- }
- Partab[i] = cnt & 1;
- }
+
/* Initialize the branch table */
for (state = 0; state < d_numstates / 2; state++) {
for (i = 0; i < rate; i++) {
Branchtab[i * d_numstates / 2 + state] =
- parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+ parity((2 * state) & d_polys[i]) ? 255 : 0;
}
}
@@ -195,7 +185,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec,
static unsigned char* X;
static unsigned int excess = 6;
static unsigned char* Branchtab;
- static unsigned char Partab[256];
int d_polys[2] = { 79, 109 };
@@ -209,24 +198,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec,
D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
volk_get_alignment());
int state, i;
- int cnt, ti;
-
- /* Initialize parity lookup table */
- for (i = 0; i < 256; i++) {
- cnt = 0;
- ti = i;
- while (ti) {
- if (ti & 1)
- cnt++;
- ti >>= 1;
- }
- Partab[i] = cnt & 1;
- }
+
/* Initialize the branch table */
for (state = 0; state < d_numstates / 2; state++) {
for (i = 0; i < rate; i++) {
Branchtab[i * d_numstates / 2 + state] =
- parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+ parity((2 * state) & d_polys[i]) ? 255 : 0;
}
}
@@ -280,7 +257,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec,
static unsigned char* X;
static unsigned int excess = 6;
static unsigned char* Branchtab;
- static unsigned char Partab[256];
int d_polys[2] = { 79, 109 };
@@ -294,24 +270,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec,
D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
volk_get_alignment());
int state, i;
- int cnt, ti;
-
- /* Initialize parity lookup table */
- for (i = 0; i < 256; i++) {
- cnt = 0;
- ti = i;
- while (ti) {
- if (ti & 1)
- cnt++;
- ti >>= 1;
- }
- Partab[i] = cnt & 1;
- }
+
/* Initialize the branch table */
for (state = 0; state < d_numstates / 2; state++) {
for (i = 0; i < rate; i++) {
Branchtab[i * d_numstates / 2 + state] =
- parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+ parity((2 * state) & d_polys[i]) ? 255 : 0;
}
}
@@ -363,7 +327,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec,
static unsigned char* D;
static unsigned int excess = 6;
static unsigned char* Branchtab;
- static unsigned char Partab[256];
int d_polys[2] = { 79, 109 };
@@ -378,24 +341,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec,
volk_get_alignment());
int state, i;
- int cnt, ti;
-
- /* Initialize parity lookup table */
- for (i = 0; i < 256; i++) {
- cnt = 0;
- ti = i;
- while (ti) {
- if (ti & 1)
- cnt++;
- ti >>= 1;
- }
- Partab[i] = cnt & 1;
- }
+
/* Initialize the branch table */
for (state = 0; state < d_numstates / 2; state++) {
for (i = 0; i < rate; i++) {
Branchtab[i * d_numstates / 2 + state] =
- parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+ parity((2 * state) & d_polys[i]) ? 255 : 0;
}
}
@@ -427,4 +378,59 @@ static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec,
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_RVV
+#include
+
+static inline void volk_8u_conv_k7_r2puppet_8u_rvv(unsigned char* dec,
+ unsigned char* syms,
+ unsigned int framebits)
+{
+ if (framebits < 12)
+ return;
+
+ int d_numstates = (1 << 6);
+ static unsigned char* D;
+ static unsigned char* Y;
+ static unsigned char* X;
+ static unsigned int excess = 6;
+ static unsigned char* Branchtab;
+
+ static int once = 1;
+ if (once) {
+ once = 0;
+
+ X = (unsigned char*)volk_malloc(3 * d_numstates, volk_get_alignment());
+ Y = X + d_numstates;
+ Branchtab = Y + d_numstates;
+ D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+ volk_get_alignment());
+
+ /* Initialize the branch table */
+ for (size_t state = 0; state < d_numstates / 2; state++) {
+ Branchtab[state] = parity(state & 39) * 255;
+ Branchtab[state + d_numstates / 2] = parity(state & 54) * 255;
+ }
+ }
+
+ memset(X, 31, d_numstates); // unbias the old_metrics
+ memset(D, 0, (d_numstates / 8) * (framebits + 6)); // initialize decisions
+
+ volk_8u_x4_conv_k7_r2_8u_rvv(
+ Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
+
+ unsigned int min = X[0];
+ int i = 0, state = 0;
+ for (i = 0; i < d_numstates; ++i) {
+ if (X[i] < min) {
+ min = X[i];
+ state = i;
+ }
+ }
+
+ chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
+
+ return;
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /*INCLUDED_volk_8u_conv_k7_r2puppet_8u_H*/
diff --git a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h
index 1464218a..5d03f03d 100644
--- a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h
+++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h
@@ -1153,5 +1153,84 @@ static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+#include
+
+static inline void volk_8u_x2_encodeframepolar_8u_rvv(unsigned char* frame,
+ unsigned char* temp,
+ unsigned int frame_size)
+{
+ unsigned int stage = log2_of_power_of_2(frame_size);
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+
+ while (stage) {
+ // encode stage
+ if (frame_half < 8) {
+ encodepolar_single_stage(frame, temp, num_branches, frame_half);
+ } else {
+ unsigned char *in = temp, *out = frame;
+ for (size_t branch = 0; branch < num_branches; ++branch) {
+ size_t n = frame_half;
+ for (size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) {
+ vl = __riscv_vsetvl_e8m1(n);
+ vuint16m2_t vc = __riscv_vle16_v_u16m2((uint16_t*)in, vl);
+ vuint8m1_t v1 = __riscv_vnsrl(vc, 0, vl);
+ vuint8m1_t v2 = __riscv_vnsrl(vc, 8, vl);
+ __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl);
+ __riscv_vse8(out + frame_half, v2, vl);
+ }
+ out += frame_half;
+ }
+ }
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ // update all the parameters.
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ --stage;
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include
+
+static inline void volk_8u_x2_encodeframepolar_8u_rvvseg(unsigned char* frame,
+ unsigned char* temp,
+ unsigned int frame_size)
+{
+ unsigned int stage = log2_of_power_of_2(frame_size);
+ unsigned int frame_half = frame_size >> 1;
+ unsigned int num_branches = 1;
+
+ while (stage) {
+ // encode stage
+ if (frame_half < 8) {
+ encodepolar_single_stage(frame, temp, num_branches, frame_half);
+ } else {
+ unsigned char *in = temp, *out = frame;
+ for (size_t branch = 0; branch < num_branches; ++branch) {
+ size_t n = frame_half;
+ for (size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) {
+ vl = __riscv_vsetvl_e8m1(n);
+ vuint8m1x2_t vc = __riscv_vlseg2e8_v_u8m1x2(in, vl);
+ vuint8m1_t v1 = __riscv_vget_u8m1(vc, 0);
+ vuint8m1_t v2 = __riscv_vget_u8m1(vc, 1);
+ __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl);
+ __riscv_vse8(out + frame_half, v2, vl);
+ }
+ out += frame_half;
+ }
+ }
+ memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+ // update all the parameters.
+ num_branches = num_branches << 1;
+ frame_half = frame_half >> 1;
+ --stage;
+ }
+}
+#endif /*LV_HAVE_RVVSEG*/
#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
diff --git a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h
index 4c45f757..e54befa4 100644
--- a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h
+++ b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h
@@ -169,4 +169,33 @@ volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+static inline void volk_8u_x3_encodepolar_8u_x2_rvv(unsigned char* frame,
+ unsigned char* temp,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ interleave_frozen_and_info_bits(
+ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_rvv(frame, temp, frame_size);
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVVSEG
+static inline void
+volk_8u_x3_encodepolar_8u_x2_rvvseg(unsigned char* frame,
+ unsigned char* temp,
+ const unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ interleave_frozen_and_info_bits(
+ temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_8u_x2_encodeframepolar_8u_rvvseg(frame, temp, frame_size);
+}
+#endif /* LV_HAVE_RVVSEG */
+
#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_A_H_ */
diff --git a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h
index 496ca2e5..792168e0 100644
--- a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h
+++ b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h
@@ -156,5 +156,47 @@ volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame,
}
#endif /* LV_HAVE_AVX2 */
+#ifdef LV_HAVE_RVV
+static inline void volk_8u_x3_encodepolarpuppet_8u_rvv(unsigned char* frame,
+ unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ if (frame_size < 1) {
+ return;
+ }
+
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+ volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_rvv(
+ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
+}
+#endif /* LV_HAVE_RVV */
+
+#ifdef LV_HAVE_RVVSEG
+static inline void
+volk_8u_x3_encodepolarpuppet_8u_rvvseg(unsigned char* frame,
+ unsigned char* frozen_bit_mask,
+ const unsigned char* frozen_bits,
+ const unsigned char* info_bits,
+ unsigned int frame_size)
+{
+ if (frame_size < 1) {
+ return;
+ }
+
+ frame_size = next_lower_power_of_two(frame_size);
+ unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+ volk_get_alignment());
+ adjust_frozen_mask(frozen_bit_mask, frame_size);
+ volk_8u_x3_encodepolar_8u_x2_rvvseg(
+ frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+ volk_free(temp);
+}
+#endif /* LV_HAVE_RVVSEG */
#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_A_H_ */
diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
index 9750b665..cb2db11a 100644
--- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
+++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
@@ -63,11 +63,14 @@ static inline void renormalize(unsigned char* X)
int i;
unsigned char min = X[0];
- for (i = 0; i < NUMSTATES; i++)
- if (min > X[i])
+ for (i = 0; i < NUMSTATES; i++) {
+ if (min > X[i]) {
min = X[i];
- for (i = 0; i < NUMSTATES; i++)
+ }
+ }
+ for (i = 0; i < NUMSTATES; i++) {
X[i] -= min;
+ }
}
@@ -91,8 +94,9 @@ static inline void BFLY(int i,
int PRECISIONSHIFT = 2;
metricsum = 1;
- for (j = 0; j < RATE; j++)
+ for (j = 0; j < RATE; j++) {
metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
+ }
metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;
unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
@@ -465,4 +469,210 @@ static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_RVV
+#include
+
+static inline void volk_8u_x4_conv_k7_r2_8u_rvv(unsigned char* Y,
+ unsigned char* X,
+ unsigned char* syms,
+ unsigned char* dec,
+ unsigned int framebits,
+ unsigned int excess,
+ unsigned char* Branchtab)
+{
+ size_t vl = 256 / 8;
+
+ size_t n = framebits + excess;
+
+ if (__riscv_vlenb() == 128 / 8) {
+ vuint8m2_t vX0 = __riscv_vle8_v_u8m2(X, vl),
+ vX1 = __riscv_vle8_v_u8m2(X + vl, vl);
+ vuint8m2_t vY0 = __riscv_vle8_v_u8m2(Y, vl),
+ vY1 = __riscv_vle8_v_u8m2(Y + vl, vl);
+ vuint8m2_t vB0 = __riscv_vle8_v_u8m2(Branchtab, vl);
+ vuint8m2_t vB1 = __riscv_vle8_v_u8m2(Branchtab + vl, vl);
+ vuint8m2_t v63 = __riscv_vmv_v_x_u8m2(63, vl);
+
+ for (size_t i = 0; i < n; ++i) {
+ // Butterfly
+ vuint8m2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl);
+ vuint8m2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl);
+ vuint8m2_t va = __riscv_vaaddu(va0, va1, 0, vl);
+ va = __riscv_vreinterpret_u8m2(
+ __riscv_vsrl(__riscv_vreinterpret_u16m2(va), 2, vl / 2));
+ va = __riscv_vand(va, v63, vl);
+ vuint8m2_t vb = __riscv_vssubu(v63, va, vl);
+ vuint8m2_t vX0a = __riscv_vsaddu(vX0, va, vl);
+ vuint8m2_t vX1b = __riscv_vsaddu(vX1, vb, vl);
+ vuint8m2_t vX0b = __riscv_vsaddu(vX0, vb, vl);
+ vuint8m2_t vX1a = __riscv_vsaddu(vX1, va, vl);
+ vY0 = __riscv_vminu(vX1b, vX0a, vl);
+ vY1 = __riscv_vminu(vX1a, vX0b, vl);
+
+ vuint16m4_t vX1ba =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl);
+ vX1b = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 0);
+ vX1a = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 1);
+
+ vuint16m4_t vm =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl);
+ vY0 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 0);
+ vY1 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 1);
+
+ __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl);
+ __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl);
+
+ // Renormalize
+ vuint8m2_t vmin = __riscv_vminu(vY0, vY1, vl);
+ vmin = __riscv_vlmul_ext_u8m2(
+ __riscv_vredminu(vmin, __riscv_vlmul_trunc_u8m1(vmin), vl));
+ vmin = __riscv_vrgather(vmin, 0, vl);
+ vY0 = __riscv_vsub(vY0, vmin, vl);
+ vY1 = __riscv_vsub(vY1, vmin, vl);
+
+ vuint8m2_t tmp; // Swap pointers to old and new metrics
+ tmp = vX0;
+ vX0 = vY0;
+ vY0 = tmp;
+ tmp = vX1;
+ vX1 = vY1;
+ vY1 = tmp;
+ }
+ if (n & 1) {
+ __riscv_vse8(X, vY0, vl);
+ __riscv_vse8(X + vl, vY1, vl);
+ __riscv_vse8(Y, vX0, vl);
+ __riscv_vse8(Y + vl, vX1, vl);
+ } else {
+ __riscv_vse8(X, vX0, vl);
+ __riscv_vse8(X + vl, vX1, vl);
+ __riscv_vse8(Y, vY0, vl);
+ __riscv_vse8(Y + vl, vY1, vl);
+ }
+ } else if (__riscv_vlenb() == 256 / 8) {
+ vuint8m1_t vX0 = __riscv_vle8_v_u8m1(X, vl),
+ vX1 = __riscv_vle8_v_u8m1(X + vl, vl);
+ vuint8m1_t vY0 = __riscv_vle8_v_u8m1(Y, vl),
+ vY1 = __riscv_vle8_v_u8m1(Y + vl, vl);
+ vuint8m1_t vB0 = __riscv_vle8_v_u8m1(Branchtab, vl);
+ vuint8m1_t vB1 = __riscv_vle8_v_u8m1(Branchtab + vl, vl);
+ vuint8m1_t v63 = __riscv_vmv_v_x_u8m1(63, vl);
+
+ for (size_t i = 0; i < n; ++i) {
+ // Butterfly
+ vuint8m1_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl);
+ vuint8m1_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl);
+ vuint8m1_t va = __riscv_vaaddu(va0, va1, 0, vl);
+ va = __riscv_vreinterpret_u8m1(
+ __riscv_vsrl(__riscv_vreinterpret_u16m1(va), 2, vl / 2));
+ va = __riscv_vand(va, v63, vl);
+ vuint8m1_t vb = __riscv_vssubu(v63, va, vl);
+ vuint8m1_t vX0a = __riscv_vsaddu(vX0, va, vl);
+ vuint8m1_t vX1b = __riscv_vsaddu(vX1, vb, vl);
+ vuint8m1_t vX0b = __riscv_vsaddu(vX0, vb, vl);
+ vuint8m1_t vX1a = __riscv_vsaddu(vX1, va, vl);
+ vY0 = __riscv_vminu(vX1b, vX0a, vl);
+ vY1 = __riscv_vminu(vX1a, vX0b, vl);
+
+ vuint16m2_t vX1ba =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl);
+ vX1b = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 0);
+ vX1a = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 1);
+
+ vuint16m2_t vm =
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl);
+ vY0 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 0);
+ vY1 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 1);
+
+ __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl);
+ __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl);
+
+ // Renormalize
+ vuint8m1_t vmin = __riscv_vminu(vY0, vY1, vl);
+ vmin = __riscv_vrgather(__riscv_vredminu(vmin, vmin, vl), 0, vl);
+ vY0 = __riscv_vsub(vY0, vmin, vl);
+ vY1 = __riscv_vsub(vY1, vmin, vl);
+
+ vuint8m1_t tmp; // Swap pointers to old and new metrics
+ tmp = vX0;
+ vX0 = vY0;
+ vY0 = tmp;
+ tmp = vX1;
+ vX1 = vY1;
+ vY1 = tmp;
+ }
+ if (n & 1) {
+ __riscv_vse8(X, vY0, vl);
+ __riscv_vse8(X + vl, vY1, vl);
+ __riscv_vse8(Y, vX0, vl);
+ __riscv_vse8(Y + vl, vX1, vl);
+ } else {
+ __riscv_vse8(X, vX0, vl);
+ __riscv_vse8(X + vl, vX1, vl);
+ __riscv_vse8(Y, vY0, vl);
+ __riscv_vse8(Y + vl, vY1, vl);
+ }
+ } else {
+ vuint8mf2_t vX0 = __riscv_vle8_v_u8mf2(X, vl),
+ vX1 = __riscv_vle8_v_u8mf2(X + vl, vl);
+ vuint8mf2_t vY0 = __riscv_vle8_v_u8mf2(Y, vl),
+ vY1 = __riscv_vle8_v_u8mf2(Y + vl, vl);
+ vuint8mf2_t vB0 = __riscv_vle8_v_u8mf2(Branchtab, vl);
+ vuint8mf2_t vB1 = __riscv_vle8_v_u8mf2(Branchtab + vl, vl);
+ vuint8mf2_t v63 = __riscv_vmv_v_x_u8mf2(63, vl);
+
+ for (size_t i = 0; i < n; ++i) {
+ // Butterfly
+ vuint8mf2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl);
+ vuint8mf2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl);
+ vuint8mf2_t va = __riscv_vaaddu(va0, va1, 0, vl);
+ va = __riscv_vreinterpret_u8mf2(
+ __riscv_vsrl(__riscv_vreinterpret_u16mf2(va), 2, vl / 2));
+ va = __riscv_vand(va, v63, vl);
+ vuint8mf2_t vb = __riscv_vssubu(v63, va, vl);
+ vuint8mf2_t vX0a = __riscv_vsaddu(vX0, va, vl);
+ vuint8mf2_t vX1b = __riscv_vsaddu(vX1, vb, vl);
+ vuint8mf2_t vX0b = __riscv_vsaddu(vX0, vb, vl);
+ vuint8mf2_t vX1a = __riscv_vsaddu(vX1, va, vl);
+ vY0 = __riscv_vminu(vX1b, vX0a, vl);
+ vY1 = __riscv_vminu(vX1a, vX0b, vl);
+
+ vuint8m1_t vX1ba = __riscv_vreinterpret_u8m1(
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl));
+ vuint8m1_t vY01 = __riscv_vreinterpret_u8m1(
+ __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl));
+
+ __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY01, vX1ba, vl * 2), vl * 2);
+
+ // Renormalize
+ vuint8m1_t vmin =
+ __riscv_vrgather(__riscv_vredminu(vY01, vY01, vl * 2), 0, vl * 2);
+ vY01 = __riscv_vsub(vY01, vmin, vl * 2);
+
+ vY0 = __riscv_vlmul_trunc_u8mf2(vY01);
+ vY1 = __riscv_vlmul_trunc_u8mf2(__riscv_vslidedown(vY01, vl, vl));
+
+ vuint8mf2_t tmp; // Swap pointers to old and new metrics
+ tmp = vX0;
+ vX0 = vY0;
+ vY0 = tmp;
+ tmp = vX1;
+ vX1 = vY1;
+ vY1 = tmp;
+ }
+ if (n & 1) {
+ __riscv_vse8(X, vY0, vl);
+ __riscv_vse8(X + vl, vY1, vl);
+ __riscv_vse8(Y, vX0, vl);
+ __riscv_vse8(Y + vl, vX1, vl);
+ } else {
+ __riscv_vse8(X, vX0, vl);
+ __riscv_vse8(X + vl, vX1, vl);
+ __riscv_vse8(Y, vY0, vl);
+ __riscv_vse8(Y + vl, vY1, vl);
+ }
+ }
+}
+#endif /*LV_HAVE_RVV*/
+
#endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 2c160b2f..588db44f 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -93,12 +93,28 @@ execute_process(
OUTPUT_VARIABLE arch_flag_lines
OUTPUT_STRIP_TRAILING_WHITESPACE)
+try_compile(
+ HAVE_RVV_INTRINSICS
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/cmake/Checks/check-rvv-intrinsics.c
+)
+if(HAVE_RVV_INTRINSICS)
+ message(STATUS "Checking RVV intrinsics - found")
+else()
+ message(STATUS "Checking RVV intrinsics - not found")
+endif()
+
macro(check_arch arch_name)
set(flags ${ARGN})
set(have_${arch_name} TRUE)
+
+ string(SUBSTRING "${arch_name}" 0 2 arch_prefix)
foreach(flag ${flags})
if(MSVC AND (${flag} STREQUAL "/arch:SSE2" OR ${flag} STREQUAL "/arch:SSE"))
# SSE/SSE2 is supported in MSVC since VS 2005 but flag not available when compiling 64-bit so do not check
+ elseif("${arch_prefix}" STREQUAL "rv" AND NOT HAVE_RVV_INTRINSICS)
+ message(STATUS "Skipping ${arch_name} due to missing RVV intrinsics support")
+ set(have_${arch_name} FALSE)
else()
include(CheckCXXCompilerFlag)
set(have_flag have${flag})
diff --git a/tmpl/volk_cpu.tmpl.c b/tmpl/volk_cpu.tmpl.c
index a4a06b0f..2cf2fa34 100644
--- a/tmpl/volk_cpu.tmpl.c
+++ b/tmpl/volk_cpu.tmpl.c
@@ -49,7 +49,7 @@ static int i_can_has_${arch.name} (void) {
#if defined(CPU_FEATURES_ARCH_MIPS)
if (GetMipsInfo().features.${check} == 0){ return 0; }
#endif
- %elif "riscv" in arch.name:
+ %elif "riscv" in arch.name or arch.name[:2] == "rv":
#if defined(CPU_FEATURES_ARCH_RISCV)
if (GetRiscvInfo().features.${check} == 0){ return 0; }
#endif