diff --git a/.github/workflows/run-tests-rvv.yml b/.github/workflows/run-tests-rvv.yml new file mode 100644 index 00000000..e97825f3 --- /dev/null +++ b/.github/workflows/run-tests-rvv.yml @@ -0,0 +1,55 @@ +# +# Copyright 2020 - 2022 Free Software Foundation, Inc. +# +# This file is part of VOLK +# +# SPDX-License-Identifier: LGPL-3.0-or-later +# + +name: Run VOLK tests on different RVV configurations + +on: [push, pull_request] + +jobs: + Tests: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + with: + submodules: "recursive" + - name: Install packages + run: | + sudo apt-get update -q -y + sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18 + mkdir build + cd build + - name: Test gcc-14 VLEN=128 + run: | + cd build; rm -rf * + CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \ + cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. + make -j$(nproc) + ARGS=-V make test + - name: Test gcc-14 VLEN=256 + run: | + cd build; rm -rf * + CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=256 \ + cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release + make -j$(nproc) + ARGS=-V make test + - name: Test clang-18 VLEN=512 + run: | + cd build; rm -rf * + CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \ + cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. + make -j$(nproc) + ARGS=-V make test + - name: Test clang-18 VLEN=1024 + run: | + cd build; rm -rf * + CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \ + cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release + make -j$(nproc) + ARGS=-V make test + + diff --git a/cmake/Checks/check-rvv-intrinsics.c b/cmake/Checks/check-rvv-intrinsics.c new file mode 100644 index 00000000..48d874de --- /dev/null +++ b/cmake/Checks/check-rvv-intrinsics.c @@ -0,0 +1,5 @@ +#if (__riscv_v_intrinsic >= 1000000 || __clang_major__ >= 18 || __GNUC__ >= 14) +int main() { return 0; } +#else +#error "rvv intrinsics aren't supported" +#endif diff --git a/cmake/Toolchains/rv64gcv-linux-gnu.cmake b/cmake/Toolchains/rv64gcv-linux-gnu.cmake new file mode 100644 index 00000000..f6edd741 --- /dev/null +++ b/cmake/Toolchains/rv64gcv-linux-gnu.cmake @@ -0,0 +1,34 @@ +# +# Copyright 2024 Free Software Foundation, Inc. +# +# This file is part of VOLK +# +# SPDX-License-Identifier: LGPL-3.0-or-later +# + +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR riscv64) + +set(CMAKE_C_COMPILER $ENV{CC}) +set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) +set(CMAKE_CXX_COMPILER $ENV{CXX}) + +set(CMAKE_C_FLAGS "$ENV{CFLAGS} -march=rv64gcv" CACHE STRING "" FORCE) +set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "" FORCE) +set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g" CACHE STRING "" FORCE) + +set(CMAKE_OBJCOPY + ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy + CACHE INTERNAL "objcopy tool") +set(CMAKE_SIZE_UTIL + ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size + CACHE INTERNAL "size tool") + +set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH}) + +set(QEMU_VLEN $ENV{VLEN}) +if(NOT QEMU_VLEN) + set(QEMU_VLEN "128") +endif() + +set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static -L /usr/riscv64-linux-gnu/ -cpu rv64,zba=true,zbb=true,v=on,vlen=${QEMU_VLEN},rvv_ta_all_1s=on,rvv_ma_all_1s=on") diff --git a/gen/archs.xml b/gen/archs.xml index 164c7bb4..7f971369 100644 --- a/gen/archs.xml +++ b/gen/archs.xml @@ -181,4 +181,48 @@ at the top, as a last resort. + + tmpl/ currently assumes that every arch.name starting with "rv" requires + RVV intrinsics + + + There is currently no mechanism in RISC-V to append extensions, + so each arch needs to specify all of them, and the order needs in the + machine definition needs to be from the fewest to the most extensions. + Fortunately, this maps quite well to the profiles concept. + + + + -march=rv64gcv + -march=rv64gcv + + + + + -march=rv64gcv + -march=rv64gcv + + It's unclear how performance portable segmented load/stores are, so the + default rvv implementations avoid using them. + This is a pseudo arch for separate segmented load/store implementations, + and is expected to never be used standalone without "rvv". + + + + + google/cpu_features currently doesn't support these extensions and profiles. + + + + + diff --git a/gen/machines.xml b/gen/machines.xml index 887f9794..64e1bbd8 100644 --- a/gen/machines.xml +++ b/gen/machines.xml @@ -33,6 +33,18 @@ generic riscv64 orc| + +generic riscv64 rvv rvvseg orc| + + + + + + generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc| diff --git a/include/volk/volk_rvv_intrinsics.h b/include/volk/volk_rvv_intrinsics.h new file mode 100644 index 00000000..85e21d43 --- /dev/null +++ b/include/volk/volk_rvv_intrinsics.h @@ -0,0 +1,77 @@ +/* -*- c++ -*- */ +/* + * Copyright 2024 Free Software Foundation, Inc. + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/* + * This file is intended to hold RVV intrinsics of intrinsics. + * They should be used in VOLK kernels to avoid copy-paste. + */ + +#ifndef INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ +#define INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ +#include + +#define RISCV_SHRINK2(op, T, S, v) \ + __riscv_##op(__riscv_vget_##T##S##m1(v, 0), \ + __riscv_vget_##T##S##m1(v, 1), \ + __riscv_vsetvlmax_e##S##m1()) + +#define RISCV_SHRINK4(op, T, S, v) \ + __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \ + __riscv_vget_##T##S##m1(v, 1), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \ + __riscv_vget_##T##S##m1(v, 3), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_vsetvlmax_e##S##m1()) + +#define RISCV_SHRINK8(op, T, S, v) \ + __riscv_##op(__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \ + __riscv_vget_##T##S##m1(v, 1), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \ + __riscv_vget_##T##S##m1(v, 3), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 4), \ + __riscv_vget_##T##S##m1(v, 5), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_##op(__riscv_vget_##T##S##m1(v, 6), \ + __riscv_vget_##T##S##m1(v, 7), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_vsetvlmax_e##S##m1()) + +#define RISCV_PERM4(f, v, vidx) \ + __riscv_vcreate_v_u8m1_u8m4( \ + f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1())) + +#define RISCV_LUT4(f, vtbl, v) \ + __riscv_vcreate_v_u8m1_u8m4( \ + f(vtbl, __riscv_vget_u8m1(v, 0), __riscv_vsetvlmax_e8m1()), \ + f(vtbl, __riscv_vget_u8m1(v, 1), __riscv_vsetvlmax_e8m1()), \ + f(vtbl, __riscv_vget_u8m1(v, 2), __riscv_vsetvlmax_e8m1()), \ + f(vtbl, __riscv_vget_u8m1(v, 3), __riscv_vsetvlmax_e8m1())) + +#define RISCV_PERM8(f, v, vidx) \ + __riscv_vcreate_v_u8m1_u8m8( \ + f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 4), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 5), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 6), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 7), vidx, __riscv_vsetvlmax_e8m1())) + +#define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl) + +#endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */ diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h index 8949785f..8d772ba8 100644 --- a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h +++ b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h @@ -668,5 +668,66 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = + __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl); + vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl); + vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = + __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl); + vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl); + vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/ diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h index b0f4d3b6..775b1523 100644 --- a/kernels/volk/volk_16i_branch_4_state_8.h +++ b/kernels/volk/volk_16i_branch_4_state_8.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_branch_4_state_8 * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h index cb7168ef..648712af 100644 --- a/kernels/volk/volk_16i_convert_8i.h +++ b/kernels/volk/volk_16i_convert_8i.h @@ -275,5 +275,20 @@ static inline void volk_16i_convert_8i_neon(int8_t* outputVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16i_convert_8i_rvv(int8_t* outputVector, + const int16_t* inputVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e16m8(n); + vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl); + __riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_16i_convert_8i_a_H */ diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h index fba73da1..ab0a4bcf 100644 --- a/kernels/volk/volk_16i_max_star_16i.h +++ b/kernels/volk/volk_16i_max_star_16i.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_max_star_16i * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h index 2b0b65c3..ee08ba43 100644 --- a/kernels/volk/volk_16i_max_star_horizontal_16i.h +++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h @@ -11,6 +11,10 @@ /*! * \page volk_16i_max_star_horizontal_16i * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h index 077c37b0..f57603db 100644 --- a/kernels/volk/volk_16i_permute_and_scalar_add.h +++ b/kernels/volk/volk_16i_permute_and_scalar_add.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_permute_and_scalar_add * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h index 817ecd22..1f9660ce 100644 --- a/kernels/volk/volk_16i_s32f_convert_32f.h +++ b/kernels/volk/volk_16i_s32f_convert_32f.h @@ -483,4 +483,21 @@ static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16i_s32f_convert_32f_rvv(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t v = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(inputVector, vl), vl); + __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */ diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h index a8337cc3..94e264fe 100644 --- a/kernels/volk/volk_16i_x4_quad_max_star_16i.h +++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_x4_quad_max_star_16i * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h index 53fa8de5..ba14c59d 100644 --- a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h +++ b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_x5_add_quad_16i_x4 * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h index 7a779bf8..99fe7cb2 100644 --- a/kernels/volk/volk_16ic_convert_32fc.h +++ b/kernels/volk/volk_16ic_convert_32fc.h @@ -315,4 +315,23 @@ static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_convert_32fc_rvv(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) +{ + const int16_t* in = (const int16_t*)inputVector; + float* out = (float*)outputVector; + size_t n = num_points * 2; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4_t v = __riscv_vle16_v_i16m4(in, vl); + __riscv_vse32(out, __riscv_vfwcvt_f(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ diff --git a/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/kernels/volk/volk_16ic_deinterleave_16i_x2.h index 37fb41e1..9f4ad7f7 100644 --- a/kernels/volk/volk_16ic_deinterleave_16i_x2.h +++ b/kernels/volk/volk_16ic_deinterleave_16i_x2.h @@ -375,4 +375,45 @@ static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_deinterleave_16i_x2_rvv(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vuint32m8_t vc = __riscv_vle32_v_u32m8((const uint32_t*)complexVector, vl); + vuint16m4_t vr = __riscv_vnsrl(vc, 0, vl); + vuint16m4_t vi = __riscv_vnsrl(vc, 16, vl); + __riscv_vse16((uint16_t*)iBuffer, vr, vl); + __riscv_vse16((uint16_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_16ic_deinterleave_16i_x2_rvvseg(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vuint16m4x2_t vc = + __riscv_vlseg2e16_v_u16m4x2((const uint16_t*)complexVector, vl); + vuint16m4_t vr = __riscv_vget_u16m4(vc, 0); + vuint16m4_t vi = __riscv_vget_u16m4(vc, 1); + __riscv_vse16((uint16_t*)iBuffer, vr, vl); + __riscv_vse16((uint16_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */ diff --git a/kernels/volk/volk_16ic_deinterleave_real_16i.h b/kernels/volk/volk_16ic_deinterleave_real_16i.h index 92110a3a..f5a9696f 100644 --- a/kernels/volk/volk_16ic_deinterleave_real_16i.h +++ b/kernels/volk/volk_16ic_deinterleave_real_16i.h @@ -377,4 +377,21 @@ static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_deinterleave_real_16i_rvv(int16_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + const uint32_t* in = (const uint32_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl); + __riscv_vse16((uint16_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */ diff --git a/kernels/volk/volk_16ic_deinterleave_real_8i.h b/kernels/volk/volk_16ic_deinterleave_real_8i.h index 231be417..257ea519 100644 --- a/kernels/volk/volk_16ic_deinterleave_real_8i.h +++ b/kernels/volk/volk_16ic_deinterleave_real_8i.h @@ -415,4 +415,24 @@ static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, } } #endif /* LV_HAVE_AVX2 */ + + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_deinterleave_real_8i_rvv(int8_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + const uint32_t* in = (const uint32_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl); + __riscv_vse8( + (uint8_t*)iBuffer, __riscv_vnsrl(__riscv_vnsrl(vc, 0, vl), 8, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */ diff --git a/kernels/volk/volk_16ic_magnitude_16i.h b/kernels/volk/volk_16ic_magnitude_16i.h index 76472540..79553d65 100644 --- a/kernels/volk/volk_16ic_magnitude_16i.h +++ b/kernels/volk/volk_16ic_magnitude_16i.h @@ -411,4 +411,50 @@ static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, } #endif /* LV_HAVE_NEONV7 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_magnitude_16i_rvv(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + const float scale = SHRT_MAX, iscale = 1.0f / scale; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl); + vint16m4_t vr = __riscv_vnsra(vc, 0, vl); + vint16m4_t vi = __riscv_vnsra(vc, 16, vl); + vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), iscale, vl); + vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), iscale, vl); + vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl); + vf = __riscv_vfmul(__riscv_vfsqrt(vf, vl), scale, vl); + __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(vf, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_16ic_magnitude_16i_rvvseg(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + const float scale = SHRT_MAX, iscale = 1.0f / scale; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl); + vint16m4_t vr = __riscv_vget_i16m4(vc, 0); + vint16m4_t vi = __riscv_vget_i16m4(vc, 1); + vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), iscale, vl); + vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), iscale, vl); + vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl); + vf = __riscv_vfmul(__riscv_vfsqrt(vf, vl), scale, vl); + __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(vf, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_16ic_magnitude_16i_u_H */ diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h index 219e977c..7f9b8ad6 100644 --- a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h +++ b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h @@ -327,4 +327,51 @@ volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_s32f_deinterleave_32f_x2_rvv(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl); + vint16m4_t vr = __riscv_vnsra(vc, 0, vl); + vint16m4_t vi = __riscv_vnsra(vc, 16, vl); + vfloat32m8_t vrf = __riscv_vfwcvt_f(vr, vl); + vfloat32m8_t vif = __riscv_vfwcvt_f(vi, vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl); + __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_16ic_s32f_deinterleave_32f_x2_rvvseg(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl); + vint16m4_t vr = __riscv_vget_i16m4(vc, 0); + vint16m4_t vi = __riscv_vget_i16m4(vc, 1); + vfloat32m8_t vrf = __riscv_vfwcvt_f(vr, vl); + vfloat32m8_t vif = __riscv_vfwcvt_f(vi, vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl); + __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */ diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h index 55688329..e8a0d1a0 100644 --- a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h +++ b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h @@ -334,4 +334,24 @@ volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_16ic_s32f_deinterleave_real_32f_rvv(float* iBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + const int32_t* in = (const int32_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vint32m8_t vc = __riscv_vle32_v_i32m8(in, vl); + vfloat32m8_t vr = __riscv_vfwcvt_f(__riscv_vncvt_x(vc, vl), vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vr, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H */ diff --git a/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/kernels/volk/volk_16ic_s32f_magnitude_32f.h index 89600632..8b193ee2 100644 --- a/kernels/volk/volk_16ic_s32f_magnitude_32f.h +++ b/kernels/volk/volk_16ic_s32f_magnitude_32f.h @@ -329,4 +329,48 @@ static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_s32f_magnitude_32f_rvv(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl); + vint16m4_t vr = __riscv_vnsra(vc, 0, vl); + vint16m4_t vi = __riscv_vnsra(vc, 16, vl); + vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl); + vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl); + vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl); + __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_16ic_s32f_magnitude_32f_rvvseg(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl); + vint16m4_t vr = __riscv_vget_i16m4(vc, 0); + vint16m4_t vi = __riscv_vget_i16m4(vc, 1); + vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl); + vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl); + vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl); + __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */ diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h index 48e33abf..a12350a0 100644 --- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h +++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h @@ -690,4 +690,68 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, #endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_RVV +#include "volk_32fc_x2_dot_prod_32fc.h" + +static inline void volk_16ic_x2_dot_prod_16ic_rvv(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) +{ + vint16m4_t vsumr = __riscv_vmv_v_x_i16m4(0, __riscv_vsetvlmax_e16m4()); + vint16m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t va = __riscv_vle32_v_i32m8((const int32_t*)in_a, vl); + vint32m8_t vb = __riscv_vle32_v_i32m8((const int32_t*)in_b, vl); + vint16m4_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 16, vl); + vint16m4_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 16, vl); + vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e16m1(); + vint16m1_t vr = RISCV_SHRINK4(vadd, i, 16, vsumr); + vint16m1_t vi = RISCV_SHRINK4(vadd, i, 16, vsumi); + vint16m1_t z = __riscv_vmv_s_x_i16m1(0, vl); + *result = lv_cmake(__riscv_vmv_x(__riscv_vredsum(vr, z, vl)), + __riscv_vmv_x(__riscv_vredsum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include "volk_32fc_x2_dot_prod_32fc.h" + + +static inline void volk_16ic_x2_dot_prod_16ic_rvvseg(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) +{ + vint16m4_t vsumr = __riscv_vmv_v_x_i16m4(0, __riscv_vsetvlmax_e16m4()); + vint16m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t va = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_a, vl); + vint16m4x2_t vb = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_b, vl); + vint16m4_t var = __riscv_vget_i16m4(va, 0), vai = __riscv_vget_i16m4(va, 1); + vint16m4_t vbr = __riscv_vget_i16m4(vb, 0), vbi = __riscv_vget_i16m4(vb, 1); + vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e16m1(); + vint16m1_t vr = RISCV_SHRINK4(vadd, i, 16, vsumr); + vint16m1_t vi = RISCV_SHRINK4(vadd, i, 16, vsumi); + vint16m1_t z = __riscv_vmv_s_x_i16m1(0, vl); + *result = lv_cmake(__riscv_vmv_x(__riscv_vredsum(vr, z, vl)), + __riscv_vmv_x(__riscv_vredsum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/ diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h index 03ee145c..37f0fb66 100644 --- a/kernels/volk/volk_16ic_x2_multiply_16ic.h +++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h @@ -462,4 +462,52 @@ static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_x2_multiply_16ic_rvv(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl, result += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t va = __riscv_vle32_v_i32m8((const int32_t*)in_a, vl); + vint32m8_t vb = __riscv_vle32_v_i32m8((const int32_t*)in_b, vl); + vint16m4_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 16, vl); + vint16m4_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 16, vl); + vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl); + vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr); + vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi); + vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl); + __riscv_vse32((uint32_t*)result, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_16ic_x2_multiply_16ic_rvvseg(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl, result += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t va = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_a, vl); + vint16m4x2_t vb = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_b, vl); + vint16m4_t var = __riscv_vget_i16m4(va, 0), vai = __riscv_vget_i16m4(va, 1); + vint16m4_t vbr = __riscv_vget_i16m4(vb, 0), vbi = __riscv_vget_i16m4(vb, 1); + vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl); + __riscv_vsseg2e16_v_i16m4x2( + (int16_t*)result, __riscv_vcreate_v_i16m4x2(vr, vi), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/ diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h index 8b1b8c03..50e59906 100644 --- a/kernels/volk/volk_16u_byteswap.h +++ b/kernels/volk/volk_16u_byteswap.h @@ -280,5 +280,54 @@ static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int nu } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_16u_byteswap_rvv(uint16_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + size_t vlmax = __riscv_vsetvlmax_e8m1(); + if (vlmax <= 256) { + vuint8m1_t vidx = __riscv_vreinterpret_u8m1( + __riscv_vsub(__riscv_vreinterpret_u16m1(__riscv_vid_v_u8m1(vlmax)), + 0x100 - 0x1, + vlmax / 2)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgather, v, vidx); + __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl); + } + } else { + vuint16m2_t vidx = __riscv_vreinterpret_u16m2( + __riscv_vsub(__riscv_vreinterpret_u32m2(__riscv_vid_v_u16m2(vlmax)), + 0x10000 - 0x1, + vlmax / 2)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx); + __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl); + } + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA23 +#include + +static inline void volk_16u_byteswap_rva23(uint16_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint16m8_t v = __riscv_vle16_v_u16m8(intsToSwap, vl); + __riscv_vse16(intsToSwap, __riscv_vrev8(v, vl), vl); + } +} +#endif /* LV_HAVE_RVA23 */ #endif /* INCLUDED_volk_16u_byteswap_a_H */ diff --git a/kernels/volk/volk_16u_byteswappuppet_16u.h b/kernels/volk/volk_16u_byteswappuppet_16u.h index 16e75d91..f01129eb 100644 --- a/kernels/volk/volk_16u_byteswappuppet_16u.h +++ b/kernels/volk/volk_16u_byteswappuppet_16u.h @@ -102,4 +102,26 @@ static inline void volk_16u_byteswappuppet_16u_u_orc(uint16_t* output, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +static inline void volk_16u_byteswappuppet_16u_rvv(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ + + volk_16u_byteswap_rvv((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); +} +#endif + +#ifdef LV_HAVE_RVA23 +static inline void volk_16u_byteswappuppet_16u_rva23(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ + + volk_16u_byteswap_rva23((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); +} +#endif + #endif diff --git a/kernels/volk/volk_32f_64f_add_64f.h b/kernels/volk/volk_32f_64f_add_64f.h index 06b56819..54d890e3 100644 --- a/kernels/volk/volk_32f_64f_add_64f.h +++ b/kernels/volk/volk_32f_64f_add_64f.h @@ -230,4 +230,22 @@ static inline void volk_32f_64f_add_64f_a_avx(double* cVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_64f_add_64f_rvv(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */ diff --git a/kernels/volk/volk_32f_64f_multiply_64f.h b/kernels/volk/volk_32f_64f_multiply_64f.h index 069cd73e..5ff81578 100644 --- a/kernels/volk/volk_32f_64f_multiply_64f.h +++ b/kernels/volk/volk_32f_64f_multiply_64f.h @@ -188,5 +188,22 @@ static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_64f_multiply_64f_rvv(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */ diff --git a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h index b3683a96..41e98a80 100644 --- a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h +++ b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h @@ -383,4 +383,174 @@ static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_8u_polarbutterfly_32f_rvv(float* llrs, + unsigned char* u, + const int frame_exp, + const int stage, + const int u_num, + const int row) +{ + const int frame_size = 0x01 << frame_exp; + if (row % 2) { // for odd rows just do the only necessary calculation and return. + const float* next_llrs = llrs + frame_size + row; + *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); + return; + } + + const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); + if (max_stage_depth < 3) { // vectorized version needs larger vectors. + volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); + return; + } + + int loop_stage = max_stage_depth; + int stage_size = 0x01 << loop_stage; + + float* src_llr_ptr; + float* dst_llr_ptr; + + if (row) { // not necessary for ZERO row. == first bit to be decoded. + // first do bit combination for all stages + // effectively encode some decoded bits again. + unsigned char* u_target = u + frame_size; + unsigned char* u_temp = u + 2 * frame_size; + memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); + + volk_8u_x2_encodeframepolar_8u_rvv(u_target, u_temp, stage_size); + + src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; + dst_llr_ptr = llrs + max_stage_depth * frame_size + row; + + size_t n = stage_size; + for (size_t vl; n > 0; + n -= vl, u_target += vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) { + vl = __riscv_vsetvl_e32m1(n); + vint8mf4_t v = __riscv_vle8_v_i8mf4((int8_t*)u_target, vl); + vuint64m2_t llr = __riscv_vle64_v_u64m2((const uint64_t*)src_llr_ptr, vl); + vfloat32m1_t llr0 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 0, vl)); + vfloat32m1_t llr1 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 32, vl)); + llr0 = __riscv_vfneg_mu(__riscv_vmslt(v, 0, vl), llr0, llr0, vl); + llr0 = __riscv_vfadd(llr0, llr1, vl); + __riscv_vse32(dst_llr_ptr, llr0, vl); + } + + --loop_stage; + stage_size >>= 1; + } + + const int min_stage = stage > 2 ? stage : 2; + + while (min_stage < loop_stage) { + dst_llr_ptr = llrs + loop_stage * frame_size + row; + src_llr_ptr = dst_llr_ptr + frame_size; + + size_t n = stage_size; + for (size_t vl; n > 0; n -= vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) { + vl = __riscv_vsetvl_e32m1(n); + vuint64m2_t llr = __riscv_vle64_v_u64m2((const uint64_t*)src_llr_ptr, vl); + vfloat32m1_t llr0 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 0, vl)); + vfloat32m1_t llr1 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 32, vl)); + vfloat32m1_t v = + __riscv_vfmin(__riscv_vfabs(llr0, vl), __riscv_vfabs(llr1, vl), vl); + v = __riscv_vfsgnjx(__riscv_vfsgnj(v, llr0, vl), llr1, vl); + __riscv_vse32(dst_llr_ptr, v, vl); + } + + --loop_stage; + stage_size >>= 1; + } + + // for stages < 3 vectors are too small!. + llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32f_8u_polarbutterfly_32f_rvvseg(float* llrs, + unsigned char* u, + const int frame_exp, + const int stage, + const int u_num, + const int row) +{ + const int frame_size = 0x01 << frame_exp; + if (row % 2) { // for odd rows just do the only necessary calculation and return. + const float* next_llrs = llrs + frame_size + row; + *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); + return; + } + + const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); + if (max_stage_depth < 3) { // vectorized version needs larger vectors. + volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); + return; + } + + int loop_stage = max_stage_depth; + int stage_size = 0x01 << loop_stage; + + float* src_llr_ptr; + float* dst_llr_ptr; + + if (row) { // not necessary for ZERO row. == first bit to be decoded. + // first do bit combination for all stages + // effectively encode some decoded bits again. + unsigned char* u_target = u + frame_size; + unsigned char* u_temp = u + 2 * frame_size; + memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); + + volk_8u_x2_encodeframepolar_8u_rvv(u_target, u_temp, stage_size); + + src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; + dst_llr_ptr = llrs + max_stage_depth * frame_size + row; + + size_t n = stage_size; + for (size_t vl; n > 0; + n -= vl, u_target += vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) { + vl = __riscv_vsetvl_e32m1(n); + vint8mf4_t v = __riscv_vle8_v_i8mf4((int8_t*)u_target, vl); + vfloat32m1x2_t llr = __riscv_vlseg2e32_v_f32m1x2(src_llr_ptr, vl); + vfloat32m1_t llr0 = __riscv_vget_f32m1(llr, 0); + vfloat32m1_t llr1 = __riscv_vget_f32m1(llr, 1); + llr0 = __riscv_vfneg_mu(__riscv_vmslt(v, 0, vl), llr0, llr0, vl); + llr0 = __riscv_vfadd(llr0, llr1, vl); + __riscv_vse32(dst_llr_ptr, llr0, vl); + } + + --loop_stage; + stage_size >>= 1; + } + + const int min_stage = stage > 2 ? stage : 2; + + while (min_stage < loop_stage) { + dst_llr_ptr = llrs + loop_stage * frame_size + row; + src_llr_ptr = dst_llr_ptr + frame_size; + + size_t n = stage_size; + for (size_t vl; n > 0; n -= vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) { + vl = __riscv_vsetvl_e32m1(n); + vfloat32m1x2_t llr = __riscv_vlseg2e32_v_f32m1x2(src_llr_ptr, vl); + vfloat32m1_t llr0 = __riscv_vget_f32m1(llr, 0); + vfloat32m1_t llr1 = __riscv_vget_f32m1(llr, 1); + vfloat32m1_t v = + __riscv_vfmin(__riscv_vfabs(llr0, vl), __riscv_vfabs(llr1, vl), vl); + v = __riscv_vfsgnjx(__riscv_vfsgnj(v, llr0, vl), llr1, vl); + __riscv_vse32(dst_llr_ptr, v, vl); + } + + --loop_stage; + stage_size >>= 1; + } + + // for stages < 3 vectors are too small!. + llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); +} +#endif /* LV_HAVE_RVVSEG */ + #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */ diff --git a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h index c97da33d..6ebcd22e 100644 --- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h +++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h @@ -162,5 +162,62 @@ static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +static inline void volk_32f_8u_polarbutterflypuppet_32f_rvv(float* llrs, + const float* input, + unsigned char* u, + const int elements) +{ + (void)input; // suppress unused parameter warning + + if (elements < 2) { + return; + } + + unsigned int frame_size = maximum_frame_size(elements); + unsigned int frame_exp = log2_of_power_of_2(frame_size); + + sanitize_bytes(u, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); + generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); + + unsigned int u_num = 0; + for (; u_num < frame_size; u_num++) { + volk_32f_8u_polarbutterfly_32f_rvv(llrs, u, frame_exp, 0, u_num, u_num); + u[u_num] = llrs[u_num] > 0 ? 0 : 1; + } + + clean_up_intermediate_values(llrs, u, frame_size, elements); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVVSEG +static inline void volk_32f_8u_polarbutterflypuppet_32f_rvvseg(float* llrs, + const float* input, + unsigned char* u, + const int elements) +{ + (void)input; // suppress unused parameter warning + + if (elements < 2) { + return; + } + + unsigned int frame_size = maximum_frame_size(elements); + unsigned int frame_exp = log2_of_power_of_2(frame_size); + + sanitize_bytes(u, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); + generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); + + unsigned int u_num = 0; + for (; u_num < frame_size; u_num++) { + volk_32f_8u_polarbutterfly_32f_rvvseg(llrs, u, frame_exp, 0, u_num, u_num); + u[u_num] = llrs[u_num] > 0 ? 0 : 1; + } + + clean_up_intermediate_values(llrs, u, frame_size, elements); +} +#endif /* LV_HAVE_RVVSEG */ #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */ diff --git a/kernels/volk/volk_32f_accumulator_s32f.h b/kernels/volk/volk_32f_accumulator_s32f.h index 1cd8568e..7e9a81f7 100644 --- a/kernels/volk/volk_32f_accumulator_s32f.h +++ b/kernels/volk/volk_32f_accumulator_s32f.h @@ -232,4 +232,26 @@ static inline void volk_32f_accumulator_s32f_generic(float* result, } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32f_accumulator_s32f_rvv(float* result, + const float* inputBuffer, + unsigned int num_points) +{ + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl); + vsum = __riscv_vfadd_tu(vsum, vsum, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */ diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h index 5cf0d693..4331987c 100644 --- a/kernels/volk/volk_32f_acos_32f.h +++ b/kernels/volk/volk_32f_acos_32f.h @@ -102,13 +102,15 @@ static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector, x = _mm256_add_ps( z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + } x = _mm256_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm256_fmadd_ps( y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); @@ -171,14 +173,16 @@ volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p x = _mm256_add_ps( z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + } x = _mm256_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); @@ -240,13 +244,15 @@ volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu condition = _mm_cmplt_ps(z, fones); x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + } x = _mm_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); condition = _mm_cmpgt_ps(z, fones); @@ -315,13 +321,15 @@ static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector, x = _mm256_add_ps( z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + } x = _mm256_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm256_fmadd_ps( y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); @@ -384,14 +392,16 @@ volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p x = _mm256_add_ps( z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + } x = _mm256_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); @@ -453,14 +463,16 @@ volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu condition = _mm_cmplt_ps(z, fones); x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + } x = _mm_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); condition = _mm_cmpgt_ps(z, fones); @@ -501,4 +513,72 @@ volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_acos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax); + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + +#if ACOS_TERMS == 2 + const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax); +#elif ACOS_TERMS == 3 + const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax); +#elif ACOS_TERMS == 4 + const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax); +#endif + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t a = + __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl); + vfloat32m2_t z = __riscv_vfabs(a, vl); + vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl); + x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); + x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); + x = __riscv_vfdiv(cf1, x, vl); + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + +#if ACOS_TERMS < 1 + vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl); +#elif ACOS_TERMS == 1 + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ACOS_TERMS == 2 + vfloat32m2_t y = cfm1o3; + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ACOS_TERMS == 3 + vfloat32m2_t y = cf1o5; + y = __riscv_vfmadd(y, xx, cfm1o3, vl); + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ACOS_TERMS == 4 + vfloat32m2_t y = cfm1o7; + y = __riscv_vfmadd(y, xx, cf1o5, vl); + y = __riscv_vfmadd(y, xx, cfm1o3, vl); + y = __riscv_vfmadd(y, xx, cf1, vl); +#else +#error "ACOS_TERMS > 4 not supported by volk_32f_acos_32f_rvv" +#endif + y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl); + y = __riscv_vfadd_mu( + __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl); + + vfloat32m2_t acosine; + acosine = __riscv_vfneg_mu(RISCV_VMFLTZ(32m2, a, vl), y, y, vl); + acosine = __riscv_vfadd_mu(RISCV_VMFLTZ(32m2, v, vl), acosine, acosine, cpi, vl); + + __riscv_vse32(bVector, acosine, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_acos_32f_u_H */ diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h index 09377163..1914c39e 100644 --- a/kernels/volk/volk_32f_asin_32f.h +++ b/kernels/volk/volk_32f_asin_32f.h @@ -486,4 +486,70 @@ volk_32f_asin_32f_generic(float* bVector, const float* aVector, unsigned int num } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_asin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + +#if ASIN_TERMS == 2 + const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax); +#elif ASIN_TERMS == 3 + const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax); +#elif ASIN_TERMS == 4 + const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax); +#endif + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t a = + __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl); + vfloat32m2_t z = __riscv_vfabs(a, vl); + vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl); + x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); + x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); + x = __riscv_vfdiv(cf1, x, vl); + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + +#if ASIN_TERMS < 1 + vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl); +#elif ASIN_TERMS == 1 + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ASIN_TERMS == 2 + vfloat32m2_t y = cfm1o3; + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ASIN_TERMS == 3 + vfloat32m2_t y = cf1o5; + y = __riscv_vfmadd(y, xx, cfm1o3, vl); + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ASIN_TERMS == 4 + vfloat32m2_t y = cfm1o7; + y = __riscv_vfmadd(y, xx, cf1o5, vl); + y = __riscv_vfmadd(y, xx, cfm1o3, vl); + y = __riscv_vfmadd(y, xx, cf1, vl); +#else +#error "ASIN_TERMS > 4 not supported by volk_32f_asin_32f_rvv" +#endif + y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl); + y = __riscv_vfadd_mu( + __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl); + + vfloat32m2_t asine; + asine = __riscv_vfneg_mu(RISCV_VMFLTZ(32m2, a, vl), y, y, vl); + + __riscv_vse32(bVector, asine, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_asin_32f_u_H */ diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index dc5987cb..300f46ca 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -293,4 +293,46 @@ volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_atan_32f_rvv(float* out, const float* in, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax); + const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax); + const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax); + const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax); + const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(in, vl); + vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(v, vl), cf1, vl); + vfloat32m2_t x = __riscv_vfdiv_mu(mswap, v, cf1, v, vl); + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + vfloat32m2_t p = c13; + p = __riscv_vfmadd(p, xx, c11, vl); + p = __riscv_vfmadd(p, xx, c9, vl); + p = __riscv_vfmadd(p, xx, c7, vl); + p = __riscv_vfmadd(p, xx, c5, vl); + p = __riscv_vfmadd(p, xx, c3, vl); + p = __riscv_vfmadd(p, xx, c1, vl); + p = __riscv_vfmul(p, x, vl); + + vfloat32m2_t t = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl); + p = __riscv_vmerge(p, t, mswap, vl); + + __riscv_vse32(out, p, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_atan_32f_u_H */ diff --git a/kernels/volk/volk_32f_binary_slicer_32i.h b/kernels/volk/volk_32f_binary_slicer_32i.h index 7606145b..861ef478 100644 --- a/kernels/volk/volk_32f_binary_slicer_32i.h +++ b/kernels/volk/volk_32f_binary_slicer_32i.h @@ -261,5 +261,21 @@ static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_binary_slicer_32i_rvv(int* cVector, + const float* aVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)aVector, vl); + v = __riscv_vsrl(__riscv_vnot(v, vl), 31, vl); + __riscv_vse32((uint32_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_binary_slicer_32i_H */ diff --git a/kernels/volk/volk_32f_binary_slicer_8i.h b/kernels/volk/volk_32f_binary_slicer_8i.h index c6929db4..9623ae90 100644 --- a/kernels/volk/volk_32f_binary_slicer_8i.h +++ b/kernels/volk/volk_32f_binary_slicer_8i.h @@ -500,5 +500,22 @@ static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_binary_slicer_8i_rvv(int8_t* cVector, + const float* aVector, + unsigned int num_points) +{ + size_t n = num_points; + vint8m2_t v0 = __riscv_vmv_v_x_i8m2(1, __riscv_vsetvlmax_e8m2()); + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + vint8m2_t vn = __riscv_vmerge(v0, 0, __riscv_vmflt(v, 0, vl), vl); + __riscv_vse8(cVector, vn, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */ diff --git a/kernels/volk/volk_32f_convert_64f.h b/kernels/volk/volk_32f_convert_64f.h index 93d1c611..5e907d39 100644 --- a/kernels/volk/volk_32f_convert_64f.h +++ b/kernels/volk/volk_32f_convert_64f.h @@ -230,5 +230,20 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_convert_64f_rvv(double* outputVector, + const float* inputVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(inputVector, vl); + __riscv_vse64(outputVector, __riscv_vfwcvt_f(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_convert_64f_a_H */ diff --git a/kernels/volk/volk_32f_cos_32f.h b/kernels/volk/volk_32f_cos_32f.h index 37785df0..854dd00e 100644 --- a/kernels/volk/volk_32f_cos_32f.h +++ b/kernels/volk/volk_32f_cos_32f.h @@ -127,8 +127,9 @@ static inline void volk_32f_cos_32f_a_avx512f(float* cosVector, cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); + } s = _mm512_div_ps(s, ftwos); sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); @@ -224,8 +225,9 @@ volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int n cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } s = _mm256_div_ps(s, ftwos); sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); @@ -335,8 +337,9 @@ volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_p cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } s = _mm256_div_ps(s, ftwos); sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); @@ -442,8 +445,9 @@ volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + } s = _mm_div_ps(s, ftwos); sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); @@ -546,8 +550,9 @@ static inline void volk_32f_cos_32f_u_avx512f(float* cosVector, cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); + } s = _mm512_div_ps(s, ftwos); sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); @@ -644,8 +649,9 @@ volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int n cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } s = _mm256_div_ps(s, ftwos); sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); @@ -755,8 +761,9 @@ volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_p cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } s = _mm256_div_ps(s, ftwos); sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); @@ -995,5 +1002,65 @@ volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_poi #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_cos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax); + const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax); + const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax); + const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax); + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t s = __riscv_vfabs(v, vl); + vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl); + vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl); + + s = __riscv_vfnmsac(s, cPio4a, r, vl); + s = __riscv_vfnmsac(s, cPio4b, r, vl); + s = __riscv_vfnmsac(s, cPio4c, r, vl); + + s = __riscv_vfmul(s, 1 / 8.0f, vl); + s = __riscv_vfmul(s, s, vl); + vfloat32m2_t t = s; + s = __riscv_vfmsub(s, c5, c4, vl); + s = __riscv_vfmadd(s, t, c3, vl); + s = __riscv_vfmsub(s, t, c2, vl); + s = __riscv_vfmadd(s, t, cf1, vl); + s = __riscv_vfmul(s, t, vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, 1 / 2.0f, vl); + + vfloat32m2_t sine = + __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl); + vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl); + + vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl); + vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl); + + cosine = __riscv_vmerge(cosine, sine, m1, vl); + cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl); + + __riscv_vse32(bVector, cosine, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_cos_32f_u_H */ diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h index 13d21201..85571dbc 100644 --- a/kernels/volk/volk_32f_exp_32f.h +++ b/kernels/volk/volk_32f_exp_32f.h @@ -266,4 +266,58 @@ volk_32f_exp_32f_generic(float* bVector, const float* aVector, unsigned int num_ #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_exp_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t exp_hi = __riscv_vfmv_v_f_f32m2(88.376259f, vlmax); + const vfloat32m2_t exp_lo = __riscv_vfmv_v_f_f32m2(-88.376259f, vlmax); + const vfloat32m2_t log2EF = __riscv_vfmv_v_f_f32m2(1.442695f, vlmax); + const vfloat32m2_t exp_C1 = __riscv_vfmv_v_f_f32m2(-0.6933594f, vlmax); + const vfloat32m2_t exp_C2 = __riscv_vfmv_v_f_f32m2(0.000212194f, vlmax); + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf1o2 = __riscv_vfmv_v_f_f32m2(0.5f, vlmax); + + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(1.9875691500e-4, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(1.3981999507e-3, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(8.3334519073e-3, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(4.1665795894e-2, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(1.6666665459e-1, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.0000001201e-1, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + v = __riscv_vfmin(v, exp_hi, vl); + v = __riscv_vfmax(v, exp_lo, vl); + vfloat32m2_t fx = __riscv_vfmadd(v, log2EF, cf1o2, vl); + + vfloat32m2_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl); + fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl); + v = __riscv_vfmacc(v, fx, exp_C1, vl); + v = __riscv_vfmacc(v, fx, exp_C2, vl); + vfloat32m2_t vv = __riscv_vfmul(v, v, vl); + + vfloat32m2_t y = c0; + y = __riscv_vfmadd(y, v, c1, vl); + y = __riscv_vfmadd(y, v, c2, vl); + y = __riscv_vfmadd(y, v, c3, vl); + y = __riscv_vfmadd(y, v, c4, vl); + y = __riscv_vfmadd(y, v, c5, vl); + y = __riscv_vfmadd(y, vv, v, vl); + y = __riscv_vfadd(y, cf1, vl); + + vfloat32m2_t pow2n = __riscv_vreinterpret_f32m2( + __riscv_vsll(__riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), 0x7f, vl), 23, vl)); + + __riscv_vse32(bVector, __riscv_vfmul(y, pow2n, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_exp_32f_u_H */ diff --git a/kernels/volk/volk_32f_expfast_32f.h b/kernels/volk/volk_32f_expfast_32f.h index 7dfbaacb..3b65968a 100644 --- a/kernels/volk/volk_32f_expfast_32f.h +++ b/kernels/volk/volk_32f_expfast_32f.h @@ -301,4 +301,25 @@ static inline void volk_32f_expfast_32f_generic(float* bVector, } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_expfast_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m8(); + const vfloat32m8_t ca = __riscv_vfmv_v_f_f32m8(A / Mln2, vlmax); + const vfloat32m8_t cb = __riscv_vfmv_v_f_f32m8(B - C, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + v = __riscv_vfmadd(v, ca, cb, vl); + v = __riscv_vreinterpret_f32m8(__riscv_vfcvt_x(v, vl)); + __riscv_vse32(bVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_expfast_32f_u_H */ diff --git a/kernels/volk/volk_32f_index_max_16u.h b/kernels/volk/volk_32f_index_max_16u.h index 2aad087e..3e7c0fb9 100644 --- a/kernels/volk/volk_32f_index_max_16u.h +++ b/kernels/volk/volk_32f_index_max_16u.h @@ -359,4 +359,32 @@ volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_p #endif /*LV_HAVE_AVX*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_index_max_16u_rvv(uint16_t* target, const float* src0, uint32_t num_points) +{ + vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(-FLT_MAX, __riscv_vsetvlmax_e32m8()); + vuint16m4_t vmaxi = __riscv_vmv_v_x_u16m4(0, __riscv_vsetvlmax_e16m4()); + vuint16m4_t vidx = __riscv_vid_v_u16m4(__riscv_vsetvlmax_e16m4()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(src0, vl); + vbool4_t m = __riscv_vmfgt(v, vmax, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m8(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK8(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool4_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_index_max_16u_u_H*/ diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h index 86dad0d1..0bf071fc 100644 --- a/kernels/volk/volk_32f_index_max_32u.h +++ b/kernels/volk/volk_32f_index_max_32u.h @@ -542,4 +542,32 @@ volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_p #endif /*LV_HAVE_SSE*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_index_max_32u_rvv(uint32_t* target, const float* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(-FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl); + vbool8_t m = __riscv_vmfgt(v, vmax, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_index_max_32u_u_H*/ diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h index 000ecafc..5e1f0aa1 100644 --- a/kernels/volk/volk_32f_index_min_16u.h +++ b/kernels/volk/volk_32f_index_min_16u.h @@ -346,4 +346,32 @@ volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num #endif /*LV_HAVE_AVX*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_index_min_16u_rvv(uint16_t* target, const float* src0, uint32_t num_points) +{ + vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(FLT_MAX, __riscv_vsetvlmax_e32m8()); + vuint16m4_t vmini = __riscv_vmv_v_x_u16m4(0, __riscv_vsetvlmax_e16m4()); + vuint16m4_t vidx = __riscv_vid_v_u16m4(__riscv_vsetvlmax_e16m4()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(src0, vl); + vbool4_t m = __riscv_vmflt(v, vmin, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m8(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK8(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool4_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_index_min_16u_u_H*/ diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h index 0c8bf8c0..7d01fbb4 100644 --- a/kernels/volk/volk_32f_index_min_32u.h +++ b/kernels/volk/volk_32f_index_min_32u.h @@ -508,4 +508,32 @@ volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num #endif /*LV_HAVE_SSE*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_index_min_32u_rvv(uint32_t* target, const float* src0, uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl); + vbool8_t m = __riscv_vmflt(v, vmin, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_index_min_32u_u_H*/ diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h index e91b6c7c..838c9927 100644 --- a/kernels/volk/volk_32f_invsqrt_32f.h +++ b/kernels/volk/volk_32f_invsqrt_32f.h @@ -97,8 +97,9 @@ volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int nu } number = eighthPoints * 8; - for (; number < num_points; number++) + for (; number < num_points; number++) { *cPtr++ = Q_rsqrt(*aPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -156,8 +157,9 @@ volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num cPtr += 4; } - for (number = quarter_points * 4; number < num_points; number++) + for (number = quarter_points * 4; number < num_points; number++) { *cPtr++ = Q_rsqrt(*aPtr++); + } } #endif /* LV_HAVE_NEON */ @@ -198,9 +200,25 @@ volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int nu } number = eighthPoints * 8; - for (; number < num_points; number++) + for (; number < num_points; number++) { *cPtr++ = Q_rsqrt(*aPtr++); + } } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_invsqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + __riscv_vse32(cVector, __riscv_vfrsqrt7(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */ diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h index 0443e56e..47a7cbe3 100644 --- a/kernels/volk/volk_32f_log2_32f.h +++ b/kernels/volk/volk_32f_log2_32f.h @@ -95,8 +95,9 @@ volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num const float* aPtr = aVector; unsigned int number = 0; - for (number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *bPtr++ = log2f_non_ieee(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -718,5 +719,73 @@ volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_ #endif /* LV_HAVE_AVX2 for unaligned */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + +#if LOG_POLY_DEGREE == 6 + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax); +#elif LOG_POLY_DEGREE == 5 + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax); +#elif LOG_POLY_DEGREE == 4 + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax); +#elif LOG_POLY_DEGREE == 3 + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax); +#else +#error +#endif + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1); + const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax); + const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t a = __riscv_vfabs(v, vl); + vfloat32m2_t exp = __riscv_vfcvt_f( + __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl), + vl); + vfloat32m2_t frac = __riscv_vreinterpret_f32m2( + __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl)); + + vfloat32m2_t mant = c0; + mant = __riscv_vfmadd(mant, frac, c1, vl); + mant = __riscv_vfmadd(mant, frac, c2, vl); +#if LOG_POLY_DEGREE >= 4 + mant = __riscv_vfmadd(mant, frac, c3, vl); +#if LOG_POLY_DEGREE >= 5 + mant = __riscv_vfmadd(mant, frac, c4, vl); +#if LOG_POLY_DEGREE >= 6 + mant = __riscv_vfmadd(mant, frac, c5, vl); +#endif +#endif +#endif + exp = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl); + + __riscv_vse32(bVector, exp, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_log2_32f_u_H */ diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h index 37bd16a8..f44a9885 100644 --- a/kernels/volk/volk_32f_reciprocal_32f.h +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -198,4 +198,19 @@ volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_p } #endif /* LV_HAVE_AVX512F */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_reciprocal_32f_rvv(float* out, const float* in, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl); + __riscv_vse32(out, __riscv_vfrdiv(v, 1.0f, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h index a6eb37c2..607bd6d8 100644 --- a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h +++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h @@ -335,4 +335,41 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_32f_fm_detect_32f_rvv(float* outputVector, + const float* inputVector, + const float bound, + float* saveValue, + unsigned int num_points) +{ + if (num_points < 1) + return; + + *outputVector = *inputVector - *saveValue; + if (*outputVector > bound) + *outputVector -= 2 * bound; + if (*outputVector < -bound) + *outputVector += 2 * bound; + ++inputVector; + ++outputVector; + + vfloat32m8_t v2bound = __riscv_vfmv_v_f_f32m8(bound * 2, __riscv_vsetvlmax_e32m8()); + + size_t n = num_points - 1; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(inputVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(inputVector - 1, vl); + vfloat32m8_t v = __riscv_vfsub(va, vb, vl); + v = __riscv_vfsub_mu(__riscv_vmfgt(v, bound, vl), v, v, v2bound, vl); + v = __riscv_vfadd_mu(__riscv_vmflt(v, -bound, vl), v, v, v2bound, vl); + __riscv_vse32(outputVector, v, vl); + } + + *saveValue = inputVector[-1]; +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_add_32f.h b/kernels/volk/volk_32f_s32f_add_32f.h index d7ae2aa1..e3301a7a 100644 --- a/kernels/volk/volk_32f_s32f_add_32f.h +++ b/kernels/volk/volk_32f_s32f_add_32f.h @@ -258,4 +258,21 @@ static inline void volk_32f_s32f_add_32f_u_orc(float* cVector, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_add_32f_rvv(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + __riscv_vse32(cVector, __riscv_vfadd(v, scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_add_32f_a_H */ diff --git a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h index 816f6092..368a987a 100644 --- a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h +++ b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h @@ -52,6 +52,8 @@ #include #include +#include + #ifdef LV_HAVE_AVX #include @@ -458,4 +460,37 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_u_avx(float* noiseFloorAmplitude, *noiseFloorAmplitude = localNoiseFloorAmplitude; } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_s32f_calc_spectral_noise_floor_32f_rvv(float* noiseFloorAmplitude, + const float* realDataPoints, + const float spectralExclusionValue, + const unsigned int num_points) +{ + float sum; + volk_32f_accumulator_s32f_rvv(&sum, realDataPoints, num_points); + float meanAmplitude = sum / num_points + spectralExclusionValue; + + vfloat32m8_t vbin = __riscv_vfmv_v_f_f32m8(meanAmplitude, __riscv_vsetvlmax_e32m8()); + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8()); + size_t n = num_points, binCount = 0; + for (size_t vl; n > 0; n -= vl, realDataPoints += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(realDataPoints, vl); + vbool4_t m = __riscv_vmfle(v, vbin, vl); + binCount += __riscv_vcpop(m, vl); + vsum = __riscv_vfadd_tumu(m, vsum, vsum, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + sum = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl)); + + *noiseFloorAmplitude = binCount == 0 ? meanAmplitude : sum / binCount; +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_clamppuppet_32f.h b/kernels/volk/volk_32f_s32f_clamppuppet_32f.h index 254bfdd5..b4a0e3af 100644 --- a/kernels/volk/volk_32f_s32f_clamppuppet_32f.h +++ b/kernels/volk/volk_32f_s32f_clamppuppet_32f.h @@ -62,4 +62,14 @@ static inline void volk_32f_s32f_clamppuppet_32f_u_sse4_1(float* out, } #endif +#ifdef LV_HAVE_RVV +static inline void volk_32f_s32f_clamppuppet_32f_rvv(float* out, + const float* in, + const float min, + unsigned int num_points) +{ + volk_32f_s32f_x2_clamp_32f_rvv(out, in, min, -min, num_points); +} +#endif + #endif /* INCLUDED_volk_32f_s32f_clamppuppet_32f_H */ diff --git a/kernels/volk/volk_32f_s32f_convert_16i.h b/kernels/volk/volk_32f_s32f_convert_16i.h index fe5a31b3..667e97f6 100644 --- a/kernels/volk/volk_32f_s32f_convert_16i.h +++ b/kernels/volk/volk_32f_s32f_convert_16i.h @@ -552,5 +552,22 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_convert_16i_rvv(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl); + v = __riscv_vfmul(v, scalar, vl); + __riscv_vse16(outputVector, __riscv_vfncvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */ diff --git a/kernels/volk/volk_32f_s32f_convert_32i.h b/kernels/volk/volk_32f_s32f_convert_32i.h index 0cd9dee8..b7b6fb1a 100644 --- a/kernels/volk/volk_32f_s32f_convert_32i.h +++ b/kernels/volk/volk_32f_s32f_convert_32i.h @@ -405,5 +405,22 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_convert_32i_rvv(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl); + v = __riscv_vfmul(v, scalar, vl); + __riscv_vse32(outputVector, __riscv_vfcvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */ diff --git a/kernels/volk/volk_32f_s32f_convert_8i.h b/kernels/volk/volk_32f_s32f_convert_8i.h index d47f95a0..a21ae7aa 100644 --- a/kernels/volk/volk_32f_s32f_convert_8i.h +++ b/kernels/volk/volk_32f_s32f_convert_8i.h @@ -437,5 +437,22 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_convert_8i_rvv(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl); + vint16m4_t vi = __riscv_vfncvt_x(__riscv_vfmul(v, scalar, vl), vl); + __riscv_vse8(outputVector, __riscv_vnclip(vi, 0, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */ diff --git a/kernels/volk/volk_32f_s32f_convertpuppet_8u.h b/kernels/volk/volk_32f_s32f_convertpuppet_8u.h index 7f530c44..aa1258ba 100644 --- a/kernels/volk/volk_32f_s32f_convertpuppet_8u.h +++ b/kernels/volk/volk_32f_s32f_convertpuppet_8u.h @@ -102,4 +102,15 @@ static inline void volk_32f_s32f_convertpuppet_8u_a_sse(uint8_t* output, volk_32f_s32f_x2_convert_8u_a_sse(output, input, scale, 128.0, num_points); } #endif + +#ifdef LV_HAVE_RVV +static inline void volk_32f_s32f_convertpuppet_8u_rvv(uint8_t* output, + const float* input, + float scale, + unsigned int num_points) +{ + volk_32f_s32f_x2_convert_8u_rvv(output, input, scale, 128.0, num_points); +} +#endif + #endif diff --git a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h index 3a178596..f4a7a2b0 100644 --- a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h +++ b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h @@ -86,4 +86,14 @@ static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float* output, output, input, bound - 3.131f, bound, num_points); } #endif +#ifdef LV_HAVE_RVV +static inline void volk_32f_s32f_mod_rangepuppet_32f_rvv(float* output, + const float* input, + float bound, + unsigned int num_points) +{ + volk_32f_s32f_s32f_mod_range_32f_rvv( + output, input, bound - 3.131f, bound, num_points); +} +#endif #endif diff --git a/kernels/volk/volk_32f_s32f_multiply_32f.h b/kernels/volk/volk_32f_s32f_multiply_32f.h index 26fc148c..27d86149 100644 --- a/kernels/volk/volk_32f_s32f_multiply_32f.h +++ b/kernels/volk/volk_32f_s32f_multiply_32f.h @@ -257,4 +257,21 @@ static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_multiply_32f_rvv(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + __riscv_vse32(cVector, __riscv_vfmul(v, scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */ diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h index 46f5799b..e572f24c 100644 --- a/kernels/volk/volk_32f_s32f_normalize.h +++ b/kernels/volk/volk_32f_s32f_normalize.h @@ -203,5 +203,19 @@ static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_s32f_normalize_rvv(float* vecBuffer, const float scalar, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, vecBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(vecBuffer, vl); + __riscv_vse32(vecBuffer, __riscv_vfmul(v, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_normalize_u_H */ diff --git a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h index d185f102..f5176150 100644 --- a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h +++ b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h @@ -359,5 +359,37 @@ static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_s32f_mod_range_32f_rvv(float* outputVector, + const float* inputVector, + const float lower_bound, + const float upper_bound, + unsigned int num_points) +{ + const float dist = upper_bound - lower_bound; + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t vdist = __riscv_vfmv_v_f_f32m4(dist, vlmax); + vfloat32m4_t vmdist = __riscv_vfmv_v_f_f32m4(-dist, vlmax); + vfloat32m4_t vupper = __riscv_vfmv_v_f_f32m4(upper_bound, vlmax); + vfloat32m4_t vlower = __riscv_vfmv_v_f_f32m4(lower_bound, vlmax); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, outputVector += vl, inputVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(inputVector, vl); + vfloat32m4_t vlt = __riscv_vfsub(vlower, v, vl); + vfloat32m4_t vgt = __riscv_vfsub(v, vupper, vl); + vbool8_t mlt = __riscv_vmflt(v, vlower, vl); + vfloat32m4_t vmul = __riscv_vmerge(vmdist, vdist, mlt, vl); + vfloat32m4_t vcnt = __riscv_vfdiv(__riscv_vmerge(vgt, vlt, mlt, vl), vdist, vl); + vcnt = __riscv_vfcvt_f(__riscv_vadd(__riscv_vfcvt_rtz_x(vcnt, vl), 1, vl), vl); + vbool8_t mgt = __riscv_vmfgt(v, vupper, vl); + v = __riscv_vfmacc_mu(__riscv_vmor(mlt, mgt, vl), v, vcnt, vmul, vl); + + __riscv_vse32(outputVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */ diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h index 3b5bb6e1..8774277b 100644 --- a/kernels/volk/volk_32f_s32f_stddev_32f.h +++ b/kernels/volk/volk_32f_s32f_stddev_32f.h @@ -344,4 +344,32 @@ static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32f_s32f_stddev_32f_rvv(float* stddev, + const float* inputBuffer, + const float mean, + unsigned int num_points) +{ + if (num_points == 0) { + *stddev = 0; + return; + } + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl); + vsum = __riscv_vfmacc_tu(vsum, v, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum); + v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl); + float sum = __riscv_vfmv_f(v); + *stddev = sqrtf((sum / num_points) - (mean * mean)); +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_x2_clamp_32f.h b/kernels/volk/volk_32f_s32f_x2_clamp_32f.h index 19d51795..2b194eaa 100644 --- a/kernels/volk/volk_32f_s32f_x2_clamp_32f.h +++ b/kernels/volk/volk_32f_s32f_x2_clamp_32f.h @@ -187,4 +187,25 @@ static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(float* out, } #endif /* LV_HAVE_SSE4_1 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_x2_clamp_32f_rvv(float* out, + const float* in, + const float min, + const float max, + unsigned int num_points) +{ + vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(min, __riscv_vsetvlmax_e32m8()); + vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(max, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl); + v = __riscv_vfmin(__riscv_vfmax(v, vmin, vl), vmax, vl); + __riscv_vse32(out, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_x2_convert_8u.h b/kernels/volk/volk_32f_s32f_x2_convert_8u.h index a52cdf28..1ad2b1ac 100644 --- a/kernels/volk/volk_32f_s32f_x2_convert_8u.h +++ b/kernels/volk/volk_32f_s32f_x2_convert_8u.h @@ -612,5 +612,24 @@ static inline void volk_32f_s32f_x2_convert_8u_a_sse(uint8_t* outputVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_x2_convert_8u_rvv(uint8_t* outputVector, + const float* inputVector, + const float scale, + const float bias, + unsigned int num_points) +{ + vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(bias, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl); + vuint16m4_t vi = __riscv_vfncvt_xu(__riscv_vfmadd_vf_f32m8(v, scale, vb, vl), vl); + __riscv_vse8(outputVector, __riscv_vnclipu(vi, 0, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_x2_convert_8u_a_H */ diff --git a/kernels/volk/volk_32f_sin_32f.h b/kernels/volk/volk_32f_sin_32f.h index 371e424f..a02f2260 100644 --- a/kernels/volk/volk_32f_sin_32f.h +++ b/kernels/volk/volk_32f_sin_32f.h @@ -127,8 +127,9 @@ static inline void volk_32f_sin_32f_a_avx512f(float* sinVector, cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); + } s = _mm512_div_ps(s, ftwos); sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); @@ -520,8 +521,9 @@ static inline void volk_32f_sin_32f_u_avx512f(float* sinVector, cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); + } s = _mm512_div_ps(s, ftwos); sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); @@ -893,5 +895,67 @@ volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_poi #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_sin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax); + const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax); + const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax); + const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax); + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t s = __riscv_vfabs(v, vl); + vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl); + vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl); + + s = __riscv_vfnmsac(s, cPio4a, r, vl); + s = __riscv_vfnmsac(s, cPio4b, r, vl); + s = __riscv_vfnmsac(s, cPio4c, r, vl); + + s = __riscv_vfmul(s, 1 / 8.0f, vl); + s = __riscv_vfmul(s, s, vl); + vfloat32m2_t t = s; + s = __riscv_vfmsub(s, c5, c4, vl); + s = __riscv_vfmadd(s, t, c3, vl); + s = __riscv_vfmsub(s, t, c2, vl); + s = __riscv_vfmadd(s, t, cf1, vl); + s = __riscv_vfmul(s, t, vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, 1 / 2.0f, vl); + + vfloat32m2_t sine = + __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl); + vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl); + + vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl); + vbool16_t m2 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl), + __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl), + vl); + + sine = __riscv_vmerge(sine, cosine, m1, vl); + sine = __riscv_vfneg_mu(m2, sine, sine, vl); + + __riscv_vse32(bVector, sine, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_sin_32f_u_H */ diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h index 9d269413..c5672534 100644 --- a/kernels/volk/volk_32f_sqrt_32f.h +++ b/kernels/volk/volk_32f_sqrt_32f.h @@ -205,4 +205,20 @@ volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_p } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_sqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + __riscv_vse32(cVector, __riscv_vfsqrt(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_sqrt_32f_u_H */ diff --git a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h index c71514bb..96535ed6 100644 --- a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h +++ b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h @@ -569,4 +569,75 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_stddev_and_mean_32f_x2_rvv(float* stddev, + float* mean, + const float* inputBuffer, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + if (num_points < vlmax) { + volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); + return; + } + + vfloat32m4_t vsum = __riscv_vle32_v_f32m4(inputBuffer, vlmax); + inputBuffer += vlmax; + vfloat32m4_t vsumsq = __riscv_vfmv_v_f_f32m4(0, vlmax); + size_t partLen = num_points / vlmax; + + for (size_t i = 1; i < partLen; ++i, inputBuffer += vlmax) { + vfloat32m4_t v = __riscv_vle32_v_f32m4(inputBuffer, vlmax); + vsum = __riscv_vfadd(vsum, v, vlmax); + vfloat32m4_t vaux = __riscv_vfmsub(v, i + 1.0f, vsum, vlmax); + vaux = __riscv_vfmul(vaux, vaux, vlmax); + vaux = __riscv_vfmul(vaux, 1.0f / (i * (i + 1.0f)), vlmax); + vsumsq = __riscv_vfadd(vsumsq, vaux, vlmax); + } + + size_t vl = __riscv_vsetvlmax_e32m2(); + vfloat32m2_t vsum2 = + __riscv_vfadd(__riscv_vget_f32m2(vsum, 0), __riscv_vget_f32m2(vsum, 1), vl); + vfloat32m2_t vfix2 = + __riscv_vfsub(__riscv_vget_f32m2(vsum, 0), __riscv_vget_f32m2(vsum, 1), vl); + vfix2 = __riscv_vfmul(vfix2, vfix2, vl); + vfloat32m2_t vsumsq2 = + __riscv_vfadd(__riscv_vget_f32m2(vsumsq, 0), __riscv_vget_f32m2(vsumsq, 1), vl); + vsumsq2 = __riscv_vfmacc(vsumsq2, 0.5f / (num_points / vlmax), vfix2, vl); + + vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vsum1 = + __riscv_vfadd(__riscv_vget_f32m1(vsum2, 0), __riscv_vget_f32m1(vsum2, 1), vl); + vfloat32m1_t vfix1 = + __riscv_vfsub(__riscv_vget_f32m1(vsum2, 0), __riscv_vget_f32m1(vsum2, 1), vl); + vfix1 = __riscv_vfmul(vfix1, vfix1, vl); + vfloat32m1_t vsumsq1 = + __riscv_vfadd(__riscv_vget_f32m1(vsumsq2, 0), __riscv_vget_f32m1(vsumsq2, 1), vl); + vsumsq1 = __riscv_vfmacc(vsumsq1, 0.5f / (num_points / vlmax * 2), vfix1, vl); + + for (size_t n = num_points / vlmax * 4, vl = vlmax >> 2; vl >>= 1; n *= 2) { + vfloat32m1_t vsumdown = __riscv_vslidedown(vsum1, vl, vl); + vfix1 = __riscv_vfsub(vsum1, vsumdown, vl); + vfix1 = __riscv_vfmul(vfix1, vfix1, vl); + vsum1 = __riscv_vfadd(vsum1, vsumdown, vl); + vsumsq1 = __riscv_vfadd(vsumsq1, __riscv_vslidedown(vsumsq1, vl, vl), vl); + vsumsq1 = __riscv_vfmacc(vsumsq1, 0.5f / n, vfix1, vl); + } + + float sum = __riscv_vfmv_f(vsum1); + float sumsq = __riscv_vfmv_f(vsumsq1); + + for (size_t i = partLen * vlmax; i < num_points; ++i) { + float in = *inputBuffer++; + sum += in; + sumsq = update_square_sum_1_val(sumsq, sum, i, in); + } + + *stddev = sqrtf(sumsq / num_points); + *mean = sum / num_points; +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */ diff --git a/kernels/volk/volk_32f_tan_32f.h b/kernels/volk/volk_32f_tan_32f.h index 1ec0202f..28810c94 100644 --- a/kernels/volk/volk_32f_tan_32f.h +++ b/kernels/volk/volk_32f_tan_32f.h @@ -750,5 +750,72 @@ volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_poi } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_tan_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax); + const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax); + const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax); + const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax); + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t s = __riscv_vfabs(v, vl); + vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl); + vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl); + + s = __riscv_vfnmsac(s, cPio4a, r, vl); + s = __riscv_vfnmsac(s, cPio4b, r, vl); + s = __riscv_vfnmsac(s, cPio4c, r, vl); + + s = __riscv_vfmul(s, 1 / 8.0f, vl); + s = __riscv_vfmul(s, s, vl); + vfloat32m2_t t = s; + s = __riscv_vfmsub(s, c5, c4, vl); + s = __riscv_vfmadd(s, t, c3, vl); + s = __riscv_vfmsub(s, t, c2, vl); + s = __riscv_vfmadd(s, t, cf1, vl); + s = __riscv_vfmul(s, t, vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, 1 / 2.0f, vl); + + vfloat32m2_t sine = + __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl); + vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl); + + vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl); + vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl); + vbool16_t m3 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl), + __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl), + vl); + + vfloat32m2_t sine0 = sine; + sine = __riscv_vmerge(sine, cosine, m1, vl); + sine = __riscv_vfneg_mu(m3, sine, sine, vl); + + cosine = __riscv_vmerge(cosine, sine0, m1, vl); + cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl); + + __riscv_vse32(bVector, __riscv_vfdiv(sine, cosine, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_tan_32f_u_H */ diff --git a/kernels/volk/volk_32f_tanh_32f.h b/kernels/volk/volk_32f_tanh_32f.h index 3e36adb7..e90e4025 100644 --- a/kernels/volk/volk_32f_tanh_32f.h +++ b/kernels/volk/volk_32f_tanh_32f.h @@ -412,4 +412,38 @@ volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int n } #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_tanh_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(135135.0f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(17325.0f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(378.0f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(62370.0f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3150.0f, vlmax); + const vfloat32m2_t c6 = __riscv_vfmv_v_f_f32m2(28.0f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t x = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + vfloat32m2_t a, b; + a = __riscv_vfadd(xx, c3, vl); + a = __riscv_vfmadd(a, xx, c2, vl); + a = __riscv_vfmadd(a, xx, c1, vl); + a = __riscv_vfmul(a, x, vl); + b = c6; + b = __riscv_vfmadd(b, xx, c5, vl); + b = __riscv_vfmadd(b, xx, c4, vl); + b = __riscv_vfmadd(b, xx, c1, vl); + __riscv_vse32(bVector, __riscv_vfdiv(a, b, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_tanh_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h index f99e6b55..be9f6aa7 100644 --- a/kernels/volk/volk_32f_x2_add_32f.h +++ b/kernels/volk/volk_32f_x2_add_32f.h @@ -391,5 +391,22 @@ static inline void volk_32f_x2_add_32f_u_orc(float* cVector, #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_add_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfadd(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_x2_add_32f_a_H */ diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h index bcb9da7c..fbece7d5 100644 --- a/kernels/volk/volk_32f_x2_divide_32f.h +++ b/kernels/volk/volk_32f_x2_divide_32f.h @@ -347,4 +347,22 @@ static inline void volk_32f_x2_divide_32f_u_avx(float* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_divide_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfdiv(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h index 3a4b7177..3502b3a5 100644 --- a/kernels/volk/volk_32f_x2_dot_prod_16i.h +++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h @@ -678,5 +678,20 @@ static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, #endif /*LV_HAVE_AVX512F*/ +#ifdef LV_HAVE_RVV +#include + +#include "volk_32f_x2_dot_prod_32f.h" + +static inline void volk_32f_x2_dot_prod_16i_rvv(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + float fresult = 0; + volk_32f_x2_dot_prod_32f_rvv(&fresult, input, taps, num_points); + *result = (int16_t)rintf(fresult); +} +#endif /*LV_HAVE_RVV*/ #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/ diff --git a/kernels/volk/volk_32f_x2_dot_prod_32f.h b/kernels/volk/volk_32f_x2_dot_prod_32f.h index 5bdb72ce..2d86411e 100644 --- a/kernels/volk/volk_32f_x2_dot_prod_32f.h +++ b/kernels/volk/volk_32f_x2_dot_prod_32f.h @@ -949,4 +949,28 @@ extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, unsigned int num_points); #endif /* LV_HAVE_NEONV7 */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32f_x2_dot_prod_32f_rvv(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v0 = __riscv_vle32_v_f32m8(input, vl); + vfloat32m8_t v1 = __riscv_vle32_v_f32m8(taps, vl); + vsum = __riscv_vfmacc_tu(vsum, v0, v1, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum); + v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl); + *result = __riscv_vfmv_f(v); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h index b4901543..62e30ad8 100644 --- a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h +++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h @@ -79,4 +79,17 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, outputVector, inputVector, bound, saveValue, num_points); } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +static inline void volk_32f_x2_fm_detectpuppet_32f_rvv(float* outputVector, + const float* inputVector, + float* saveValue, + unsigned int num_points) +{ + const float bound = 2.0f; + volk_32f_s32f_32f_fm_detect_32f_rvv( + outputVector, inputVector, bound, saveValue, num_points); +} +#endif /* LV_HAVE_RVV */ + #endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_interleave_32fc.h b/kernels/volk/volk_32f_x2_interleave_32fc.h index 140fa9ff..2190f1a4 100644 --- a/kernels/volk/volk_32f_x2_interleave_32fc.h +++ b/kernels/volk/volk_32f_x2_interleave_32fc.h @@ -255,4 +255,43 @@ static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_interleave_32fc_rvv(lv_32fc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + unsigned int num_points) +{ + uint64_t* out = (uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint32m4_t vr = __riscv_vle32_v_u32m4((const uint32_t*)iBuffer, vl); + vuint32m4_t vi = __riscv_vle32_v_u32m4((const uint32_t*)qBuffer, vl); + vuint64m8_t vc = + __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFFFFFF, vi, vl); + __riscv_vse64(out, vc, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32f_x2_interleave_32fc_rvvseg(lv_32fc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t vr = __riscv_vle32_v_f32m4(iBuffer, vl); + vfloat32m4_t vi = __riscv_vle32_v_f32m4(qBuffer, vl); + __riscv_vsseg2e32((float*)complexVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32f_x2_interleave_32fc_u_H */ diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h index 0f88ffe6..a0d48f75 100644 --- a/kernels/volk/volk_32f_x2_max_32f.h +++ b/kernels/volk/volk_32f_x2_max_32f.h @@ -330,4 +330,22 @@ static inline void volk_32f_x2_max_32f_u_avx(float* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_max_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfmax(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_max_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h index 128c7483..2910b1f9 100644 --- a/kernels/volk/volk_32f_x2_min_32f.h +++ b/kernels/volk/volk_32f_x2_min_32f.h @@ -334,4 +334,22 @@ static inline void volk_32f_x2_min_32f_u_avx(float* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_min_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfmin(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_min_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_multiply_32f.h b/kernels/volk/volk_32f_x2_multiply_32f.h index c36adfc2..af266041 100644 --- a/kernels/volk/volk_32f_x2_multiply_32f.h +++ b/kernels/volk/volk_32f_x2_multiply_32f.h @@ -356,5 +356,22 @@ static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_multiply_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfmul(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */ diff --git a/kernels/volk/volk_32f_x2_pow_32f.h b/kernels/volk/volk_32f_x2_pow_32f.h index 637fd4b7..c2b77233 100644 --- a/kernels/volk/volk_32f_x2_pow_32f.h +++ b/kernels/volk/volk_32f_x2_pow_32f.h @@ -976,4 +976,127 @@ static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector, #endif /* LV_HAVE_AVX2 for unaligned */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_pow_32f_rvv(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m1(); + +#if POW_POLY_DEGREE == 6 + const vfloat32m1_t cl5 = __riscv_vfmv_v_f_f32m1(3.1157899f, vlmax); + const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(-3.3241990f, vlmax); + const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.5988452f, vlmax); + const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.2315303f, vlmax); + const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(3.1821337e-1f, vlmax); + const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-3.4436006e-2f, vlmax); +#elif POW_POLY_DEGREE == 5 + const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(2.8882704548164776201f, vlmax); + const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(-2.52074962577807006663f, vlmax); + const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(1.48116647521213171641f, vlmax); + const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-0.465725644288844778798f, vlmax); + const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.0596515482674574969533f, vlmax); +#elif POW_POLY_DEGREE == 4 + const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.61761038894603480148f, vlmax); + const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.75647175389045657003f, vlmax); + const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(0.688243882994381274313f, vlmax); + const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-0.107254423828329604454f, vlmax); +#elif POW_POLY_DEGREE == 3 + const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(2.28330284476918490682f, vlmax); + const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-1.04913055217340124191f, vlmax); + const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.204446009836232697516f, vlmax); +#else +#error +#endif + + const vfloat32m1_t exp_hi = __riscv_vfmv_v_f_f32m1(88.376259f, vlmax); + const vfloat32m1_t exp_lo = __riscv_vfmv_v_f_f32m1(-88.376259f, vlmax); + const vfloat32m1_t log2EF = __riscv_vfmv_v_f_f32m1(1.442695f, vlmax); + const vfloat32m1_t exp_C1 = __riscv_vfmv_v_f_f32m1(-0.6933594f, vlmax); + const vfloat32m1_t exp_C2 = __riscv_vfmv_v_f_f32m1(0.000212194f, vlmax); + const vfloat32m1_t cf1 = __riscv_vfmv_v_f_f32m1(1.0f, vlmax); + const vfloat32m1_t cf1o2 = __riscv_vfmv_v_f_f32m1(0.5f, vlmax); + const vfloat32m1_t ln2 = __riscv_vfmv_v_f_f32m1(0.6931471805f, vlmax); + + const vfloat32m1_t ce0 = __riscv_vfmv_v_f_f32m1(1.9875691500e-4, vlmax); + const vfloat32m1_t ce1 = __riscv_vfmv_v_f_f32m1(1.3981999507e-3, vlmax); + const vfloat32m1_t ce2 = __riscv_vfmv_v_f_f32m1(8.3334519073e-3, vlmax); + const vfloat32m1_t ce3 = __riscv_vfmv_v_f_f32m1(4.1665795894e-2, vlmax); + const vfloat32m1_t ce4 = __riscv_vfmv_v_f_f32m1(1.6666665459e-1, vlmax); + const vfloat32m1_t ce5 = __riscv_vfmv_v_f_f32m1(5.0000001201e-1, vlmax); + + const vint32m1_t m1 = __riscv_vreinterpret_i32m1(cf1); + const vint32m1_t m2 = __riscv_vmv_v_x_i32m1(0x7FFFFF, vlmax); + const vint32m1_t c127 = __riscv_vmv_v_x_i32m1(127, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t va = __riscv_vle32_v_f32m1(aVector, vl); + vfloat32m1_t log; + + { /* log(a) */ + vfloat32m1_t a = __riscv_vfabs(va, vl); + vfloat32m1_t exp = __riscv_vfcvt_f( + __riscv_vsub( + __riscv_vsra(__riscv_vreinterpret_i32m1(a), 23, vl), c127, vl), + vl); + vfloat32m1_t frac = __riscv_vreinterpret_f32m1(__riscv_vor( + __riscv_vand(__riscv_vreinterpret_i32m1(va), m2, vl), m1, vl)); + + vfloat32m1_t mant = cl0; + mant = __riscv_vfmadd(mant, frac, cl1, vl); + mant = __riscv_vfmadd(mant, frac, cl2, vl); +#if POW_POLY_DEGREE >= 4 + mant = __riscv_vfmadd(mant, frac, cl3, vl); +#if POW_POLY_DEGREE >= 5 + mant = __riscv_vfmadd(mant, frac, cl4, vl); +#if POW_POLY_DEGREE >= 6 + mant = __riscv_vfmadd(mant, frac, cl5, vl); +#endif +#endif +#endif + log = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl); + log = __riscv_vfmul(log, ln2, vl); + } + + vfloat32m1_t vb = __riscv_vle32_v_f32m1(bVector, vl); + vb = __riscv_vfmul(vb, log, vl); /* b*log(a) */ + vfloat32m1_t exp; + + { /* exp(b*log(a)) */ + vb = __riscv_vfmin(vb, exp_hi, vl); + vb = __riscv_vfmax(vb, exp_lo, vl); + vfloat32m1_t fx = __riscv_vfmadd(vb, log2EF, cf1o2, vl); + + vfloat32m1_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl); + fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl); + vb = __riscv_vfmacc(vb, exp_C1, fx, vl); + vb = __riscv_vfmacc(vb, exp_C2, fx, vl); + vfloat32m1_t vv = __riscv_vfmul(vb, vb, vl); + + vfloat32m1_t y = ce0; + y = __riscv_vfmadd(y, vb, ce1, vl); + y = __riscv_vfmadd(y, vb, ce2, vl); + y = __riscv_vfmadd(y, vb, ce3, vl); + y = __riscv_vfmadd(y, vb, ce4, vl); + y = __riscv_vfmadd(y, vb, ce5, vl); + y = __riscv_vfmadd(y, vv, vb, vl); + y = __riscv_vfadd(y, cf1, vl); + + vfloat32m1_t pow2n = __riscv_vreinterpret_f32m1(__riscv_vsll( + __riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), c127, vl), 23, vl)); + + exp = __riscv_vfmul(y, pow2n, vl); + } + + __riscv_vse32(cVector, exp, vl); + } +} + +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_powpuppet_32f.h b/kernels/volk/volk_32f_x2_powpuppet_32f.h index 419ee18e..d4df0b3d 100644 --- a/kernels/volk/volk_32f_x2_powpuppet_32f.h +++ b/kernels/volk/volk_32f_x2_powpuppet_32f.h @@ -111,4 +111,16 @@ static inline void volk_32f_x2_powpuppet_32f_u_avx2(float* cVector, } #endif /* LV_HAVE_AVX2 for unaligned */ +#ifdef LV_HAVE_RVV +static inline void volk_32f_x2_powpuppet_32f_rvv(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) +{ + float* aVectorPos = make_positive(aVector, num_points); + volk_32f_x2_pow_32f_rvv(cVector, bVector, aVectorPos, num_points); + volk_free(aVectorPos); +} +#endif /* LV_HAVE_RVV */ + #endif /* INCLUDED_volk_32f_x2_powpuppet_32f_H */ diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h index 2ddfb0fd..9a78a01a 100644 --- a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h +++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h @@ -326,5 +326,51 @@ static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVec } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_s32f_interleave_16ic_rvv(lv_16sc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + const float scalar, + unsigned int num_points) +{ + uint32_t* out = (uint32_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl); + vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl); + vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl); + vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl); + vuint16m4_t vr = __riscv_vreinterpret_u16m4(vri); + vuint16m4_t vi = __riscv_vreinterpret_u16m4(vii); + vuint32m8_t vc = __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFF, vi, vl); + __riscv_vse32(out, vc, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32f_x2_s32f_interleave_16ic_rvvseg(lv_16sc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl); + vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl); + vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl); + vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl); + __riscv_vsseg2e16( + (int16_t*)complexVector, __riscv_vcreate_v_i16m4x2(vri, vii), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */ diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h index 631b72f8..e3d563fc 100644 --- a/kernels/volk/volk_32f_x2_subtract_32f.h +++ b/kernels/volk/volk_32f_x2_subtract_32f.h @@ -272,4 +272,22 @@ static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_subtract_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfsub(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */ diff --git a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h index 6afd262a..b9a83714 100644 --- a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h +++ b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h @@ -341,8 +341,9 @@ static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, result[k] += center_point_array[2] * thrd + center_point_array[3] * frth; } } - for (k = 0; k < 8; k += 2) + for (k = 0; k < 8; k += 2) { result[k] = result[k] + result[k + 1]; + } *target = result[0] + result[2] + result[4] + result[6]; @@ -654,4 +655,45 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, } #endif // LV_HAVE_AVX +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32f_x3_sum_of_poly_32f_rvv(float* target, + float* src0, + float* center_point_array, + float* cutoff, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t vsum = __riscv_vfmv_v_f_f32m4(0, vlmax); + float mul1 = center_point_array[0]; // scalar to avoid register spills + float mul2 = center_point_array[1]; + vfloat32m4_t vmul3 = __riscv_vfmv_v_f_f32m4(center_point_array[2], vlmax); + vfloat32m4_t vmul4 = __riscv_vfmv_v_f_f32m4(center_point_array[3], vlmax); + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(*cutoff, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl); + vfloat32m4_t v1 = __riscv_vfmax(v, vmax, vl); + vfloat32m4_t v2 = __riscv_vfmul(v1, v1, vl); + vfloat32m4_t v3 = __riscv_vfmul(v1, v2, vl); + vfloat32m4_t v4 = __riscv_vfmul(v2, v2, vl); + v2 = __riscv_vfmul(v2, mul2, vl); + v4 = __riscv_vfmul(v4, vmul4, vl); + v1 = __riscv_vfmadd(v1, mul1, v2, vl); + v3 = __riscv_vfmadd(v3, vmul3, v4, vl); + v1 = __riscv_vfadd(v1, v3, vl); + vsum = __riscv_vfadd_tu(vsum, vsum, v1, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK4(vfadd, f, 32, vsum); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + float sum = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl)); + *target = sum + num_points * center_point_array[4]; +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/ diff --git a/kernels/volk/volk_32fc_32f_add_32fc.h b/kernels/volk/volk_32fc_32f_add_32fc.h index b820ed5d..24eff2b4 100644 --- a/kernels/volk/volk_32fc_32f_add_32fc.h +++ b/kernels/volk/volk_32fc_32f_add_32fc.h @@ -230,5 +230,24 @@ static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_32f_add_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, cVector += vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m8_t vc = __riscv_vle32_v_f32m8((const float*)aVector, vl * 2); + vuint32m4_t v = __riscv_vle32_v_u32m4((const uint32_t*)bVector, vl); + vfloat32m8_t vf = __riscv_vreinterpret_f32m8( + __riscv_vreinterpret_u32m8(__riscv_vzext_vf2_u64m8(v, vl))); + __riscv_vse32((float*)cVector, __riscv_vfadd(vc, vf, vl * 2), vl * 2); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h index 363bf657..472d405a 100644 --- a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h @@ -743,5 +743,63 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result, #endif /*LV_HAVE_SSE*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_32f_dot_prod_32fc_rvv(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)input, vl); + vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr; + vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl)); + vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl)); + vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl); + vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_32f_dot_prod_32fc_rvvseg(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr; + vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl); + vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/ diff --git a/kernels/volk/volk_32fc_32f_multiply_32fc.h b/kernels/volk/volk_32fc_32f_multiply_32fc.h index 76ed1af7..b731414c 100644 --- a/kernels/volk/volk_32fc_32f_multiply_32fc.h +++ b/kernels/volk/volk_32fc_32f_multiply_32fc.h @@ -224,5 +224,24 @@ static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_32f_multiply_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, cVector += vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m8_t vc = __riscv_vle32_v_f32m8((const float*)aVector, vl * 2); + vuint32m4_t v = __riscv_vle32_v_u32m4((const uint32_t*)bVector, vl); + vfloat32m8_t vf = __riscv_vreinterpret_f32m8(__riscv_vreinterpret_u32m8( + __riscv_vwmaccu(__riscv_vwaddu_vv(v, v, vl), 0xFFFFFFFF, v, vl))); + __riscv_vse32((float*)cVector, __riscv_vfmul(vc, vf, vl * 2), vl * 2); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_accumulator_s32fc.h b/kernels/volk/volk_32fc_accumulator_s32fc.h index d7267ea6..72266bd5 100644 --- a/kernels/volk/volk_32fc_accumulator_s32fc.h +++ b/kernels/volk/volk_32fc_accumulator_s32fc.h @@ -276,4 +276,33 @@ static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_accumulator_s32fc_rvv(lv_32fc_t* result, + const lv_32fc_t* inputBuffer, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m8(); + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, vlmax); + const float* in = (const float*)inputBuffer; + size_t n = num_points * 2; + for (size_t vl; n > 0; n -= vl, in += vl) { + vl = __riscv_vsetvl_e32m8(n < vlmax ? n : vlmax); /* force exact vl */ + vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl); + vsum = __riscv_vfadd_tu(vsum, vsum, v, vl); + } + vuint64m8_t vsumu = __riscv_vreinterpret_u64m8(__riscv_vreinterpret_u32m8(vsum)); + vfloat32m4_t vsum1 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 0, vlmax)); + vfloat32m4_t vsum2 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 32, vlmax)); + vlmax = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsum1); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsum2); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vlmax); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vlmax)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vlmax))); +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */ diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h index aa1134ab..2edff119 100644 --- a/kernels/volk/volk_32fc_conjugate_32fc.h +++ b/kernels/volk/volk_32fc_conjugate_32fc.h @@ -260,4 +260,21 @@ static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_conjugate_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + unsigned int num_points) +{ + size_t n = num_points; + vuint64m8_t m = __riscv_vmv_v_x_u64m8(1ull << 63, __riscv_vsetvlmax_e64m8()); + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint64m8_t v = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl); + __riscv_vse64((uint64_t*)cVector, __riscv_vxor(v, m, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h index a38cce64..55768ab0 100644 --- a/kernels/volk/volk_32fc_convert_16ic.h +++ b/kernels/volk/volk_32fc_convert_16ic.h @@ -416,4 +416,23 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, } } #endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_convert_16ic_rvv(lv_16sc_t* outputVector, + const lv_32fc_t* inputVector, + unsigned int num_points) +{ + int16_t* out = (int16_t*)outputVector; + float* in = (float*)inputVector; + size_t n = num_points * 2; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl); + __riscv_vse16(out, __riscv_vfncvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/kernels/volk/volk_32fc_deinterleave_32f_x2.h index f269d661..569942fe 100644 --- a/kernels/volk/volk_32fc_deinterleave_32f_x2.h +++ b/kernels/volk/volk_32fc_deinterleave_32f_x2.h @@ -254,4 +254,46 @@ static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer, } } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_32f_x2_rvv(float* iBuffer, + float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vuint32m4_t vr = __riscv_vnsrl(vc, 0, vl); + vuint32m4_t vi = __riscv_vnsrl(vc, 32, vl); + __riscv_vse32((uint32_t*)iBuffer, vr, vl); + __riscv_vse32((uint32_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_deinterleave_32f_x2_rvvseg(float* iBuffer, + float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint32m4x2_t vc = + __riscv_vlseg2e32_v_u32m4x2((const uint32_t*)complexVector, vl); + vuint32m4_t vr = __riscv_vget_u32m4(vc, 0); + vuint32m4_t vi = __riscv_vget_u32m4(vc, 1); + __riscv_vse32((uint32_t*)iBuffer, vr, vl); + __riscv_vse32((uint32_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/kernels/volk/volk_32fc_deinterleave_64f_x2.h index 1af5098f..6599780b 100644 --- a/kernels/volk/volk_32fc_deinterleave_64f_x2.h +++ b/kernels/volk/volk_32fc_deinterleave_64f_x2.h @@ -314,4 +314,44 @@ static inline void volk_32fc_deinterleave_64f_x2_neon(double* iBuffer, } #endif /* LV_HAVE_NEONV8 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_64f_x2_rvv(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + __riscv_vse64(iBuffer, __riscv_vfwcvt_f(vr, vl), vl); + __riscv_vse64(qBuffer, __riscv_vfwcvt_f(vi, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_deinterleave_64f_x2_rvvseg(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + __riscv_vse64(iBuffer, __riscv_vfwcvt_f(vr, vl), vl); + __riscv_vse64(qBuffer, __riscv_vfwcvt_f(vi, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/kernels/volk/volk_32fc_deinterleave_imag_32f.h index 9e330d33..bb54411b 100644 --- a/kernels/volk/volk_32fc_deinterleave_imag_32f.h +++ b/kernels/volk/volk_32fc_deinterleave_imag_32f.h @@ -229,4 +229,22 @@ static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer, } } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_imag_32f_rvv(float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + const uint64_t* in = (const uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8(in, vl); + __riscv_vse32((uint32_t*)qBuffer, __riscv_vnsrl(vc, 32, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_real_32f.h b/kernels/volk/volk_32fc_deinterleave_real_32f.h index 6fc0679d..f75cdd03 100644 --- a/kernels/volk/volk_32fc_deinterleave_real_32f.h +++ b/kernels/volk/volk_32fc_deinterleave_real_32f.h @@ -234,4 +234,21 @@ static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_real_32f_rvv(float* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + const uint64_t* in = (const uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8(in, vl); + __riscv_vse32((uint32_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_deinterleave_real_32f_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_real_64f.h b/kernels/volk/volk_32fc_deinterleave_real_64f.h index 31d8f3ec..5c6b0c95 100644 --- a/kernels/volk/volk_32fc_deinterleave_real_64f.h +++ b/kernels/volk/volk_32fc_deinterleave_real_64f.h @@ -240,4 +240,21 @@ static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_real_64f_rvv(double* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + const uint64_t* in = (const uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint32m4_t vi = __riscv_vnsrl(__riscv_vle64_v_u64m8(in, vl), 0, vl); + __riscv_vse64(iBuffer, __riscv_vfwcvt_f(__riscv_vreinterpret_f32m4(vi), vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_deinterleave_real_64f_u_H */ diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h index 28b51766..781876d1 100644 --- a/kernels/volk/volk_32fc_index_max_16u.h +++ b/kernels/volk/volk_32fc_index_max_16u.h @@ -321,7 +321,7 @@ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_ uint32_t i = 0; - for (; i> 3; ++i) { + for (; i < (num_bytes >> 3); ++i) { sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); @@ -466,4 +466,65 @@ static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target, #endif /*LV_HAVE_AVX2*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32fc_index_max_16u_rvv(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); + vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmflt(vmax, v, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(0, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void +volk_32fc_index_max_16u_rvvseg(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); + vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmflt(vmax, v, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(0, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/ diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h index fafff48c..993187ca 100644 --- a/kernels/volk/volk_32fc_index_max_32u.h +++ b/kernels/volk/volk_32fc_index_max_32u.h @@ -307,7 +307,7 @@ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ uint32_t i = 0; - for (; i> 3; ++i) { + for (; i < (num_bytes >> 3); ++i) { sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); @@ -509,4 +509,65 @@ volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi #endif /*LV_HAVE_NEON*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32fc_index_max_32u_rvv(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmflt(vmax, v, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(0, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void +volk_32fc_index_max_32u_rvvseg(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmflt(vmax, v, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(0, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/ diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h index 6cf6d844..706db915 100644 --- a/kernels/volk/volk_32fc_index_min_16u.h +++ b/kernels/volk/volk_32fc_index_min_16u.h @@ -462,4 +462,67 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, #endif /*LV_HAVE_AVX2*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_index_min_16u_rvv(uint16_t* target, + const lv_32fc_t* source, + uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); + vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, source += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmfgt(vmin, v, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_index_min_16u_rvvseg(uint16_t* target, + const lv_32fc_t* source, + uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); + vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, source += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmfgt(vmin, v, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/ diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h index 5e409b99..807a3bb5 100644 --- a/kernels/volk/volk_32fc_index_min_32u.h +++ b/kernels/volk/volk_32fc_index_min_32u.h @@ -504,4 +504,67 @@ static inline void volk_32fc_index_min_32u_neon(uint32_t* target, #endif /*LV_HAVE_NEON*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_index_min_32u_rvv(uint32_t* target, + const lv_32fc_t* source, + uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, source += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmfgt(vmin, v, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_index_min_32u_rvvseg(uint32_t* target, + const lv_32fc_t* source, + uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, source += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmfgt(vmin, v, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/ diff --git a/kernels/volk/volk_32fc_magnitude_32f.h b/kernels/volk/volk_32fc_magnitude_32f.h index eca00e24..7b4e44a5 100644 --- a/kernels/volk/volk_32fc_magnitude_32f.h +++ b/kernels/volk/volk_32fc_magnitude_32f.h @@ -420,5 +420,42 @@ static inline void volk_32fc_magnitude_32f_neon_fancy_sweet( } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_magnitude_32f_rvv(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_magnitude_32f_rvvseg(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/kernels/volk/volk_32fc_magnitude_squared_32f.h b/kernels/volk/volk_32fc_magnitude_squared_32f.h index e7b11ae9..24fa3a9a 100644 --- a/kernels/volk/volk_32fc_magnitude_squared_32f.h +++ b/kernels/volk/volk_32fc_magnitude_squared_32f.h @@ -350,5 +350,42 @@ static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_magnitude_squared_32f_rvv(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(magnitudeVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_magnitude_squared_32f_rvvseg(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(magnitudeVector, v, vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 759db24c..7d98b7c2 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -344,4 +344,113 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, } #endif /* LV_HAVE_AVX2 for unaligned */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector, + const lv_32fc_t* inputVector, + const float normalizeFactor, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax); + const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax); + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax); + const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax); + const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax); + const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax); + const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t v = __riscv_vle64_v_u64m4((const uint64_t*)inputVector, vl); + vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 0, vl)); + vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 32, vl)); + vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl); + vfloat32m2_t x = __riscv_vfdiv( + __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl); + vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl); + x = __riscv_vreinterpret_f32m2( + __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl)); + + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + vfloat32m2_t p = c13; + p = __riscv_vfmadd(p, xx, c11, vl); + p = __riscv_vfmadd(p, xx, c9, vl); + p = __riscv_vfmadd(p, xx, c7, vl); + p = __riscv_vfmadd(p, xx, c5, vl); + p = __riscv_vfmadd(p, xx, c3, vl); + p = __riscv_vfmadd(p, xx, c1, vl); + p = __riscv_vfmul(p, x, vl); + + x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl); + p = __riscv_vmerge(p, x, mswap, vl); + p = __riscv_vfadd_mu( + RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl); + + __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector, + const lv_32fc_t* inputVector, + const float normalizeFactor, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax); + const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax); + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax); + const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax); + const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax); + const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax); + const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2((const float*)inputVector, vl); + vfloat32m2_t vr = __riscv_vget_f32m2(v, 0), vi = __riscv_vget_f32m2(v, 1); + vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl); + vfloat32m2_t x = __riscv_vfdiv( + __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl); + vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl); + x = __riscv_vreinterpret_f32m2( + __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl)); + + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + vfloat32m2_t p = c13; + p = __riscv_vfmadd(p, xx, c11, vl); + p = __riscv_vfmadd(p, xx, c9, vl); + p = __riscv_vfmadd(p, xx, c7, vl); + p = __riscv_vfmadd(p, xx, c5, vl); + p = __riscv_vfmadd(p, xx, c3, vl); + p = __riscv_vfmadd(p, xx, c1, vl); + p = __riscv_vfmul(p, x, vl); + + x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl); + p = __riscv_vmerge(p, x, mswap, vl); + p = __riscv_vfadd_mu( + RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl); + + __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h index c4bfc28e..51840e3b 100644 --- a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h +++ b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h @@ -253,4 +253,24 @@ volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32fc_s32f_deinterleave_real_16i_rvv(int16_t* iBuffer, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + const uint64_t* in = (const uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint32m4_t vi = __riscv_vnsrl(__riscv_vle64_v_u64m8(in, vl), 0, vl); + vfloat32m4_t vif = __riscv_vfmul(__riscv_vreinterpret_f32m4(vi), scalar, vl); + __riscv_vse16(iBuffer, __riscv_vncvt_x(__riscv_vfcvt_x(vif, vl), vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_magnitude_16i.h b/kernels/volk/volk_32fc_s32f_magnitude_16i.h index 21e12e2d..f699ed72 100644 --- a/kernels/volk/volk_32fc_s32f_magnitude_16i.h +++ b/kernels/volk/volk_32fc_s32f_magnitude_16i.h @@ -302,4 +302,46 @@ static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_s32f_magnitude_16i_rvv(int16_t* magnitudeVector, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl); + __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_s32f_magnitude_16i_rvvseg(int16_t* magnitudeVector, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl); + __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h index be9aa88a..f676758e 100644 --- a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h +++ b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h @@ -142,4 +142,167 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, #endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_s32f_power_spectrum_32f_rvv(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + +#if LOG_POLY_DEGREE == 6 + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax); +#elif LOG_POLY_DEGREE == 5 + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax); +#elif LOG_POLY_DEGREE == 4 + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax); +#elif LOG_POLY_DEGREE == 3 + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax); +#else +#error +#endif + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1); + const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax); + const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax); + + const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexFFTInput += vl, logPowerOutput += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)complexFFTInput, vl); + vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl)); + vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl)); + vfloat32m2_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + v = __riscv_vfmul(v, normFactSq, vl); + + vfloat32m2_t a = __riscv_vfabs(v, vl); + vfloat32m2_t exp = __riscv_vfcvt_f( + __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl), + vl); + vfloat32m2_t frac = __riscv_vreinterpret_f32m2( + __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl)); + + vfloat32m2_t mant = c0; + mant = __riscv_vfmadd(mant, frac, c1, vl); + mant = __riscv_vfmadd(mant, frac, c2, vl); +#if LOG_POLY_DEGREE >= 4 + mant = __riscv_vfmadd(mant, frac, c3, vl); +#if LOG_POLY_DEGREE >= 5 + mant = __riscv_vfmadd(mant, frac, c4, vl); +#if LOG_POLY_DEGREE >= 6 + mant = __riscv_vfmadd(mant, frac, c5, vl); +#endif +#endif +#endif + v = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl); + v = __riscv_vfmul(v, volk_log2to10factor, vl); + + __riscv_vse32(logPowerOutput, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_32fc_s32f_power_spectrum_32f_rvvseg(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + +#if LOG_POLY_DEGREE == 6 + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax); +#elif LOG_POLY_DEGREE == 5 + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax); +#elif LOG_POLY_DEGREE == 4 + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax); +#elif LOG_POLY_DEGREE == 3 + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax); +#else +#error +#endif + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1); + const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax); + const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax); + + const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexFFTInput += vl, logPowerOutput += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2x2_t vc = + __riscv_vlseg2e32_v_f32m2x2((const float*)complexFFTInput, vl); + vfloat32m2_t vr = __riscv_vget_f32m2(vc, 0); + vfloat32m2_t vi = __riscv_vget_f32m2(vc, 1); + vfloat32m2_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + v = __riscv_vfmul(v, normFactSq, vl); + + vfloat32m2_t a = __riscv_vfabs(v, vl); + vfloat32m2_t exp = __riscv_vfcvt_f( + __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl), + vl); + vfloat32m2_t frac = __riscv_vreinterpret_f32m2( + __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl)); + + vfloat32m2_t mant = c0; + mant = __riscv_vfmadd(mant, frac, c1, vl); + mant = __riscv_vfmadd(mant, frac, c2, vl); +#if LOG_POLY_DEGREE >= 4 + mant = __riscv_vfmadd(mant, frac, c3, vl); +#if LOG_POLY_DEGREE >= 5 + mant = __riscv_vfmadd(mant, frac, c4, vl); +#if LOG_POLY_DEGREE >= 6 + mant = __riscv_vfmadd(mant, frac, c5, vl); +#endif +#endif +#endif + v = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl); + v = __riscv_vfmul(v, volk_log2to10factor, vl); + + __riscv_vse32(logPowerOutput, v, vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */ diff --git a/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h index 3ce071ca..1ae8ad92 100644 --- a/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h @@ -170,4 +170,34 @@ volk_32fc_s32fc_rotator2puppet_32fc_u_avx_fma(lv_32fc_t* outVector, #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ +#ifdef LV_HAVE_RVV +static inline void volk_32fc_s32fc_rotator2puppet_32fc_rvv(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_rvv( + outVector, inVector, &phase_inc_n, phase, num_points); +} +#endif /*LV_HAVE_RVV*/ + + +#ifdef LV_HAVE_RVVSEG +static inline void volk_32fc_s32fc_rotator2puppet_32fc_rvvseg(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_rvv( + outVector, inVector, &phase_inc_n, phase, num_points); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h index bee1f068..e668e3c5 100644 --- a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h @@ -779,4 +779,158 @@ static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(lv_32fc_t* outVect #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ +/* Note on the RVV implementation: + * The complex multiply was expanded, because we don't care about the corner cases. + * Otherwise, without -ffast-math, the compiler would inserts function calls, + * which invalidates all vector registers and spills them on each loop iteration. */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvv(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD; + + lv_32fc_t inc = 1.0f; + vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr; + for (size_t i = 0; i < vlmax; ++i) { + lv_32fc_t ph = + lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc), + lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc)); + phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax); + phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax); + inc = lv_cmake( + lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc), + lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc)); + } + vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax); + vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax); + + size_t vl = 0; + if (num_points > 0) + while (1) { + size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD; + num_points -= n; + + for (; n > 0; n -= vl, inVector += vl, outVector += vl) { + // vl + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvvseg(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD; + + lv_32fc_t inc = 1.0f; + vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr; + for (size_t i = 0; i < vlmax; ++i) { + lv_32fc_t ph = + lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc), + lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc)); + phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax); + phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax); + inc = lv_cmake( + lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc), + lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc)); + } + vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax); + vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax); + + size_t vl = 0; + if (num_points > 0) + while (1) { + size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD; + num_points -= n; + + for (; n > 0; n -= vl, inVector += vl, outVector += vl) { + // vl + +static inline void volk_32fc_x2_add_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + const float* ina = (const float*)aVector; + const float* inb = (const float*)bVector; + float* out = (float*)cVector; + size_t n = num_points * 2; + for (size_t vl; n > 0; n -= vl, ina += vl, inb += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(ina, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(inb, vl); + __riscv_vse32(out, __riscv_vfadd(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h index 7b9aae3a..a5a4a9df 100644 --- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h @@ -421,5 +421,72 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result #endif /*LV_HAVE_SSE3*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_rvv(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2()); + vfloat32m2_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl); + vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl); + vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl)); + vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl)); + vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl)); + vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl)); + vbi = __riscv_vfneg(vbi, vl); + vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_rvvseg(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2()); + vfloat32m2_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2x2_t va = __riscv_vlseg2e32_v_f32m2x2((const float*)input, vl); + vfloat32m2x2_t vb = __riscv_vlseg2e32_v_f32m2x2((const float*)taps, vl); + vfloat32m2_t var = __riscv_vget_f32m2(va, 0), vai = __riscv_vget_f32m2(va, 1); + vfloat32m2_t vbr = __riscv_vget_f32m2(vb, 0), vbi = __riscv_vget_f32m2(vb, 1); + vbi = __riscv_vfneg(vbi, vl); + vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/ diff --git a/kernels/volk/volk_32fc_x2_divide_32fc.h b/kernels/volk/volk_32fc_x2_divide_32fc.h index 3a013cb0..ceee6559 100644 --- a/kernels/volk/volk_32fc_x2_divide_32fc.h +++ b/kernels/volk/volk_32fc_x2_divide_32fc.h @@ -414,5 +414,66 @@ static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + + +static inline void volk_32fc_x2_divide_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + uint64_t* out = (uint64_t*)cVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, out += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl); + vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl)); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t mul = __riscv_vfrdiv( + __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl); + vfloat32m4_t vr = __riscv_vfmul( + __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl); + vfloat32m4_t vi = __riscv_vfmul( + __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl); + vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr); + vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi); + vuint64m8_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64(out, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_x2_divide_32fc_rvvseg(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t mul = __riscv_vfrdiv( + __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl); + vfloat32m4_t vr = __riscv_vfmul( + __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl); + vfloat32m4_t vi = __riscv_vfmul( + __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index 47d6f697..d4acab3a 100644 --- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -730,5 +730,70 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_x2_dot_prod_32fc_rvv(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2()); + vfloat32m2_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl); + vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl); + vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl)); + vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl)); + vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl)); + vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl)); + vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_x2_dot_prod_32fc_rvvseg(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/ diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h index 96cefed5..2db2929b 100644 --- a/kernels/volk/volk_32fc_x2_multiply_32fc.h +++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h @@ -460,4 +460,55 @@ static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_x2_multiply_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl); + vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl)); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr); + vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi); + vuint64m8_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64((uint64_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_x2_multiply_32fc_rvvseg(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h index 12e4948a..ce01d6d6 100644 --- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h +++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h @@ -287,5 +287,56 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_x2_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl); + vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl)); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl); + vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr); + vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi); + vuint64m8_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64((uint64_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_x2_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h index 54ffbf0f..0b956c20 100644 --- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h @@ -535,4 +535,62 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, } #endif // LV_HAVE_SSE +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax); + vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax); + vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, target += vl, points += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl); + vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvvseg(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax); + vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax); + vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, target += vl, points += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0); + vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl); + vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/ diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h index b35bed5e..b27f7b7b 100644 --- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h +++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h @@ -342,4 +342,69 @@ volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + vfloat32m2_t vbr = + __riscv_vfmv_v_f_f32m2(lv_creal(*scalar), __riscv_vsetvlmax_e32m2()); + vfloat32m2_t vbi = + __riscv_vfmv_v_f_f32m2(lv_cimag(*scalar), __riscv_vsetvlmax_e32m2()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, bVector += vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)bVector, vl); + vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)aVector, vl); + vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl)); + vfloat32m2_t vcr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl)); + vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl)); + vfloat32m2_t vci = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl)); + vfloat32m2_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m2_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vuint32m2_t vru = __riscv_vreinterpret_u32m2(__riscv_vfadd(vr, vcr, vl)); + vuint32m2_t viu = __riscv_vreinterpret_u32m2(__riscv_vfadd(vi, vci, vl)); + vuint64m4_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64((uint64_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvvseg(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + vfloat32m4_t vbr = + __riscv_vfmv_v_f_f32m4(lv_creal(*scalar), __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vbi = + __riscv_vfmv_v_f_f32m4(lv_cimag(*scalar), __riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl); + vfloat32m4_t vcr = __riscv_vget_f32m4(vc, 0), vci = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vr = __riscv_vfadd(vr, vcr, vl); + vi = __riscv_vfadd(vi, vci, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */ diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h index 4a93d5bf..b711bcf1 100644 --- a/kernels/volk/volk_32fc_x2_square_dist_32f.h +++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h @@ -277,7 +277,7 @@ static inline void volk_32fc_x2_square_dist_32f_generic(float* target, float sq_dist; unsigned int i = 0; - for (; i> 3; ++i) { + for (; i < (num_bytes >> 3); ++i) { diff = src0[0] - points[i]; sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); @@ -374,4 +374,56 @@ static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target, #endif /*LV_HAVE_AVX2*/ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_x2_square_dist_32f_rvv(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax); + vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, target += vl, points += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl); + vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(target, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_x2_square_dist_32f_rvvseg(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax); + vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, target += vl, points += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0); + vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl); + vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(target, v, vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/ diff --git a/kernels/volk/volk_32i_s32f_convert_32f.h b/kernels/volk/volk_32i_s32f_convert_32f.h index 678290fc..749cb1af 100644 --- a/kernels/volk/volk_32i_s32f_convert_32f.h +++ b/kernels/volk/volk_32i_s32f_convert_32f.h @@ -313,5 +313,21 @@ static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32i_s32f_convert_32f_rvv(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vfcvt_f(__riscv_vle32_v_i32m8(inputVector, vl), vl); + __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ diff --git a/kernels/volk/volk_32i_x2_and_32i.h b/kernels/volk/volk_32i_x2_and_32i.h index d2bcf6b8..79e4f221 100644 --- a/kernels/volk/volk_32i_x2_and_32i.h +++ b/kernels/volk/volk_32i_x2_and_32i.h @@ -337,5 +337,22 @@ static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32i_x2_and_32i_rvv(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl); + vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vand(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32i_x2_and_32i_u_H */ diff --git a/kernels/volk/volk_32i_x2_or_32i.h b/kernels/volk/volk_32i_x2_or_32i.h index f3e4b769..3642f13d 100644 --- a/kernels/volk/volk_32i_x2_or_32i.h +++ b/kernels/volk/volk_32i_x2_or_32i.h @@ -336,5 +336,22 @@ static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32i_x2_or_32i_rvv(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl); + vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vor(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32i_x2_or_32i_u_H */ diff --git a/kernels/volk/volk_32u_byteswap.h b/kernels/volk/volk_32u_byteswap.h index a6ec86f8..d5d0613e 100644 --- a/kernels/volk/volk_32u_byteswap.h +++ b/kernels/volk/volk_32u_byteswap.h @@ -343,5 +343,53 @@ static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int n } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32u_byteswap_rvv(uint32_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + size_t vlmax = __riscv_vsetvlmax_e8m1(); + if (vlmax <= 256) { + vuint8m1_t vidx = __riscv_vreinterpret_u8m1( + __riscv_vsub(__riscv_vreinterpret_u32m1(__riscv_vid_v_u8m1(vlmax)), + 0x3020100 - 0x10203, + vlmax / 4)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle32_v_u32m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgather, v, vidx); + __riscv_vse32(intsToSwap, __riscv_vreinterpret_u32m8(v), vl); + } + } else { + vuint16m2_t vidx = __riscv_vreinterpret_u16m2( + __riscv_vsub(__riscv_vreinterpret_u64m2(__riscv_vid_v_u16m2(vlmax)), + 0x3000200010000 - 0x100020003, + vlmax / 4)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle32_v_u32m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx); + __riscv_vse32(intsToSwap, __riscv_vreinterpret_u32m8(v), vl); + } + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA23 +#include + +static inline void volk_32u_byteswap_rva23(uint32_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t v = __riscv_vle32_v_u32m8(intsToSwap, vl); + __riscv_vse32(intsToSwap, __riscv_vrev8(v, vl), vl); + } +} +#endif /* LV_HAVE_RVA23 */ #endif /* INCLUDED_volk_32u_byteswap_a_H */ diff --git a/kernels/volk/volk_32u_byteswappuppet_32u.h b/kernels/volk/volk_32u_byteswappuppet_32u.h index a6ef921f..4ad3deac 100644 --- a/kernels/volk/volk_32u_byteswappuppet_32u.h +++ b/kernels/volk/volk_32u_byteswappuppet_32u.h @@ -91,4 +91,26 @@ static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, } #endif +#ifdef LV_HAVE_RVV +static inline void volk_32u_byteswappuppet_32u_rvv(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ + + volk_32u_byteswap_rvv((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); +} +#endif + +#ifdef LV_HAVE_RVA23 +static inline void volk_32u_byteswappuppet_32u_rva23(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ + + volk_32u_byteswap_rva23((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); +} +#endif + #endif diff --git a/kernels/volk/volk_32u_popcnt.h b/kernels/volk/volk_32u_popcnt.h index b8c371fb..3ad2f0aa 100644 --- a/kernels/volk/volk_32u_popcnt.h +++ b/kernels/volk/volk_32u_popcnt.h @@ -76,4 +76,22 @@ static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) #endif /*LV_HAVE_SSE4_2*/ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32u_popcnt_rvv(uint32_t* ret, const uint32_t value) +{ + *ret = __riscv_vcpop(__riscv_vreinterpret_b4(__riscv_vmv_s_x_u64m1(value, 1)), 32); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVA22V +#include + +static inline void volk_32u_popcnt_rva22(uint32_t* ret, const uint32_t value) +{ + *ret = __riscv_cpop_32(value); +} +#endif /*LV_HAVE_RVA22V*/ + #endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/ diff --git a/kernels/volk/volk_32u_popcntpuppet_32u.h b/kernels/volk/volk_32u_popcntpuppet_32u.h index 19a17f56..b808eb00 100644 --- a/kernels/volk/volk_32u_popcntpuppet_32u.h +++ b/kernels/volk/volk_32u_popcntpuppet_32u.h @@ -18,9 +18,8 @@ static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_32u_popcnt_generic(outVector + ii, *(inVector + ii)); + for (size_t i = 0; i < num_points; ++i) { + volk_32u_popcnt_generic(outVector + i, inVector[i]); } } #endif /* LV_HAVE_GENERIC */ @@ -30,11 +29,32 @@ static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_32u_popcnt_a_sse4_2(outVector + ii, *(inVector + ii)); + for (size_t i = 0; i < num_points; ++i) { + volk_32u_popcnt_a_sse4_2(outVector + i, inVector[i]); } } #endif /* LV_HAVE_SSE4_2 */ +#ifdef LV_HAVE_RVV +static inline void volk_32u_popcntpuppet_32u_rvv(uint32_t* outVector, + const uint32_t* inVector, + unsigned int num_points) +{ + for (size_t i = 0; i < num_points; ++i) { + volk_32u_popcnt_rvv(outVector + i, inVector[i]); + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA22V +static inline void volk_32u_popcntpuppet_32u_rva22(uint32_t* outVector, + const uint32_t* inVector, + unsigned int num_points) +{ + for (size_t i = 0; i < num_points; ++i) { + volk_32u_popcnt_rva22(outVector + i, inVector[i]); + } +} +#endif /* LV_HAVE_RVA22V */ + #endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h index 62150ac6..ece8f48b 100644 --- a/kernels/volk/volk_32u_reverse_32u.h +++ b/kernels/volk/volk_32u_reverse_32u.h @@ -337,4 +337,57 @@ volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_poi #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32u_reverse_32u_rvv(uint32_t* out, const uint32_t* in, unsigned int num_points) +{ + size_t n = num_points; + + static const uint64_t tblLo[] = { + 0xE060A020C0408000, + 0xF070B030D0509010, + }; + static const uint64_t tblHi[] = { + 0x0E060A020C040800, + 0x0F070B030D050901, + }; + vuint8m1_t vtblLo = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(tblLo, 2)); + vuint8m1_t vtblHi = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(tblHi, 2)); + + size_t vlmax = __riscv_vsetvlmax_e8m1(); + vuint16m2_t vidx = __riscv_vreinterpret_u16m2( + __riscv_vsub(__riscv_vreinterpret_u64m2(__riscv_vid_v_u16m2(vlmax)), + 0x3000200010000 - 0x100020003, + vlmax / 4)); + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint8m4_t v = __riscv_vreinterpret_u8m4(__riscv_vle32_v_u32m4(in, vl)); + v = RISCV_PERM4(__riscv_vrgatherei16, v, vidx); + vuint8m4_t lo = __riscv_vand(v, 0xF, vl * 4); + lo = RISCV_LUT4(__riscv_vrgather, vtblLo, lo); + vuint8m4_t hi = __riscv_vsrl(v, 4, vl * 4); + hi = RISCV_LUT4(__riscv_vrgather, vtblHi, hi); + v = __riscv_vor(hi, lo, vl * 4); + __riscv_vse32(out, __riscv_vreinterpret_u32m4(v), vl); + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA23 +#include + +static inline void +volk_32u_reverse_32u_rva23(uint32_t* out, const uint32_t* in, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t v = __riscv_vle32_v_u32m8(in, vl); + __riscv_vse32(out, __riscv_vbrev(v, vl), vl); + } +} +#endif /* LV_HAVE_RVA23 */ + #endif /* INCLUDED_volk_32u_reverse_32u_u_H */ diff --git a/kernels/volk/volk_64f_convert_32f.h b/kernels/volk/volk_64f_convert_32f.h index b5f9b507..67f6ae48 100644 --- a/kernels/volk/volk_64f_convert_32f.h +++ b/kernels/volk/volk_64f_convert_32f.h @@ -315,5 +315,20 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_convert_32f_rvv(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t v = __riscv_vle64_v_f64m8(inputVector, vl); + __riscv_vse32(outputVector, __riscv_vfncvt_f(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_64f_convert_32f_a_H */ diff --git a/kernels/volk/volk_64f_x2_add_64f.h b/kernels/volk/volk_64f_x2_add_64f.h index 867a5d3b..bf9024e8 100644 --- a/kernels/volk/volk_64f_x2_add_64f.h +++ b/kernels/volk/volk_64f_x2_add_64f.h @@ -244,4 +244,22 @@ static inline void volk_64f_x2_add_64f_a_avx(double* cVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_x2_add_64f_rvv(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_64f_x2_add_64f_u_H */ diff --git a/kernels/volk/volk_64f_x2_max_64f.h b/kernels/volk/volk_64f_x2_max_64f.h index 973605c7..e9ca3ef6 100644 --- a/kernels/volk/volk_64f_x2_max_64f.h +++ b/kernels/volk/volk_64f_x2_max_64f.h @@ -290,5 +290,22 @@ static inline void volk_64f_x2_max_64f_u_avx(double* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_x2_max_64f_rvv(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfmax(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_64f_x2_max_64f_u_H */ diff --git a/kernels/volk/volk_64f_x2_min_64f.h b/kernels/volk/volk_64f_x2_min_64f.h index 970b843f..7652ef72 100644 --- a/kernels/volk/volk_64f_x2_min_64f.h +++ b/kernels/volk/volk_64f_x2_min_64f.h @@ -290,5 +290,22 @@ static inline void volk_64f_x2_min_64f_u_avx(double* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_x2_min_64f_rvv(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfmin(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_64f_x2_min_64f_u_H */ diff --git a/kernels/volk/volk_64f_x2_multiply_64f.h b/kernels/volk/volk_64f_x2_multiply_64f.h index caab3aaa..57eb468a 100644 --- a/kernels/volk/volk_64f_x2_multiply_64f.h +++ b/kernels/volk/volk_64f_x2_multiply_64f.h @@ -244,4 +244,22 @@ static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_x2_multiply_64f_rvv(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_64f_x2_multiply_64f_u_H */ diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h index 2fbf3cce..a8da031c 100644 --- a/kernels/volk/volk_64u_byteswap.h +++ b/kernels/volk/volk_64u_byteswap.h @@ -383,4 +383,53 @@ static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, #endif /* LV_HAVE_SSSE3 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64u_byteswap_rvv(uint64_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + size_t vlmax = __riscv_vsetvlmax_e8m1(); + if (vlmax <= 256) { + vuint8m1_t vidx = __riscv_vreinterpret_u8m1( + __riscv_vsub(__riscv_vreinterpret_u64m1(__riscv_vid_v_u8m1(vlmax)), + 0x0706050403020100 - 0x1020304050607, + vlmax / 8)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgather, v, vidx); + __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl); + } + } else { + vuint16m2_t vid = __riscv_vid_v_u16m2(vlmax); + vuint16m2_t voff1 = __riscv_vand(vid, 0x7, vlmax); + vuint16m2_t voff2 = __riscv_vrsub(voff1, 0x7, vlmax); + vuint16m2_t vidx = __riscv_vadd(__riscv_vsub(vid, voff1, vlmax), voff2, vlmax); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx); + __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl); + } + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA23 +#include + +static inline void volk_64u_byteswap_rva23(uint64_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint64m8_t v = __riscv_vle64_v_u64m8(intsToSwap, vl); + __riscv_vse64(intsToSwap, __riscv_vrev8(v, vl), vl); + } +} +#endif /* LV_HAVE_RVA23 */ + #endif /* INCLUDED_volk_64u_byteswap_a_H */ diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h index c2b55bf4..2be3b0b7 100644 --- a/kernels/volk/volk_64u_byteswappuppet_64u.h +++ b/kernels/volk/volk_64u_byteswappuppet_64u.h @@ -92,4 +92,26 @@ static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, } #endif +#ifdef LV_HAVE_RVV +static inline void volk_64u_byteswappuppet_64u_rvv(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ + + volk_64u_byteswap_rvv((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); +} +#endif + +#ifdef LV_HAVE_RVA23 +static inline void volk_64u_byteswappuppet_64u_rva23(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ + + volk_64u_byteswap_rva23((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); +} +#endif + #endif diff --git a/kernels/volk/volk_64u_popcnt.h b/kernels/volk/volk_64u_popcnt.h index 5c9b2a3a..fb12bbe1 100644 --- a/kernels/volk/volk_64u_popcnt.h +++ b/kernels/volk/volk_64u_popcnt.h @@ -116,5 +116,22 @@ static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value) } #endif /*LV_HAVE_NEON*/ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64u_popcnt_rvv(uint64_t* ret, const uint64_t value) +{ + *ret = __riscv_vcpop(__riscv_vreinterpret_b2(__riscv_vmv_s_x_u64m1(value, 1)), 64); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVA22V +#include + +static inline void volk_64u_popcnt_rva22(uint64_t* ret, const uint64_t value) +{ + *ret = __riscv_cpop_64(value); +} +#endif /*LV_HAVE_RVA22V*/ #endif /*INCLUDED_volk_64u_popcnt_a_H*/ diff --git a/kernels/volk/volk_64u_popcntpuppet_64u.h b/kernels/volk/volk_64u_popcntpuppet_64u.h index 300d4fd1..245aeba1 100644 --- a/kernels/volk/volk_64u_popcntpuppet_64u.h +++ b/kernels/volk/volk_64u_popcntpuppet_64u.h @@ -19,11 +19,9 @@ static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_64u_popcnt_generic(outVector + ii, num_points); + for (size_t i = 0; i < num_points; ++i) { + volk_64u_popcnt_generic(outVector + i, inVector[i]); } - memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); } #endif /* LV_HAVE_GENERIC */ @@ -32,11 +30,9 @@ static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_64u_popcnt_a_sse4_2(outVector + ii, num_points); + for (size_t i = 0; i < num_points; ++i) { + volk_64u_popcnt_a_sse4_2(outVector + i, inVector[i]); } - memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); } #endif /* LV_HAVE_SSE4_2 */ @@ -45,12 +41,32 @@ static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_64u_popcnt_neon(outVector + ii, num_points); + for (size_t i = 0; i < num_points; ++i) { + volk_64u_popcnt_neon(outVector + i, inVector[i]); } - memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +static inline void volk_64u_popcntpuppet_64u_rvv(uint64_t* outVector, + const uint64_t* inVector, + unsigned int num_points) +{ + for (size_t i = 0; i < num_points; ++i) { + volk_64u_popcnt_rvv(outVector + i, inVector[i]); + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA22V +static inline void volk_64u_popcntpuppet_64u_rva22(uint64_t* outVector, + const uint64_t* inVector, + unsigned int num_points) +{ + for (size_t i = 0; i < num_points; ++i) { + volk_64u_popcnt_rva22(outVector + i, inVector[i]); + } +} +#endif /* LV_HAVE_RVA22V */ + #endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/kernels/volk/volk_8i_convert_16i.h b/kernels/volk/volk_8i_convert_16i.h index 36e929bb..0800f7c5 100644 --- a/kernels/volk/volk_8i_convert_16i.h +++ b/kernels/volk/volk_8i_convert_16i.h @@ -266,5 +266,20 @@ static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8i_convert_16i_rvv(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e8m4(n); + vint16m8_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m4(inputVector, vl), vl); + __riscv_vse16(outputVector, __riscv_vsll(v, 8, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/kernels/volk/volk_8i_s32f_convert_32f.h b/kernels/volk/volk_8i_s32f_convert_32f.h index d904d25d..cd2c325e 100644 --- a/kernels/volk/volk_8i_s32f_convert_32f.h +++ b/kernels/volk/volk_8i_s32f_convert_32f.h @@ -350,5 +350,22 @@ static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8i_s32f_convert_32f_rvv(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e8m2(n); + vint16m4_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m2(inputVector, vl), vl); + __riscv_vse32( + outputVector, __riscv_vfmul(__riscv_vfwcvt_f(v, vl), 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/kernels/volk/volk_8ic_deinterleave_16i_x2.h b/kernels/volk/volk_8ic_deinterleave_16i_x2.h index 46b2e2e4..87d745b8 100644 --- a/kernels/volk/volk_8ic_deinterleave_16i_x2.h +++ b/kernels/volk/volk_8ic_deinterleave_16i_x2.h @@ -392,4 +392,26 @@ static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, } } #endif /* LV_HAVE_AVX2 */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_deinterleave_16i_x2_rvv(int16_t* iBuffer, + int16_t* qBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) +{ + const uint16_t* in = (const uint16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint16m8_t vc = __riscv_vle16_v_u16m8(in, vl); + vuint16m8_t vr = __riscv_vsll(vc, 8, vl); + vuint16m8_t vi = __riscv_vand(vc, 0xFF00, vl); + __riscv_vse16((uint16_t*)iBuffer, vr, vl); + __riscv_vse16((uint16_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */ diff --git a/kernels/volk/volk_8ic_deinterleave_real_16i.h b/kernels/volk/volk_8ic_deinterleave_real_16i.h index bef47592..8814e5e1 100644 --- a/kernels/volk/volk_8ic_deinterleave_real_16i.h +++ b/kernels/volk/volk_8ic_deinterleave_real_16i.h @@ -300,4 +300,22 @@ static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, } } #endif /* LV_HAVE_AVX2 */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_deinterleave_real_16i_rvv(int16_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) +{ + const int16_t* in = (const int16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e16m8(n); + vint16m8_t v = __riscv_vle16_v_i16m8(in, vl); + __riscv_vse16(iBuffer, __riscv_vsra(__riscv_vsll(v, 8, vl), 1, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */ diff --git a/kernels/volk/volk_8ic_deinterleave_real_8i.h b/kernels/volk/volk_8ic_deinterleave_real_8i.h index 116b1afb..2c409c69 100644 --- a/kernels/volk/volk_8ic_deinterleave_real_8i.h +++ b/kernels/volk/volk_8ic_deinterleave_real_8i.h @@ -402,4 +402,21 @@ static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_deinterleave_real_8i_rvv(int8_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) +{ + const uint16_t* in = (const uint16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint16m8_t vc = __riscv_vle16_v_u16m8(in, vl); + __riscv_vse8((uint8_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */ diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h index 8936a169..e0234b16 100644 --- a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h +++ b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h @@ -441,4 +441,28 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_s32f_deinterleave_32f_x2_rvv(float* iBuffer, + float* qBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + const uint16_t* in = (const uint16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vuint16m4_t vc = __riscv_vle16_v_u16m4(in, vl); + vint8m2_t vr = __riscv_vreinterpret_i8m2(__riscv_vnsrl(vc, 0, vl)); + vint8m2_t vi = __riscv_vreinterpret_i8m2(__riscv_vnsrl(vc, 8, vl)); + vfloat32m8_t vrf = __riscv_vfwcvt_f(__riscv_vsext_vf2(vr, vl), vl); + vfloat32m8_t vif = __riscv_vfwcvt_f(__riscv_vsext_vf2(vi, vl), vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl); + __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */ diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h index 37cb2555..7ec8958d 100644 --- a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h +++ b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h @@ -349,5 +349,24 @@ volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_s32f_deinterleave_real_32f_rvv(float* iBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + const uint16_t* in = (const uint16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vuint16m4_t vc = __riscv_vle16_v_u16m4(in, vl); + vint8m2_t vr = __riscv_vreinterpret_i8m2(__riscv_vnsrl(vc, 0, vl)); + vfloat32m8_t vrf = __riscv_vfwcvt_f(__riscv_vsext_vf2(vr, vl), vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H */ diff --git a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h index 5462ea67..5de0e312 100644 --- a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h +++ b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h @@ -274,4 +274,55 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_x2_multiply_conjugate_16ic_rvv(lv_16sc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e8m2(n); + vint16m4_t va = __riscv_vle16_v_i16m4((const int16_t*)aVector, vl); + vint16m4_t vb = __riscv_vle16_v_i16m4((const int16_t*)bVector, vl); + vint8m2_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl); + vint8m2_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl); + vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = + __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl); + vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr); + vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi); + vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl); + __riscv_vse32((uint32_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_8ic_x2_multiply_conjugate_16ic_rvvseg(lv_16sc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e8m2(n); + vint8m2x2_t va = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)aVector, vl); + vint8m2x2_t vb = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)bVector, vl); + vint8m2_t var = __riscv_vget_i8m2(va, 0), vai = __riscv_vget_i8m2(va, 1); + vint8m2_t vbr = __riscv_vget_i8m2(vb, 0), vbi = __riscv_vget_i8m2(vb, 1); + vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = + __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl); + __riscv_vsseg2e16_v_i16m4x2( + (int16_t*)cVector, __riscv_vcreate_v_i16m4x2(vr, vi), vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */ diff --git a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h index 318a7819..5316ada0 100644 --- a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h +++ b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h @@ -341,4 +341,63 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, #endif /* LV_HAVE_AVX2*/ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e8m1(n); + vint16m2_t va = __riscv_vle16_v_i16m2((const int16_t*)aVector, vl); + vint16m2_t vb = __riscv_vle16_v_i16m2((const int16_t*)bVector, vl); + vint8m1_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl); + vint8m1_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl); + vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl); + vint16m2_t vi = + __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl); + vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl); + vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl); + vuint32m4_t vru = __riscv_vreinterpret_u32m4(vrf); + vuint32m4_t viu = __riscv_vreinterpret_u32m4(vif); + vuint64m8_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64((uint64_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_8ic_x2_s32f_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e8m1(n); + vint8m1x2_t va = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)aVector, vl); + vint8m1x2_t vb = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)bVector, vl); + vint8m1_t var = __riscv_vget_i8m1(va, 0), vai = __riscv_vget_i8m1(va, 1); + vint8m1_t vbr = __riscv_vget_i8m1(vb, 0), vbi = __riscv_vget_i8m1(vb, 1); + vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl); + vint16m2_t vi = + __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl); + vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl); + vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vrf, vif), vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */ diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h index 51963efd..5314622b 100644 --- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h +++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h @@ -20,11 +20,14 @@ typedef union { unsigned int* w; } p_decision_t; -static inline int parity(int x, unsigned char* Partab) +static inline int parity(int x) { - x ^= (x >> 16); - x ^= (x >> 8); - return Partab[x]; + x ^= x >> 16; + x ^= x >> 8; + x ^= x >> 4; + x ^= x >> 2; + x ^= x >> 1; + return x & 1; } static inline int chainback_viterbi(unsigned char* data, @@ -113,7 +116,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* dec, static unsigned char* X; static unsigned int excess = 6; static unsigned char* Branchtab; - static unsigned char Partab[256]; int d_polys[2] = { 79, 109 }; @@ -127,24 +129,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* dec, D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), volk_get_alignment()); int state, i; - int cnt, ti; - - /* Initialize parity lookup table */ - for (i = 0; i < 256; i++) { - cnt = 0; - ti = i; - while (ti) { - if (ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } + /* Initialize the branch table */ for (state = 0; state < d_numstates / 2; state++) { for (i = 0; i < rate; i++) { Branchtab[i * d_numstates / 2 + state] = - parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + parity((2 * state) & d_polys[i]) ? 255 : 0; } } @@ -195,7 +185,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec, static unsigned char* X; static unsigned int excess = 6; static unsigned char* Branchtab; - static unsigned char Partab[256]; int d_polys[2] = { 79, 109 }; @@ -209,24 +198,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec, D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), volk_get_alignment()); int state, i; - int cnt, ti; - - /* Initialize parity lookup table */ - for (i = 0; i < 256; i++) { - cnt = 0; - ti = i; - while (ti) { - if (ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } + /* Initialize the branch table */ for (state = 0; state < d_numstates / 2; state++) { for (i = 0; i < rate; i++) { Branchtab[i * d_numstates / 2 + state] = - parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + parity((2 * state) & d_polys[i]) ? 255 : 0; } } @@ -280,7 +257,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec, static unsigned char* X; static unsigned int excess = 6; static unsigned char* Branchtab; - static unsigned char Partab[256]; int d_polys[2] = { 79, 109 }; @@ -294,24 +270,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec, D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), volk_get_alignment()); int state, i; - int cnt, ti; - - /* Initialize parity lookup table */ - for (i = 0; i < 256; i++) { - cnt = 0; - ti = i; - while (ti) { - if (ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } + /* Initialize the branch table */ for (state = 0; state < d_numstates / 2; state++) { for (i = 0; i < rate; i++) { Branchtab[i * d_numstates / 2 + state] = - parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + parity((2 * state) & d_polys[i]) ? 255 : 0; } } @@ -363,7 +327,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec, static unsigned char* D; static unsigned int excess = 6; static unsigned char* Branchtab; - static unsigned char Partab[256]; int d_polys[2] = { 79, 109 }; @@ -378,24 +341,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec, volk_get_alignment()); int state, i; - int cnt, ti; - - /* Initialize parity lookup table */ - for (i = 0; i < 256; i++) { - cnt = 0; - ti = i; - while (ti) { - if (ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } + /* Initialize the branch table */ for (state = 0; state < d_numstates / 2; state++) { for (i = 0; i < rate; i++) { Branchtab[i * d_numstates / 2 + state] = - parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + parity((2 * state) & d_polys[i]) ? 255 : 0; } } @@ -427,4 +378,59 @@ static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec, #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_RVV +#include + +static inline void volk_8u_conv_k7_r2puppet_8u_rvv(unsigned char* dec, + unsigned char* syms, + unsigned int framebits) +{ + if (framebits < 12) + return; + + int d_numstates = (1 << 6); + static unsigned char* D; + static unsigned char* Y; + static unsigned char* X; + static unsigned int excess = 6; + static unsigned char* Branchtab; + + static int once = 1; + if (once) { + once = 0; + + X = (unsigned char*)volk_malloc(3 * d_numstates, volk_get_alignment()); + Y = X + d_numstates; + Branchtab = Y + d_numstates; + D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), + volk_get_alignment()); + + /* Initialize the branch table */ + for (size_t state = 0; state < d_numstates / 2; state++) { + Branchtab[state] = parity(state & 39) * 255; + Branchtab[state + d_numstates / 2] = parity(state & 54) * 255; + } + } + + memset(X, 31, d_numstates); // unbias the old_metrics + memset(D, 0, (d_numstates / 8) * (framebits + 6)); // initialize decisions + + volk_8u_x4_conv_k7_r2_8u_rvv( + Y, X, syms, D, framebits / 2 - excess, excess, Branchtab); + + unsigned int min = X[0]; + int i = 0, state = 0; + for (i = 0; i < d_numstates; ++i) { + if (X[i] < min) { + min = X[i]; + state = i; + } + } + + chainback_viterbi(dec, framebits / 2 - excess, state, excess, D); + + return; +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_8u_conv_k7_r2puppet_8u_H*/ diff --git a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h index 1464218a..5d03f03d 100644 --- a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h +++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h @@ -1153,5 +1153,84 @@ static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8u_x2_encodeframepolar_8u_rvv(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) +{ + unsigned int stage = log2_of_power_of_2(frame_size); + unsigned int frame_half = frame_size >> 1; + unsigned int num_branches = 1; + + while (stage) { + // encode stage + if (frame_half < 8) { + encodepolar_single_stage(frame, temp, num_branches, frame_half); + } else { + unsigned char *in = temp, *out = frame; + for (size_t branch = 0; branch < num_branches; ++branch) { + size_t n = frame_half; + for (size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) { + vl = __riscv_vsetvl_e8m1(n); + vuint16m2_t vc = __riscv_vle16_v_u16m2((uint16_t*)in, vl); + vuint8m1_t v1 = __riscv_vnsrl(vc, 0, vl); + vuint8m1_t v2 = __riscv_vnsrl(vc, 8, vl); + __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl); + __riscv_vse8(out + frame_half, v2, vl); + } + out += frame_half; + } + } + memcpy(temp, frame, sizeof(unsigned char) * frame_size); + + // update all the parameters. + num_branches = num_branches << 1; + frame_half = frame_half >> 1; + --stage; + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_8u_x2_encodeframepolar_8u_rvvseg(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) +{ + unsigned int stage = log2_of_power_of_2(frame_size); + unsigned int frame_half = frame_size >> 1; + unsigned int num_branches = 1; + + while (stage) { + // encode stage + if (frame_half < 8) { + encodepolar_single_stage(frame, temp, num_branches, frame_half); + } else { + unsigned char *in = temp, *out = frame; + for (size_t branch = 0; branch < num_branches; ++branch) { + size_t n = frame_half; + for (size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) { + vl = __riscv_vsetvl_e8m1(n); + vuint8m1x2_t vc = __riscv_vlseg2e8_v_u8m1x2(in, vl); + vuint8m1_t v1 = __riscv_vget_u8m1(vc, 0); + vuint8m1_t v2 = __riscv_vget_u8m1(vc, 1); + __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl); + __riscv_vse8(out + frame_half, v2, vl); + } + out += frame_half; + } + } + memcpy(temp, frame, sizeof(unsigned char) * frame_size); + + // update all the parameters. + num_branches = num_branches << 1; + frame_half = frame_half >> 1; + --stage; + } +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */ diff --git a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h index 4c45f757..e54befa4 100644 --- a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h +++ b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h @@ -169,4 +169,33 @@ volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +static inline void volk_8u_x3_encodepolar_8u_x2_rvv(unsigned char* frame, + unsigned char* temp, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) +{ + interleave_frozen_and_info_bits( + temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_8u_x2_encodeframepolar_8u_rvv(frame, temp, frame_size); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVVSEG +static inline void +volk_8u_x3_encodepolar_8u_x2_rvvseg(unsigned char* frame, + unsigned char* temp, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) +{ + interleave_frozen_and_info_bits( + temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_8u_x2_encodeframepolar_8u_rvvseg(frame, temp, frame_size); +} +#endif /* LV_HAVE_RVVSEG */ + #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_A_H_ */ diff --git a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h index 496ca2e5..792168e0 100644 --- a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h +++ b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h @@ -156,5 +156,47 @@ volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +static inline void volk_8u_x3_encodepolarpuppet_8u_rvv(unsigned char* frame, + unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) +{ + if (frame_size < 1) { + return; + } + + frame_size = next_lower_power_of_two(frame_size); + unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, + volk_get_alignment()); + adjust_frozen_mask(frozen_bit_mask, frame_size); + volk_8u_x3_encodepolar_8u_x2_rvv( + frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_free(temp); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVVSEG +static inline void +volk_8u_x3_encodepolarpuppet_8u_rvvseg(unsigned char* frame, + unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) +{ + if (frame_size < 1) { + return; + } + + frame_size = next_lower_power_of_two(frame_size); + unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, + volk_get_alignment()); + adjust_frozen_mask(frozen_bit_mask, frame_size); + volk_8u_x3_encodepolar_8u_x2_rvvseg( + frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_free(temp); +} +#endif /* LV_HAVE_RVVSEG */ #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_A_H_ */ diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h index 9750b665..cb2db11a 100644 --- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h @@ -63,11 +63,14 @@ static inline void renormalize(unsigned char* X) int i; unsigned char min = X[0]; - for (i = 0; i < NUMSTATES; i++) - if (min > X[i]) + for (i = 0; i < NUMSTATES; i++) { + if (min > X[i]) { min = X[i]; - for (i = 0; i < NUMSTATES; i++) + } + } + for (i = 0; i < NUMSTATES; i++) { X[i] -= min; + } } @@ -91,8 +94,9 @@ static inline void BFLY(int i, int PRECISIONSHIFT = 2; metricsum = 1; - for (j = 0; j < RATE; j++) + for (j = 0; j < RATE; j++) { metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]); + } metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT; unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT); @@ -465,4 +469,210 @@ static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_RVV +#include + +static inline void volk_8u_x4_conv_k7_r2_8u_rvv(unsigned char* Y, + unsigned char* X, + unsigned char* syms, + unsigned char* dec, + unsigned int framebits, + unsigned int excess, + unsigned char* Branchtab) +{ + size_t vl = 256 / 8; + + size_t n = framebits + excess; + + if (__riscv_vlenb() == 128 / 8) { + vuint8m2_t vX0 = __riscv_vle8_v_u8m2(X, vl), + vX1 = __riscv_vle8_v_u8m2(X + vl, vl); + vuint8m2_t vY0 = __riscv_vle8_v_u8m2(Y, vl), + vY1 = __riscv_vle8_v_u8m2(Y + vl, vl); + vuint8m2_t vB0 = __riscv_vle8_v_u8m2(Branchtab, vl); + vuint8m2_t vB1 = __riscv_vle8_v_u8m2(Branchtab + vl, vl); + vuint8m2_t v63 = __riscv_vmv_v_x_u8m2(63, vl); + + for (size_t i = 0; i < n; ++i) { + // Butterfly + vuint8m2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl); + vuint8m2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl); + vuint8m2_t va = __riscv_vaaddu(va0, va1, 0, vl); + va = __riscv_vreinterpret_u8m2( + __riscv_vsrl(__riscv_vreinterpret_u16m2(va), 2, vl / 2)); + va = __riscv_vand(va, v63, vl); + vuint8m2_t vb = __riscv_vssubu(v63, va, vl); + vuint8m2_t vX0a = __riscv_vsaddu(vX0, va, vl); + vuint8m2_t vX1b = __riscv_vsaddu(vX1, vb, vl); + vuint8m2_t vX0b = __riscv_vsaddu(vX0, vb, vl); + vuint8m2_t vX1a = __riscv_vsaddu(vX1, va, vl); + vY0 = __riscv_vminu(vX1b, vX0a, vl); + vY1 = __riscv_vminu(vX1a, vX0b, vl); + + vuint16m4_t vX1ba = + __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl); + vX1b = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 0); + vX1a = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 1); + + vuint16m4_t vm = + __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl); + vY0 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 0); + vY1 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 1); + + __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl); + __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl); + + // Renormalize + vuint8m2_t vmin = __riscv_vminu(vY0, vY1, vl); + vmin = __riscv_vlmul_ext_u8m2( + __riscv_vredminu(vmin, __riscv_vlmul_trunc_u8m1(vmin), vl)); + vmin = __riscv_vrgather(vmin, 0, vl); + vY0 = __riscv_vsub(vY0, vmin, vl); + vY1 = __riscv_vsub(vY1, vmin, vl); + + vuint8m2_t tmp; // Swap pointers to old and new metrics + tmp = vX0; + vX0 = vY0; + vY0 = tmp; + tmp = vX1; + vX1 = vY1; + vY1 = tmp; + } + if (n & 1) { + __riscv_vse8(X, vY0, vl); + __riscv_vse8(X + vl, vY1, vl); + __riscv_vse8(Y, vX0, vl); + __riscv_vse8(Y + vl, vX1, vl); + } else { + __riscv_vse8(X, vX0, vl); + __riscv_vse8(X + vl, vX1, vl); + __riscv_vse8(Y, vY0, vl); + __riscv_vse8(Y + vl, vY1, vl); + } + } else if (__riscv_vlenb() == 256 / 8) { + vuint8m1_t vX0 = __riscv_vle8_v_u8m1(X, vl), + vX1 = __riscv_vle8_v_u8m1(X + vl, vl); + vuint8m1_t vY0 = __riscv_vle8_v_u8m1(Y, vl), + vY1 = __riscv_vle8_v_u8m1(Y + vl, vl); + vuint8m1_t vB0 = __riscv_vle8_v_u8m1(Branchtab, vl); + vuint8m1_t vB1 = __riscv_vle8_v_u8m1(Branchtab + vl, vl); + vuint8m1_t v63 = __riscv_vmv_v_x_u8m1(63, vl); + + for (size_t i = 0; i < n; ++i) { + // Butterfly + vuint8m1_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl); + vuint8m1_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl); + vuint8m1_t va = __riscv_vaaddu(va0, va1, 0, vl); + va = __riscv_vreinterpret_u8m1( + __riscv_vsrl(__riscv_vreinterpret_u16m1(va), 2, vl / 2)); + va = __riscv_vand(va, v63, vl); + vuint8m1_t vb = __riscv_vssubu(v63, va, vl); + vuint8m1_t vX0a = __riscv_vsaddu(vX0, va, vl); + vuint8m1_t vX1b = __riscv_vsaddu(vX1, vb, vl); + vuint8m1_t vX0b = __riscv_vsaddu(vX0, vb, vl); + vuint8m1_t vX1a = __riscv_vsaddu(vX1, va, vl); + vY0 = __riscv_vminu(vX1b, vX0a, vl); + vY1 = __riscv_vminu(vX1a, vX0b, vl); + + vuint16m2_t vX1ba = + __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl); + vX1b = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 0); + vX1a = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 1); + + vuint16m2_t vm = + __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl); + vY0 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 0); + vY1 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 1); + + __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl); + __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl); + + // Renormalize + vuint8m1_t vmin = __riscv_vminu(vY0, vY1, vl); + vmin = __riscv_vrgather(__riscv_vredminu(vmin, vmin, vl), 0, vl); + vY0 = __riscv_vsub(vY0, vmin, vl); + vY1 = __riscv_vsub(vY1, vmin, vl); + + vuint8m1_t tmp; // Swap pointers to old and new metrics + tmp = vX0; + vX0 = vY0; + vY0 = tmp; + tmp = vX1; + vX1 = vY1; + vY1 = tmp; + } + if (n & 1) { + __riscv_vse8(X, vY0, vl); + __riscv_vse8(X + vl, vY1, vl); + __riscv_vse8(Y, vX0, vl); + __riscv_vse8(Y + vl, vX1, vl); + } else { + __riscv_vse8(X, vX0, vl); + __riscv_vse8(X + vl, vX1, vl); + __riscv_vse8(Y, vY0, vl); + __riscv_vse8(Y + vl, vY1, vl); + } + } else { + vuint8mf2_t vX0 = __riscv_vle8_v_u8mf2(X, vl), + vX1 = __riscv_vle8_v_u8mf2(X + vl, vl); + vuint8mf2_t vY0 = __riscv_vle8_v_u8mf2(Y, vl), + vY1 = __riscv_vle8_v_u8mf2(Y + vl, vl); + vuint8mf2_t vB0 = __riscv_vle8_v_u8mf2(Branchtab, vl); + vuint8mf2_t vB1 = __riscv_vle8_v_u8mf2(Branchtab + vl, vl); + vuint8mf2_t v63 = __riscv_vmv_v_x_u8mf2(63, vl); + + for (size_t i = 0; i < n; ++i) { + // Butterfly + vuint8mf2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl); + vuint8mf2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl); + vuint8mf2_t va = __riscv_vaaddu(va0, va1, 0, vl); + va = __riscv_vreinterpret_u8mf2( + __riscv_vsrl(__riscv_vreinterpret_u16mf2(va), 2, vl / 2)); + va = __riscv_vand(va, v63, vl); + vuint8mf2_t vb = __riscv_vssubu(v63, va, vl); + vuint8mf2_t vX0a = __riscv_vsaddu(vX0, va, vl); + vuint8mf2_t vX1b = __riscv_vsaddu(vX1, vb, vl); + vuint8mf2_t vX0b = __riscv_vsaddu(vX0, vb, vl); + vuint8mf2_t vX1a = __riscv_vsaddu(vX1, va, vl); + vY0 = __riscv_vminu(vX1b, vX0a, vl); + vY1 = __riscv_vminu(vX1a, vX0b, vl); + + vuint8m1_t vX1ba = __riscv_vreinterpret_u8m1( + __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl)); + vuint8m1_t vY01 = __riscv_vreinterpret_u8m1( + __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl)); + + __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY01, vX1ba, vl * 2), vl * 2); + + // Renormalize + vuint8m1_t vmin = + __riscv_vrgather(__riscv_vredminu(vY01, vY01, vl * 2), 0, vl * 2); + vY01 = __riscv_vsub(vY01, vmin, vl * 2); + + vY0 = __riscv_vlmul_trunc_u8mf2(vY01); + vY1 = __riscv_vlmul_trunc_u8mf2(__riscv_vslidedown(vY01, vl, vl)); + + vuint8mf2_t tmp; // Swap pointers to old and new metrics + tmp = vX0; + vX0 = vY0; + vY0 = tmp; + tmp = vX1; + vX1 = vY1; + vY1 = tmp; + } + if (n & 1) { + __riscv_vse8(X, vY0, vl); + __riscv_vse8(X + vl, vY1, vl); + __riscv_vse8(Y, vX0, vl); + __riscv_vse8(Y + vl, vX1, vl); + } else { + __riscv_vse8(X, vX0, vl); + __riscv_vse8(X + vl, vX1, vl); + __riscv_vse8(Y, vY0, vl); + __riscv_vse8(Y + vl, vY1, vl); + } + } +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/ diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 2c160b2f..588db44f 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -93,12 +93,28 @@ execute_process( OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE) +try_compile( + HAVE_RVV_INTRINSICS + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/cmake/Checks/check-rvv-intrinsics.c +) +if(HAVE_RVV_INTRINSICS) + message(STATUS "Checking RVV intrinsics - found") +else() + message(STATUS "Checking RVV intrinsics - not found") +endif() + macro(check_arch arch_name) set(flags ${ARGN}) set(have_${arch_name} TRUE) + + string(SUBSTRING "${arch_name}" 0 2 arch_prefix) foreach(flag ${flags}) if(MSVC AND (${flag} STREQUAL "/arch:SSE2" OR ${flag} STREQUAL "/arch:SSE")) # SSE/SSE2 is supported in MSVC since VS 2005 but flag not available when compiling 64-bit so do not check + elseif("${arch_prefix}" STREQUAL "rv" AND NOT HAVE_RVV_INTRINSICS) + message(STATUS "Skipping ${arch_name} due to missing RVV intrinsics support") + set(have_${arch_name} FALSE) else() include(CheckCXXCompilerFlag) set(have_flag have${flag}) diff --git a/tmpl/volk_cpu.tmpl.c b/tmpl/volk_cpu.tmpl.c index a4a06b0f..2cf2fa34 100644 --- a/tmpl/volk_cpu.tmpl.c +++ b/tmpl/volk_cpu.tmpl.c @@ -49,7 +49,7 @@ static int i_can_has_${arch.name} (void) { #if defined(CPU_FEATURES_ARCH_MIPS) if (GetMipsInfo().features.${check} == 0){ return 0; } #endif - %elif "riscv" in arch.name: + %elif "riscv" in arch.name or arch.name[:2] == "rv": #if defined(CPU_FEATURES_ARCH_RISCV) if (GetRiscvInfo().features.${check} == 0){ return 0; } #endif