diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index f0581eeaf..0b79cb82b 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -114,7 +114,6 @@ jobs:
submodules: 'recursive'
- uses: uraimo/run-on-arch-action@v2.5.0
name: Build in non-x86 container
- continue-on-error: ${{ contains(fromJson('["ppc64le", "s390x"]'), matrix.arch) }}
id: build
with:
arch: ${{ matrix.arch }}
@@ -153,7 +152,9 @@ jobs:
cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
echo "Build with $(nproc) thread(s)"
make -j$(nproc)
- ./cpu_features/list_cpu_features
+ if [ -f ./cpu_features/list_cpu_features ]; then
+ ./cpu_features/list_cpu_features
+ fi
./apps/volk-config-info --alignment
./apps/volk-config-info --avail-machines
./apps/volk-config-info --all-machines
diff --git a/docs/kernels.dox b/docs/kernels.dox
index e6930ad2f..f67e44440 100644
--- a/docs/kernels.dox
+++ b/docs/kernels.dox
@@ -50,7 +50,9 @@
\li \subpage volk_32f_cos_32f
\li \subpage volk_32fc_s32f_atan2_32f
\li \subpage volk_32fc_s32fc_multiply_32fc
+\li \subpage volk_32fc_s32fc_multiply2_32fc
\li \subpage volk_32fc_s32fc_x2_rotator_32fc
+\li \subpage volk_32fc_s32fc_x2_rotator2_32fc
\li \subpage volk_32fc_s32f_deinterleave_real_16i
\li \subpage volk_32fc_s32f_magnitude_16i
\li \subpage volk_32fc_s32f_power_32fc
@@ -63,6 +65,7 @@
\li \subpage volk_32fc_x2_multiply_32fc
\li \subpage volk_32fc_x2_multiply_conjugate_32fc
\li \subpage volk_32fc_x2_s32fc_multiply_conjugate_add_32fc
+\li \subpage volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc
\li \subpage volk_32fc_x2_s32f_square_dist_scalar_mult_32f
\li \subpage volk_32fc_x2_square_dist_32f
\li \subpage volk_32f_exp_32f
diff --git a/kernels/volk/volk_32fc_s32fc_multiply2_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply2_32fc.h
new file mode 100644
index 000000000..c1704041f
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32fc_multiply2_32fc.h
@@ -0,0 +1,413 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32fc_multiply2_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies the input complex vector by a complex scalar and returns
+ * the results.
+ *
+ * Dispatcher Prototype
+ * \code
+ * void volk_32fc_s32fc_multiply2_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t* scalar, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector to be multiplied.
+ * \li scalar: The complex scalar to multiply against aVector.
+ * \li num_points: The number of complex values in aVector.
+ *
+ * \b Outputs
+ * \li cVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Generate points around the unit circle and shift the phase pi/3 rad.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t scalar = lv_cmake((float)std::cos(M_PI/3.f), (float)std::sin(M_PI/3.f));
+ *
+ * float delta = 2.f*M_PI / (float)N;
+ * for(unsigned int ii = 0; ii < N/2; ++ii){
+ * // Generate points around the unit circle
+ * float real = std::cos(delta * (float)ii);
+ * float imag = std::sin(delta * (float)ii);
+ * in[ii] = lv_cmake(real, imag);
+ * in[ii+N/2] = lv_cmake(-real, -imag);
+ * }
+ *
+ * volk_32fc_s32fc_multiply2_32fc(out, in, &scalar, N);
+ *
+ * printf(" mag phase | mag phase\n");
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("%+1.2f %+1.2f | %+1.2f %+1.2f\n",
+ * std::abs(in[ii]), std::arg(in[ii]),
+ * std::abs(out[ii]), std::arg(out[ii]));
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32fc_multiply2_32fc_u_H
+#define INCLUDED_volk_32fc_s32fc_multiply2_32fc_u_H
+
+#include
+#include
+#include
+#include
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include
+
+static inline void volk_32fc_s32fc_multiply2_32fc_u_avx_fma(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+ __m256 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm256_set1_ps(lv_creal(*scalar));
+ yh = _mm256_set1_ps(lv_cimag(*scalar));
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = x;
+
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_fmaddsub_ps(
+ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+#ifdef LV_HAVE_AVX
+#include
+
+static inline void volk_32fc_s32fc_multiply2_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+ __m256 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm256_set1_ps(lv_creal(*scalar));
+ yh = _mm256_set1_ps(lv_cimag(*scalar));
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include
+
+static inline void volk_32fc_s32fc_multiply2_32fc_u_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(*scalar));
+ yh = _mm_set_ps1(lv_cimag(*scalar));
+
+ for (; number < halfPoints; number++) {
+
+ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if ((num_points % 2) != 0) {
+ *c = (*a) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_multiply2_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8) {
+ *cPtr++ = (*aPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) * (*scalar);
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0)
+ *cPtr++ = *aPtr++ * (*scalar);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply2_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_s32fc_multiply2_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_multiply2_32fc_a_H
+
+#include
+#include
+#include
+#include
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include
+
+static inline void volk_32fc_s32fc_multiply2_32fc_a_avx_fma(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+ __m256 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm256_set1_ps(lv_creal(*scalar));
+ yh = _mm256_set1_ps(lv_cimag(*scalar));
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = x;
+
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_fmaddsub_ps(
+ tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+
+#ifdef LV_HAVE_AVX
+#include
+
+static inline void volk_32fc_s32fc_multiply2_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+ __m256 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm256_set1_ps(lv_creal(*scalar));
+ yh = _mm256_set1_ps(lv_cimag(*scalar));
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include
+
+static inline void volk_32fc_s32fc_multiply2_32fc_a_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm_set_ps1(lv_creal(*scalar));
+ yh = _mm_set_ps1(lv_cimag(*scalar));
+
+ for (; number < halfPoints; number++) {
+
+ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+ tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm_addsub_ps(tmp1,
+ tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+ _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+ a += 2;
+ c += 2;
+ }
+
+ if ((num_points % 2) != 0) {
+ *c = (*a) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_NEON
+#include
+
+static inline void volk_32fc_s32fc_multiply2_32fc_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ unsigned int number = num_points;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4x2_t a_val, scalar_val;
+ float32x4x2_t tmp_imag;
+
+ scalar_val.val[0] = vld1q_dup_f32((const float*)scalar);
+ scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1);
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)aPtr);
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
+
+ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
+ tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
+
+ vst2q_f32((float*)cPtr, tmp_imag);
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = *aPtr++ * (*scalar);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_x2_multiply2_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index 1593b7cbe..cd5d1e97e 100644
--- a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+++ b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -10,6 +10,12 @@
/*!
* \page volk_32fc_s32fc_multiply_32fc
*
+ * \b Deprecation
+ *
+ * This kernel is deprecated, because passing in `lv_32fc_t` by value results in
+ * Undefined Behaviour, causing a segmentation fault on some architectures.
+ * Use `volk_32fc_s32fc_multiply2_32fc` instead.
+ *
* \b Overview
*
* Multiplies the input complex vector by a complex scalar and returns
@@ -66,137 +72,39 @@
#include
#include
#include
+#include
#include
#if LV_HAVE_AVX && LV_HAVE_FMA
-#include
static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- unsigned int i = 0;
- const unsigned int quarterPoints = num_points / 4;
- unsigned int isodd = num_points & 3;
- __m256 x, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
-
- // Set up constant scalar vector
- yl = _mm256_set1_ps(lv_creal(scalar));
- yh = _mm256_set1_ps(lv_cimag(scalar));
-
- for (; number < quarterPoints; number++) {
- x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
- tmp1 = x;
-
- x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm256_fmaddsub_ps(
- tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
-
- a += 4;
- c += 4;
- }
-
- for (i = num_points - isodd; i < num_points; i++) {
- *c++ = (*a++) * scalar;
- }
+ volk_32fc_s32fc_multiply2_32fc_u_avx_fma(cVector, aVector, &scalar, num_points);
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
#ifdef LV_HAVE_AVX
-#include
static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- unsigned int i = 0;
- const unsigned int quarterPoints = num_points / 4;
- unsigned int isodd = num_points & 3;
- __m256 x, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
-
- // Set up constant scalar vector
- yl = _mm256_set1_ps(lv_creal(scalar));
- yh = _mm256_set1_ps(lv_cimag(scalar));
-
- for (; number < quarterPoints; number++) {
- x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
- tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
- x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm256_addsub_ps(tmp1,
- tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
-
- a += 4;
- c += 4;
- }
-
- for (i = num_points - isodd; i < num_points; i++) {
- *c++ = (*a++) * scalar;
- }
+ volk_32fc_s32fc_multiply2_32fc_u_avx(cVector, aVector, &scalar, num_points);
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
-#include
static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
-
- // Set up constant scalar vector
- yl = _mm_set_ps1(lv_creal(scalar));
- yh = _mm_set_ps1(lv_cimag(scalar));
-
- for (; number < halfPoints; number++) {
-
- x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
- tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
- x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm_addsub_ps(tmp1,
- tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm_storeu_ps((float*)c, z); // Store the results back into the C container
-
- a += 2;
- c += 2;
- }
-
- if ((num_points % 2) != 0) {
- *c = (*a) * scalar;
- }
+ volk_32fc_s32fc_multiply2_32fc_u_sse3(cVector, aVector, &scalar, num_points);
}
#endif /* LV_HAVE_SSE */
@@ -207,26 +115,7 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- unsigned int number = num_points;
-
- // unwrap loop
- while (number >= 8) {
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- *cPtr++ = (*aPtr++) * scalar;
- number -= 8;
- }
-
- // clean up any remaining
- while (number-- > 0)
- *cPtr++ = *aPtr++ * scalar;
+ volk_32fc_s32fc_multiply2_32fc_generic(cVector, aVector, &scalar, num_points);
}
#endif /* LV_HAVE_GENERIC */
@@ -241,172 +130,47 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector,
#include
#if LV_HAVE_AVX && LV_HAVE_FMA
-#include
static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- unsigned int i = 0;
- const unsigned int quarterPoints = num_points / 4;
- unsigned int isodd = num_points & 3;
- __m256 x, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
-
- // Set up constant scalar vector
- yl = _mm256_set1_ps(lv_creal(scalar));
- yh = _mm256_set1_ps(lv_cimag(scalar));
-
- for (; number < quarterPoints; number++) {
- x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
- tmp1 = x;
-
- x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm256_fmaddsub_ps(
- tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm256_store_ps((float*)c, z); // Store the results back into the C container
-
- a += 4;
- c += 4;
- }
-
- for (i = num_points - isodd; i < num_points; i++) {
- *c++ = (*a++) * scalar;
- }
+ volk_32fc_s32fc_multiply2_32fc_a_avx_fma(cVector, aVector, &scalar, num_points);
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
#ifdef LV_HAVE_AVX
-#include
static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- unsigned int i = 0;
- const unsigned int quarterPoints = num_points / 4;
- unsigned int isodd = num_points & 3;
- __m256 x, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
-
- // Set up constant scalar vector
- yl = _mm256_set1_ps(lv_creal(scalar));
- yh = _mm256_set1_ps(lv_cimag(scalar));
-
- for (; number < quarterPoints; number++) {
- x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
- tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
- x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm256_addsub_ps(tmp1,
- tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm256_store_ps((float*)c, z); // Store the results back into the C container
-
- a += 4;
- c += 4;
- }
-
- for (i = num_points - isodd; i < num_points; i++) {
- *c++ = (*a++) * scalar;
- }
+ volk_32fc_s32fc_multiply2_32fc_a_avx(cVector, aVector, &scalar, num_points);
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
-#include
static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, yl, yh, z, tmp1, tmp2;
- lv_32fc_t* c = cVector;
- const lv_32fc_t* a = aVector;
-
- // Set up constant scalar vector
- yl = _mm_set_ps1(lv_creal(scalar));
- yh = _mm_set_ps1(lv_cimag(scalar));
-
- for (; number < halfPoints; number++) {
-
- x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
- tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
- x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
- tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
- z = _mm_addsub_ps(tmp1,
- tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
- _mm_store_ps((float*)c, z); // Store the results back into the C container
-
- a += 2;
- c += 2;
- }
-
- if ((num_points % 2) != 0) {
- *c = (*a) * scalar;
- }
+ volk_32fc_s32fc_multiply2_32fc_a_sse3(cVector, aVector, &scalar, num_points);
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
-#include
static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- lv_32fc_t* cPtr = cVector;
- const lv_32fc_t* aPtr = aVector;
- unsigned int number = num_points;
- unsigned int quarter_points = num_points / 4;
-
- float32x4x2_t a_val, scalar_val;
- float32x4x2_t tmp_imag;
-
- scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
- scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
- for (number = 0; number < quarter_points; ++number) {
- a_val = vld2q_f32((float*)aPtr);
- tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
- tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
-
- tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
- tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
-
- vst2q_f32((float*)cPtr, tmp_imag);
- aPtr += 4;
- cPtr += 4;
- }
-
- for (number = quarter_points * 4; number < num_points; number++) {
- *cPtr++ = *aPtr++ * scalar;
- }
+ volk_32fc_s32fc_multiply2_32fc_neon(cVector, aVector, &scalar, num_points);
}
#endif /* LV_HAVE_NEON */
diff --git a/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h
new file mode 100644
index 000000000..3ce071ca4
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h
@@ -0,0 +1,173 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+
+#ifndef INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H
+
+
+#include
+#include
+#include
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_generic(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_generic(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include
+#include
+
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_neon(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_neon(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include
+
+static inline void
+volk_32fc_s32fc_rotator2puppet_32fc_a_sse4_1(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include
+static inline void
+volk_32fc_s32fc_rotator2puppet_32fc_u_sse4_1(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include
+
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_a_avx(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_a_avx(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_AVX
+#include
+
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_u_avx(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_u_avx(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX */
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include
+
+static inline void
+volk_32fc_s32fc_rotator2puppet_32fc_a_avx_fma(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include
+
+static inline void
+volk_32fc_s32fc_rotator2puppet_32fc_u_avx_fma(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ unsigned int num_points)
+{
+ lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ const lv_32fc_t phase_inc_n =
+ *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+ volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(
+ outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
deleted file mode 100644
index e328a3112..000000000
--- a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/* -*- c++ -*- */
-/*
- * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
- *
- * This file is part of VOLK
- *
- * SPDX-License-Identifier: LGPL-3.0-or-later
- */
-
-
-#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
-#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
-
-
-#include
-#include
-#include
-
-
-#ifdef LV_HAVE_GENERIC
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector,
- const lv_32fc_t* inVector,
- const lv_32fc_t phase_inc,
- unsigned int num_points)
-{
- lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n =
- phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_generic(
- outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_GENERIC */
-
-
-#ifdef LV_HAVE_NEON
-#include
-#include
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector,
- const lv_32fc_t* inVector,
- const lv_32fc_t phase_inc,
- unsigned int num_points)
-{
- lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n =
- phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_neon(
- outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_NEON */
-
-
-#ifdef LV_HAVE_SSE4_1
-#include
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector,
- const lv_32fc_t* inVector,
- const lv_32fc_t phase_inc,
- unsigned int num_points)
-{
- lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n =
- phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
- outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#ifdef LV_HAVE_SSE4_1
-#include
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector,
- const lv_32fc_t* inVector,
- const lv_32fc_t phase_inc,
- unsigned int num_points)
-{
- lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n =
- phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
- outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#ifdef LV_HAVE_AVX
-#include
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector,
- const lv_32fc_t* inVector,
- const lv_32fc_t phase_inc,
- unsigned int num_points)
-{
- lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n =
- phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_a_avx(
- outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX */
-
-
-#ifdef LV_HAVE_AVX
-#include
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector,
- const lv_32fc_t* inVector,
- const lv_32fc_t phase_inc,
- unsigned int num_points)
-{
- lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n =
- phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_u_avx(
- outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX */
-
-#if LV_HAVE_AVX && LV_HAVE_FMA
-#include
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector,
- const lv_32fc_t* inVector,
- const lv_32fc_t phase_inc,
- unsigned int num_points)
-{
- lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n =
- phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
- outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
-
-
-#if LV_HAVE_AVX && LV_HAVE_FMA
-#include
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector,
- const lv_32fc_t* inVector,
- const lv_32fc_t phase_inc,
- unsigned int num_points)
-{
- lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- const lv_32fc_t phase_inc_n =
- phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
- volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
- outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
-
-#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h
new file mode 100644
index 000000000..bee1f0682
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h
@@ -0,0 +1,782 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32fc_x2_rotator2_32fc
+ *
+ * \b Overview
+ *
+ * Rotate input vector at fixed rate per sample from initial phase
+ * offset.
+ *
+ * Dispatcher Prototype
+ * \code
+ * void volk_32fc_s32fc_x2_rotator2_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector,
+ * const lv_32fc_t* phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inVector: Vector to be rotated.
+ * \li phase_inc: rotational velocity (scalar, input).
+ * \li phase: phase offset (scalar, input & output).
+ * \li num_points: The number of values in inVector to be rotated and stored into
+ * outVector.
+ *
+ * \b Outputs
+ * \li outVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Generate a tone at f=0.3 (normalized frequency) and use the rotator with
+ * f=0.1 to shift the tone to f=0.4. Change this example to start with a DC
+ * tone (initialize in with lv_cmake(1, 0)) to observe rotator signal generation.
+ * \code
+ * int N = 10;
+ * unsigned int alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * // Generate a tone at f=0.3
+ * float real = std::cos(0.3f * (float)ii);
+ * float imag = std::sin(0.3f * (float)ii);
+ * in[ii] = lv_cmake(real, imag);
+ * }
+ * // The oscillator rotates at f=0.1
+ * float frequency = 0.1f;
+ * lv_32fc_t phase_increment = lv_cmake(std::cos(frequency), std::sin(frequency));
+ * lv_32fc_t phase= lv_cmake(1.f, 0.0f); // start at 1 (0 rad phase)
+ *
+ * // rotate so the output is a tone at f=0.4
+ * volk_32fc_s32fc_x2_rotator2_32fc(out, in, &phase_increment, &phase, N);
+ *
+ * // print results for inspection
+ * for(unsigned int ii = 0; ii < N; ++ii){
+ * printf("out[%u] = %+1.2f %+1.2fj\n",
+ * ii, lv_creal(out[ii]), lv_cimag(out[ii]));
+ * }
+ *
+ * volk_free(in);
+ * volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
+
+
+#include
+#include
+#include
+#include
+#define ROTATOR_RELOAD 512
+#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
+#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_generic(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
+ unsigned int i = 0;
+ int j = 0;
+ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
+ for (j = 0; j < ROTATOR_RELOAD; ++j) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= *phase_inc;
+ }
+
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ }
+ for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
+ *outVector++ = *inVector++ * (*phase);
+ (*phase) *= *phase_inc;
+ }
+ if (i) {
+ // Make sure, we normalize phase on every call!
+ (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include
+#include
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_neon(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+
+{
+ lv_32fc_t* outputVectorPtr = outVector;
+ const lv_32fc_t* inputVectorPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
+ float32x4x2_t input_vec;
+ float32x4x2_t output_vec;
+
+ unsigned int i = 0, j = 0;
+ // const unsigned int quarter_points = num_points / 4;
+
+ for (i = 0; i < 4; ++i) {
+ phasePtr[i] *= incr;
+ incr *= (*phase_inc);
+ }
+
+ // Notice that incr has be incremented in the previous loop
+ const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
+ const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
+ float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
+
+ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD_4; j++) {
+ input_vec = vld2q_f32((float*)inputVectorPtr);
+ // Prefetch next one, speeds things up
+ __VOLK_PREFETCH(inputVectorPtr + 4);
+ // Rotate
+ output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
+ // Increase phase
+ phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
+ // Store output
+ vst2q_f32((float*)outputVectorPtr, output_vec);
+
+ outputVectorPtr += 4;
+ inputVectorPtr += 4;
+ }
+ // normalize phase so magnitude doesn't grow because of
+ // floating point rounding error
+ const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
+ const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
+ // Multiply complex with real
+ phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
+ phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
+ }
+
+ for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
+ input_vec = vld2q_f32((float*)inputVectorPtr);
+ // Prefetch next one, speeds things up
+ __VOLK_PREFETCH(inputVectorPtr + 4);
+ // Rotate
+ output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
+ // Increase phase
+ phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
+ // Store output
+ vst2q_f32((float*)outputVectorPtr, output_vec);
+
+ outputVectorPtr += 4;
+ inputVectorPtr += 4;
+ }
+ // if(i) == true means we looped above
+ if (i) {
+ // normalize phase so magnitude doesn't grow because of
+ // floating point rounding error
+ const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
+ const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
+ // Multiply complex with real
+ phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
+ phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
+ }
+ // Store current phase
+ vst2q_f32((float*)phasePtr, phase_vec);
+
+ // Deal with the rest
+ for (i = 0; i < num_points % 4; i++) {
+ *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
+ phasePtr[0] *= (*phase_inc);
+ }
+
+ // For continuous phase next time we need to call this function
+ (*phase) = phasePtr[0];
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+ for (i = 0; i < 2; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (*phase_inc);
+ }
+
+ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
+
+ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
+
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+ for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
+ aVal = _mm_load_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_store_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ if (i) {
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+
+ _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+ if (num_points & 1) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (*phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+ for (i = 0; i < 2; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (*phase_inc);
+ }
+
+ /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+ printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+ printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+ __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
+
+ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
+
+ aVal = _mm_loadu_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_storeu_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+ for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
+ aVal = _mm_loadu_ps((float*)aPtr);
+
+ yl = _mm_moveldup_ps(phase_Val);
+ yh = _mm_movehdup_ps(phase_Val);
+ ylp = _mm_moveldup_ps(inc_Val);
+ yhp = _mm_movehdup_ps(inc_Val);
+
+ tmp1 = _mm_mul_ps(aVal, yl);
+
+ tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+ aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm_mul_ps(aVal, yh);
+ tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+ z = _mm_addsub_ps(tmp1, tmp2);
+ phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+ _mm_storeu_ps((float*)cPtr, z);
+
+ aPtr += 2;
+ cPtr += 2;
+ }
+ if (i) {
+ tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm_sqrt_ps(tmp1);
+ phase_Val = _mm_div_ps(phase_Val, tmp2);
+ }
+
+ _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+ if (num_points & 1) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (*phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include
+#include
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
+ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+ for (i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (*phase_inc);
+ }
+
+ __m256 aVal, phase_Val, z;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+
+ const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr));
+
+ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ z = _mm256_complexmul_ps(aVal, phase_Val);
+ phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ phase_Val = _mm256_normalize_ps(phase_Val);
+ }
+
+ for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ z = _mm256_complexmul_ps(aVal, phase_Val);
+ phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ if (i) {
+ phase_Val = _mm256_normalize_ps(phase_Val);
+ }
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ (*phase) = phase_Ptr[0];
+ volk_32fc_s32fc_x2_rotator2_32fc_generic(
+ cPtr, aPtr, phase_inc, phase, num_points % 4);
+}
+
+#endif /* LV_HAVE_AVX for aligned */
+
+
+#ifdef LV_HAVE_AVX
+#include
+#include
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
+ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+ for (i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (*phase_inc);
+ }
+
+ __m256 aVal, phase_Val, z;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+
+ const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr));
+
+ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
+ for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+ aVal = _mm256_loadu_ps((float*)aPtr);
+
+ z = _mm256_complexmul_ps(aVal, phase_Val);
+ phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+ _mm256_storeu_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ phase_Val = _mm256_normalize_ps(phase_Val);
+ }
+
+ for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+ aVal = _mm256_loadu_ps((float*)aPtr);
+
+ z = _mm256_complexmul_ps(aVal, phase_Val);
+ phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+ _mm256_storeu_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ if (i) {
+ phase_Val = _mm256_normalize_ps(phase_Val);
+ }
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ (*phase) = phase_Ptr[0];
+ volk_32fc_s32fc_x2_rotator2_32fc_generic(
+ cPtr, aPtr, phase_inc, phase, num_points % 4);
+}
+
+#endif /* LV_HAVE_AVX */
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ __VOLK_ATTR_ALIGNED(32)
+ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+ for (i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (*phase_inc);
+ }
+
+ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm256_load_ps((float*)phase_Ptr);
+ inc_Val = _mm256_set_ps(lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr));
+
+ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = aVal;
+ tmp1p = phase_Val;
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+ phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+ for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+ aVal = _mm256_load_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = aVal;
+ tmp1p = phase_Val;
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+ phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+ _mm256_store_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ if (i) {
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+
+ _mm256_store_ps((float*)phase_Ptr, phase_Val);
+ for (i = 0; i < num_points % 4; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (*phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(lv_32fc_t* outVector,
+ const lv_32fc_t* inVector,
+ const lv_32fc_t* phase_inc,
+ lv_32fc_t* phase,
+ unsigned int num_points)
+{
+ lv_32fc_t* cPtr = outVector;
+ const lv_32fc_t* aPtr = inVector;
+ lv_32fc_t incr = 1;
+ lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+ unsigned int i, j = 0;
+
+ for (i = 0; i < 4; ++i) {
+ phase_Ptr[i] *= incr;
+ incr *= (*phase_inc);
+ }
+
+ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+ phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+ inc_Val = _mm256_set_ps(lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr),
+ lv_cimag(incr),
+ lv_creal(incr));
+
+ for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+ for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+ aVal = _mm256_loadu_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = aVal;
+ tmp1p = phase_Val;
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+ phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+ _mm256_storeu_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+ for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+ aVal = _mm256_loadu_ps((float*)aPtr);
+
+ yl = _mm256_moveldup_ps(phase_Val);
+ yh = _mm256_movehdup_ps(phase_Val);
+ ylp = _mm256_moveldup_ps(inc_Val);
+ yhp = _mm256_movehdup_ps(inc_Val);
+
+ tmp1 = aVal;
+ tmp1p = phase_Val;
+
+ aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+ phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+ tmp2 = _mm256_mul_ps(aVal, yh);
+ tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+ z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+ phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+ _mm256_storeu_ps((float*)cPtr, z);
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+ if (i) {
+ tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+ tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+ tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+ tmp2 = _mm256_sqrt_ps(tmp1);
+ phase_Val = _mm256_div_ps(phase_Val, tmp2);
+ }
+
+ _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+ for (i = 0; i < num_points % 4; ++i) {
+ *cPtr++ = *aPtr++ * phase_Ptr[0];
+ phase_Ptr[0] *= (*phase_inc);
+ }
+
+ (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
index aebf46d53..c0def35dd 100644
--- a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
+++ b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
@@ -10,6 +10,12 @@
/*!
* \page volk_32fc_s32fc_x2_rotator_32fc
*
+ * \b Deprecation
+ *
+ * This kernel is deprecated, because passing in `lv_32fc_t` by value results in
+ * Undefined Behaviour, causing a segmentation fault on some architectures.
+ * Use `volk_32fc_s32fc_x2_rotator2_32fc` instead.
+ *
* \b Overview
*
* Rotate input vector at fixed rate per sample from initial phase
@@ -72,10 +78,8 @@
#include
#include
#include
+#include
#include
-#define ROTATOR_RELOAD 512
-#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
-#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
#ifdef LV_HAVE_GENERIC
@@ -86,32 +90,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
lv_32fc_t* phase,
unsigned int num_points)
{
- unsigned int i = 0;
- int j = 0;
- for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
- for (j = 0; j < ROTATOR_RELOAD; ++j) {
- *outVector++ = *inVector++ * (*phase);
- (*phase) *= phase_inc;
- }
-
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- }
- for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
- *outVector++ = *inVector++ * (*phase);
- (*phase) *= phase_inc;
- }
- if (i) {
- // Make sure, we normalize phase on every call!
- (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
- }
+ volk_32fc_s32fc_x2_rotator2_32fc_generic(
+ outVector, inVector, &phase_inc, phase, num_points);
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
-#include
-#include
static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
const lv_32fc_t* inVector,
@@ -120,92 +106,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
unsigned int num_points)
{
- lv_32fc_t* outputVectorPtr = outVector;
- const lv_32fc_t* inputVectorPtr = inVector;
- lv_32fc_t incr = 1;
- lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
- float32x4x2_t input_vec;
- float32x4x2_t output_vec;
-
- unsigned int i = 0, j = 0;
- // const unsigned int quarter_points = num_points / 4;
-
- for (i = 0; i < 4; ++i) {
- phasePtr[i] *= incr;
- incr *= (phase_inc);
- }
-
- // Notice that incr has be incremented in the previous loop
- const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
- const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
- float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
-
- for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
- for (j = 0; j < ROTATOR_RELOAD_4; j++) {
- input_vec = vld2q_f32((float*)inputVectorPtr);
- // Prefetch next one, speeds things up
- __VOLK_PREFETCH(inputVectorPtr + 4);
- // Rotate
- output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
- // Increase phase
- phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
- // Store output
- vst2q_f32((float*)outputVectorPtr, output_vec);
-
- outputVectorPtr += 4;
- inputVectorPtr += 4;
- }
- // normalize phase so magnitude doesn't grow because of
- // floating point rounding error
- const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
- const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
- // Multiply complex with real
- phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
- phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
- }
-
- for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
- input_vec = vld2q_f32((float*)inputVectorPtr);
- // Prefetch next one, speeds things up
- __VOLK_PREFETCH(inputVectorPtr + 4);
- // Rotate
- output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
- // Increase phase
- phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
- // Store output
- vst2q_f32((float*)outputVectorPtr, output_vec);
-
- outputVectorPtr += 4;
- inputVectorPtr += 4;
- }
- // if(i) == true means we looped above
- if (i) {
- // normalize phase so magnitude doesn't grow because of
- // floating point rounding error
- const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
- const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
- // Multiply complex with real
- phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
- phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
- }
- // Store current phase
- vst2q_f32((float*)phasePtr, phase_vec);
-
- // Deal with the rest
- for (i = 0; i < num_points % 4; i++) {
- *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
- phasePtr[0] *= (phase_inc);
- }
-
- // For continuous phase next time we need to call this function
- (*phase) = phasePtr[0];
+ volk_32fc_s32fc_x2_rotator2_32fc_neon(
+ outVector, inVector, &phase_inc, phase, num_points);
}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_SSE4_1
-#include
static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
const lv_32fc_t* inVector,
@@ -213,102 +121,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
lv_32fc_t* phase,
unsigned int num_points)
{
- lv_32fc_t* cPtr = outVector;
- const lv_32fc_t* aPtr = inVector;
- lv_32fc_t incr = 1;
- lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
-
- unsigned int i, j = 0;
-
- for (i = 0; i < 2; ++i) {
- phase_Ptr[i] *= incr;
- incr *= (phase_inc);
- }
-
- __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
- phase_Val = _mm_loadu_ps((float*)phase_Ptr);
- inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
-
- for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
- for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
-
- aVal = _mm_load_ps((float*)aPtr);
-
- yl = _mm_moveldup_ps(phase_Val);
- yh = _mm_movehdup_ps(phase_Val);
- ylp = _mm_moveldup_ps(inc_Val);
- yhp = _mm_movehdup_ps(inc_Val);
-
- tmp1 = _mm_mul_ps(aVal, yl);
- tmp1p = _mm_mul_ps(phase_Val, ylp);
-
- aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
- phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
- tmp2 = _mm_mul_ps(aVal, yh);
- tmp2p = _mm_mul_ps(phase_Val, yhp);
-
- z = _mm_addsub_ps(tmp1, tmp2);
- phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
- _mm_store_ps((float*)cPtr, z);
-
- aPtr += 2;
- cPtr += 2;
- }
- tmp1 = _mm_mul_ps(phase_Val, phase_Val);
- tmp2 = _mm_hadd_ps(tmp1, tmp1);
- tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
- tmp2 = _mm_sqrt_ps(tmp1);
- phase_Val = _mm_div_ps(phase_Val, tmp2);
- }
- for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
- aVal = _mm_load_ps((float*)aPtr);
-
- yl = _mm_moveldup_ps(phase_Val);
- yh = _mm_movehdup_ps(phase_Val);
- ylp = _mm_moveldup_ps(inc_Val);
- yhp = _mm_movehdup_ps(inc_Val);
-
- tmp1 = _mm_mul_ps(aVal, yl);
-
- tmp1p = _mm_mul_ps(phase_Val, ylp);
-
- aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
- phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
- tmp2 = _mm_mul_ps(aVal, yh);
- tmp2p = _mm_mul_ps(phase_Val, yhp);
-
- z = _mm_addsub_ps(tmp1, tmp2);
- phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
- _mm_store_ps((float*)cPtr, z);
-
- aPtr += 2;
- cPtr += 2;
- }
- if (i) {
- tmp1 = _mm_mul_ps(phase_Val, phase_Val);
- tmp2 = _mm_hadd_ps(tmp1, tmp1);
- tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
- tmp2 = _mm_sqrt_ps(tmp1);
- phase_Val = _mm_div_ps(phase_Val, tmp2);
- }
-
- _mm_storeu_ps((float*)phase_Ptr, phase_Val);
- if (num_points & 1) {
- *cPtr++ = *aPtr++ * phase_Ptr[0];
- phase_Ptr[0] *= (phase_inc);
- }
-
- (*phase) = phase_Ptr[0];
+ volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(
+ outVector, inVector, &phase_inc, phase, num_points);
}
#endif /* LV_HAVE_SSE4_1 for aligned */
#ifdef LV_HAVE_SSE4_1
-#include
static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
const lv_32fc_t* inVector,
@@ -316,106 +136,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
lv_32fc_t* phase,
unsigned int num_points)
{
- lv_32fc_t* cPtr = outVector;
- const lv_32fc_t* aPtr = inVector;
- lv_32fc_t incr = 1;
- lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
-
- unsigned int i, j = 0;
-
- for (i = 0; i < 2; ++i) {
- phase_Ptr[i] *= incr;
- incr *= (phase_inc);
- }
-
- /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
- printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
- printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
- __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
- phase_Val = _mm_loadu_ps((float*)phase_Ptr);
- inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
-
- for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
- for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
-
- aVal = _mm_loadu_ps((float*)aPtr);
-
- yl = _mm_moveldup_ps(phase_Val);
- yh = _mm_movehdup_ps(phase_Val);
- ylp = _mm_moveldup_ps(inc_Val);
- yhp = _mm_movehdup_ps(inc_Val);
-
- tmp1 = _mm_mul_ps(aVal, yl);
- tmp1p = _mm_mul_ps(phase_Val, ylp);
-
- aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
- phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
- tmp2 = _mm_mul_ps(aVal, yh);
- tmp2p = _mm_mul_ps(phase_Val, yhp);
-
- z = _mm_addsub_ps(tmp1, tmp2);
- phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
- _mm_storeu_ps((float*)cPtr, z);
-
- aPtr += 2;
- cPtr += 2;
- }
- tmp1 = _mm_mul_ps(phase_Val, phase_Val);
- tmp2 = _mm_hadd_ps(tmp1, tmp1);
- tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
- tmp2 = _mm_sqrt_ps(tmp1);
- phase_Val = _mm_div_ps(phase_Val, tmp2);
- }
- for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
- aVal = _mm_loadu_ps((float*)aPtr);
-
- yl = _mm_moveldup_ps(phase_Val);
- yh = _mm_movehdup_ps(phase_Val);
- ylp = _mm_moveldup_ps(inc_Val);
- yhp = _mm_movehdup_ps(inc_Val);
-
- tmp1 = _mm_mul_ps(aVal, yl);
-
- tmp1p = _mm_mul_ps(phase_Val, ylp);
-
- aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
- phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
- tmp2 = _mm_mul_ps(aVal, yh);
- tmp2p = _mm_mul_ps(phase_Val, yhp);
-
- z = _mm_addsub_ps(tmp1, tmp2);
- phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
- _mm_storeu_ps((float*)cPtr, z);
-
- aPtr += 2;
- cPtr += 2;
- }
- if (i) {
- tmp1 = _mm_mul_ps(phase_Val, phase_Val);
- tmp2 = _mm_hadd_ps(tmp1, tmp1);
- tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
- tmp2 = _mm_sqrt_ps(tmp1);
- phase_Val = _mm_div_ps(phase_Val, tmp2);
- }
-
- _mm_storeu_ps((float*)phase_Ptr, phase_Val);
- if (num_points & 1) {
- *cPtr++ = *aPtr++ * phase_Ptr[0];
- phase_Ptr[0] *= (phase_inc);
- }
-
- (*phase) = phase_Ptr[0];
+ volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(
+ outVector, inVector, &phase_inc, phase, num_points);
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_AVX
-#include
-#include
static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
const lv_32fc_t* inVector,
@@ -423,73 +151,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
lv_32fc_t* phase,
unsigned int num_points)
{
- lv_32fc_t* cPtr = outVector;
- const lv_32fc_t* aPtr = inVector;
- lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
- lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
-
- unsigned int i, j = 0;
-
- for (i = 0; i < 4; ++i) {
- phase_Ptr[i] *= incr;
- incr *= (phase_inc);
- }
-
- __m256 aVal, phase_Val, z;
-
- phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-
- const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr));
-
- for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
- for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
-
- aVal = _mm256_load_ps((float*)aPtr);
-
- z = _mm256_complexmul_ps(aVal, phase_Val);
- phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
-
- _mm256_store_ps((float*)cPtr, z);
-
- aPtr += 4;
- cPtr += 4;
- }
- phase_Val = _mm256_normalize_ps(phase_Val);
- }
-
- for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
- aVal = _mm256_load_ps((float*)aPtr);
-
- z = _mm256_complexmul_ps(aVal, phase_Val);
- phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
-
- _mm256_store_ps((float*)cPtr, z);
-
- aPtr += 4;
- cPtr += 4;
- }
- if (i) {
- phase_Val = _mm256_normalize_ps(phase_Val);
- }
-
- _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
- (*phase) = phase_Ptr[0];
- volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
+ volk_32fc_s32fc_x2_rotator2_32fc_a_avx(
+ outVector, inVector, &phase_inc, phase, num_points);
}
#endif /* LV_HAVE_AVX for aligned */
#ifdef LV_HAVE_AVX
-#include
-#include
static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
const lv_32fc_t* inVector,
@@ -497,71 +166,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
lv_32fc_t* phase,
unsigned int num_points)
{
- lv_32fc_t* cPtr = outVector;
- const lv_32fc_t* aPtr = inVector;
- lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
- lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
-
- unsigned int i, j = 0;
-
- for (i = 0; i < 4; ++i) {
- phase_Ptr[i] *= incr;
- incr *= (phase_inc);
- }
-
- __m256 aVal, phase_Val, z;
-
- phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-
- const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr));
-
- for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
- for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
-
- aVal = _mm256_loadu_ps((float*)aPtr);
-
- z = _mm256_complexmul_ps(aVal, phase_Val);
- phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
-
- _mm256_storeu_ps((float*)cPtr, z);
-
- aPtr += 4;
- cPtr += 4;
- }
- phase_Val = _mm256_normalize_ps(phase_Val);
- }
-
- for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
- aVal = _mm256_loadu_ps((float*)aPtr);
-
- z = _mm256_complexmul_ps(aVal, phase_Val);
- phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
-
- _mm256_storeu_ps((float*)cPtr, z);
-
- aPtr += 4;
- cPtr += 4;
- }
- if (i) {
- phase_Val = _mm256_normalize_ps(phase_Val);
- }
-
- _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
- (*phase) = phase_Ptr[0];
- volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
+ volk_32fc_s32fc_x2_rotator2_32fc_u_avx(
+ outVector, inVector, &phase_inc, phase, num_points);
}
#endif /* LV_HAVE_AVX */
#if LV_HAVE_AVX && LV_HAVE_FMA
-#include
static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
const lv_32fc_t* inVector,
@@ -569,108 +180,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
lv_32fc_t* phase,
unsigned int num_points)
{
- lv_32fc_t* cPtr = outVector;
- const lv_32fc_t* aPtr = inVector;
- lv_32fc_t incr = 1;
- __VOLK_ATTR_ALIGNED(32)
- lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
-
- unsigned int i, j = 0;
-
- for (i = 0; i < 4; ++i) {
- phase_Ptr[i] *= incr;
- incr *= (phase_inc);
- }
-
- __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
- phase_Val = _mm256_load_ps((float*)phase_Ptr);
- inc_Val = _mm256_set_ps(lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr));
-
- for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
- for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
-
- aVal = _mm256_load_ps((float*)aPtr);
-
- yl = _mm256_moveldup_ps(phase_Val);
- yh = _mm256_movehdup_ps(phase_Val);
- ylp = _mm256_moveldup_ps(inc_Val);
- yhp = _mm256_movehdup_ps(inc_Val);
-
- tmp1 = aVal;
- tmp1p = phase_Val;
-
- aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
- phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
- tmp2 = _mm256_mul_ps(aVal, yh);
- tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
- z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
- phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
-
- _mm256_store_ps((float*)cPtr, z);
-
- aPtr += 4;
- cPtr += 4;
- }
- tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
- tmp2 = _mm256_hadd_ps(tmp1, tmp1);
- tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
- tmp2 = _mm256_sqrt_ps(tmp1);
- phase_Val = _mm256_div_ps(phase_Val, tmp2);
- }
- for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
- aVal = _mm256_load_ps((float*)aPtr);
-
- yl = _mm256_moveldup_ps(phase_Val);
- yh = _mm256_movehdup_ps(phase_Val);
- ylp = _mm256_moveldup_ps(inc_Val);
- yhp = _mm256_movehdup_ps(inc_Val);
-
- tmp1 = aVal;
- tmp1p = phase_Val;
-
- aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
- phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
- tmp2 = _mm256_mul_ps(aVal, yh);
- tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
- z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
- phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
-
- _mm256_store_ps((float*)cPtr, z);
-
- aPtr += 4;
- cPtr += 4;
- }
- if (i) {
- tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
- tmp2 = _mm256_hadd_ps(tmp1, tmp1);
- tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
- tmp2 = _mm256_sqrt_ps(tmp1);
- phase_Val = _mm256_div_ps(phase_Val, tmp2);
- }
-
- _mm256_store_ps((float*)phase_Ptr, phase_Val);
- for (i = 0; i < num_points % 4; ++i) {
- *cPtr++ = *aPtr++ * phase_Ptr[0];
- phase_Ptr[0] *= (phase_inc);
- }
-
- (*phase) = phase_Ptr[0];
+ volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(
+ outVector, inVector, &phase_inc, phase, num_points);
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
#if LV_HAVE_AVX && LV_HAVE_FMA
-#include
static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
const lv_32fc_t* inVector,
@@ -678,101 +194,8 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto
lv_32fc_t* phase,
unsigned int num_points)
{
- lv_32fc_t* cPtr = outVector;
- const lv_32fc_t* aPtr = inVector;
- lv_32fc_t incr = 1;
- lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
-
- unsigned int i, j = 0;
-
- for (i = 0; i < 4; ++i) {
- phase_Ptr[i] *= incr;
- incr *= (phase_inc);
- }
-
- __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
- phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
- inc_Val = _mm256_set_ps(lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr),
- lv_cimag(incr),
- lv_creal(incr));
-
- for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
- for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
-
- aVal = _mm256_loadu_ps((float*)aPtr);
-
- yl = _mm256_moveldup_ps(phase_Val);
- yh = _mm256_movehdup_ps(phase_Val);
- ylp = _mm256_moveldup_ps(inc_Val);
- yhp = _mm256_movehdup_ps(inc_Val);
-
- tmp1 = aVal;
- tmp1p = phase_Val;
-
- aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
- phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
- tmp2 = _mm256_mul_ps(aVal, yh);
- tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
- z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
- phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
-
- _mm256_storeu_ps((float*)cPtr, z);
-
- aPtr += 4;
- cPtr += 4;
- }
- tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
- tmp2 = _mm256_hadd_ps(tmp1, tmp1);
- tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
- tmp2 = _mm256_sqrt_ps(tmp1);
- phase_Val = _mm256_div_ps(phase_Val, tmp2);
- }
- for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
- aVal = _mm256_loadu_ps((float*)aPtr);
-
- yl = _mm256_moveldup_ps(phase_Val);
- yh = _mm256_movehdup_ps(phase_Val);
- ylp = _mm256_moveldup_ps(inc_Val);
- yhp = _mm256_movehdup_ps(inc_Val);
-
- tmp1 = aVal;
- tmp1p = phase_Val;
-
- aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
- phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
- tmp2 = _mm256_mul_ps(aVal, yh);
- tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
- z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
- phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
-
- _mm256_storeu_ps((float*)cPtr, z);
-
- aPtr += 4;
- cPtr += 4;
- }
- if (i) {
- tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
- tmp2 = _mm256_hadd_ps(tmp1, tmp1);
- tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
- tmp2 = _mm256_sqrt_ps(tmp1);
- phase_Val = _mm256_div_ps(phase_Val, tmp2);
- }
-
- _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
- for (i = 0; i < num_points % 4; ++i) {
- *cPtr++ = *aPtr++ * phase_Ptr[0];
- phase_Ptr[0] *= (phase_inc);
- }
-
- (*phase) = phase_Ptr[0];
+ volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(
+ outVector, inVector, &phase_inc, phase, num_points);
}
#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
new file mode 100644
index 000000000..b35bed5eb
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
@@ -0,0 +1,345 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2019 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc
+ *
+ * \b Overview
+ *
+ * Conjugate the input complex vector, multiply them by a complex scalar,
+ * add the another input complex vector and returns the results.
+ *
+ * c[i] = a[i] + conj(b[i]) * (*scalar)
+ *
+ * Dispatcher Prototype
+ * \code
+ * void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc(lv_32fc_t* cVector, const
+ * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t* scalar, unsigned int
+ * num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector to be added.
+ * \li bVector: The input vector to be conjugate and multiplied.
+ * \li scalar: The complex scalar to multiply against conjugated bVector.
+ * \li num_points: The number of complex values in aVector and bVector to be conjugate,
+ * multiplied and stored into cVector.
+ *
+ * \b Outputs
+ * \li cVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Calculate coefficients.
+ *
+ * \code
+ * int n_filter = 2 * N + 1;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * // state is a queue of input IQ data.
+ * lv_32fc_t* state = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
+ * // weight and next one.
+ * lv_32fc_t* weight = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
+ * lv_32fc_t* next = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
+ * ...
+ * // push back input IQ data into state.
+ * foo_push_back_queue(state, input);
+ *
+ * // get filtered output.
+ * lv_32fc_t output = lv_cmake(0.f,0.f);
+ * for (int i = 0; i < n_filter; i++) {
+ * output += state[i] * weight[i];
+ * }
+ *
+ * // update weight using output.
+ * float real = lv_creal(output) * (1.0 - std::norm(output)) * MU;
+ * lv_32fc_t factor = lv_cmake(real, 0.f);
+ * volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc(
+ * next, weight, state, &factor, n_filter);
+ * lv_32fc_t *tmp = next;
+ * next = weight;
+ * weight = tmp;
+ * weight[N + 1] = lv_cmake(lv_creal(weight[N + 1]), 0.f);
+ * ...
+ * volk_free(state);
+ * volk_free(weight);
+ * volk_free(next);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
+#define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
+
+#include
+#include
+#include
+#include
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ const lv_32fc_t* aPtr = aVector;
+ const lv_32fc_t* bPtr = bVector;
+ lv_32fc_t* cPtr = cVector;
+ unsigned int number = num_points;
+
+ // unwrap loop
+ while (number >= 8) {
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ number -= 8;
+ }
+
+ // clean up any remaining
+ while (number-- > 0) {
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_AVX
+#include
+#include
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+
+ __m256 x, y, s, z;
+ lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
+
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+ lv_32fc_t* c = cVector;
+
+ // Set up constant scalar vector
+ s = _mm256_loadu_ps((float*)v_scalar);
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_loadu_ps((float*)b);
+ y = _mm256_loadu_ps((float*)a);
+ z = _mm256_complexconjugatemul_ps(s, x);
+ z = _mm256_add_ps(y, z);
+ _mm256_storeu_ps((float*)c, z);
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) + lv_conj(*b++) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include
+#include
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, s, z;
+ lv_32fc_t v_scalar[2] = { *scalar, *scalar };
+
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+ lv_32fc_t* c = cVector;
+
+ // Set up constant scalar vector
+ s = _mm_loadu_ps((float*)v_scalar);
+
+ for (; number < halfPoints; number++) {
+ x = _mm_loadu_ps((float*)b);
+ y = _mm_loadu_ps((float*)a);
+ z = _mm_complexconjugatemul_ps(s, x);
+ z = _mm_add_ps(y, z);
+ _mm_storeu_ps((float*)c, z);
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if ((num_points % 2) != 0) {
+ *c = *a + lv_conj(*b) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include
+#include
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+
+ __m256 x, y, s, z;
+ lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
+
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+ lv_32fc_t* c = cVector;
+
+ // Set up constant scalar vector
+ s = _mm256_loadu_ps((float*)v_scalar);
+
+ for (; number < quarterPoints; number++) {
+ x = _mm256_load_ps((float*)b);
+ y = _mm256_load_ps((float*)a);
+ z = _mm256_complexconjugatemul_ps(s, x);
+ z = _mm256_add_ps(y, z);
+ _mm256_store_ps((float*)c, z);
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ for (i = num_points - isodd; i < num_points; i++) {
+ *c++ = (*a++) + lv_conj(*b++) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include
+#include
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int halfPoints = num_points / 2;
+
+ __m128 x, y, s, z;
+ lv_32fc_t v_scalar[2] = { *scalar, *scalar };
+
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+ lv_32fc_t* c = cVector;
+
+ // Set up constant scalar vector
+ s = _mm_loadu_ps((float*)v_scalar);
+
+ for (; number < halfPoints; number++) {
+ x = _mm_load_ps((float*)b);
+ y = _mm_load_ps((float*)a);
+ z = _mm_complexconjugatemul_ps(s, x);
+ z = _mm_add_ps(y, z);
+ _mm_store_ps((float*)c, z);
+
+ a += 2;
+ b += 2;
+ c += 2;
+ }
+
+ if ((num_points % 2) != 0) {
+ *c = *a + lv_conj(*b) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t* cVector,
+ const lv_32fc_t* aVector,
+ const lv_32fc_t* bVector,
+ const lv_32fc_t* scalar,
+ unsigned int num_points)
+{
+ const lv_32fc_t* bPtr = bVector;
+ const lv_32fc_t* aPtr = aVector;
+ lv_32fc_t* cPtr = cVector;
+ unsigned int number = num_points;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4x2_t a_val, b_val, c_val, scalar_val;
+ float32x4x2_t tmp_val;
+
+ scalar_val.val[0] = vld1q_dup_f32((const float*)scalar);
+ scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1);
+
+ for (number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)aPtr);
+ b_val = vld2q_f32((float*)bPtr);
+ b_val.val[1] = vnegq_f32(b_val.val[1]);
+ __VOLK_PREFETCH(aPtr + 8);
+ __VOLK_PREFETCH(bPtr + 8);
+
+ tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
+ tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
+
+ tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
+ tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
+
+ c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
+ c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
+
+ vst2q_f32((float*)cPtr, c_val);
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for (number = quarter_points * 4; number < num_points; number++) {
+ *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */
diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
index 85cdaf16a..78694044e 100644
--- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
+++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
@@ -10,6 +10,12 @@
/*!
* \page volk_32fc_x2_s32fc_multiply_conjugate_add_32fc
*
+ * \b Deprecation
+ *
+ * This kernel is deprecated, because passing in `lv_32fc_t` by value results in
+ * Undefined Behaviour, causing a segmentation fault on some architectures.
+ * Use `volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc` instead.
+ *
* \b Overview
*
* Conjugate the input complex vector, multiply them by a complex scalar,
@@ -76,6 +82,7 @@
#include
#include
#include
+#include
#include
@@ -88,35 +95,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- const lv_32fc_t* aPtr = aVector;
- const lv_32fc_t* bPtr = bVector;
- lv_32fc_t* cPtr = cVector;
- unsigned int number = num_points;
-
- // unwrap loop
- while (number >= 8) {
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- number -= 8;
- }
-
- // clean up any remaining
- while (number-- > 0) {
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- }
+ volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(
+ cVector, aVector, bVector, &scalar, num_points);
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX
-#include
-#include
static inline void
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector,
@@ -125,43 +110,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- unsigned int i = 0;
- const unsigned int quarterPoints = num_points / 4;
- unsigned int isodd = num_points & 3;
-
- __m256 x, y, s, z;
- lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
-
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
- lv_32fc_t* c = cVector;
-
- // Set up constant scalar vector
- s = _mm256_loadu_ps((float*)v_scalar);
-
- for (; number < quarterPoints; number++) {
- x = _mm256_loadu_ps((float*)b);
- y = _mm256_loadu_ps((float*)a);
- z = _mm256_complexconjugatemul_ps(s, x);
- z = _mm256_add_ps(y, z);
- _mm256_storeu_ps((float*)c, z);
-
- a += 4;
- b += 4;
- c += 4;
- }
-
- for (i = num_points - isodd; i < num_points; i++) {
- *c++ = (*a++) + lv_conj(*b++) * scalar;
- }
+ volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(
+ cVector, aVector, bVector, &scalar, num_points);
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
-#include
-#include
static inline void
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector,
@@ -170,41 +125,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, y, s, z;
- lv_32fc_t v_scalar[2] = { scalar, scalar };
-
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
- lv_32fc_t* c = cVector;
-
- // Set up constant scalar vector
- s = _mm_loadu_ps((float*)v_scalar);
-
- for (; number < halfPoints; number++) {
- x = _mm_loadu_ps((float*)b);
- y = _mm_loadu_ps((float*)a);
- z = _mm_complexconjugatemul_ps(s, x);
- z = _mm_add_ps(y, z);
- _mm_storeu_ps((float*)c, z);
-
- a += 2;
- b += 2;
- c += 2;
- }
-
- if ((num_points % 2) != 0) {
- *c = *a + lv_conj(*b) * scalar;
- }
+ volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(
+ cVector, aVector, bVector, &scalar, num_points);
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX
-#include
-#include
static inline void
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector,
@@ -213,43 +140,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- unsigned int i = 0;
- const unsigned int quarterPoints = num_points / 4;
- unsigned int isodd = num_points & 3;
-
- __m256 x, y, s, z;
- lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
-
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
- lv_32fc_t* c = cVector;
-
- // Set up constant scalar vector
- s = _mm256_loadu_ps((float*)v_scalar);
-
- for (; number < quarterPoints; number++) {
- x = _mm256_load_ps((float*)b);
- y = _mm256_load_ps((float*)a);
- z = _mm256_complexconjugatemul_ps(s, x);
- z = _mm256_add_ps(y, z);
- _mm256_store_ps((float*)c, z);
-
- a += 4;
- b += 4;
- c += 4;
- }
-
- for (i = num_points - isodd; i < num_points; i++) {
- *c++ = (*a++) + lv_conj(*b++) * scalar;
- }
+ volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(
+ cVector, aVector, bVector, &scalar, num_points);
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
-#include
-#include
static inline void
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector,
@@ -258,40 +155,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- const unsigned int halfPoints = num_points / 2;
-
- __m128 x, y, s, z;
- lv_32fc_t v_scalar[2] = { scalar, scalar };
-
- const lv_32fc_t* a = aVector;
- const lv_32fc_t* b = bVector;
- lv_32fc_t* c = cVector;
-
- // Set up constant scalar vector
- s = _mm_loadu_ps((float*)v_scalar);
-
- for (; number < halfPoints; number++) {
- x = _mm_load_ps((float*)b);
- y = _mm_load_ps((float*)a);
- z = _mm_complexconjugatemul_ps(s, x);
- z = _mm_add_ps(y, z);
- _mm_store_ps((float*)c, z);
-
- a += 2;
- b += 2;
- c += 2;
- }
-
- if ((num_points % 2) != 0) {
- *c = *a + lv_conj(*b) * scalar;
- }
+ volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(
+ cVector, aVector, bVector, &scalar, num_points);
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
-#include
static inline void
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector,
@@ -300,44 +170,8 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector,
const lv_32fc_t scalar,
unsigned int num_points)
{
- const lv_32fc_t* bPtr = bVector;
- const lv_32fc_t* aPtr = aVector;
- lv_32fc_t* cPtr = cVector;
- unsigned int number = num_points;
- unsigned int quarter_points = num_points / 4;
-
- float32x4x2_t a_val, b_val, c_val, scalar_val;
- float32x4x2_t tmp_val;
-
- scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
- scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
-
- for (number = 0; number < quarter_points; ++number) {
- a_val = vld2q_f32((float*)aPtr);
- b_val = vld2q_f32((float*)bPtr);
- b_val.val[1] = vnegq_f32(b_val.val[1]);
- __VOLK_PREFETCH(aPtr + 8);
- __VOLK_PREFETCH(bPtr + 8);
-
- tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
- tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
-
- tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
- tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
-
- c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
- c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
-
- vst2q_f32((float*)cPtr, c_val);
-
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- for (number = quarter_points * 4; number < num_points; number++) {
- *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
- }
+ volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(
+ cVector, aVector, bVector, &scalar, num_points);
}
#endif /* LV_HAVE_NEON */
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
index cd8efb981..9da8b399b 100644
--- a/lib/kernel_tests.h
+++ b/lib/kernel_tests.h
@@ -55,8 +55,8 @@ std::vector init_test_list(volk_test_params_t test_params)
QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params))
QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params))
QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params))
- QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc,
- volk_32fc_s32fc_x2_rotator_32fc,
+ QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotator2puppet_32fc,
+ volk_32fc_s32fc_x2_rotator2_32fc,
test_params_rotator))
QA(VOLK_INIT_PUPP(
volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0)))
@@ -158,14 +158,14 @@ std::vector init_test_list(volk_test_params_t test_params)
QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params))
QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params))
QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params))
- QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply2_32fc, test_params))
QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_s32f_add_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))
QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params))
QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params))
QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc))
- QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params))
+ QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc, test_params))
QA(VOLK_INIT_TEST(volk_32f_exp_32f, test_params))
QA(VOLK_INIT_PUPP(volk_32f_x2_powpuppet_32f, volk_32f_x2_pow_32f, test_params_inacc))
QA(VOLK_INIT_PUPP(
diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc
index 4be7b8ad9..a94d895c4 100644
--- a/lib/qa_utils.cc
+++ b/lib/qa_utils.cc
@@ -355,7 +355,7 @@ inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func,
std::string arch)
{
while (iter--)
- func(buffs[0], scalar, vlen, arch.c_str());
+ func(buffs[0], &scalar, vlen, arch.c_str());
}
inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func,
@@ -366,7 +366,7 @@ inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func,
std::string arch)
{
while (iter--)
- func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+ func(buffs[0], buffs[1], &scalar, vlen, arch.c_str());
}
inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func,
@@ -377,7 +377,7 @@ inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func,
std::string arch)
{
while (iter--)
- func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+ func(buffs[0], buffs[1], buffs[2], &scalar, vlen, arch.c_str());
}
template
diff --git a/lib/qa_utils.h b/lib/qa_utils.h
index 40f549cfd..a65677203 100644
--- a/lib/qa_utils.h
+++ b/lib/qa_utils.h
@@ -199,11 +199,11 @@ typedef void (*volk_fn_2arg_s32f)(void*, void*, float, unsigned int, const char*
typedef void (*volk_fn_3arg_s32f)(void*, void*, void*, float, unsigned int, const char*);
typedef void (*volk_fn_1arg_s32fc)(
void*,
- lv_32fc_t,
+ lv_32fc_t*,
unsigned int,
const char*); // one input vector, one scalar float input
-typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t*, unsigned int, const char*);
typedef void (*volk_fn_3arg_s32fc)(
- void*, void*, void*, lv_32fc_t, unsigned int, const char*);
+ void*, void*, void*, lv_32fc_t*, unsigned int, const char*);
#endif // VOLK_QA_UTILS_H
diff --git a/tmpl/volk.tmpl.h b/tmpl/volk.tmpl.h
index ad5595b24..b26c542cf 100644
--- a/tmpl/volk.tmpl.h
+++ b/tmpl/volk.tmpl.h
@@ -63,9 +63,11 @@ VOLK_API bool volk_is_aligned(const void *ptr);
// Just drop the deprecated attribute in case we are on Windows. Clang and GCC support `__attribute__`.
// We just assume the compiler and the system are tight together as far as Mako templates are concerned.
<%
-deprecated_kernels = ('volk_16i_x5_add_quad_16i_x4', 'volk_16i_branch_4_state_8',
- 'volk_16i_max_star_16i', 'volk_16i_max_star_horizontal_16i',
- 'volk_16i_permute_and_scalar_add', 'volk_16i_x4_quad_max_star_16i')
+deprecated_kernels = ('volk_16i_x5_add_quad_16i_x4', 'volk_16i_branch_4_state_8',
+ 'volk_16i_max_star_16i', 'volk_16i_max_star_horizontal_16i',
+ 'volk_16i_permute_and_scalar_add', 'volk_16i_x4_quad_max_star_16i',
+ 'volk_32fc_s32fc_multiply_32fc', 'volk_32fc_s32fc_x2_rotator_32fc',
+ 'volk_32fc_x2_s32fc_multiply_conjugate_add_32fc')
from platform import system
if system() == 'Windows':
deprecated_kernels = ()