From 23d5871f4a0084a62cbeb4a289bacccc5e914b69 Mon Sep 17 00:00:00 2001 From: Andrej Rode Date: Sat, 24 Oct 2020 13:20:57 +0200 Subject: [PATCH 1/2] archs: MSVC 2013 and greater don't have a SSE flag But still they support SSE built-in. Add FORCE_ARCH macro to enable SIMD architecture for compilers which don't have a compiler flag to check for them. --- gen/archs.xml | 3 --- lib/CMakeLists.txt | 15 +++++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/gen/archs.xml b/gen/archs.xml index 56689f1ac..cd6905f12 100644 --- a/gen/archs.xml +++ b/gen/archs.xml @@ -66,7 +66,6 @@ -mmmx -mmmx - /arch:SSE 8 @@ -82,7 +81,6 @@ -msse -msse - /arch:SSE _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); xmmintrin.h 16 @@ -92,7 +90,6 @@ -msse2 -msse2 - /arch:SSE2 16 diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 37aaed5e5..74585300c 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -138,6 +138,11 @@ macro(OVERRULE_ARCH arch reason) list(REMOVE_ITEM available_archs ${arch}) endmacro(OVERRULE_ARCH) +macro(FORCE_ARCH arch reason) + message(STATUS "${reason}, Forced arch ${arch}") + list(APPEND available_archs ${arch}) +endmacro(FORCE_ARCH) + ######################################################################## # eliminate AVX on if not on x86, or if the compiler does not accept # the xgetbv instruction, or {if not cross-compiling and the xgetbv @@ -257,13 +262,15 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86) endif() #MSVC 64 bit does not have MMX, overrule it - if (${SIZEOF_CPU} EQUAL 64 AND MSVC) + if (MSVC) + if (${SIZEOF_CPU} EQUAL 64) OVERRULE_ARCH(mmx "No MMX for Win64") - if (MSVC_VERSION GREATER 1700) - OVERRULE_ARCH(sse "No SSE for Win64 Visual Studio 2013") - endif() + endif() + FORCE_ARCH(sse "Built-in for MSVC > 2013") + FORCE_ARCH(sse2 "Built-in for MSVC > 2013") endif() + endif() ######################################################################## From f65b44d2704b37032d00ff0348fce1514afd2b02 Mon Sep 17 00:00:00 2001 From: Andrej Rode Date: Mon, 9 Nov 2020 01:40:25 +0100 Subject: [PATCH 2/2] volk_32fc_x2_dot_prod_32fc: disable slow & broken SSE4.1 kernel --- kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 244 +++++++++++----------- 1 file changed, 122 insertions(+), 122 deletions(-) diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index b0b7fee3f..e0c585791 100644 --- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -302,89 +302,89 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, #endif /*LV_HAVE_SSE3*/ -#ifdef LV_HAVE_SSE4_1 +// #ifdef LV_HAVE_SSE4_1 -#include +// #include -static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, - const lv_32fc_t* input, - const lv_32fc_t* taps, - unsigned int num_points) -{ +// static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, +// const lv_32fc_t* input, +// const lv_32fc_t* taps, +// unsigned int num_points) +// { - unsigned int i = 0; - const unsigned int qtr_points = num_points / 4; - const unsigned int isodd = num_points & 3; +// unsigned int i = 0; +// const unsigned int qtr_points = num_points / 4; +// const unsigned int isodd = num_points & 3; - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64* p_result; +// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; +// float *p_input, *p_taps; +// __m64* p_result; - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; +// p_result = (__m64*)result; +// p_input = (float*)input; +// p_taps = (float*)taps; - static const __m128i neg = { 0x000000000000000080000000 }; +// static const __m128i neg = { 0x000000000000000080000000 }; - real0 = _mm_setzero_ps(); - real1 = _mm_setzero_ps(); - im0 = _mm_setzero_ps(); - im1 = _mm_setzero_ps(); +// real0 = _mm_setzero_ps(); +// real1 = _mm_setzero_ps(); +// im0 = _mm_setzero_ps(); +// im1 = _mm_setzero_ps(); - for (; i < qtr_points; ++i) { - xmm0 = _mm_loadu_ps(p_input); - xmm1 = _mm_loadu_ps(p_taps); +// for (; i < qtr_points; ++i) { +// xmm0 = _mm_loadu_ps(p_input); +// xmm1 = _mm_loadu_ps(p_taps); - p_input += 4; - p_taps += 4; +// p_input += 4; +// p_taps += 4; - xmm2 = _mm_loadu_ps(p_input); - xmm3 = _mm_loadu_ps(p_taps); +// xmm2 = _mm_loadu_ps(p_input); +// xmm3 = _mm_loadu_ps(p_taps); - p_input += 4; - p_taps += 4; +// p_input += 4; +// p_taps += 4; - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); +// xmm4 = _mm_unpackhi_ps(xmm0, xmm2); +// xmm5 = _mm_unpackhi_ps(xmm1, xmm3); +// xmm0 = _mm_unpacklo_ps(xmm0, xmm2); +// xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - // imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - // real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - // imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - // real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); +// // imaginary vector from input +// xmm1 = _mm_unpackhi_ps(xmm0, xmm4); +// // real vector from input +// xmm3 = _mm_unpacklo_ps(xmm0, xmm4); +// // imaginary vector from taps +// xmm0 = _mm_unpackhi_ps(xmm2, xmm5); +// // real vector from taps +// xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); +// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); +// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); +// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); +// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - } +// real0 = _mm_add_ps(xmm4, real0); +// real1 = _mm_add_ps(xmm5, real1); +// im0 = _mm_add_ps(xmm6, im0); +// im1 = _mm_add_ps(xmm7, im1); +// } - real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); +// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); +// im0 = _mm_add_ps(im0, im1); +// real0 = _mm_add_ps(real0, real1); - im0 = _mm_add_ps(im0, real0); +// im0 = _mm_add_ps(im0, real0); - _mm_storel_pi(p_result, im0); +// _mm_storel_pi(p_result, im0); - for (i = num_points - isodd; i < num_points; i++) { - *result += input[i] * taps[i]; - } -} +// for (i = num_points - isodd; i < num_points; i++) { +// *result += input[i] * taps[i]; +// } +// } -#endif /*LV_HAVE_SSE4_1*/ +// #endif /*LV_HAVE_SSE4_1*/ #ifdef LV_HAVE_AVX @@ -895,89 +895,89 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, #endif /*LV_HAVE_SSE3*/ -#ifdef LV_HAVE_SSE4_1 +// #ifdef LV_HAVE_SSE4_1 -#include +// #include -static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, - const lv_32fc_t* input, - const lv_32fc_t* taps, - unsigned int num_points) -{ +// static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, +// const lv_32fc_t* input, +// const lv_32fc_t* taps, +// unsigned int num_points) +// { - unsigned int i = 0; - const unsigned int qtr_points = num_points / 4; - const unsigned int isodd = num_points & 3; +// unsigned int i = 0; +// const unsigned int qtr_points = num_points / 4; +// const unsigned int isodd = num_points & 3; - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64* p_result; +// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; +// float *p_input, *p_taps; +// __m64* p_result; - static const __m128i neg = { 0x000000000000000080000000 }; +// static const __m128i neg = { 0x000000000000000080000000 }; - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; +// p_result = (__m64*)result; +// p_input = (float*)input; +// p_taps = (float*)taps; - real0 = _mm_setzero_ps(); - real1 = _mm_setzero_ps(); - im0 = _mm_setzero_ps(); - im1 = _mm_setzero_ps(); +// real0 = _mm_setzero_ps(); +// real1 = _mm_setzero_ps(); +// im0 = _mm_setzero_ps(); +// im1 = _mm_setzero_ps(); - for (; i < qtr_points; ++i) { - xmm0 = _mm_load_ps(p_input); - xmm1 = _mm_load_ps(p_taps); +// for (; i < qtr_points; ++i) { +// xmm0 = _mm_load_ps(p_input); +// xmm1 = _mm_load_ps(p_taps); - p_input += 4; - p_taps += 4; +// p_input += 4; +// p_taps += 4; - xmm2 = _mm_load_ps(p_input); - xmm3 = _mm_load_ps(p_taps); +// xmm2 = _mm_load_ps(p_input); +// xmm3 = _mm_load_ps(p_taps); - p_input += 4; - p_taps += 4; +// p_input += 4; +// p_taps += 4; - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); +// xmm4 = _mm_unpackhi_ps(xmm0, xmm2); +// xmm5 = _mm_unpackhi_ps(xmm1, xmm3); +// xmm0 = _mm_unpacklo_ps(xmm0, xmm2); +// xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - // imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - // real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - // imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - // real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); +// // imaginary vector from input +// xmm1 = _mm_unpackhi_ps(xmm0, xmm4); +// // real vector from input +// xmm3 = _mm_unpacklo_ps(xmm0, xmm4); +// // imaginary vector from taps +// xmm0 = _mm_unpackhi_ps(xmm2, xmm5); +// // real vector from taps +// xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); +// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); +// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); +// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); +// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - } +// real0 = _mm_add_ps(xmm4, real0); +// real1 = _mm_add_ps(xmm5, real1); +// im0 = _mm_add_ps(xmm6, im0); +// im1 = _mm_add_ps(xmm7, im1); +// } - real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); +// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); +// im0 = _mm_add_ps(im0, im1); +// real0 = _mm_add_ps(real0, real1); - im0 = _mm_add_ps(im0, real0); +// im0 = _mm_add_ps(im0, real0); - _mm_storel_pi(p_result, im0); +// _mm_storel_pi(p_result, im0); - for (i = num_points - isodd; i < num_points; i++) { - *result += input[i] * taps[i]; - } -} +// for (i = num_points - isodd; i < num_points; i++) { +// *result += input[i] * taps[i]; +// } +// } -#endif /*LV_HAVE_SSE4_1*/ +// #endif /*LV_HAVE_SSE4_1*/ #ifdef LV_HAVE_NEON #include