From 23d5871f4a0084a62cbeb4a289bacccc5e914b69 Mon Sep 17 00:00:00 2001
From: Andrej Rode <mail@andrejro.de>
Date: Sat, 24 Oct 2020 13:20:57 +0200
Subject: [PATCH 1/2] archs: MSVC 2013 and greater don't have a SSE flag

But still they support SSE built-in.
Add FORCE_ARCH macro to enable SIMD architecture for
compilers which don't have a compiler flag to check
for them.
---
 gen/archs.xml      |  3 ---
 lib/CMakeLists.txt | 15 +++++++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/gen/archs.xml b/gen/archs.xml
index 56689f1ac..cd6905f12 100644
--- a/gen/archs.xml
+++ b/gen/archs.xml
@@ -66,7 +66,6 @@
   <check name="mmx"></check>
   <flag compiler="gnu">-mmmx</flag>
   <flag compiler="clang">-mmmx</flag>
-  <flag compiler="msvc">/arch:SSE</flag>
   <alignment>8</alignment>
 </arch>
 
@@ -82,7 +81,6 @@
   <check name="sse"></check>
   <flag compiler="gnu">-msse</flag>
   <flag compiler="clang">-msse</flag>
-  <flag compiler="msvc">/arch:SSE</flag>
   <environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
   <include>xmmintrin.h</include>
   <alignment>16</alignment>
@@ -92,7 +90,6 @@
   <check name="sse2"></check>
   <flag compiler="gnu">-msse2</flag>
   <flag compiler="clang">-msse2</flag>
-  <flag compiler="msvc">/arch:SSE2</flag>
   <alignment>16</alignment>
 </arch>
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 37aaed5e5..74585300c 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -138,6 +138,11 @@ macro(OVERRULE_ARCH arch reason)
     list(REMOVE_ITEM available_archs ${arch})
 endmacro(OVERRULE_ARCH)
 
+macro(FORCE_ARCH arch reason)
+  message(STATUS "${reason}, Forced arch ${arch}")
+  list(APPEND available_archs ${arch})
+endmacro(FORCE_ARCH)
+
 ########################################################################
 # eliminate AVX on if not on x86, or if the compiler does not accept
 # the xgetbv instruction, or {if not cross-compiling and the xgetbv
@@ -257,13 +262,15 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
     endif()
 
     #MSVC 64 bit does not have MMX, overrule it
-    if (${SIZEOF_CPU} EQUAL 64 AND MSVC)
+    if (MSVC)
+      if (${SIZEOF_CPU} EQUAL 64)
         OVERRULE_ARCH(mmx "No MMX for Win64")
-	if (MSVC_VERSION GREATER 1700)
-            OVERRULE_ARCH(sse "No SSE for Win64 Visual Studio 2013")
-        endif()
+      endif()
+      FORCE_ARCH(sse "Built-in for MSVC > 2013")
+      FORCE_ARCH(sse2 "Built-in for MSVC > 2013")
     endif()
 
+
 endif()
 
 ########################################################################

From f65b44d2704b37032d00ff0348fce1514afd2b02 Mon Sep 17 00:00:00 2001
From: Andrej Rode <mail@andrejro.de>
Date: Mon, 9 Nov 2020 01:40:25 +0100
Subject: [PATCH 2/2] volk_32fc_x2_dot_prod_32fc: disable slow & broken SSE4.1
 kernel

---
 kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 244 +++++++++++-----------
 1 file changed, 122 insertions(+), 122 deletions(-)

diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index b0b7fee3f..e0c585791 100644
--- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -302,89 +302,89 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result,
 
 #endif /*LV_HAVE_SSE3*/
 
-#ifdef LV_HAVE_SSE4_1
+// #ifdef LV_HAVE_SSE4_1
 
-#include <smmintrin.h>
+// #include <smmintrin.h>
 
-static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
-                                                       const lv_32fc_t* input,
-                                                       const lv_32fc_t* taps,
-                                                       unsigned int num_points)
-{
+// static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
+//                                                        const lv_32fc_t* input,
+//                                                        const lv_32fc_t* taps,
+//                                                        unsigned int num_points)
+// {
 
-    unsigned int i = 0;
-    const unsigned int qtr_points = num_points / 4;
-    const unsigned int isodd = num_points & 3;
+//     unsigned int i = 0;
+//     const unsigned int qtr_points = num_points / 4;
+//     const unsigned int isodd = num_points & 3;
 
-    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
-    float *p_input, *p_taps;
-    __m64* p_result;
+//     __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+//     float *p_input, *p_taps;
+//     __m64* p_result;
 
-    p_result = (__m64*)result;
-    p_input = (float*)input;
-    p_taps = (float*)taps;
+//     p_result = (__m64*)result;
+//     p_input = (float*)input;
+//     p_taps = (float*)taps;
 
-    static const __m128i neg = { 0x000000000000000080000000 };
+//     static const __m128i neg = { 0x000000000000000080000000 };
 
-    real0 = _mm_setzero_ps();
-    real1 = _mm_setzero_ps();
-    im0 = _mm_setzero_ps();
-    im1 = _mm_setzero_ps();
+//     real0 = _mm_setzero_ps();
+//     real1 = _mm_setzero_ps();
+//     im0 = _mm_setzero_ps();
+//     im1 = _mm_setzero_ps();
 
-    for (; i < qtr_points; ++i) {
-        xmm0 = _mm_loadu_ps(p_input);
-        xmm1 = _mm_loadu_ps(p_taps);
+//     for (; i < qtr_points; ++i) {
+//         xmm0 = _mm_loadu_ps(p_input);
+//         xmm1 = _mm_loadu_ps(p_taps);
 
-        p_input += 4;
-        p_taps += 4;
+//         p_input += 4;
+//         p_taps += 4;
 
-        xmm2 = _mm_loadu_ps(p_input);
-        xmm3 = _mm_loadu_ps(p_taps);
+//         xmm2 = _mm_loadu_ps(p_input);
+//         xmm3 = _mm_loadu_ps(p_taps);
 
-        p_input += 4;
-        p_taps += 4;
+//         p_input += 4;
+//         p_taps += 4;
 
-        xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
-        xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
-        xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
-        xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+//         xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+//         xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+//         xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+//         xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
 
-        // imaginary vector from input
-        xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
-        // real vector from input
-        xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
-        // imaginary vector from taps
-        xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
-        // real vector from taps
-        xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+//         // imaginary vector from input
+//         xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+//         // real vector from input
+//         xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+//         // imaginary vector from taps
+//         xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+//         // real vector from taps
+//         xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
 
-        xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
-        xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+//         xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+//         xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
 
-        xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
-        xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+//         xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+//         xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
 
-        real0 = _mm_add_ps(xmm4, real0);
-        real1 = _mm_add_ps(xmm5, real1);
-        im0 = _mm_add_ps(xmm6, im0);
-        im1 = _mm_add_ps(xmm7, im1);
-    }
+//         real0 = _mm_add_ps(xmm4, real0);
+//         real1 = _mm_add_ps(xmm5, real1);
+//         im0 = _mm_add_ps(xmm6, im0);
+//         im1 = _mm_add_ps(xmm7, im1);
+//     }
 
-    real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+//     real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
 
-    im0 = _mm_add_ps(im0, im1);
-    real0 = _mm_add_ps(real0, real1);
+//     im0 = _mm_add_ps(im0, im1);
+//     real0 = _mm_add_ps(real0, real1);
 
-    im0 = _mm_add_ps(im0, real0);
+//     im0 = _mm_add_ps(im0, real0);
 
-    _mm_storel_pi(p_result, im0);
+//     _mm_storel_pi(p_result, im0);
 
-    for (i = num_points - isodd; i < num_points; i++) {
-        *result += input[i] * taps[i];
-    }
-}
+//     for (i = num_points - isodd; i < num_points; i++) {
+//         *result += input[i] * taps[i];
+//     }
+// }
 
-#endif /*LV_HAVE_SSE4_1*/
+// #endif /*LV_HAVE_SSE4_1*/
 
 #ifdef LV_HAVE_AVX
 
@@ -895,89 +895,89 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result,
 #endif /*LV_HAVE_SSE3*/
 
 
-#ifdef LV_HAVE_SSE4_1
+// #ifdef LV_HAVE_SSE4_1
 
-#include <smmintrin.h>
+// #include <smmintrin.h>
 
-static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
-                                                       const lv_32fc_t* input,
-                                                       const lv_32fc_t* taps,
-                                                       unsigned int num_points)
-{
+// static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
+//                                                        const lv_32fc_t* input,
+//                                                        const lv_32fc_t* taps,
+//                                                        unsigned int num_points)
+// {
 
-    unsigned int i = 0;
-    const unsigned int qtr_points = num_points / 4;
-    const unsigned int isodd = num_points & 3;
+//     unsigned int i = 0;
+//     const unsigned int qtr_points = num_points / 4;
+//     const unsigned int isodd = num_points & 3;
 
-    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
-    float *p_input, *p_taps;
-    __m64* p_result;
+//     __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+//     float *p_input, *p_taps;
+//     __m64* p_result;
 
-    static const __m128i neg = { 0x000000000000000080000000 };
+//     static const __m128i neg = { 0x000000000000000080000000 };
 
-    p_result = (__m64*)result;
-    p_input = (float*)input;
-    p_taps = (float*)taps;
+//     p_result = (__m64*)result;
+//     p_input = (float*)input;
+//     p_taps = (float*)taps;
 
-    real0 = _mm_setzero_ps();
-    real1 = _mm_setzero_ps();
-    im0 = _mm_setzero_ps();
-    im1 = _mm_setzero_ps();
+//     real0 = _mm_setzero_ps();
+//     real1 = _mm_setzero_ps();
+//     im0 = _mm_setzero_ps();
+//     im1 = _mm_setzero_ps();
 
-    for (; i < qtr_points; ++i) {
-        xmm0 = _mm_load_ps(p_input);
-        xmm1 = _mm_load_ps(p_taps);
+//     for (; i < qtr_points; ++i) {
+//         xmm0 = _mm_load_ps(p_input);
+//         xmm1 = _mm_load_ps(p_taps);
 
-        p_input += 4;
-        p_taps += 4;
+//         p_input += 4;
+//         p_taps += 4;
 
-        xmm2 = _mm_load_ps(p_input);
-        xmm3 = _mm_load_ps(p_taps);
+//         xmm2 = _mm_load_ps(p_input);
+//         xmm3 = _mm_load_ps(p_taps);
 
-        p_input += 4;
-        p_taps += 4;
+//         p_input += 4;
+//         p_taps += 4;
 
-        xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
-        xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
-        xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
-        xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+//         xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+//         xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+//         xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+//         xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
 
-        // imaginary vector from input
-        xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
-        // real vector from input
-        xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
-        // imaginary vector from taps
-        xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
-        // real vector from taps
-        xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+//         // imaginary vector from input
+//         xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+//         // real vector from input
+//         xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+//         // imaginary vector from taps
+//         xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+//         // real vector from taps
+//         xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
 
-        xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
-        xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+//         xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+//         xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
 
-        xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
-        xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+//         xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+//         xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
 
-        real0 = _mm_add_ps(xmm4, real0);
-        real1 = _mm_add_ps(xmm5, real1);
-        im0 = _mm_add_ps(xmm6, im0);
-        im1 = _mm_add_ps(xmm7, im1);
-    }
+//         real0 = _mm_add_ps(xmm4, real0);
+//         real1 = _mm_add_ps(xmm5, real1);
+//         im0 = _mm_add_ps(xmm6, im0);
+//         im1 = _mm_add_ps(xmm7, im1);
+//     }
 
-    real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+//     real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
 
-    im0 = _mm_add_ps(im0, im1);
-    real0 = _mm_add_ps(real0, real1);
+//     im0 = _mm_add_ps(im0, im1);
+//     real0 = _mm_add_ps(real0, real1);
 
-    im0 = _mm_add_ps(im0, real0);
+//     im0 = _mm_add_ps(im0, real0);
 
-    _mm_storel_pi(p_result, im0);
+//     _mm_storel_pi(p_result, im0);
 
-    for (i = num_points - isodd; i < num_points; i++) {
-        *result += input[i] * taps[i];
-    }
-}
+//     for (i = num_points - isodd; i < num_points; i++) {
+//         *result += input[i] * taps[i];
+//     }
+// }
 
-#endif /*LV_HAVE_SSE4_1*/
+// #endif /*LV_HAVE_SSE4_1*/
 
 #ifdef LV_HAVE_NEON
 #include <arm_neon.h>