Skip to content

Commit

Permalink
Merge pull request #411 from noc0lour/enable_sse_on_MSVC
Browse files Browse the repository at this point in the history
archs: MSVC 2013 and greater don't have a SSE flag
  • Loading branch information
michaelld authored Nov 9, 2020
2 parents 24c7c84 + f65b44d commit d838ba7
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 129 deletions.
3 changes: 0 additions & 3 deletions gen/archs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@
<check name="mmx"></check>
<flag compiler="gnu">-mmmx</flag>
<flag compiler="clang">-mmmx</flag>
<flag compiler="msvc">/arch:SSE</flag>
<alignment>8</alignment>
</arch>

Expand All @@ -82,7 +81,6 @@
<check name="sse"></check>
<flag compiler="gnu">-msse</flag>
<flag compiler="clang">-msse</flag>
<flag compiler="msvc">/arch:SSE</flag>
<environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
<include>xmmintrin.h</include>
<alignment>16</alignment>
Expand All @@ -92,7 +90,6 @@
<check name="sse2"></check>
<flag compiler="gnu">-msse2</flag>
<flag compiler="clang">-msse2</flag>
<flag compiler="msvc">/arch:SSE2</flag>
<alignment>16</alignment>
</arch>

Expand Down
244 changes: 122 additions & 122 deletions kernels/volk/volk_32fc_x2_dot_prod_32fc.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,89 +302,89 @@ static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result,

#endif /*LV_HAVE_SSE3*/

#ifdef LV_HAVE_SSE4_1
// #ifdef LV_HAVE_SSE4_1

#include <smmintrin.h>
// #include <smmintrin.h>

static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points)
{
// static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
// const lv_32fc_t* input,
// const lv_32fc_t* taps,
// unsigned int num_points)
// {

unsigned int i = 0;
const unsigned int qtr_points = num_points / 4;
const unsigned int isodd = num_points & 3;
// unsigned int i = 0;
// const unsigned int qtr_points = num_points / 4;
// const unsigned int isodd = num_points & 3;

__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
float *p_input, *p_taps;
__m64* p_result;
// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
// float *p_input, *p_taps;
// __m64* p_result;

p_result = (__m64*)result;
p_input = (float*)input;
p_taps = (float*)taps;
// p_result = (__m64*)result;
// p_input = (float*)input;
// p_taps = (float*)taps;

static const __m128i neg = { 0x000000000000000080000000 };
// static const __m128i neg = { 0x000000000000000080000000 };

real0 = _mm_setzero_ps();
real1 = _mm_setzero_ps();
im0 = _mm_setzero_ps();
im1 = _mm_setzero_ps();
// real0 = _mm_setzero_ps();
// real1 = _mm_setzero_ps();
// im0 = _mm_setzero_ps();
// im1 = _mm_setzero_ps();

for (; i < qtr_points; ++i) {
xmm0 = _mm_loadu_ps(p_input);
xmm1 = _mm_loadu_ps(p_taps);
// for (; i < qtr_points; ++i) {
// xmm0 = _mm_loadu_ps(p_input);
// xmm1 = _mm_loadu_ps(p_taps);

p_input += 4;
p_taps += 4;
// p_input += 4;
// p_taps += 4;

xmm2 = _mm_loadu_ps(p_input);
xmm3 = _mm_loadu_ps(p_taps);
// xmm2 = _mm_loadu_ps(p_input);
// xmm3 = _mm_loadu_ps(p_taps);

p_input += 4;
p_taps += 4;
// p_input += 4;
// p_taps += 4;

xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
// xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
// xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
// xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
// xmm2 = _mm_unpacklo_ps(xmm1, xmm3);

// imaginary vector from input
xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
// real vector from input
xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
// imaginary vector from taps
xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
// real vector from taps
xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
// // imaginary vector from input
// xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
// // real vector from input
// xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
// // imaginary vector from taps
// xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
// // real vector from taps
// xmm2 = _mm_unpacklo_ps(xmm2, xmm5);

xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);

xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);

real0 = _mm_add_ps(xmm4, real0);
real1 = _mm_add_ps(xmm5, real1);
im0 = _mm_add_ps(xmm6, im0);
im1 = _mm_add_ps(xmm7, im1);
}
// real0 = _mm_add_ps(xmm4, real0);
// real1 = _mm_add_ps(xmm5, real1);
// im0 = _mm_add_ps(xmm6, im0);
// im1 = _mm_add_ps(xmm7, im1);
// }

real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);

im0 = _mm_add_ps(im0, im1);
real0 = _mm_add_ps(real0, real1);
// im0 = _mm_add_ps(im0, im1);
// real0 = _mm_add_ps(real0, real1);

im0 = _mm_add_ps(im0, real0);
// im0 = _mm_add_ps(im0, real0);

_mm_storel_pi(p_result, im0);
// _mm_storel_pi(p_result, im0);

for (i = num_points - isodd; i < num_points; i++) {
*result += input[i] * taps[i];
}
}
// for (i = num_points - isodd; i < num_points; i++) {
// *result += input[i] * taps[i];
// }
// }

#endif /*LV_HAVE_SSE4_1*/
// #endif /*LV_HAVE_SSE4_1*/

#ifdef LV_HAVE_AVX

Expand Down Expand Up @@ -895,89 +895,89 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result,
#endif /*LV_HAVE_SSE3*/


#ifdef LV_HAVE_SSE4_1
// #ifdef LV_HAVE_SSE4_1

#include <smmintrin.h>
// #include <smmintrin.h>

static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points)
{
// static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
// const lv_32fc_t* input,
// const lv_32fc_t* taps,
// unsigned int num_points)
// {

unsigned int i = 0;
const unsigned int qtr_points = num_points / 4;
const unsigned int isodd = num_points & 3;
// unsigned int i = 0;
// const unsigned int qtr_points = num_points / 4;
// const unsigned int isodd = num_points & 3;

__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
float *p_input, *p_taps;
__m64* p_result;
// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
// float *p_input, *p_taps;
// __m64* p_result;

static const __m128i neg = { 0x000000000000000080000000 };
// static const __m128i neg = { 0x000000000000000080000000 };

p_result = (__m64*)result;
p_input = (float*)input;
p_taps = (float*)taps;
// p_result = (__m64*)result;
// p_input = (float*)input;
// p_taps = (float*)taps;

real0 = _mm_setzero_ps();
real1 = _mm_setzero_ps();
im0 = _mm_setzero_ps();
im1 = _mm_setzero_ps();
// real0 = _mm_setzero_ps();
// real1 = _mm_setzero_ps();
// im0 = _mm_setzero_ps();
// im1 = _mm_setzero_ps();

for (; i < qtr_points; ++i) {
xmm0 = _mm_load_ps(p_input);
xmm1 = _mm_load_ps(p_taps);
// for (; i < qtr_points; ++i) {
// xmm0 = _mm_load_ps(p_input);
// xmm1 = _mm_load_ps(p_taps);

p_input += 4;
p_taps += 4;
// p_input += 4;
// p_taps += 4;

xmm2 = _mm_load_ps(p_input);
xmm3 = _mm_load_ps(p_taps);
// xmm2 = _mm_load_ps(p_input);
// xmm3 = _mm_load_ps(p_taps);

p_input += 4;
p_taps += 4;
// p_input += 4;
// p_taps += 4;

xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
// xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
// xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
// xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
// xmm2 = _mm_unpacklo_ps(xmm1, xmm3);

// imaginary vector from input
xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
// real vector from input
xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
// imaginary vector from taps
xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
// real vector from taps
xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
// // imaginary vector from input
// xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
// // real vector from input
// xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
// // imaginary vector from taps
// xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
// // real vector from taps
// xmm2 = _mm_unpacklo_ps(xmm2, xmm5);

xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);

xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);

real0 = _mm_add_ps(xmm4, real0);
real1 = _mm_add_ps(xmm5, real1);
im0 = _mm_add_ps(xmm6, im0);
im1 = _mm_add_ps(xmm7, im1);
}
// real0 = _mm_add_ps(xmm4, real0);
// real1 = _mm_add_ps(xmm5, real1);
// im0 = _mm_add_ps(xmm6, im0);
// im1 = _mm_add_ps(xmm7, im1);
// }

real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);

im0 = _mm_add_ps(im0, im1);
real0 = _mm_add_ps(real0, real1);
// im0 = _mm_add_ps(im0, im1);
// real0 = _mm_add_ps(real0, real1);

im0 = _mm_add_ps(im0, real0);
// im0 = _mm_add_ps(im0, real0);

_mm_storel_pi(p_result, im0);
// _mm_storel_pi(p_result, im0);

for (i = num_points - isodd; i < num_points; i++) {
*result += input[i] * taps[i];
}
}
// for (i = num_points - isodd; i < num_points; i++) {
// *result += input[i] * taps[i];
// }
// }

#endif /*LV_HAVE_SSE4_1*/
// #endif /*LV_HAVE_SSE4_1*/

#ifdef LV_HAVE_NEON
#include <arm_neon.h>
Expand Down
15 changes: 11 additions & 4 deletions lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ macro(OVERRULE_ARCH arch reason)
list(REMOVE_ITEM available_archs ${arch})
endmacro(OVERRULE_ARCH)

macro(FORCE_ARCH arch reason)
message(STATUS "${reason}, Forced arch ${arch}")
list(APPEND available_archs ${arch})
endmacro(FORCE_ARCH)

########################################################################
# eliminate AVX on if not on x86, or if the compiler does not accept
# the xgetbv instruction, or {if not cross-compiling and the xgetbv
Expand Down Expand Up @@ -257,13 +262,15 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
endif()

#MSVC 64 bit does not have MMX, overrule it
if (${SIZEOF_CPU} EQUAL 64 AND MSVC)
if (MSVC)
if (${SIZEOF_CPU} EQUAL 64)
OVERRULE_ARCH(mmx "No MMX for Win64")
if (MSVC_VERSION GREATER 1700)
OVERRULE_ARCH(sse "No SSE for Win64 Visual Studio 2013")
endif()
endif()
FORCE_ARCH(sse "Built-in for MSVC > 2013")
FORCE_ARCH(sse2 "Built-in for MSVC > 2013")
endif()


endif()

########################################################################
Expand Down

0 comments on commit d838ba7

Please sign in to comment.