From d6eb61717755254ff3fd598a8590658ad0f88b94 Mon Sep 17 00:00:00 2001 From: Jessica Iwamoto Date: Mon, 2 Oct 2017 11:58:51 -0700 Subject: [PATCH 1/2] Added AVX2 to fm_detect, still working on adding to conv_k7_r2 --- .../volk/volk_32f_s32f_32f_fm_detect_32f.h | 145 +++++++++ .../volk/volk_32f_x2_fm_detectpuppet_32f.h | 34 +- kernels/volk/volk_8u_conv_k7_r2puppet_8u.h | 75 ++++- kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 295 ++++++++++++++++++ lib/kernel_tests.h | 33 +- 5 files changed, 560 insertions(+), 22 deletions(-) diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h index 276bfe34d..668e7a432 100644 --- a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h +++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h @@ -59,6 +59,74 @@ #include #include +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int eighthPoints = (num_points-1) / 8; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m256 upperBound = _mm256_set1_ps(bound); + __m256 lowerBound = _mm256_set1_ps(-bound); + __m256 next3old1; + __m256 next4; + __m256 boundAdjust; + __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above. + __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below. + // Do the first 8 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + for (; number < eighthPoints; number++) { + // Load data + next3old1 = _mm256_loadu_ps((float*) (inPtr-1)); + next4 = _mm256_load_ps(inPtr); + inPtr += 8; + // Subtract and store: + next3old1 = _mm256_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm256_cmp_ps(next3old1, upperBound, 14); + boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm256_cmp_ps(next3old1, lowerBound, 1); + next4 = _mm256_and_ps(next4, negBoundAdjust); + boundAdjust = _mm256_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm256_add_ps(next3old1, boundAdjust); + _mm256_store_ps(outPtr,next3old1); // Store the results back into the output + outPtr += 8; + } + + for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_SSE #include @@ -159,3 +227,80 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector, #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H +#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H + +#include +#include + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int eighthPoints = (num_points-1) / 8; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m256 upperBound = _mm256_set1_ps(bound); + __m256 lowerBound = _mm256_set1_ps(-bound); + __m256 next3old1; + __m256 next4; + __m256 boundAdjust; + __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above. + __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below. + // Do the first 8 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + for (; number < eighthPoints; number++) { + // Load data + next3old1 = _mm256_loadu_ps((float*) (inPtr-1)); + next4 = _mm256_loadu_ps(inPtr); + inPtr += 8; + // Subtract and store: + next3old1 = _mm256_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm256_cmp_ps(next3old1, upperBound, 14); + boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm256_cmp_ps(next3old1, lowerBound, 1); + next4 = _mm256_and_ps(next4, negBoundAdjust); + boundAdjust = _mm256_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm256_add_ps(next3old1, boundAdjust); + _mm256_storeu_ps(outPtr,next3old1); // Store the results back into the output + outPtr += 8; + } + + for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_AVX */ + + +#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h index a1f3e38c9..e1da18557 100644 --- a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h +++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h @@ -20,11 +20,21 @@ * Boston, MA 02110-1301, USA. */ -#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H -#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H +#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H +#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H #include "volk_32f_s32f_32f_fm_detect_32f.h" +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +{ + const float bound = 1.0f; + + volk_32f_s32f_32f_fm_detect_32f_a_avx(outputVector, inputVector, bound, saveValue, num_points); +} +#endif /* LV_HAVE_AVX */ #ifdef LV_HAVE_SSE #include @@ -48,4 +58,22 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector, #endif /* LV_HAVE_GENERIC */ -#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H */ +#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H */ + + +#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H +#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H + +#include "volk_32f_s32f_32f_fm_detect_32f.h" + +#ifdef LV_HAVE_AVX +#include + +static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points) +{ + const float bound = 1.0f; + + volk_32f_s32f_32f_fm_detect_32f_u_avx(outputVector, inputVector, bound, saveValue, num_points); +} +#endif /* LV_HAVE_AVX */ +#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */ diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h index b398556ca..a84d7e700 100644 --- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h +++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h @@ -109,10 +109,6 @@ static inline int chainback_viterbi(unsigned char* data, #include #include - - - - static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) { @@ -181,6 +177,77 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsig #endif /*LV_HAVE_SSE3*/ +#if LV_HAVE_AVX2 + +#include +#include + +static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, unsigned char* dec, unsigned int framebits) { + + + static int once = 1; + int d_numstates = (1 << 6); + int rate = 2; + static unsigned char* D; + static unsigned char* Y; + static unsigned char* X; + static unsigned int excess = 6; + static unsigned char* Branchtab; + static unsigned char Partab[256]; + + int d_polys[2] = {79, 109}; + + + if(once) { + + X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment()); + Y = X + d_numstates; + Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment()); + D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment()); + int state, i; + int cnt,ti; + + /* Initialize parity lookup table */ + for(i=0;i<256;i++){ + cnt = 0; + ti = i; + while(ti){ + if(ti & 1) + cnt++; + ti >>= 1; + } + Partab[i] = cnt & 1; + } + /* Initialize the branch table */ + for(state=0;state < d_numstates/2;state++){ + for(i=0; i +#include + +static inline void +volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X, + unsigned char* syms, unsigned char* dec, + unsigned int framebits, unsigned int excess, + unsigned char* Branchtab) +{ + unsigned int i9; + for(i9 = 0; i9 < ((framebits + excess) >> 2); i9++) { // >>1 + unsigned char a75, a81; + int a73, a92; + short int s20, s21, s26, s27; + unsigned char *a74, *a80, *b6; + short int *a110, *a111, *a91, *a93, *a94; + __m256i *a102, *a112, *a113, *a71, *a72, *a77, *a83 + , *a95, *a96, *a97, *a98, *a99; + __m256i a105, a106, a86, a87; + __m256i a100, a101, a103, a104, a107, a108, a109 + , a76, a78, a79, a82, a84, a85, a88, a89 + , a90, d10, d11, d12, d9, m23, m24, m25 + , m26, m27, m28, m29, m30, s18, s19, s22 + , s23, s24, s25, s28, s29, t13, t14, t15 + , t16, t17, t18; + a71 = ((__m256i *) X); + s18 = *(a71); + a72 = (a71 + 2); //? + s19 = *(a72); + a73 = (8 * i9); //4 + a74 = (syms + a73); + a75 = *(a74); + a76 = _mm256_set1_epi8(a75); + a77 = ((__m256i *) Branchtab); + a78 = *(a77); + a79 = _mm256_xor_si256(a76, a78); + b6 = (a73 + syms); + a80 = (b6 + 1); + a81 = *(a80); + a82 = _mm256_set1_epi8(a81); + a83 = (a77 + 2); //? + a84 = *(a83); + a85 = _mm256_xor_si256(a82, a84); + t13 = _mm256_avg_epu8(a79,a85); + a86 = ((__m256i ) t13); + a87 = _mm256_srli_epi16(a86, 2); + a88 = ((__m256i ) a87); + t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63)); + t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14); + m23 = _mm256_adds_epu8(s18, t14); + m24 = _mm256_adds_epu8(s19, t15); + m25 = _mm256_adds_epu8(s18, t15); + m26 = _mm256_adds_epu8(s19, t14); + a89 = _mm256_min_epu8(m24, m23); + d9 = _mm256_cmpeq_epi8(a89, m24); + a90 = _mm256_min_epu8(m26, m25); + d10 = _mm256_cmpeq_epi8(a90, m26); + s22 = _mm256_unpacklo_epi8(d9,d10); //could go wrong here; also, addresses + s23 = _mm256_unpackhi_epi8(d9,d10); + s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); + a91 = ((short int *) dec); + a92 = (8 * i9); //8 + a93 = (a91 + a92); + *(a93) = s20; + s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); + a94 = (a93 + 1); + *(a94) = s21; + s22 = _mm256_unpacklo_epi8(a89, a90); + s23 = _mm256_unpackhi_epi8(a89, a90); + a95 = ((__m256i *) Y); + s24 = _mm256_permute2x128_si256(s22, s23, 0x20); + *(a95) = s24; + s23 = _mm256_permute2x128_si256(s22, s23, 0x31); + a96 = (a95 + 1); + *(a96) = s23; + a97 = (a71 + 1); + s24 = *(a97); + a98 = (a71 + 3); + s25 = *(a98); + a99 = (a77 + 1); + a100 = *(a99); + a101 = _mm256_xor_si256(a76, a100); + a102 = (a77 + 3); + a103 = *(a102); + a104 = _mm256_xor_si256(a82, a103); + t16 = _mm256_avg_epu8(a101,a104); + a105 = ((__m256i ) t16); + a106 = _mm256_srli_epi16(a105, 2); + a107 = ((__m256i ) a106); + t17 = _mm256_and_si256(a107, _mm256_set1_epi8(63)); + t18 = _mm256_subs_epu8(_mm256_set1_epi8(63), t17); + m27 = _mm256_adds_epu8(s24, t17); + m28 = _mm256_adds_epu8(s25, t18); + m29 = _mm256_adds_epu8(s24, t18); + m30 = _mm256_adds_epu8(s25, t17); + a108 = _mm256_min_epu8(m28, m27); + d11 = _mm256_cmpeq_epi8(a108, m28); + a109 = _mm256_min_epu8(m30, m29); + d12 = _mm256_cmpeq_epi8(a109, m30); + s24 = _mm256_unpacklo_epi8(d11,d12); + s25 = _mm256_unpackhi_epi8(d11,d12); + s26 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); + a110 = (a93 + 2); + *(a110) = s26; + s27 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); + a111 = (a93 + 3); + *(a111) = s27; + s28 = _mm256_unpacklo_epi8(a108, a109); + s29 = _mm256_unpackhi_epi8(a108, a109); + s25 = _mm256_permute2x128_si256(s28, s29, 0x20); + a112 = (a95 + 2); + *(a112) = s25; + s29 = _mm256_permute2x128_si256(s28, s29, 0x31); + a113 = (a95 + 3); +/* *(a113) = s29; + if ((((unsigned char *) Y)[0]>210)) { + __m256i m5, m6; + m5 = ((__m256i *) Y)[0]; + m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[1]); + m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[2]); + m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[3]); + __m256i m7; + m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); + m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7))); + m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7))); + m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7))); + m7 = _mm256_unpacklo_epi8(m7, m7); + m7 = _mm256_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); + m6 = _mm256_unpacklo_epi64(m7, m7); + m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes + ((__m256i *) Y)[0] = _mm256_subs_epu8(((__m256i *) Y)[0], m6); + ((__m256i *) Y)[1] = _mm256_subs_epu8(((__m256i *) Y)[1], m6); + ((__m256i *) Y)[2] = _mm256_subs_epu8(((__m256i *) Y)[2], m6); + ((__m256i *) Y)[3] = _mm256_subs_epu8(((__m256i *) Y)[3], m6); + } + unsigned char a188, a194; + int a186, a205; + short int s48, s49, s54, s55; + unsigned char *a187, *a193, *b15; + short int *a204, *a206, *a207, *a223, *a224, *b16; + __m256i *a184, *a185, *a190, *a196, *a208, *a209, *a210 + , *a211, *a212, *a215, *a225, *a226; + __m256i a199, a200, a218, a219; + __m256i a189, a191, a192, a195, a197, a198, a201 + , a202, a203, a213, a214, a216, a217, a220, a221 + , a222, d17, d18, d19, d20, m39, m40, m41 + , m42, m43, m44, m45, m46, s46, s47, s50 + , s51, s52, s53, s56, s57, t25, t26, t27 + , t28, t29, t30; + a184 = ((__m256i *) Y); + s46 = *(a184); + a185 = (a184 + 2); + s47 = *(a185); + a186 = (4 * i9); //4 + b15 = (a186 + syms); + a187 = (b15 + 4); + a188 = *(a187); + a189 = _mm256_set1_epi8(a188); + a190 = ((__m256i *) Branchtab); + a191 = *(a190); + a192 = _mm256_xor_si256(a189, a191); + a193 = (b15 + 3); + a194 = *(a193); + a195 = _mm256_set1_epi8(a194); + a196 = (a190 + 2); + a197 = *(a196); + a198 = _mm256_xor_si256(a195, a197); + t25 = _mm256_avg_epu8(a192,a198); + a199 = ((__m256i ) t25); + a200 = _mm256_srli_epi16(a199, 2); + a201 = ((__m256i ) a200); + t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63)); + t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26); + m39 = _mm256_adds_epu8(s46, t26); + m40 = _mm256_adds_epu8(s47, t27); + m41 = _mm256_adds_epu8(s46, t27); + m42 = _mm256_adds_epu8(s47, t26); + a202 = _mm256_min_epu8(m40, m39); + d17 = _mm256_cmpeq_epi8(a202, m40); + a203 = _mm256_min_epu8(m42, m41); + d18 = _mm256_cmpeq_epi8(a203, m42); + s24 = _mm256_unpacklo_epi8(d17,d18); + s25 = _mm256_unpackhi_epi8(d17,d18); + s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); + a204 = ((short int *) dec); + a205 = (8 * i9); //8 + b16 = (a204 + a205); + a206 = (b16 + 4); + *(a206) = s48; + s49 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); + a207 = (b16 + 5); + *(a207) = s49; + s50 = _mm256_unpacklo_epi8(a202, a203); + s51 = _mm256_unpackhi_epi8(a202, a203); + s25 = _mm256_permute2x128_si256(s50, s51, 0x20); + s51 = _mm256_permute2x128_si256(s50, s51, 0x31); + a208 = ((__m256i *) X); + *(a208) = s25; + a209 = (a208 + 1); + *(a209) = s51; + a210 = (a184 + 1); + s52 = *(a210); + a211 = (a184 + 3); + s53 = *(a211); + a212 = (a190 + 1); + a213 = *(a212); + a214 = _mm256_xor_si256(a189, a213); + a215 = (a190 + 3); + a216 = *(a215); + a217 = _mm256_xor_si256(a195, a216); + t28 = _mm256_avg_epu8(a214,a217); + a218 = ((__m256i ) t28); + a219 = _mm256_srli_epi16(a218, 2); + a220 = ((__m256i ) a219); + t29 = _mm256_and_si256(a220, _mm256_set1_epi8(63)); + t30 = _mm256_subs_epu8(_mm256_set1_epi8(63), t29); + m43 = _mm256_adds_epu8(s52, t29); + m44 = _mm256_adds_epu8(s53, t30); + m45 = _mm256_adds_epu8(s52, t30); + m46 = _mm256_adds_epu8(s53, t29); + a221 = _mm256_min_epu8(m44, m43); + d19 = _mm256_cmpeq_epi8(a221, m44); + a222 = _mm256_min_epu8(m46, m45); + d20 = _mm256_cmpeq_epi8(a222, m46); + s24 = _mm256_unpacklo_epi8(d19,d20); + s25 = _mm256_unpackhi_epi8(d19,d20); + s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); + a223 = (b16 + 6); + *(a223) = s54; + s55 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); + a224 = (b16 + 7); + *(a224) = s55; + s56 = _mm256_unpacklo_epi8(a221, a222); + s57 = _mm256_unpackhi_epi8(a221, a222); + s25 = _mm256_permute2x128_si256(s56, s57, 0x20); + s57 = _mm256_permute2x128_si256(s56, s57, 0x32); + a225 = (a208 + 2); + *(a225) = s25; + a226 = (a208 + 3); + *(a226) = s57; + if ((((unsigned char *) X)[0]>210)) { + __m256i m12, m13; + m12 = ((__m256i *) X)[0]; + m12 = _mm256_min_epu8(m12, ((__m256i *) X)[1]); + m12 = _mm256_min_epu8(m12, ((__m256i *) X)[2]); + m12 = _mm256_min_epu8(m12, ((__m256i *) X)[3]); + __m256i m14; + m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); + m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14))); + m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14))); + m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14))); + m14 = _mm256_unpacklo_epi8(m14, m14); + m14 = _mm256_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); + m13 = _mm256_unpacklo_epi64(m14, m14); + m13 = _mm256_permute2x128_si256(m13, m13, 0); + ((__m256i *) X)[0] = _mm256_subs_epu8(((__m256i *) X)[0], m13); + ((__m256i *) X)[1] = _mm256_subs_epu8(((__m256i *) X)[1], m13); + ((__m256i *) X)[2] = _mm256_subs_epu8(((__m256i *) X)[2], m13); + ((__m256i *) X)[3] = _mm256_subs_epu8(((__m256i *) X)[3], m13); + } */ + } + + renormalize(X, 210); + + /*int ch; + for(ch = 0; ch < 64; ch++) { + printf("%d,", X[ch]); + } + printf("\n");*/ + + unsigned int j; + for(j=0; j < (framebits + excess) % 2; ++j) { + int i; + for(i=0;i<64/2;i++){ + BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab); + } + + + renormalize(Y, 210); + + /*printf("\n"); + for(ch = 0; ch < 64; ch++) { + printf("%d,", Y[ch]); + } + printf("\n");*/ + + } + /*skip*/ +} + +#endif /*LV_HAVE_AVX2*/ + + #if LV_HAVE_SSE3 #include diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index dfb70d4ae..96f27a881 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -30,6 +30,16 @@ std::vector init_test_list(volk_test_params_t test_params) test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); std::vector test_cases = boost::assign::list_of + (VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, volk_test_params_t(0, test_params.scalar(), test_params.vlen(), test_params.iter()/10, test_params.benchmark_mode(), test_params.kernel_regex()))) + (VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params)) + (VOLK_INIT_TEST(volk_32f_expfast_32f, volk_test_params_t(1e-1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + (VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc)) + (VOLK_INIT_TEST(volk_32f_log2_32f, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + (VOLK_INIT_TEST(volk_32f_x2_pow_32f, volk_test_params_t(1e-2, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + (VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params)) + (VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params)) + (VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params)) + (VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc)) (VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) (VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params)) (VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc)) @@ -115,27 +125,20 @@ std::vector init_test_list(volk_test_params_t test_params) (VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params)) (VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params)) (VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params)) - (VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params)) (VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params)) - (VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, volk_test_params_t(0, test_params.scalar(), test_params.vlen(), test_params.iter()/10, test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) - (VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) - (VOLK_INIT_TEST(volk_32f_log2_32f, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32f_expfast_32f, volk_test_params_t(1e-1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32f_x2_pow_32f, volk_test_params_t(1e-2, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) + + // These use simdmath library, which isn't supported on my machine so AVX2 can't be added and tested (VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params)) - (VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc)) + (VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) + (VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params)) (VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params)) + + // These need AVX2 intrinsics still + (VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) + (VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) (VOLK_INIT_TEST(volk_32fc_index_max_16u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) (VOLK_INIT_TEST(volk_32fc_index_max_32u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()))) - (VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params)) - (VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) - (VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc)) (VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params)) (VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params)) // no one uses these, so don't test them From bd8580b5857f59d4d356d369f9aad12a59877456 Mon Sep 17 00:00:00 2001 From: Jessica Iwamoto Date: Mon, 15 Jan 2018 17:19:50 -0800 Subject: [PATCH 2/2] added avx2 to volk 8u x4 conv k7 r2 kernel --- kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 205 ++++++------------------ 1 file changed, 51 insertions(+), 154 deletions(-) diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h index 69ca0cad6..6321269d8 100644 --- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h @@ -134,37 +134,36 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X, unsigned char* Branchtab) { unsigned int i9; - for(i9 = 0; i9 < ((framebits + excess) >> 2); i9++) { // >>1 + for(i9 = 0; i9 < ((framebits + excess)>>1); i9++) { unsigned char a75, a81; int a73, a92; - short int s20, s21, s26, s27; - unsigned char *a74, *a80, *b6; - short int *a110, *a111, *a91, *a93, *a94; - __m256i *a102, *a112, *a113, *a71, *a72, *a77, *a83 - , *a95, *a96, *a97, *a98, *a99; - __m256i a105, a106, a86, a87; - __m256i a100, a101, a103, a104, a107, a108, a109 - , a76, a78, a79, a82, a84, a85, a88, a89 - , a90, d10, d11, d12, d9, m23, m24, m25 - , m26, m27, m28, m29, m30, s18, s19, s22 - , s23, s24, s25, s28, s29, t13, t14, t15 - , t16, t17, t18; + int s20, s21; + unsigned char *a80, *b6; + int *a110, *a91, *a93; + __m256i *a112, *a71, *a72, *a77, *a83, *a95; + __m256i a86, a87; + __m256i a76, a78, a79, a82, a84, a85, a88, a89 + , a90, d10, d9, m23, m24, m25 + , m26, s18, s19, s22 + , s23, s24, s25, t13, t14, t15; a71 = ((__m256i *) X); s18 = *(a71); - a72 = (a71 + 2); //? + a72 = (a71 + 1); s19 = *(a72); - a73 = (8 * i9); //4 - a74 = (syms + a73); - a75 = *(a74); + s22 = _mm256_permute2x128_si256(s18,s19,0x20); + s19 = _mm256_permute2x128_si256(s18,s19,0x31); + s18 = s22; + a73 = (4 * i9); + b6 = (syms + a73); + a75 = *(b6); a76 = _mm256_set1_epi8(a75); a77 = ((__m256i *) Branchtab); a78 = *(a77); a79 = _mm256_xor_si256(a76, a78); - b6 = (a73 + syms); a80 = (b6 + 1); a81 = *(a80); a82 = _mm256_set1_epi8(a81); - a83 = (a77 + 2); //? + a83 = (a77 + 1); a84 = *(a83); a85 = _mm256_xor_si256(a82, a84); t13 = _mm256_avg_epu8(a79,a85); @@ -181,114 +180,68 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X, d9 = _mm256_cmpeq_epi8(a89, m24); a90 = _mm256_min_epu8(m26, m25); d10 = _mm256_cmpeq_epi8(a90, m26); - s22 = _mm256_unpacklo_epi8(d9,d10); //could go wrong here; also, addresses + s22 = _mm256_unpacklo_epi8(d9,d10); s23 = _mm256_unpackhi_epi8(d9,d10); s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20)); - a91 = ((short int *) dec); - a92 = (8 * i9); //8 + a91 = ((int *) dec); + a92 = (4 * i9); a93 = (a91 + a92); *(a93) = s20; s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31)); - a94 = (a93 + 1); - *(a94) = s21; + a110 = (a93 + 1); + *(a110) = s21; s22 = _mm256_unpacklo_epi8(a89, a90); s23 = _mm256_unpackhi_epi8(a89, a90); a95 = ((__m256i *) Y); s24 = _mm256_permute2x128_si256(s22, s23, 0x20); *(a95) = s24; s23 = _mm256_permute2x128_si256(s22, s23, 0x31); - a96 = (a95 + 1); - *(a96) = s23; - a97 = (a71 + 1); - s24 = *(a97); - a98 = (a71 + 3); - s25 = *(a98); - a99 = (a77 + 1); - a100 = *(a99); - a101 = _mm256_xor_si256(a76, a100); - a102 = (a77 + 3); - a103 = *(a102); - a104 = _mm256_xor_si256(a82, a103); - t16 = _mm256_avg_epu8(a101,a104); - a105 = ((__m256i ) t16); - a106 = _mm256_srli_epi16(a105, 2); - a107 = ((__m256i ) a106); - t17 = _mm256_and_si256(a107, _mm256_set1_epi8(63)); - t18 = _mm256_subs_epu8(_mm256_set1_epi8(63), t17); - m27 = _mm256_adds_epu8(s24, t17); - m28 = _mm256_adds_epu8(s25, t18); - m29 = _mm256_adds_epu8(s24, t18); - m30 = _mm256_adds_epu8(s25, t17); - a108 = _mm256_min_epu8(m28, m27); - d11 = _mm256_cmpeq_epi8(a108, m28); - a109 = _mm256_min_epu8(m30, m29); - d12 = _mm256_cmpeq_epi8(a109, m30); - s24 = _mm256_unpacklo_epi8(d11,d12); - s25 = _mm256_unpackhi_epi8(d11,d12); - s26 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); - a110 = (a93 + 2); - *(a110) = s26; - s27 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); - a111 = (a93 + 3); - *(a111) = s27; - s28 = _mm256_unpacklo_epi8(a108, a109); - s29 = _mm256_unpackhi_epi8(a108, a109); - s25 = _mm256_permute2x128_si256(s28, s29, 0x20); - a112 = (a95 + 2); - *(a112) = s25; - s29 = _mm256_permute2x128_si256(s28, s29, 0x31); - a113 = (a95 + 3); -/* *(a113) = s29; + a112 = (a95 + 1); + *(a112) = s23; if ((((unsigned char *) Y)[0]>210)) { __m256i m5, m6; m5 = ((__m256i *) Y)[0]; m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[1]); - m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[2]); - m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[3]); __m256i m7; m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5); m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7))); m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7))); m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7))); m7 = _mm256_unpacklo_epi8(m7, m7); - m7 = _mm256_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); + m7 = _mm256_shufflelo_epi16(m7, 0); m6 = _mm256_unpacklo_epi64(m7, m7); m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes ((__m256i *) Y)[0] = _mm256_subs_epu8(((__m256i *) Y)[0], m6); ((__m256i *) Y)[1] = _mm256_subs_epu8(((__m256i *) Y)[1], m6); - ((__m256i *) Y)[2] = _mm256_subs_epu8(((__m256i *) Y)[2], m6); - ((__m256i *) Y)[3] = _mm256_subs_epu8(((__m256i *) Y)[3], m6); } unsigned char a188, a194; int a186, a205; - short int s48, s49, s54, s55; + int s48, s54; unsigned char *a187, *a193, *b15; - short int *a204, *a206, *a207, *a223, *a224, *b16; - __m256i *a184, *a185, *a190, *a196, *a208, *a209, *a210 - , *a211, *a212, *a215, *a225, *a226; - __m256i a199, a200, a218, a219; + int *a204, *a206, *a223, *b16; + __m256i *a184, *a185, *a190, *a196, *a208, *a225; + __m256i a199, a200; __m256i a189, a191, a192, a195, a197, a198, a201 - , a202, a203, a213, a214, a216, a217, a220, a221 - , a222, d17, d18, d19, d20, m39, m40, m41 - , m42, m43, m44, m45, m46, s46, s47, s50 - , s51, s52, s53, s56, s57, t25, t26, t27 - , t28, t29, t30; + , a202, a203, d17, d18, m39, m40, m41 + , m42, s46, s47, s50 + , s51, t25, t26, t27; a184 = ((__m256i *) Y); s46 = *(a184); - a185 = (a184 + 2); + a185 = (a184 + 1); s47 = *(a185); - a186 = (4 * i9); //4 - b15 = (a186 + syms); - a187 = (b15 + 4); + s50 = _mm256_permute2x128_si256(s46,s47,0x20); + s47 = _mm256_permute2x128_si256(s46,s47,0x31); + s46 = s50; + a187 = (b6 + 2); a188 = *(a187); a189 = _mm256_set1_epi8(a188); a190 = ((__m256i *) Branchtab); a191 = *(a190); a192 = _mm256_xor_si256(a189, a191); - a193 = (b15 + 3); + a193 = (b6 + 3); a194 = *(a193); a195 = _mm256_set1_epi8(a194); - a196 = (a190 + 2); + a196 = (a190 + 1); a197 = *(a196); a198 = _mm256_xor_si256(a195, a197); t25 = _mm256_avg_epu8(a192,a198); @@ -308,92 +261,43 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X, s24 = _mm256_unpacklo_epi8(d17,d18); s25 = _mm256_unpackhi_epi8(d17,d18); s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); - a204 = ((short int *) dec); - a205 = (8 * i9); //8 + a204 = ((int *) dec); + a205 = (4 * i9); b16 = (a204 + a205); - a206 = (b16 + 4); + a206 = (b16 + 2); *(a206) = s48; - s49 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); - a207 = (b16 + 5); - *(a207) = s49; + s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); + a223 = (b16 + 3); + *(a223) = s54; s50 = _mm256_unpacklo_epi8(a202, a203); s51 = _mm256_unpackhi_epi8(a202, a203); s25 = _mm256_permute2x128_si256(s50, s51, 0x20); s51 = _mm256_permute2x128_si256(s50, s51, 0x31); a208 = ((__m256i *) X); *(a208) = s25; - a209 = (a208 + 1); - *(a209) = s51; - a210 = (a184 + 1); - s52 = *(a210); - a211 = (a184 + 3); - s53 = *(a211); - a212 = (a190 + 1); - a213 = *(a212); - a214 = _mm256_xor_si256(a189, a213); - a215 = (a190 + 3); - a216 = *(a215); - a217 = _mm256_xor_si256(a195, a216); - t28 = _mm256_avg_epu8(a214,a217); - a218 = ((__m256i ) t28); - a219 = _mm256_srli_epi16(a218, 2); - a220 = ((__m256i ) a219); - t29 = _mm256_and_si256(a220, _mm256_set1_epi8(63)); - t30 = _mm256_subs_epu8(_mm256_set1_epi8(63), t29); - m43 = _mm256_adds_epu8(s52, t29); - m44 = _mm256_adds_epu8(s53, t30); - m45 = _mm256_adds_epu8(s52, t30); - m46 = _mm256_adds_epu8(s53, t29); - a221 = _mm256_min_epu8(m44, m43); - d19 = _mm256_cmpeq_epi8(a221, m44); - a222 = _mm256_min_epu8(m46, m45); - d20 = _mm256_cmpeq_epi8(a222, m46); - s24 = _mm256_unpacklo_epi8(d19,d20); - s25 = _mm256_unpackhi_epi8(d19,d20); - s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20)); - a223 = (b16 + 6); - *(a223) = s54; - s55 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31)); - a224 = (b16 + 7); - *(a224) = s55; - s56 = _mm256_unpacklo_epi8(a221, a222); - s57 = _mm256_unpackhi_epi8(a221, a222); - s25 = _mm256_permute2x128_si256(s56, s57, 0x20); - s57 = _mm256_permute2x128_si256(s56, s57, 0x32); - a225 = (a208 + 2); - *(a225) = s25; - a226 = (a208 + 3); - *(a226) = s57; + a225 = (a208 + 1); + *(a225) = s51; + if ((((unsigned char *) X)[0]>210)) { __m256i m12, m13; m12 = ((__m256i *) X)[0]; m12 = _mm256_min_epu8(m12, ((__m256i *) X)[1]); - m12 = _mm256_min_epu8(m12, ((__m256i *) X)[2]); - m12 = _mm256_min_epu8(m12, ((__m256i *) X)[3]); __m256i m14; m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12); m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14))); m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14))); m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14))); m14 = _mm256_unpacklo_epi8(m14, m14); - m14 = _mm256_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); + m14 = _mm256_shufflelo_epi16(m14, 0); m13 = _mm256_unpacklo_epi64(m14, m14); m13 = _mm256_permute2x128_si256(m13, m13, 0); ((__m256i *) X)[0] = _mm256_subs_epu8(((__m256i *) X)[0], m13); ((__m256i *) X)[1] = _mm256_subs_epu8(((__m256i *) X)[1], m13); - ((__m256i *) X)[2] = _mm256_subs_epu8(((__m256i *) X)[2], m13); - ((__m256i *) X)[3] = _mm256_subs_epu8(((__m256i *) X)[3], m13); - } */ + } } renormalize(X, 210); - /*int ch; - for(ch = 0; ch < 64; ch++) { - printf("%d,", X[ch]); - } - printf("\n");*/ - unsigned int j; for(j=0; j < (framebits + excess) % 2; ++j) { int i; @@ -401,15 +305,8 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X, BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab); } - renormalize(Y, 210); - /*printf("\n"); - for(ch = 0; ch < 64; ch++) { - printf("%d,", Y[ch]); - } - printf("\n");*/ - } /*skip*/ }