Merge pull request gnuradio#29 in GNSSSDR/volk from feature/GNSSSDR-7…

…1-add-avx2-to-fm_detect-conv_k7_r2 to next * commit '028652df04a978772b9aa61ac2f5b2ed351dc98c': added avx2 to volk 8u x4 conv k7 r2 kernel Added AVX2 to fm_detect, still working on adding to conv_k7_r2
jessica-iwamoto · Jan 18, 2018 · 9dc890a · 9dc890a
2 parents 0c1536d + 028652d
commit 9dc890a
Show file tree

Hide file tree

Showing 5 changed files with 444 additions and 13 deletions.
diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
@@ -59,6 +59,74 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+  if (num_points < 1) {
+    return;
+  }
+  unsigned int number = 1;
+  unsigned int j = 0;
+  // num_points-1 keeps Fedora 7's gcc from crashing...
+  // num_points won't work.  :(
+  const unsigned int eighthPoints = (num_points-1) / 8;
+
+  float* outPtr = outputVector;
+  const float* inPtr = inputVector;
+  __m256 upperBound = _mm256_set1_ps(bound);
+  __m256 lowerBound = _mm256_set1_ps(-bound);
+  __m256 next3old1;
+  __m256 next4;
+  __m256 boundAdjust;
+  __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
+  __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
+  // Do the first 8 by hand since we're going in from the saveValue:
+  *outPtr = *inPtr - *saveValue;
+  if (*outPtr >  bound) *outPtr -= 2*bound;
+  if (*outPtr < -bound) *outPtr += 2*bound;
+  inPtr++;
+  outPtr++;
+  for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+
+  for (; number < eighthPoints; number++) {
+    // Load data
+    next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
+    next4 = _mm256_load_ps(inPtr);
+    inPtr += 8;
+    // Subtract and store:
+    next3old1 = _mm256_sub_ps(next4, next3old1);
+    // Bound:
+    boundAdjust = _mm256_cmp_ps(next3old1, upperBound, 14);
+    boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+    next4 = _mm256_cmp_ps(next3old1, lowerBound, 1);
+    next4 = _mm256_and_ps(next4, negBoundAdjust);
+    boundAdjust = _mm256_or_ps(next4, boundAdjust);
+    // Make sure we're in the bounding interval:
+    next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+    _mm256_store_ps(outPtr,next3old1); // Store the results back into the output
+    outPtr += 8;
+  }
+
+  for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+
+  *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_AVX */
+
+
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
 
@@ -159,3 +227,80 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
 
 
 #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
+#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+  if (num_points < 1) {
+    return;
+  }
+  unsigned int number = 1;
+  unsigned int j = 0;
+  // num_points-1 keeps Fedora 7's gcc from crashing...
+  // num_points won't work.  :(
+  const unsigned int eighthPoints = (num_points-1) / 8;
+
+  float* outPtr = outputVector;
+  const float* inPtr = inputVector;
+  __m256 upperBound = _mm256_set1_ps(bound);
+  __m256 lowerBound = _mm256_set1_ps(-bound);
+  __m256 next3old1;
+  __m256 next4;
+  __m256 boundAdjust;
+  __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
+  __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
+  // Do the first 8 by hand since we're going in from the saveValue:
+  *outPtr = *inPtr - *saveValue;
+  if (*outPtr >  bound) *outPtr -= 2*bound;
+  if (*outPtr < -bound) *outPtr += 2*bound;
+  inPtr++;
+  outPtr++;
+  for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+
+  for (; number < eighthPoints; number++) {
+    // Load data
+    next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
+    next4 = _mm256_loadu_ps(inPtr);
+    inPtr += 8;
+    // Subtract and store:
+    next3old1 = _mm256_sub_ps(next4, next3old1);
+    // Bound:
+    boundAdjust = _mm256_cmp_ps(next3old1, upperBound, 14);
+    boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+    next4 = _mm256_cmp_ps(next3old1, lowerBound, 1);
+    next4 = _mm256_and_ps(next4, negBoundAdjust);
+    boundAdjust = _mm256_or_ps(next4, boundAdjust);
+    // Make sure we're in the bounding interval:
+    next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+    _mm256_storeu_ps(outPtr,next3old1); // Store the results back into the output
+    outPtr += 8;
+  }
+
+  for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+
+  *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
@@ -20,11 +20,21 @@
  * Boston, MA 02110-1301, USA.
  */
 
-#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H
-#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H
+#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H
+#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H
 
 #include "volk_32f_s32f_32f_fm_detect_32f.h"
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+{
+  const float bound = 1.0f;
+
+  volk_32f_s32f_32f_fm_detect_32f_a_avx(outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_AVX */
 
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
@@ -48,4 +58,22 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector,
 #endif /* LV_HAVE_GENERIC */
 
 
-#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H */
+#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H
+#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H
+
+#include "volk_32f_s32f_32f_fm_detect_32f.h"
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+{
+  const float bound = 1.0f;
+
+  volk_32f_s32f_32f_fm_detect_32f_u_avx(outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_AVX */
+#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */
diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
@@ -109,10 +109,6 @@ static inline int chainback_viterbi(unsigned char* data,
 #include <mmintrin.h>
 #include <stdio.h>
 
-
-
-
-
 static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
 
 
@@ -181,6 +177,77 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsig
 #endif /*LV_HAVE_SSE3*/
 
 
+#if LV_HAVE_AVX2
+
+#include <immintrin.h>
+#include <stdio.h>
+
+static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
+
+
+  static int once = 1;
+  int d_numstates = (1 << 6);
+  int rate = 2;
+  static unsigned char* D;
+  static unsigned char* Y;
+  static unsigned char* X;
+  static unsigned int excess = 6;
+  static unsigned char* Branchtab;
+  static unsigned char Partab[256];
+
+  int d_polys[2] = {79, 109};
+
+
+  if(once) {
+
+    X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
+    Y = X + d_numstates;
+    Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
+    D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
+    int state, i;
+    int cnt,ti;
+
+    /* Initialize parity lookup table */
+    for(i=0;i<256;i++){
+      cnt = 0;
+      ti = i;
+      while(ti){
+        if(ti & 1)
+          cnt++;
+        ti >>= 1;
+      }
+      Partab[i] = cnt & 1;
+    }
+    /*  Initialize the branch table */
+    for(state=0;state < d_numstates/2;state++){
+      for(i=0; i<rate; i++){
+        Branchtab[i*d_numstates/2+state] = (d_polys[i] < 0) ^ parity((2*state) & abs(d_polys[i]), Partab) ? 255 : 0;
+      }
+    }
+
+    once = 0;
+  }
+
+    //unbias the old_metrics
+  memset(X, 31, d_numstates);
+
+  volk_8u_x4_conv_k7_r2_8u_avx2(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+
+  unsigned int min = X[0];
+  int i = 0, state = 0;
+  for(i = 0; i < (d_numstates); ++i) {
+    if(X[i] < min) {
+      min = X[i];
+      state = i;
+    }
+  }
+
+  chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
+
+  return;
+}
+
+#endif /*LV_HAVE_AVX2*/