From d6eb61717755254ff3fd598a8590658ad0f88b94 Mon Sep 17 00:00:00 2001
From: Jessica Iwamoto <jessica.iwamoto@aero.org>
Date: Mon, 2 Oct 2017 11:58:51 -0700
Subject: [PATCH 1/2] Added AVX2 to fm_detect, still working on adding to
 conv_k7_r2

---
 .../volk/volk_32f_s32f_32f_fm_detect_32f.h    | 145 +++++++++
 .../volk/volk_32f_x2_fm_detectpuppet_32f.h    |  34 +-
 kernels/volk/volk_8u_conv_k7_r2puppet_8u.h    |  75 ++++-
 kernels/volk/volk_8u_x4_conv_k7_r2_8u.h       | 295 ++++++++++++++++++
 lib/kernel_tests.h                            |  33 +-
 5 files changed, 560 insertions(+), 22 deletions(-)

diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
index 276bfe34d..668e7a432 100644
--- a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
+++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
@@ -59,6 +59,74 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+  if (num_points < 1) {
+    return;
+  }
+  unsigned int number = 1;
+  unsigned int j = 0;
+  // num_points-1 keeps Fedora 7's gcc from crashing...
+  // num_points won't work.  :(
+  const unsigned int eighthPoints = (num_points-1) / 8;
+
+  float* outPtr = outputVector;
+  const float* inPtr = inputVector;
+  __m256 upperBound = _mm256_set1_ps(bound);
+  __m256 lowerBound = _mm256_set1_ps(-bound);
+  __m256 next3old1;
+  __m256 next4;
+  __m256 boundAdjust;
+  __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
+  __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
+  // Do the first 8 by hand since we're going in from the saveValue:
+  *outPtr = *inPtr - *saveValue;
+  if (*outPtr >  bound) *outPtr -= 2*bound;
+  if (*outPtr < -bound) *outPtr += 2*bound;
+  inPtr++;
+  outPtr++;
+  for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+
+  for (; number < eighthPoints; number++) {
+    // Load data
+    next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
+    next4 = _mm256_load_ps(inPtr);
+    inPtr += 8;
+    // Subtract and store:
+    next3old1 = _mm256_sub_ps(next4, next3old1);
+    // Bound:
+    boundAdjust = _mm256_cmp_ps(next3old1, upperBound, 14);
+    boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+    next4 = _mm256_cmp_ps(next3old1, lowerBound, 1);
+    next4 = _mm256_and_ps(next4, negBoundAdjust);
+    boundAdjust = _mm256_or_ps(next4, boundAdjust);
+    // Make sure we're in the bounding interval:
+    next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+    _mm256_store_ps(outPtr,next3old1); // Store the results back into the output
+    outPtr += 8;
+  }
+
+  for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+
+  *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_AVX */
+
+
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
 
@@ -159,3 +227,80 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
 
 
 #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
+#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+  if (num_points < 1) {
+    return;
+  }
+  unsigned int number = 1;
+  unsigned int j = 0;
+  // num_points-1 keeps Fedora 7's gcc from crashing...
+  // num_points won't work.  :(
+  const unsigned int eighthPoints = (num_points-1) / 8;
+
+  float* outPtr = outputVector;
+  const float* inPtr = inputVector;
+  __m256 upperBound = _mm256_set1_ps(bound);
+  __m256 lowerBound = _mm256_set1_ps(-bound);
+  __m256 next3old1;
+  __m256 next4;
+  __m256 boundAdjust;
+  __m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
+  __m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
+  // Do the first 8 by hand since we're going in from the saveValue:
+  *outPtr = *inPtr - *saveValue;
+  if (*outPtr >  bound) *outPtr -= 2*bound;
+  if (*outPtr < -bound) *outPtr += 2*bound;
+  inPtr++;
+  outPtr++;
+  for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+
+  for (; number < eighthPoints; number++) {
+    // Load data
+    next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
+    next4 = _mm256_loadu_ps(inPtr);
+    inPtr += 8;
+    // Subtract and store:
+    next3old1 = _mm256_sub_ps(next4, next3old1);
+    // Bound:
+    boundAdjust = _mm256_cmp_ps(next3old1, upperBound, 14);
+    boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+    next4 = _mm256_cmp_ps(next3old1, lowerBound, 1);
+    next4 = _mm256_and_ps(next4, negBoundAdjust);
+    boundAdjust = _mm256_or_ps(next4, boundAdjust);
+    // Make sure we're in the bounding interval:
+    next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+    _mm256_storeu_ps(outPtr,next3old1); // Store the results back into the output
+    outPtr += 8;
+  }
+
+  for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+
+  *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
index a1f3e38c9..e1da18557 100644
--- a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
+++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
@@ -20,11 +20,21 @@
  * Boston, MA 02110-1301, USA.
  */
 
-#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H
-#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H
+#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H
+#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H
 
 #include "volk_32f_s32f_32f_fm_detect_32f.h"
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+{
+  const float bound = 1.0f;
+
+  volk_32f_s32f_32f_fm_detect_32f_a_avx(outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_AVX */
 
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
@@ -48,4 +58,22 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector,
 #endif /* LV_HAVE_GENERIC */
 
 
-#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H */
+#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H
+#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H
+
+#include "volk_32f_s32f_32f_fm_detect_32f.h"
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
+{
+  const float bound = 1.0f;
+
+  volk_32f_s32f_32f_fm_detect_32f_u_avx(outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_AVX */
+#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */
diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
index b398556ca..a84d7e700 100644
--- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
+++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
@@ -109,10 +109,6 @@ static inline int chainback_viterbi(unsigned char* data,
 #include <mmintrin.h>
 #include <stdio.h>
 
-
-
-
-
 static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
 
 
@@ -181,6 +177,77 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsig
 #endif /*LV_HAVE_SSE3*/
 
 
+#if LV_HAVE_AVX2
+
+#include <immintrin.h>
+#include <stdio.h>
+
+static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, unsigned char* dec, unsigned int framebits) {
+
+
+  static int once = 1;
+  int d_numstates = (1 << 6);
+  int rate = 2;
+  static unsigned char* D;
+  static unsigned char* Y;
+  static unsigned char* X;
+  static unsigned int excess = 6;
+  static unsigned char* Branchtab;
+  static unsigned char Partab[256];
+
+  int d_polys[2] = {79, 109};
+
+
+  if(once) {
+
+    X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
+    Y = X + d_numstates;
+    Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
+    D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
+    int state, i;
+    int cnt,ti;
+
+    /* Initialize parity lookup table */
+    for(i=0;i<256;i++){
+      cnt = 0;
+      ti = i;
+      while(ti){
+        if(ti & 1)
+          cnt++;
+        ti >>= 1;
+      }
+      Partab[i] = cnt & 1;
+    }
+    /*  Initialize the branch table */
+    for(state=0;state < d_numstates/2;state++){
+      for(i=0; i<rate; i++){
+        Branchtab[i*d_numstates/2+state] = (d_polys[i] < 0) ^ parity((2*state) & abs(d_polys[i]), Partab) ? 255 : 0;
+      }
+    }
+
+    once = 0;
+  }
+
+    //unbias the old_metrics
+  memset(X, 31, d_numstates);
+
+  volk_8u_x4_conv_k7_r2_8u_avx2(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);
+
+  unsigned int min = X[0];
+  int i = 0, state = 0;
+  for(i = 0; i < (d_numstates); ++i) {
+    if(X[i] < min) {
+      min = X[i];
+      state = i;
+    }
+  }
+
+  chainback_viterbi(dec, framebits/2 -excess, state, excess, D);
+
+  return;
+}
+
+#endif /*LV_HAVE_AVX2*/
 
 
 
diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
index 4d162dd9d..69ca0cad6 100644
--- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
+++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
@@ -122,6 +122,301 @@ BFLY(int i, int s, unsigned char * syms, unsigned char *Y,
 }
 
 
+#if LV_HAVE_AVX2
+
+#include <immintrin.h>
+#include <stdio.h>
+
+static inline void
+volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
+                                unsigned char* syms, unsigned char* dec,
+                                unsigned int framebits, unsigned int excess,
+                                unsigned char* Branchtab)
+{
+  unsigned int i9;
+  for(i9 = 0; i9 < ((framebits + excess) >> 2); i9++) { // >>1
+    unsigned char a75, a81;
+    int a73, a92;
+    short int s20, s21, s26, s27;
+    unsigned char  *a74, *a80, *b6;
+    short int  *a110, *a111, *a91, *a93, *a94;
+    __m256i  *a102, *a112, *a113, *a71, *a72, *a77, *a83
+      , *a95, *a96, *a97, *a98, *a99;
+    __m256i a105, a106, a86, a87;
+    __m256i a100, a101, a103, a104, a107, a108, a109
+      , a76, a78, a79, a82, a84, a85, a88, a89
+      , a90, d10, d11, d12, d9, m23, m24, m25
+      , m26, m27, m28, m29, m30, s18, s19, s22
+      , s23, s24, s25, s28, s29, t13, t14, t15
+      , t16, t17, t18;
+    a71 = ((__m256i  *) X);
+    s18 = *(a71);
+    a72 = (a71 + 2); //?
+    s19 = *(a72);
+    a73 = (8 * i9); //4
+    a74 = (syms + a73);
+    a75 = *(a74);
+    a76 = _mm256_set1_epi8(a75);
+    a77 = ((__m256i  *) Branchtab);
+    a78 = *(a77);
+    a79 = _mm256_xor_si256(a76, a78);
+    b6 = (a73 + syms);
+    a80 = (b6 + 1);
+    a81 = *(a80);
+    a82 = _mm256_set1_epi8(a81);
+    a83 = (a77 + 2); //?
+    a84 = *(a83);
+    a85 = _mm256_xor_si256(a82, a84);
+    t13 = _mm256_avg_epu8(a79,a85);
+    a86 = ((__m256i ) t13);
+    a87 = _mm256_srli_epi16(a86, 2);
+    a88 = ((__m256i ) a87);
+    t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
+    t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
+    m23 = _mm256_adds_epu8(s18, t14);
+    m24 = _mm256_adds_epu8(s19, t15);
+    m25 = _mm256_adds_epu8(s18, t15);
+    m26 = _mm256_adds_epu8(s19, t14);
+    a89 = _mm256_min_epu8(m24, m23);
+    d9 = _mm256_cmpeq_epi8(a89, m24);
+    a90 = _mm256_min_epu8(m26, m25);
+    d10 = _mm256_cmpeq_epi8(a90, m26);
+    s22 = _mm256_unpacklo_epi8(d9,d10); //could go wrong here; also, addresses
+    s23 = _mm256_unpackhi_epi8(d9,d10);
+    s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
+    a91 = ((short int  *) dec);
+    a92 = (8 * i9); //8
+    a93 = (a91 + a92);
+    *(a93) = s20;
+    s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
+    a94 = (a93 + 1);
+    *(a94) = s21;
+    s22 = _mm256_unpacklo_epi8(a89, a90);
+    s23 = _mm256_unpackhi_epi8(a89, a90);
+    a95 = ((__m256i  *) Y);
+    s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
+    *(a95) = s24;
+    s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
+    a96 = (a95 + 1);
+    *(a96) = s23;
+    a97 = (a71 + 1);
+    s24 = *(a97);
+    a98 = (a71 + 3);
+    s25 = *(a98);
+    a99 = (a77 + 1);
+    a100 = *(a99);
+    a101 = _mm256_xor_si256(a76, a100);
+    a102 = (a77 + 3);
+    a103 = *(a102);
+    a104 = _mm256_xor_si256(a82, a103);
+    t16 = _mm256_avg_epu8(a101,a104);
+    a105 = ((__m256i ) t16);
+    a106 = _mm256_srli_epi16(a105, 2);
+    a107 = ((__m256i ) a106);
+    t17 = _mm256_and_si256(a107, _mm256_set1_epi8(63));
+    t18 = _mm256_subs_epu8(_mm256_set1_epi8(63), t17);
+    m27 = _mm256_adds_epu8(s24, t17);
+    m28 = _mm256_adds_epu8(s25, t18);
+    m29 = _mm256_adds_epu8(s24, t18);
+    m30 = _mm256_adds_epu8(s25, t17);
+    a108 = _mm256_min_epu8(m28, m27);
+    d11 = _mm256_cmpeq_epi8(a108, m28);
+    a109 = _mm256_min_epu8(m30, m29);
+    d12 = _mm256_cmpeq_epi8(a109, m30);
+    s24 = _mm256_unpacklo_epi8(d11,d12);
+    s25 = _mm256_unpackhi_epi8(d11,d12);
+    s26 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
+    a110 = (a93 + 2);
+    *(a110) = s26;
+    s27 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
+    a111 = (a93 + 3);
+    *(a111) = s27;
+    s28 = _mm256_unpacklo_epi8(a108, a109);
+    s29 = _mm256_unpackhi_epi8(a108, a109);
+    s25 = _mm256_permute2x128_si256(s28, s29, 0x20);
+    a112 = (a95 + 2);
+    *(a112) = s25;
+    s29 = _mm256_permute2x128_si256(s28, s29, 0x31);
+    a113 = (a95 + 3);
+/*    *(a113) = s29;
+    if ((((unsigned char  *) Y)[0]>210)) {
+      __m256i m5, m6;
+      m5 = ((__m256i  *) Y)[0];
+      m5 = _mm256_min_epu8(m5, ((__m256i  *) Y)[1]);
+      m5 = _mm256_min_epu8(m5, ((__m256i  *) Y)[2]);
+      m5 = _mm256_min_epu8(m5, ((__m256i  *) Y)[3]);
+      __m256i m7;
+      m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
+      m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7)));
+      m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7)));
+      m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7)));
+      m7 = _mm256_unpacklo_epi8(m7, m7);
+      m7 = _mm256_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
+      m6 = _mm256_unpacklo_epi64(m7, m7);
+      m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes
+      ((__m256i  *) Y)[0] = _mm256_subs_epu8(((__m256i  *) Y)[0], m6);
+      ((__m256i  *) Y)[1] = _mm256_subs_epu8(((__m256i  *) Y)[1], m6);
+      ((__m256i  *) Y)[2] = _mm256_subs_epu8(((__m256i  *) Y)[2], m6);
+      ((__m256i  *) Y)[3] = _mm256_subs_epu8(((__m256i  *) Y)[3], m6);
+    }
+    unsigned char a188, a194;
+    int a186, a205;
+    short int s48, s49, s54, s55;
+    unsigned char  *a187, *a193, *b15;
+    short int  *a204, *a206, *a207, *a223, *a224, *b16;
+    __m256i  *a184, *a185, *a190, *a196, *a208, *a209, *a210
+      , *a211, *a212, *a215, *a225, *a226;
+    __m256i a199, a200, a218, a219;
+    __m256i a189, a191, a192, a195, a197, a198, a201
+      , a202, a203, a213, a214, a216, a217, a220, a221
+      , a222, d17, d18, d19, d20, m39, m40, m41
+      , m42, m43, m44, m45, m46, s46, s47, s50
+      , s51, s52, s53, s56, s57, t25, t26, t27
+      , t28, t29, t30;
+    a184 = ((__m256i  *) Y);
+    s46 = *(a184);
+    a185 = (a184 + 2);
+    s47 = *(a185);
+    a186 = (4 * i9); //4
+    b15 = (a186 + syms);
+    a187 = (b15 + 4);
+    a188 = *(a187);
+    a189 = _mm256_set1_epi8(a188);
+    a190 = ((__m256i  *) Branchtab);
+    a191 = *(a190);
+    a192 = _mm256_xor_si256(a189, a191);
+    a193 = (b15 + 3);
+    a194 = *(a193);
+    a195 = _mm256_set1_epi8(a194);
+    a196 = (a190 + 2);
+    a197 = *(a196);
+    a198 = _mm256_xor_si256(a195, a197);
+    t25 = _mm256_avg_epu8(a192,a198);
+    a199 = ((__m256i ) t25);
+    a200 = _mm256_srli_epi16(a199, 2);
+    a201 = ((__m256i ) a200);
+    t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
+    t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
+    m39 = _mm256_adds_epu8(s46, t26);
+    m40 = _mm256_adds_epu8(s47, t27);
+    m41 = _mm256_adds_epu8(s46, t27);
+    m42 = _mm256_adds_epu8(s47, t26);
+    a202 = _mm256_min_epu8(m40, m39);
+    d17 = _mm256_cmpeq_epi8(a202, m40);
+    a203 = _mm256_min_epu8(m42, m41);
+    d18 = _mm256_cmpeq_epi8(a203, m42);
+    s24 = _mm256_unpacklo_epi8(d17,d18);
+    s25 = _mm256_unpackhi_epi8(d17,d18);
+    s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
+    a204 = ((short int  *) dec);
+    a205 = (8 * i9); //8
+    b16 = (a204 + a205);
+    a206 = (b16 + 4);
+    *(a206) = s48;
+    s49 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
+    a207 = (b16 + 5);
+    *(a207) = s49;
+    s50 = _mm256_unpacklo_epi8(a202, a203);
+    s51 = _mm256_unpackhi_epi8(a202, a203);
+    s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
+    s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
+    a208 = ((__m256i  *) X);
+    *(a208) = s25;
+    a209 = (a208 + 1);
+    *(a209) = s51;
+    a210 = (a184 + 1);
+    s52 = *(a210);
+    a211 = (a184 + 3);
+    s53 = *(a211);
+    a212 = (a190 + 1);
+    a213 = *(a212);
+    a214 = _mm256_xor_si256(a189, a213);
+    a215 = (a190 + 3);
+    a216 = *(a215);
+    a217 = _mm256_xor_si256(a195, a216);
+    t28 = _mm256_avg_epu8(a214,a217);
+    a218 = ((__m256i ) t28);
+    a219 = _mm256_srli_epi16(a218, 2);
+    a220 = ((__m256i ) a219);
+    t29 = _mm256_and_si256(a220, _mm256_set1_epi8(63));
+    t30 = _mm256_subs_epu8(_mm256_set1_epi8(63), t29);
+    m43 = _mm256_adds_epu8(s52, t29);
+    m44 = _mm256_adds_epu8(s53, t30);
+    m45 = _mm256_adds_epu8(s52, t30);
+    m46 = _mm256_adds_epu8(s53, t29);
+    a221 = _mm256_min_epu8(m44, m43);
+    d19 = _mm256_cmpeq_epi8(a221, m44);
+    a222 = _mm256_min_epu8(m46, m45);
+    d20 = _mm256_cmpeq_epi8(a222, m46);
+    s24 = _mm256_unpacklo_epi8(d19,d20);
+    s25 = _mm256_unpackhi_epi8(d19,d20);
+    s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
+    a223 = (b16 + 6);
+    *(a223) = s54;
+    s55 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
+    a224 = (b16 + 7);
+    *(a224) = s55;
+    s56 = _mm256_unpacklo_epi8(a221, a222);
+    s57 = _mm256_unpackhi_epi8(a221, a222);
+    s25 = _mm256_permute2x128_si256(s56, s57, 0x20);
+    s57 = _mm256_permute2x128_si256(s56, s57, 0x32);
+    a225 = (a208 + 2);
+    *(a225) = s25;
+    a226 = (a208 + 3);
+    *(a226) = s57;
+    if ((((unsigned char  *) X)[0]>210)) {
+      __m256i m12, m13;
+      m12 = ((__m256i  *) X)[0];
+      m12 = _mm256_min_epu8(m12, ((__m256i  *) X)[1]);
+      m12 = _mm256_min_epu8(m12, ((__m256i  *) X)[2]);
+      m12 = _mm256_min_epu8(m12, ((__m256i  *) X)[3]);
+      __m256i m14;
+      m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
+      m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14)));
+      m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14)));
+      m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14)));
+      m14 = _mm256_unpacklo_epi8(m14, m14);
+      m14 = _mm256_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
+      m13 = _mm256_unpacklo_epi64(m14, m14);
+      m13 = _mm256_permute2x128_si256(m13, m13, 0);
+      ((__m256i  *) X)[0] = _mm256_subs_epu8(((__m256i  *) X)[0], m13);
+      ((__m256i  *) X)[1] = _mm256_subs_epu8(((__m256i  *) X)[1], m13);
+      ((__m256i  *) X)[2] = _mm256_subs_epu8(((__m256i  *) X)[2], m13);
+      ((__m256i  *) X)[3] = _mm256_subs_epu8(((__m256i  *) X)[3], m13);
+    } */
+  }
+
+  renormalize(X, 210);
+
+  /*int ch;
+  for(ch = 0; ch < 64; ch++) {
+    printf("%d,", X[ch]);
+  }
+  printf("\n");*/
+
+  unsigned int j;
+  for(j=0; j < (framebits + excess) % 2; ++j) {
+    int i;
+    for(i=0;i<64/2;i++){
+      BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
+    }
+
+
+    renormalize(Y, 210);
+
+    /*printf("\n");
+    for(ch = 0; ch < 64; ch++) {
+      printf("%d,", Y[ch]);
+    }
+    printf("\n");*/
+
+  }
+  /*skip*/
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+
 #if LV_HAVE_SSE3
 
 #include <pmmintrin.h>
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
index dfb70d4ae..96f27a881 100644
--- a/lib/kernel_tests.h
+++ b/lib/kernel_tests.h
@@ -30,6 +30,16 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
             test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
 
     std::vector<volk_test_case_t> test_cases = boost::assign::list_of
+        (VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, volk_test_params_t(0, test_params.scalar(), test_params.vlen(), test_params.iter()/10, test_params.benchmark_mode(), test_params.kernel_regex())))
+        (VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
+        (VOLK_INIT_TEST(volk_32f_expfast_32f,        volk_test_params_t(1e-1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+        (VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f,                    test_params_inacc))
+        (VOLK_INIT_TEST(volk_32f_log2_32f,           volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+        (VOLK_INIT_TEST(volk_32f_x2_pow_32f,         volk_test_params_t(1e-2, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+        (VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f,                   test_params))
+        (VOLK_INIT_TEST(volk_32f_x2_min_32f,                            test_params))
+        (VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f,  test_params))
+        (VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f,    test_params_inacc))
         (VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
         (VOLK_INIT_TEST(volk_32f_x2_interleave_32fc,                    test_params))
         (VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc,                     test_params_inacc))
@@ -115,27 +125,20 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
         (VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2,              test_params))
         (VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc,                  test_params))
         (VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc, volk_32fc_s32fc_x2_rotator_32fc, test_params))
-
         (VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt,     test_params))
         (VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u,  test_params))
-        (VOLK_INIT_PUPP(volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, volk_test_params_t(0, test_params.scalar(), test_params.vlen(), test_params.iter()/10, test_params.benchmark_mode(), test_params.kernel_regex())))
-        (VOLK_INIT_PUPP(volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
-        (VOLK_INIT_TEST(volk_32f_index_max_16u,                         test_params))
-        (VOLK_INIT_TEST(volk_32f_index_max_32u,                         test_params))
-        (VOLK_INIT_TEST(volk_32f_log2_32f,           volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
-        (VOLK_INIT_TEST(volk_32f_expfast_32f,        volk_test_params_t(1e-1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
-        (VOLK_INIT_TEST(volk_32f_x2_pow_32f,         volk_test_params_t(1e-2, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+
+        // These use simdmath library, which isn't supported on my machine so AVX2 can't be added and tested
         (VOLK_INIT_TEST(volk_32fc_s32f_power_32fc,                      test_params))
-        (VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f,    test_params_inacc))
+        (VOLK_INIT_TEST(volk_32f_s32f_power_32f,                        test_params))
+        (VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f,              test_params))
         (VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f,                       test_params))
+
+        // These need AVX2 intrinsics still
+        (VOLK_INIT_TEST(volk_32f_index_max_16u,                         test_params))
+        (VOLK_INIT_TEST(volk_32f_index_max_32u,                         test_params))
         (VOLK_INIT_TEST(volk_32fc_index_max_16u,      volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
         (VOLK_INIT_TEST(volk_32fc_index_max_32u,      volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
-        (VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f,              test_params))
-        (VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f,                   test_params))
-        (VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f,  test_params))
-        (VOLK_INIT_TEST(volk_32f_x2_min_32f,                            test_params))
-        (VOLK_INIT_TEST(volk_32f_s32f_power_32f,                        test_params))
-        (VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f,                    test_params_inacc))
         (VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
         (VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params))
         // no one uses these, so don't test them

From bd8580b5857f59d4d356d369f9aad12a59877456 Mon Sep 17 00:00:00 2001
From: Jessica Iwamoto <jessica.iwamoto@aero.org>
Date: Mon, 15 Jan 2018 17:19:50 -0800
Subject: [PATCH 2/2] added avx2 to volk 8u x4 conv k7 r2 kernel

---
 kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 205 ++++++------------------
 1 file changed, 51 insertions(+), 154 deletions(-)

diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
index 69ca0cad6..6321269d8 100644
--- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
+++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
@@ -134,37 +134,36 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
                                 unsigned char* Branchtab)
 {
   unsigned int i9;
-  for(i9 = 0; i9 < ((framebits + excess) >> 2); i9++) { // >>1
+  for(i9 = 0; i9 < ((framebits + excess)>>1); i9++) {
     unsigned char a75, a81;
     int a73, a92;
-    short int s20, s21, s26, s27;
-    unsigned char  *a74, *a80, *b6;
-    short int  *a110, *a111, *a91, *a93, *a94;
-    __m256i  *a102, *a112, *a113, *a71, *a72, *a77, *a83
-      , *a95, *a96, *a97, *a98, *a99;
-    __m256i a105, a106, a86, a87;
-    __m256i a100, a101, a103, a104, a107, a108, a109
-      , a76, a78, a79, a82, a84, a85, a88, a89
-      , a90, d10, d11, d12, d9, m23, m24, m25
-      , m26, m27, m28, m29, m30, s18, s19, s22
-      , s23, s24, s25, s28, s29, t13, t14, t15
-      , t16, t17, t18;
+    int s20, s21;
+    unsigned char  *a80, *b6;
+    int  *a110, *a91, *a93;
+    __m256i  *a112, *a71, *a72, *a77, *a83, *a95;
+    __m256i a86, a87;
+    __m256i a76, a78, a79, a82, a84, a85, a88, a89
+      , a90, d10, d9, m23, m24, m25
+      , m26, s18, s19, s22
+      , s23, s24, s25, t13, t14, t15;
     a71 = ((__m256i  *) X);
     s18 = *(a71);
-    a72 = (a71 + 2); //?
+    a72 = (a71 + 1);
     s19 = *(a72);
-    a73 = (8 * i9); //4
-    a74 = (syms + a73);
-    a75 = *(a74);
+    s22 = _mm256_permute2x128_si256(s18,s19,0x20);
+    s19 = _mm256_permute2x128_si256(s18,s19,0x31);
+    s18 = s22;
+    a73 = (4 * i9);
+    b6 = (syms + a73);
+    a75 = *(b6);
     a76 = _mm256_set1_epi8(a75);
     a77 = ((__m256i  *) Branchtab);
     a78 = *(a77);
     a79 = _mm256_xor_si256(a76, a78);
-    b6 = (a73 + syms);
     a80 = (b6 + 1);
     a81 = *(a80);
     a82 = _mm256_set1_epi8(a81);
-    a83 = (a77 + 2); //?
+    a83 = (a77 + 1);
     a84 = *(a83);
     a85 = _mm256_xor_si256(a82, a84);
     t13 = _mm256_avg_epu8(a79,a85);
@@ -181,114 +180,68 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
     d9 = _mm256_cmpeq_epi8(a89, m24);
     a90 = _mm256_min_epu8(m26, m25);
     d10 = _mm256_cmpeq_epi8(a90, m26);
-    s22 = _mm256_unpacklo_epi8(d9,d10); //could go wrong here; also, addresses
+    s22 = _mm256_unpacklo_epi8(d9,d10);
     s23 = _mm256_unpackhi_epi8(d9,d10);
     s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
-    a91 = ((short int  *) dec);
-    a92 = (8 * i9); //8
+    a91 = ((int  *) dec);
+    a92 = (4 * i9);
     a93 = (a91 + a92);
     *(a93) = s20;
     s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
-    a94 = (a93 + 1);
-    *(a94) = s21;
+    a110 = (a93 + 1);
+    *(a110) = s21;
     s22 = _mm256_unpacklo_epi8(a89, a90);
     s23 = _mm256_unpackhi_epi8(a89, a90);
     a95 = ((__m256i  *) Y);
     s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
     *(a95) = s24;
     s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
-    a96 = (a95 + 1);
-    *(a96) = s23;
-    a97 = (a71 + 1);
-    s24 = *(a97);
-    a98 = (a71 + 3);
-    s25 = *(a98);
-    a99 = (a77 + 1);
-    a100 = *(a99);
-    a101 = _mm256_xor_si256(a76, a100);
-    a102 = (a77 + 3);
-    a103 = *(a102);
-    a104 = _mm256_xor_si256(a82, a103);
-    t16 = _mm256_avg_epu8(a101,a104);
-    a105 = ((__m256i ) t16);
-    a106 = _mm256_srli_epi16(a105, 2);
-    a107 = ((__m256i ) a106);
-    t17 = _mm256_and_si256(a107, _mm256_set1_epi8(63));
-    t18 = _mm256_subs_epu8(_mm256_set1_epi8(63), t17);
-    m27 = _mm256_adds_epu8(s24, t17);
-    m28 = _mm256_adds_epu8(s25, t18);
-    m29 = _mm256_adds_epu8(s24, t18);
-    m30 = _mm256_adds_epu8(s25, t17);
-    a108 = _mm256_min_epu8(m28, m27);
-    d11 = _mm256_cmpeq_epi8(a108, m28);
-    a109 = _mm256_min_epu8(m30, m29);
-    d12 = _mm256_cmpeq_epi8(a109, m30);
-    s24 = _mm256_unpacklo_epi8(d11,d12);
-    s25 = _mm256_unpackhi_epi8(d11,d12);
-    s26 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
-    a110 = (a93 + 2);
-    *(a110) = s26;
-    s27 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
-    a111 = (a93 + 3);
-    *(a111) = s27;
-    s28 = _mm256_unpacklo_epi8(a108, a109);
-    s29 = _mm256_unpackhi_epi8(a108, a109);
-    s25 = _mm256_permute2x128_si256(s28, s29, 0x20);
-    a112 = (a95 + 2);
-    *(a112) = s25;
-    s29 = _mm256_permute2x128_si256(s28, s29, 0x31);
-    a113 = (a95 + 3);
-/*    *(a113) = s29;
+    a112 = (a95 + 1);
+    *(a112) = s23;
     if ((((unsigned char  *) Y)[0]>210)) {
       __m256i m5, m6;
       m5 = ((__m256i  *) Y)[0];
       m5 = _mm256_min_epu8(m5, ((__m256i  *) Y)[1]);
-      m5 = _mm256_min_epu8(m5, ((__m256i  *) Y)[2]);
-      m5 = _mm256_min_epu8(m5, ((__m256i  *) Y)[3]);
       __m256i m7;
       m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
       m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7)));
       m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7)));
       m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7)));
       m7 = _mm256_unpacklo_epi8(m7, m7);
-      m7 = _mm256_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
+      m7 = _mm256_shufflelo_epi16(m7, 0);
       m6 = _mm256_unpacklo_epi64(m7, m7);
       m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes
       ((__m256i  *) Y)[0] = _mm256_subs_epu8(((__m256i  *) Y)[0], m6);
       ((__m256i  *) Y)[1] = _mm256_subs_epu8(((__m256i  *) Y)[1], m6);
-      ((__m256i  *) Y)[2] = _mm256_subs_epu8(((__m256i  *) Y)[2], m6);
-      ((__m256i  *) Y)[3] = _mm256_subs_epu8(((__m256i  *) Y)[3], m6);
     }
     unsigned char a188, a194;
     int a186, a205;
-    short int s48, s49, s54, s55;
+    int s48, s54;
     unsigned char  *a187, *a193, *b15;
-    short int  *a204, *a206, *a207, *a223, *a224, *b16;
-    __m256i  *a184, *a185, *a190, *a196, *a208, *a209, *a210
-      , *a211, *a212, *a215, *a225, *a226;
-    __m256i a199, a200, a218, a219;
+    int  *a204, *a206, *a223, *b16;
+    __m256i  *a184, *a185, *a190, *a196, *a208, *a225;
+    __m256i a199, a200;
     __m256i a189, a191, a192, a195, a197, a198, a201
-      , a202, a203, a213, a214, a216, a217, a220, a221
-      , a222, d17, d18, d19, d20, m39, m40, m41
-      , m42, m43, m44, m45, m46, s46, s47, s50
-      , s51, s52, s53, s56, s57, t25, t26, t27
-      , t28, t29, t30;
+      , a202, a203, d17, d18, m39, m40, m41
+      , m42, s46, s47, s50
+      , s51, t25, t26, t27;
     a184 = ((__m256i  *) Y);
     s46 = *(a184);
-    a185 = (a184 + 2);
+    a185 = (a184 + 1);
     s47 = *(a185);
-    a186 = (4 * i9); //4
-    b15 = (a186 + syms);
-    a187 = (b15 + 4);
+    s50 = _mm256_permute2x128_si256(s46,s47,0x20);
+    s47 = _mm256_permute2x128_si256(s46,s47,0x31);
+    s46 = s50;
+    a187 = (b6 + 2);
     a188 = *(a187);
     a189 = _mm256_set1_epi8(a188);
     a190 = ((__m256i  *) Branchtab);
     a191 = *(a190);
     a192 = _mm256_xor_si256(a189, a191);
-    a193 = (b15 + 3);
+    a193 = (b6 + 3);
     a194 = *(a193);
     a195 = _mm256_set1_epi8(a194);
-    a196 = (a190 + 2);
+    a196 = (a190 + 1);
     a197 = *(a196);
     a198 = _mm256_xor_si256(a195, a197);
     t25 = _mm256_avg_epu8(a192,a198);
@@ -308,92 +261,43 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
     s24 = _mm256_unpacklo_epi8(d17,d18);
     s25 = _mm256_unpackhi_epi8(d17,d18);
     s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
-    a204 = ((short int  *) dec);
-    a205 = (8 * i9); //8
+    a204 = ((int  *) dec);
+    a205 = (4 * i9);
     b16 = (a204 + a205);
-    a206 = (b16 + 4);
+    a206 = (b16 + 2);
     *(a206) = s48;
-    s49 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
-    a207 = (b16 + 5);
-    *(a207) = s49;
+    s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
+    a223 = (b16 + 3);
+    *(a223) = s54;
     s50 = _mm256_unpacklo_epi8(a202, a203);
     s51 = _mm256_unpackhi_epi8(a202, a203);
     s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
     s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
     a208 = ((__m256i  *) X);
     *(a208) = s25;
-    a209 = (a208 + 1);
-    *(a209) = s51;
-    a210 = (a184 + 1);
-    s52 = *(a210);
-    a211 = (a184 + 3);
-    s53 = *(a211);
-    a212 = (a190 + 1);
-    a213 = *(a212);
-    a214 = _mm256_xor_si256(a189, a213);
-    a215 = (a190 + 3);
-    a216 = *(a215);
-    a217 = _mm256_xor_si256(a195, a216);
-    t28 = _mm256_avg_epu8(a214,a217);
-    a218 = ((__m256i ) t28);
-    a219 = _mm256_srli_epi16(a218, 2);
-    a220 = ((__m256i ) a219);
-    t29 = _mm256_and_si256(a220, _mm256_set1_epi8(63));
-    t30 = _mm256_subs_epu8(_mm256_set1_epi8(63), t29);
-    m43 = _mm256_adds_epu8(s52, t29);
-    m44 = _mm256_adds_epu8(s53, t30);
-    m45 = _mm256_adds_epu8(s52, t30);
-    m46 = _mm256_adds_epu8(s53, t29);
-    a221 = _mm256_min_epu8(m44, m43);
-    d19 = _mm256_cmpeq_epi8(a221, m44);
-    a222 = _mm256_min_epu8(m46, m45);
-    d20 = _mm256_cmpeq_epi8(a222, m46);
-    s24 = _mm256_unpacklo_epi8(d19,d20);
-    s25 = _mm256_unpackhi_epi8(d19,d20);
-    s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
-    a223 = (b16 + 6);
-    *(a223) = s54;
-    s55 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
-    a224 = (b16 + 7);
-    *(a224) = s55;
-    s56 = _mm256_unpacklo_epi8(a221, a222);
-    s57 = _mm256_unpackhi_epi8(a221, a222);
-    s25 = _mm256_permute2x128_si256(s56, s57, 0x20);
-    s57 = _mm256_permute2x128_si256(s56, s57, 0x32);
-    a225 = (a208 + 2);
-    *(a225) = s25;
-    a226 = (a208 + 3);
-    *(a226) = s57;
+    a225 = (a208 + 1);
+    *(a225) = s51;
+
     if ((((unsigned char  *) X)[0]>210)) {
       __m256i m12, m13;
       m12 = ((__m256i  *) X)[0];
       m12 = _mm256_min_epu8(m12, ((__m256i  *) X)[1]);
-      m12 = _mm256_min_epu8(m12, ((__m256i  *) X)[2]);
-      m12 = _mm256_min_epu8(m12, ((__m256i  *) X)[3]);
       __m256i m14;
       m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
       m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14)));
       m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14)));
       m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14)));
       m14 = _mm256_unpacklo_epi8(m14, m14);
-      m14 = _mm256_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
+      m14 = _mm256_shufflelo_epi16(m14, 0);
       m13 = _mm256_unpacklo_epi64(m14, m14);
       m13 = _mm256_permute2x128_si256(m13, m13, 0);
       ((__m256i  *) X)[0] = _mm256_subs_epu8(((__m256i  *) X)[0], m13);
       ((__m256i  *) X)[1] = _mm256_subs_epu8(((__m256i  *) X)[1], m13);
-      ((__m256i  *) X)[2] = _mm256_subs_epu8(((__m256i  *) X)[2], m13);
-      ((__m256i  *) X)[3] = _mm256_subs_epu8(((__m256i  *) X)[3], m13);
-    } */
+    }
   }
 
   renormalize(X, 210);
 
-  /*int ch;
-  for(ch = 0; ch < 64; ch++) {
-    printf("%d,", X[ch]);
-  }
-  printf("\n");*/
-
   unsigned int j;
   for(j=0; j < (framebits + excess) % 2; ++j) {
     int i;
@@ -401,15 +305,8 @@ volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
       BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
     }
 
-
     renormalize(Y, 210);
 
-    /*printf("\n");
-    for(ch = 0; ch < 64; ch++) {
-      printf("%d,", Y[ch]);
-    }
-    printf("\n");*/
-
   }
   /*skip*/
 }