Skip to content

Commit

Permalink
Merge pull request gnuradio#29 in GNSSSDR/volk from feature/GNSSSDR-7…
Browse files Browse the repository at this point in the history
…1-add-avx2-to-fm_detect-conv_k7_r2 to next

* commit '028652df04a978772b9aa61ac2f5b2ed351dc98c':
  added avx2 to volk 8u x4 conv k7 r2 kernel
  Added AVX2 to fm_detect, still working on adding to conv_k7_r2
  • Loading branch information
jessica-iwamoto committed Jan 18, 2018
2 parents 0c1536d + 028652d commit 9dc890a
Show file tree
Hide file tree
Showing 5 changed files with 444 additions and 13 deletions.
145 changes: 145 additions & 0 deletions kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,74 @@
#include <inttypes.h>
#include <stdio.h>

#ifdef LV_HAVE_AVX
#include <immintrin.h>

static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
if (num_points < 1) {
return;
}
unsigned int number = 1;
unsigned int j = 0;
// num_points-1 keeps Fedora 7's gcc from crashing...
// num_points won't work. :(
const unsigned int eighthPoints = (num_points-1) / 8;

float* outPtr = outputVector;
const float* inPtr = inputVector;
__m256 upperBound = _mm256_set1_ps(bound);
__m256 lowerBound = _mm256_set1_ps(-bound);
__m256 next3old1;
__m256 next4;
__m256 boundAdjust;
__m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
__m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
// Do the first 8 by hand since we're going in from the saveValue:
*outPtr = *inPtr - *saveValue;
if (*outPtr > bound) *outPtr -= 2*bound;
if (*outPtr < -bound) *outPtr += 2*bound;
inPtr++;
outPtr++;
for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
*outPtr = *(inPtr) - *(inPtr-1);
if (*outPtr > bound) *outPtr -= 2*bound;
if (*outPtr < -bound) *outPtr += 2*bound;
inPtr++;
outPtr++;
}

for (; number < eighthPoints; number++) {
// Load data
next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
next4 = _mm256_load_ps(inPtr);
inPtr += 8;
// Subtract and store:
next3old1 = _mm256_sub_ps(next4, next3old1);
// Bound:
boundAdjust = _mm256_cmp_ps(next3old1, upperBound, 14);
boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
next4 = _mm256_cmp_ps(next3old1, lowerBound, 1);
next4 = _mm256_and_ps(next4, negBoundAdjust);
boundAdjust = _mm256_or_ps(next4, boundAdjust);
// Make sure we're in the bounding interval:
next3old1 = _mm256_add_ps(next3old1, boundAdjust);
_mm256_store_ps(outPtr,next3old1); // Store the results back into the output
outPtr += 8;
}

for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
*outPtr = *(inPtr) - *(inPtr-1);
if (*outPtr > bound) *outPtr -= 2*bound;
if (*outPtr < -bound) *outPtr += 2*bound;
inPtr++;
outPtr++;
}

*saveValue = inputVector[num_points-1];
}
#endif /* LV_HAVE_AVX */


#ifdef LV_HAVE_SSE
#include <xmmintrin.h>

Expand Down Expand Up @@ -159,3 +227,80 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,


#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */


#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H

#include <inttypes.h>
#include <stdio.h>

#ifdef LV_HAVE_AVX
#include <immintrin.h>

static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
if (num_points < 1) {
return;
}
unsigned int number = 1;
unsigned int j = 0;
// num_points-1 keeps Fedora 7's gcc from crashing...
// num_points won't work. :(
const unsigned int eighthPoints = (num_points-1) / 8;

float* outPtr = outputVector;
const float* inPtr = inputVector;
__m256 upperBound = _mm256_set1_ps(bound);
__m256 lowerBound = _mm256_set1_ps(-bound);
__m256 next3old1;
__m256 next4;
__m256 boundAdjust;
__m256 posBoundAdjust = _mm256_set1_ps(-2*bound); // Subtract when we're above.
__m256 negBoundAdjust = _mm256_set1_ps(2*bound); // Add when we're below.
// Do the first 8 by hand since we're going in from the saveValue:
*outPtr = *inPtr - *saveValue;
if (*outPtr > bound) *outPtr -= 2*bound;
if (*outPtr < -bound) *outPtr += 2*bound;
inPtr++;
outPtr++;
for (j = 1; j < ( (8 < num_points) ? 8 : num_points); j++) {
*outPtr = *(inPtr) - *(inPtr-1);
if (*outPtr > bound) *outPtr -= 2*bound;
if (*outPtr < -bound) *outPtr += 2*bound;
inPtr++;
outPtr++;
}

for (; number < eighthPoints; number++) {
// Load data
next3old1 = _mm256_loadu_ps((float*) (inPtr-1));
next4 = _mm256_loadu_ps(inPtr);
inPtr += 8;
// Subtract and store:
next3old1 = _mm256_sub_ps(next4, next3old1);
// Bound:
boundAdjust = _mm256_cmp_ps(next3old1, upperBound, 14);
boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
next4 = _mm256_cmp_ps(next3old1, lowerBound, 1);
next4 = _mm256_and_ps(next4, negBoundAdjust);
boundAdjust = _mm256_or_ps(next4, boundAdjust);
// Make sure we're in the bounding interval:
next3old1 = _mm256_add_ps(next3old1, boundAdjust);
_mm256_storeu_ps(outPtr,next3old1); // Store the results back into the output
outPtr += 8;
}

for (number = (8 > (eighthPoints*8) ? 8 : (8 * eighthPoints)); number < num_points; number++) {
*outPtr = *(inPtr) - *(inPtr-1);
if (*outPtr > bound) *outPtr -= 2*bound;
if (*outPtr < -bound) *outPtr += 2*bound;
inPtr++;
outPtr++;
}

*saveValue = inputVector[num_points-1];
}
#endif /* LV_HAVE_AVX */


#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */
34 changes: 31 additions & 3 deletions kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,21 @@
* Boston, MA 02110-1301, USA.
*/

#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H
#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H
#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H
#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H

#include "volk_32f_s32f_32f_fm_detect_32f.h"

#ifdef LV_HAVE_AVX
#include <immintrin.h>

static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
{
const float bound = 1.0f;

volk_32f_s32f_32f_fm_detect_32f_a_avx(outputVector, inputVector, bound, saveValue, num_points);
}
#endif /* LV_HAVE_AVX */

#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
Expand All @@ -48,4 +58,22 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector,
#endif /* LV_HAVE_GENERIC */


#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_H */
#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H */


#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H
#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H

#include "volk_32f_s32f_32f_fm_detect_32f.h"

#ifdef LV_HAVE_AVX
#include <immintrin.h>

static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, const float* inputVector, float* saveValue, unsigned int num_points)
{
const float bound = 1.0f;

volk_32f_s32f_32f_fm_detect_32f_u_avx(outputVector, inputVector, bound, saveValue, num_points);
}
#endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */
75 changes: 71 additions & 4 deletions kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,6 @@ static inline int chainback_viterbi(unsigned char* data,
#include <mmintrin.h>
#include <stdio.h>





static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsigned char* dec, unsigned int framebits) {


Expand Down Expand Up @@ -181,6 +177,77 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, unsig
#endif /*LV_HAVE_SSE3*/


#if LV_HAVE_AVX2

#include <immintrin.h>
#include <stdio.h>

static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, unsigned char* dec, unsigned int framebits) {


static int once = 1;
int d_numstates = (1 << 6);
int rate = 2;
static unsigned char* D;
static unsigned char* Y;
static unsigned char* X;
static unsigned int excess = 6;
static unsigned char* Branchtab;
static unsigned char Partab[256];

int d_polys[2] = {79, 109};


if(once) {

X = (unsigned char*)volk_malloc(2*d_numstates, volk_get_alignment());
Y = X + d_numstates;
Branchtab = (unsigned char*)volk_malloc(d_numstates/2*rate, volk_get_alignment());
D = (unsigned char*)volk_malloc((d_numstates/8) * (framebits + 6), volk_get_alignment());
int state, i;
int cnt,ti;

/* Initialize parity lookup table */
for(i=0;i<256;i++){
cnt = 0;
ti = i;
while(ti){
if(ti & 1)
cnt++;
ti >>= 1;
}
Partab[i] = cnt & 1;
}
/* Initialize the branch table */
for(state=0;state < d_numstates/2;state++){
for(i=0; i<rate; i++){
Branchtab[i*d_numstates/2+state] = (d_polys[i] < 0) ^ parity((2*state) & abs(d_polys[i]), Partab) ? 255 : 0;
}
}

once = 0;
}

//unbias the old_metrics
memset(X, 31, d_numstates);

volk_8u_x4_conv_k7_r2_8u_avx2(Y, X, syms, D, framebits/2 - excess, excess, Branchtab);

unsigned int min = X[0];
int i = 0, state = 0;
for(i = 0; i < (d_numstates); ++i) {
if(X[i] < min) {
min = X[i];
state = i;
}
}

chainback_viterbi(dec, framebits/2 -excess, state, excess, D);

return;
}

#endif /*LV_HAVE_AVX2*/



Expand Down
Loading

0 comments on commit 9dc890a

Please sign in to comment.