Merge pull request #620 from ThomasHabets/volk_32fc_x2_dot_prod_32fc_…

…sifive_u74 Add volk_32fc_x2_dot_prod_32fc_sifive_u74
gnuradio · Apr 17, 2023 · 73c2580 · 73c2580
2 parents 4992330 + 5982821
commit 73c2580
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 0 deletions.
diff --git a/kernels/volk/asm/riscv/volk_32fc_x2_dot_prod_32fc_sifive_u74.s b/kernels/volk/asm/riscv/volk_32fc_x2_dot_prod_32fc_sifive_u74.s
@@ -0,0 +1,101 @@
+        .text
+        .align 2
+        .type   volk_32fc_x2_dot_prod_32fc_sifive_u74, @function
+        .global volk_32fc_x2_dot_prod_32fc_sifive_u74
+
+        #
+        # RISC-V implementation using only I and F sets.
+        # About 41% less CPU use than GCC, measured with volk_profile,
+        # and a test gnuradio graph using Freq XLAT FIR filter.
+        #
+        # The generic C code is also 2x unrolled, but its main flaw
+        # seems to be not properly fusing into fmadd and fnmsub.
+        #
+        # Focus of this hand coded assembly:
+        # * Better use of fused multiply.
+        # * Try to maximize space between write and read.
+        #
+        # Instruction order has been done manually and benchmarked,
+        # and may not be optimal.
+        #
+volk_32fc_x2_dot_prod_32fc_sifive_u74:
+        # a0: out
+        # a1: in
+        # a2: taps
+        # a3: number of points
+
+        # Calculate end of main loop.
+        and     a4,a3,1
+        xor     a4,a3,a4
+        slli    a5,a4,3
+        add     a5,a5,a1
+
+        # Output regs.
+        fmv.w.x ft0,zero
+        fmv.w.x ft1,zero
+        fmv.w.x ft2,zero
+        fmv.w.x ft3,zero
+        fmv.w.x ft4,zero
+        fmv.w.x ft5,zero
+        fmv.w.x ft6,zero
+        fmv.w.x ft7,zero
+        beq     a1,a5,.endloop
+
+        # Main loop two complexes at a time.
+.loop:
+        # Load input in order of when it'll be used.
+        # flw has 2 cycle latency, 1 cycle repeat.
+        flw     ft8,0(a1)               # in0
+        flw     ft9,0(a2)               # tp0
+        flw     ft10,4(a2)              # tp1
+        flw     ft11,4(a1)              # in1
+
+        # None of the fused multiple-adds have a write-read stall.
+        # FMA, like mul and add, have 5 cycle latency, 1 cycle repeat.
+        fmadd.s  ft0,ft8, ft9, ft0      # in0*tp0
+        flw      fa0,8(a1)              # in0
+        fmadd.s  ft1,ft8, ft10,ft1      # in0*tp1
+        flw      fa1,8(a2)              # tp0
+        fnmsub.s ft2,ft11,ft10,ft2      # -in1*tp1
+        flw      fa2,12(a2)             # tp1
+        fmadd.s  ft3,ft11,ft9, ft3      # in1*tp0
+        flw      fa3,12(a1)             # in1
+
+        fmadd.s  ft4,fa0,fa1,ft4        # in0*tp0
+        addi     a1,a1,16               # free ride in pipeline A.
+        fmadd.s  ft5,fa0,fa2,ft5        # in0*tp1
+        addi     a2,a2,16               # free ride in pipeline A.
+        fnmsub.s ft6,fa3,fa2,ft6        # -in1*tp1
+        fmadd.s  ft7,fa3,fa1,ft7        # in1*tp0
+        bne      a1,a5,.loop
+
+.endloop:
+        # Check if odd number of inputs.
+        andi    a3,a3,1
+        beqz    a3,.done
+
+        # Do odd one complex.
+        flw     fa0,0(a1) # in0
+        flw     fa1,0(a2) # tp0
+        flw     fa2,4(a2) # tp1
+        flw     fa3,4(a1) # in1
+
+        fmadd.s  ft4,fa0,fa1,ft4   # in0*tp0
+        fmadd.s  ft5,fa0,fa2,ft5   # in0*tp1
+        fnmsub.s ft6,fa3,fa2,ft6   # -in1*tp1
+        fmadd.s  ft7,fa3,fa1,ft7   # in1*tp0
+.done:
+        # Some one-time stalling here.
+        # Latency 5, repeat 1.
+        fadd.s  ft0,ft0,ft2
+        fadd.s  ft1,ft1,ft3
+        fadd.s  ft0,ft0,ft4
+        fadd.s  ft1,ft1,ft5
+        fadd.s  ft0,ft0,ft6
+        fadd.s  ft1,ft1,ft7
+        # fsw has latency 4, repeat 1.
+        fsw     ft0,0(a0)
+        fsw     ft1,4(a0)
+        ret
+
+        .size volk_32fc_x2_dot_prod_32fc_sifive_u74, .-volk_32fc_x2_dot_prod_32fc_sifive_u74
diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -51,6 +51,13 @@
 #include <volk/volk_complex.h>
 
 
+#ifdef LV_HAVE_RISCV64
+extern void volk_32fc_x2_dot_prod_32fc_sifive_u74(lv_32fc_t* result,
+                                                  const lv_32fc_t* input,
+                                                  const lv_32fc_t* taps,
+                                                  unsigned int num_points);
+#endif
+
 #ifdef LV_HAVE_GENERIC