From bb1ed3ba7cd08f558d9f09aa53b63e51c89d2af5 Mon Sep 17 00:00:00 2001
From: Magnus Lundmark <magnuslundmark@gmail.com>
Date: Tue, 27 Feb 2024 22:15:55 +0100
Subject: [PATCH 1/3] New AVX512F implementation

Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
---
 kernels/volk/volk_32f_reciprocal_32f.h | 189 +++++++++++++++++++++++++
 lib/kernel_tests.h                     |   1 +
 2 files changed, 190 insertions(+)
 create mode 100644 kernels/volk/volk_32f_reciprocal_32f.h
diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h
new file mode 100644
index 00000000..5fba6926
--- /dev/null
+++ b/kernels/volk/volk_32f_reciprocal_32f.h
@@ -0,0 +1,189 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_reciprocal_32f
+ *
+ * \b Overview
+ *
+ * Computes the reciprocal of the input vector and stores the results
+ * in the output vector. For the AVX512F implementation the relative
+ * error is < 2**(-14) = 6.1e-05
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_reciprocal_32f(float* out, const float* in, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li in: A pointer to the input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: A pointer to the output vector of floats.
+ *
+ * \b Example
+ * \code
+    int N = 10;
+    unsigned int alignment = volk_get_alignment();
+    float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+    float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+
+    for(unsigned int ii = 1; ii < N; ++ii){
+        in[ii] = (float)(ii*ii);
+    }
+
+    volk_32f_reciprocal_32f(out, in, N);
+
+    for(unsigned int ii = 0; ii < N; ++ii){
+        printf("out(%i) = %f\n", ii, out[ii]);
+    }
+
+    volk_free(in);
+    volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H
+#define INCLUDED_volk_32f_reciprocal_32f_a_H
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
+{
+    for (unsigned int i = 0; i < num_points; i++) {
+        out[i] = 1.f / in[i];
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+static inline void
+volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_points)
+{
+    const __m128 ONE = _mm_set_ps1(1.f);
+    const unsigned int quarter_points = num_points / 4;
+    for (unsigned int number = 0; number < quarter_points; number++) {
+        __m128 x = _mm_load_ps(in);
+        in += 4;
+        __m128 r = _mm_div_ps(ONE, x);
+        _mm_store_ps(out, r);
+        out += 4;
+    }
+
+    const unsigned int done = quarter_points * 4;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
+{
+    const __m256 ONE = _mm256_set1_ps(1.f);
+    const unsigned int eighth_points = num_points / 8;
+    for (unsigned int number = 0; number < eighth_points; number++) {
+        __m256 x = _mm256_load_ps(in);
+        in += 8;
+        __m256 r = _mm256_div_ps(ONE, x);
+        _mm256_store_ps(out, r);
+        out += 8;
+    }
+
+    const unsigned int done = eighth_points * 8;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points)
+{
+    const unsigned int sixteenth_points = num_points / 16;
+    for (unsigned int number = 0; number < sixteenth_points; number++) {
+        __m512 x = _mm512_load_ps(in);
+        in += 16;
+        __m512 r = _mm512_rcp14_ps(x);
+        _mm512_store_ps(out, r);
+        out += 16;
+    }
+
+    const unsigned int done = sixteenth_points * 16;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_AVX512F */
+
+#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H
+#define INCLUDED_volk_32f_reciprocal_32f_u_H
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+static inline void
+volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_points)
+{
+    const __m128 ONE = _mm_set_ps1(1.f);
+    const unsigned int quarter_points = num_points / 4;
+    for (unsigned int number = 0; number < quarter_points; number++) {
+        __m128 x = _mm_loadu_ps(in);
+        in += 4;
+        __m128 r = _mm_div_ps(ONE, x);
+        _mm_storeu_ps(out, r);
+        out += 4;
+    }
+
+    const unsigned int done = quarter_points * 4;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
+{
+    const __m256 ONE = _mm256_set1_ps(1.f);
+    const unsigned int eighth_points = num_points / 8;
+    for (unsigned int number = 0; number < eighth_points; number++) {
+        __m256 x = _mm256_loadu_ps(in);
+        in += 8;
+        __m256 r = _mm256_div_ps(ONE, x);
+        _mm256_storeu_ps(out, r);
+        out += 8;
+    }
+
+    const unsigned int done = eighth_points * 8;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+static inline void
+volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points)
+{
+    const unsigned int sixteenth_points = num_points / 16;
+    for (unsigned int number = 0; number < sixteenth_points; number++) {
+        __m512 x = _mm512_loadu_ps(in);
+        in += 16;
+        __m512 r = _mm512_rcp14_ps(x);
+        _mm512_storeu_ps(out, r);
+        out += 16;
+    }
+
+    const unsigned int done = sixteenth_points * 16;
+    volk_32f_reciprocal_32f_generic(out, in, num_points - done);
+}
+#endif /* LV_HAVE_AVX512F */
+
+#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
index 16c79c36..a22a4027 100644
--- a/lib/kernel_tests.h
+++ b/lib/kernel_tests.h
@@ -141,6 +141,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
     QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params))
     QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
     QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_reciprocal_32f, test_params.make_tol(6.15e-5)))
     QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
     QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
     QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_absolute(1e-5)))

From 7fcb2054402d2d65632da4c689785ef7fd538275 Mon Sep 17 00:00:00 2001
From: Magnus Lundmark <magnuslundmark@gmail.com>
Date: Tue, 27 Feb 2024 22:38:26 +0100
Subject: [PATCH 2/3] Updated copyright year

Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
---
 kernels/volk/volk_32f_reciprocal_32f.h | 2 +-
 lib/kernel_tests.h                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h
index 5fba6926..42363d3f 100644
--- a/kernels/volk/volk_32f_reciprocal_32f.h
+++ b/kernels/volk/volk_32f_reciprocal_32f.h
@@ -1,6 +1,6 @@
 /* -*- c++ -*- */
 /*
- * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
+ * Copyright 2024 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
index a22a4027..57a296dc 100644
--- a/lib/kernel_tests.h
+++ b/lib/kernel_tests.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2014 - 2021 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
+ * Copyright 2023, 2024 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *

From 89bfc55317365847360951a468d399ed077b525d Mon Sep 17 00:00:00 2001
From: Magnus Lundmark <magnuslundmark@gmail.com>
Date: Fri, 1 Mar 2024 13:37:29 +0100
Subject: [PATCH 3/3] formatting

Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
---
 kernels/volk/volk_32f_reciprocal_32f.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h
index 42363d3f..37bd16a8 100644
--- a/kernels/volk/volk_32f_reciprocal_32f.h
+++ b/kernels/volk/volk_32f_reciprocal_32f.h
@@ -70,6 +70,7 @@ volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_poin
 {
     const __m128 ONE = _mm_set_ps1(1.f);
     const unsigned int quarter_points = num_points / 4;
+
     for (unsigned int number = 0; number < quarter_points; number++) {
         __m128 x = _mm_load_ps(in);
         in += 4;
@@ -79,6 +80,7 @@ volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_poin
     }
 
     const unsigned int done = quarter_points * 4;
+
     volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_SSE */
@@ -90,6 +92,7 @@ volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_poin
 {
     const __m256 ONE = _mm256_set1_ps(1.f);
     const unsigned int eighth_points = num_points / 8;
+
     for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_load_ps(in);
         in += 8;
@@ -99,6 +102,7 @@ volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_poin
     }
 
     const unsigned int done = eighth_points * 8;
+
     volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_AVX */
@@ -109,6 +113,7 @@ static inline void
 volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points)
 {
     const unsigned int sixteenth_points = num_points / 16;
+
     for (unsigned int number = 0; number < sixteenth_points; number++) {
         __m512 x = _mm512_load_ps(in);
         in += 16;
@@ -118,6 +123,7 @@ volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_p
     }
 
     const unsigned int done = sixteenth_points * 16;
+
     volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_AVX512F */
@@ -134,6 +140,7 @@ volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_poin
 {
     const __m128 ONE = _mm_set_ps1(1.f);
     const unsigned int quarter_points = num_points / 4;
+
     for (unsigned int number = 0; number < quarter_points; number++) {
         __m128 x = _mm_loadu_ps(in);
         in += 4;
@@ -143,6 +150,7 @@ volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_poin
     }
 
     const unsigned int done = quarter_points * 4;
+
     volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_SSE */
@@ -154,6 +162,7 @@ volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_poin
 {
     const __m256 ONE = _mm256_set1_ps(1.f);
     const unsigned int eighth_points = num_points / 8;
+
     for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_loadu_ps(in);
         in += 8;
@@ -163,6 +172,7 @@ volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_poin
     }
 
     const unsigned int done = eighth_points * 8;
+
     volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_AVX */
@@ -173,6 +183,7 @@ static inline void
 volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points)
 {
     const unsigned int sixteenth_points = num_points / 16;
+
     for (unsigned int number = 0; number < sixteenth_points; number++) {
         __m512 x = _mm512_loadu_ps(in);
         in += 16;
@@ -182,6 +193,7 @@ volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_p
     }
 
     const unsigned int done = sixteenth_points * 16;
+
     volk_32f_reciprocal_32f_generic(out, in, num_points - done);
 }
 #endif /* LV_HAVE_AVX512F */