From bb1ed3ba7cd08f558d9f09aa53b63e51c89d2af5 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Tue, 27 Feb 2024 22:15:55 +0100 Subject: [PATCH 1/3] New AVX512F implementation Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_reciprocal_32f.h | 189 +++++++++++++++++++++++++ lib/kernel_tests.h | 1 + 2 files changed, 190 insertions(+) create mode 100644 kernels/volk/volk_32f_reciprocal_32f.h diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h new file mode 100644 index 00000000..5fba6926 --- /dev/null +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -0,0 +1,189 @@ +/* -*- c++ -*- */ +/* + * Copyright 2023 Magnus Lundmark + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/*! + * \page volk_32f_reciprocal_32f + * + * \b Overview + * + * Computes the reciprocal of the input vector and stores the results + * in the output vector. For the AVX512F implementation the relative + * error is < 2**(-14) = 6.1e-05 + * + * Dispatcher Prototype + * \code + * void volk_32f_reciprocal_32f(float* out, const float* in, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li in: A pointer to the input vector of floats. + * \li num_points: The number of data points. + * + * \b Outputs + * \li bVector: A pointer to the output vector of floats. + * + * \b Example + * \code + int N = 10; + unsigned int alignment = volk_get_alignment(); + float* in = (float*)volk_malloc(sizeof(float)*N, alignment); + float* out = (float*)volk_malloc(sizeof(float)*N, alignment); + + for(unsigned int ii = 1; ii < N; ++ii){ + in[ii] = (float)(ii*ii); + } + + volk_32f_reciprocal_32f(out, in, N); + + for(unsigned int ii = 0; ii < N; ++ii){ + printf("out(%i) = %f\n", ii, out[ii]); + } + + volk_free(in); + volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H +#define INCLUDED_volk_32f_reciprocal_32f_a_H + +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points) +{ + for (unsigned int i = 0; i < num_points; i++) { + out[i] = 1.f / in[i]; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_SSE +#include +static inline void +volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_points) +{ + const __m128 ONE = _mm_set_ps1(1.f); + const unsigned int quarter_points = num_points / 4; + for (unsigned int number = 0; number < quarter_points; number++) { + __m128 x = _mm_load_ps(in); + in += 4; + __m128 r = _mm_div_ps(ONE, x); + _mm_store_ps(out, r); + out += 4; + } + + const unsigned int done = quarter_points * 4; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +static inline void +volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points) +{ + const __m256 ONE = _mm256_set1_ps(1.f); + const unsigned int eighth_points = num_points / 8; + for (unsigned int number = 0; number < eighth_points; number++) { + __m256 x = _mm256_load_ps(in); + in += 8; + __m256 r = _mm256_div_ps(ONE, x); + _mm256_store_ps(out, r); + out += 8; + } + + const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_AVX512F +#include +static inline void +volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points) +{ + const unsigned int sixteenth_points = num_points / 16; + for (unsigned int number = 0; number < sixteenth_points; number++) { + __m512 x = _mm512_load_ps(in); + in += 16; + __m512 r = _mm512_rcp14_ps(x); + _mm512_store_ps(out, r); + out += 16; + } + + const unsigned int done = sixteenth_points * 16; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX512F */ + +#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */ + +#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H +#define INCLUDED_volk_32f_reciprocal_32f_u_H + +#ifdef LV_HAVE_SSE +#include +static inline void +volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_points) +{ + const __m128 ONE = _mm_set_ps1(1.f); + const unsigned int quarter_points = num_points / 4; + for (unsigned int number = 0; number < quarter_points; number++) { + __m128 x = _mm_loadu_ps(in); + in += 4; + __m128 r = _mm_div_ps(ONE, x); + _mm_storeu_ps(out, r); + out += 4; + } + + const unsigned int done = quarter_points * 4; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +static inline void +volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points) +{ + const __m256 ONE = _mm256_set1_ps(1.f); + const unsigned int eighth_points = num_points / 8; + for (unsigned int number = 0; number < eighth_points; number++) { + __m256 x = _mm256_loadu_ps(in); + in += 8; + __m256 r = _mm256_div_ps(ONE, x); + _mm256_storeu_ps(out, r); + out += 8; + } + + const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_AVX512F +#include +static inline void +volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points) +{ + const unsigned int sixteenth_points = num_points / 16; + for (unsigned int number = 0; number < sixteenth_points; number++) { + __m512 x = _mm512_loadu_ps(in); + in += 16; + __m512 r = _mm512_rcp14_ps(x); + _mm512_storeu_ps(out, r); + out += 16; + } + + const unsigned int done = sixteenth_points * 16; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX512F */ + +#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */ diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index 16c79c36..a22a4027 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -141,6 +141,7 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params)) QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params)) QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_reciprocal_32f, test_params.make_tol(6.15e-5))) QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_absolute(1e-5))) From 7fcb2054402d2d65632da4c689785ef7fd538275 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Tue, 27 Feb 2024 22:38:26 +0100 Subject: [PATCH 2/3] Updated copyright year Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_reciprocal_32f.h | 2 +- lib/kernel_tests.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h index 5fba6926..42363d3f 100644 --- a/kernels/volk/volk_32f_reciprocal_32f.h +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -1,6 +1,6 @@ /* -*- c++ -*- */ /* - * Copyright 2023 Magnus Lundmark + * Copyright 2024 Magnus Lundmark * * This file is part of VOLK * diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index a22a4027..57a296dc 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 - 2021 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * From 89bfc55317365847360951a468d399ed077b525d Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Fri, 1 Mar 2024 13:37:29 +0100 Subject: [PATCH 3/3] formatting Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_reciprocal_32f.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h index 42363d3f..37bd16a8 100644 --- a/kernels/volk/volk_32f_reciprocal_32f.h +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -70,6 +70,7 @@ volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_poin { const __m128 ONE = _mm_set_ps1(1.f); const unsigned int quarter_points = num_points / 4; + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_load_ps(in); in += 4; @@ -79,6 +80,7 @@ volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_poin } const unsigned int done = quarter_points * 4; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_SSE */ @@ -90,6 +92,7 @@ volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_poin { const __m256 ONE = _mm256_set1_ps(1.f); const unsigned int eighth_points = num_points / 8; + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); in += 8; @@ -99,6 +102,7 @@ volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_poin } const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX */ @@ -109,6 +113,7 @@ static inline void volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points) { const unsigned int sixteenth_points = num_points / 16; + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_load_ps(in); in += 16; @@ -118,6 +123,7 @@ volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_p } const unsigned int done = sixteenth_points * 16; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX512F */ @@ -134,6 +140,7 @@ volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_poin { const __m128 ONE = _mm_set_ps1(1.f); const unsigned int quarter_points = num_points / 4; + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_loadu_ps(in); in += 4; @@ -143,6 +150,7 @@ volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_poin } const unsigned int done = quarter_points * 4; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_SSE */ @@ -154,6 +162,7 @@ volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_poin { const __m256 ONE = _mm256_set1_ps(1.f); const unsigned int eighth_points = num_points / 8; + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); in += 8; @@ -163,6 +172,7 @@ volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_poin } const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX */ @@ -173,6 +183,7 @@ static inline void volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points) { const unsigned int sixteenth_points = num_points / 16; + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_loadu_ps(in); in += 16; @@ -182,6 +193,7 @@ volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_p } const unsigned int done = sixteenth_points * 16; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX512F */