Skip to content

Commit

Permalink
Merge pull request #774 from camel-cdr/main
Browse files Browse the repository at this point in the history
Implement RISC-V Vector 1.0 kernels
  • Loading branch information
jdemel authored Nov 3, 2024
2 parents 7fdde55 + 8e88f1e commit 5aaac67
Show file tree
Hide file tree
Showing 147 changed files with 5,396 additions and 116 deletions.
55 changes: 55 additions & 0 deletions .github/workflows/run-tests-rvv.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#
# Copyright 2020 - 2022 Free Software Foundation, Inc.
#
# This file is part of VOLK
#
# SPDX-License-Identifier: LGPL-3.0-or-later
#

name: Run VOLK tests on different RVV configurations

on: [push, pull_request]

jobs:
Tests:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Install packages
run: |
sudo apt-get update -q -y
sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18
mkdir build
cd build
- name: Test gcc-14 VLEN=128
run: |
cd build; rm -rf *
CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
make -j$(nproc)
ARGS=-V make test
- name: Test gcc-14 VLEN=256
run: |
cd build; rm -rf *
CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=256 \
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
make -j$(nproc)
ARGS=-V make test
- name: Test clang-18 VLEN=512
run: |
cd build; rm -rf *
CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
make -j$(nproc)
ARGS=-V make test
- name: Test clang-18 VLEN=1024
run: |
cd build; rm -rf *
CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release
make -j$(nproc)
ARGS=-V make test
5 changes: 5 additions & 0 deletions cmake/Checks/check-rvv-intrinsics.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#if (__riscv_v_intrinsic >= 1000000 || __clang_major__ >= 18 || __GNUC__ >= 14)
int main() { return 0; }
#else
#error "rvv intrinsics aren't supported"
#endif
34 changes: 34 additions & 0 deletions cmake/Toolchains/rv64gcv-linux-gnu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#
# Copyright 2024 Free Software Foundation, Inc.
#
# This file is part of VOLK
#
# SPDX-License-Identifier: LGPL-3.0-or-later
#

set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR riscv64)

set(CMAKE_C_COMPILER $ENV{CC})
set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
set(CMAKE_CXX_COMPILER $ENV{CXX})

set(CMAKE_C_FLAGS "$ENV{CFLAGS} -march=rv64gcv" CACHE STRING "" FORCE)
set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "" FORCE)
set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g" CACHE STRING "" FORCE)

set(CMAKE_OBJCOPY
${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy
CACHE INTERNAL "objcopy tool")
set(CMAKE_SIZE_UTIL
${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size
CACHE INTERNAL "size tool")

set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})

set(QEMU_VLEN $ENV{VLEN})
if(NOT QEMU_VLEN)
set(QEMU_VLEN "128")
endif()

set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static -L /usr/riscv64-linux-gnu/ -cpu rv64,zba=true,zbb=true,v=on,vlen=${QEMU_VLEN},rvv_ta_all_1s=on,rvv_ma_all_1s=on")
44 changes: 44 additions & 0 deletions gen/archs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -181,4 +181,48 @@ at the top, as a last resort.
<arch name="riscv64">
</arch>

<!-->
tmpl/ currently assumes that every arch.name starting with "rv" requires
RVV intrinsics
</-->
<!-->
There is currently no mechanism in RISC-V to append extensions,
so each arch needs to specify all of them, and the order needs in the
machine definition needs to be from the fewest to the most extensions.
Fortunately, this maps quite well to the profiles concept.
</-->
<arch name="rvv">
<check name="V"></check>
<flag compiler="gnu">-march=rv64gcv</flag>
<flag compiler="clang">-march=rv64gcv</flag>
</arch>

<arch name="rvvseg">
<check name="V"></check>
<flag compiler="gnu">-march=rv64gcv</flag>
<flag compiler="clang">-march=rv64gcv</flag>
<!-->
It's unclear how performance portable segmented load/stores are, so the
default rvv implementations avoid using them.
This is a pseudo arch for separate segmented load/store implementations,
and is expected to never be used standalone without "rvv".
</-->
</arch>

<!-->
google/cpu_features currently doesn't support these extensions and profiles.
</-->
<!--arch name="rva22v">
<check name="V"></check>
<check name="B"></check>
<flag compiler="gnu">-march=rv64gcv_zba_zbb_zbs</flag>
<flag compiler="clang">-march=rv64gcv_zba_zbb_zbs</flag>
</arch-->

<!--arch name="rva23">
<check name="rva23"></check>
<flag compiler="gnu">-march=rva23u64</flag>
<flag compiler="clang">-march=rva23u64</flag>
</arch-->

</grammar>
12 changes: 12 additions & 0 deletions gen/machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@
<archs>generic riscv64 orc|</archs>
</machine>

<machine name="rv64gcv">
<archs>generic riscv64 rvv rvvseg orc|</archs>
</machine>

<!--machine name="rva22v">
<archs>generic riscv64 rvv rvvseg rva22v orc|</archs>
</machine-->

<!--machine name="rva23">
<archs>generic riscv64 rvv rvvseg rva22v rva23 orc|</archs>
</machine-->

<machine name="sse4_a">
<archs>generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc|</archs>
</machine>
Expand Down
77 changes: 77 additions & 0 deletions include/volk/volk_rvv_intrinsics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/* -*- c++ -*- */
/*
* Copyright 2024 Free Software Foundation, Inc.
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*
* This file is intended to hold RVV intrinsics of intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

#ifndef INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
#include <riscv_vector.h>

#define RISCV_SHRINK2(op, T, S, v) \
__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
__riscv_vget_##T##S##m1(v, 1), \
__riscv_vsetvlmax_e##S##m1())

#define RISCV_SHRINK4(op, T, S, v) \
__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
__riscv_vget_##T##S##m1(v, 1), \
__riscv_vsetvlmax_e##S##m1()), \
__riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
__riscv_vget_##T##S##m1(v, 3), \
__riscv_vsetvlmax_e##S##m1()), \
__riscv_vsetvlmax_e##S##m1())

#define RISCV_SHRINK8(op, T, S, v) \
__riscv_##op(__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
__riscv_vget_##T##S##m1(v, 1), \
__riscv_vsetvlmax_e##S##m1()), \
__riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
__riscv_vget_##T##S##m1(v, 3), \
__riscv_vsetvlmax_e##S##m1()), \
__riscv_vsetvlmax_e##S##m1()), \
__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 4), \
__riscv_vget_##T##S##m1(v, 5), \
__riscv_vsetvlmax_e##S##m1()), \
__riscv_##op(__riscv_vget_##T##S##m1(v, 6), \
__riscv_vget_##T##S##m1(v, 7), \
__riscv_vsetvlmax_e##S##m1()), \
__riscv_vsetvlmax_e##S##m1()), \
__riscv_vsetvlmax_e##S##m1())

#define RISCV_PERM4(f, v, vidx) \
__riscv_vcreate_v_u8m1_u8m4( \
f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()))

#define RISCV_LUT4(f, vtbl, v) \
__riscv_vcreate_v_u8m1_u8m4( \
f(vtbl, __riscv_vget_u8m1(v, 0), __riscv_vsetvlmax_e8m1()), \
f(vtbl, __riscv_vget_u8m1(v, 1), __riscv_vsetvlmax_e8m1()), \
f(vtbl, __riscv_vget_u8m1(v, 2), __riscv_vsetvlmax_e8m1()), \
f(vtbl, __riscv_vget_u8m1(v, 3), __riscv_vsetvlmax_e8m1()))

#define RISCV_PERM8(f, v, vidx) \
__riscv_vcreate_v_u8m1_u8m8( \
f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 4), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 5), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 6), vidx, __riscv_vsetvlmax_e8m1()), \
f(__riscv_vget_u8m1(v, 7), vidx, __riscv_vsetvlmax_e8m1()))

#define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl)

#endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */
61 changes: 61 additions & 0 deletions kernels/volk/volk_16i_32fc_dot_prod_32fc.h
Original file line number Diff line number Diff line change
Expand Up @@ -668,5 +668,66 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,

#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/

#ifdef LV_HAVE_RVV
#include <riscv_vector.h>
#include <volk/volk_rvv_intrinsics.h>

static inline void volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points)
{
vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
vfloat32m4_t vsumi = vsumr;
size_t n = num_points;
for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
vl = __riscv_vsetvl_e32m4(n);
vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl);
vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
vfloat32m4_t v =
__riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
}
size_t vl = __riscv_vsetvlmax_e32m1();
vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
*result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
__riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
}
#endif /*LV_HAVE_RVV*/

#ifdef LV_HAVE_RVVSEG
#include <riscv_vector.h>
#include <volk/volk_rvv_intrinsics.h>

static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points)
{
vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
vfloat32m4_t vsumi = vsumr;
size_t n = num_points;
for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
vl = __riscv_vsetvl_e32m4(n);
vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
vfloat32m4_t v =
__riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
}
size_t vl = __riscv_vsetvlmax_e32m1();
vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
*result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
__riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
}
#endif /*LV_HAVE_RVVSEG*/

#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
4 changes: 4 additions & 0 deletions kernels/volk/volk_16i_branch_4_state_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
/*!
* \page volk_16i_branch_4_state_8
*
* \b Deprecation
*
* This kernel is deprecated.
*
* \b Overview
*
* <FIXME>
Expand Down
15 changes: 15 additions & 0 deletions kernels/volk/volk_16i_convert_8i.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,5 +275,20 @@ static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
}
#endif /* LV_HAVE_NEON */

#ifdef LV_HAVE_RVV
#include <riscv_vector.h>

static inline void volk_16i_convert_8i_rvv(int8_t* outputVector,
const int16_t* inputVector,
unsigned int num_points)
{
size_t n = num_points;
for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
vl = __riscv_vsetvl_e16m8(n);
vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl);
__riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl);
}
}
#endif /*LV_HAVE_RVV*/

#endif /* INCLUDED_volk_16i_convert_8i_a_H */
4 changes: 4 additions & 0 deletions kernels/volk/volk_16i_max_star_16i.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
/*!
* \page volk_16i_max_star_16i
*
* \b Deprecation
*
* This kernel is deprecated.
*
* \b Overview
*
* <FIXME>
Expand Down
4 changes: 4 additions & 0 deletions kernels/volk/volk_16i_max_star_horizontal_16i.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
/*!
* \page volk_16i_max_star_horizontal_16i
*
* \b Deprecation
*
* This kernel is deprecated.
*
* \b Overview
*
* <FIXME>
Expand Down
4 changes: 4 additions & 0 deletions kernels/volk/volk_16i_permute_and_scalar_add.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
/*!
* \page volk_16i_permute_and_scalar_add
*
* \b Deprecation
*
* This kernel is deprecated.
*
* \b Overview
*
* <FIXME>
Expand Down
17 changes: 17 additions & 0 deletions kernels/volk/volk_16i_s32f_convert_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -483,4 +483,21 @@ static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
}
#endif /* LV_HAVE_SSE */

#ifdef LV_HAVE_RVV
#include <riscv_vector.h>

static inline void volk_16i_s32f_convert_32f_rvv(float* outputVector,
const int16_t* inputVector,
const float scalar,
unsigned int num_points)
{
size_t n = num_points;
for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
vl = __riscv_vsetvl_e16m4(n);
vfloat32m8_t v = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(inputVector, vl), vl);
__riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl);
}
}
#endif /*LV_HAVE_RVV*/

#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
Loading

0 comments on commit 5aaac67

Please sign in to comment.