Skip to content

Commit acdc582

Browse files
add backend based on gcc vector extension
1 parent 89facb7 commit acdc582

File tree

7 files changed

+3018
-8
lines changed

7 files changed

+3018
-8
lines changed

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ project(vs-dfttest2 VERSION 0.1 LANGUAGES CXX)
44

55
set(ENABLE_CUDA ON CACHE BOOL "Whether to compile with CUDA backends")
66
set(ENABLE_CPU ON CACHE BOOL "Whether to compile with x86 backends")
7+
set(ENABLE_GCC OFF CACHE BOOL "Whether to compile with gcc vector extension backends")
78

89
if(NOT CMAKE_BUILD_TYPE)
910
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE)
@@ -137,3 +138,7 @@ endif() # ENABLE_CUDA
137138
if(ENABLE_CPU)
138139
add_subdirectory(cpu_source)
139140
endif() # ENABLE_CPU
141+
142+
if(ENABLE_GCC)
143+
add_subdirectory(gcc_source)
144+
endif() # ENABLE_GCC

dfttest2.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ class NVRTC:
2727
class CPU:
2828
opt: int = 0
2929

30-
backendT = typing.Union[Backend.cuFFT, Backend.NVRTC, Backend.CPU]
30+
@dataclass(frozen=False)
31+
class GCC:
32+
pass
33+
34+
backendT = typing.Union[Backend.cuFFT, Backend.NVRTC, Backend.CPU, Backend.GCC]
3135

3236

3337
def init_backend(backend: backendT) -> backendT:
@@ -37,6 +41,8 @@ def init_backend(backend: backendT) -> backendT:
3741
backend = Backend.NVRTC()
3842
elif backend is Backend.CPU: # type: ignore
3943
backend = Backend.CPU()
44+
elif backend is Backend.GCC: # type: ignore
45+
backend = Backend.GCC()
4046
return backend
4147

4248

@@ -252,7 +258,7 @@ def DFTTest2(
252258
zero_mean = zmean
253259
backend = init_backend(backend)
254260

255-
if isinstance(backend, (Backend.CPU, Backend.NVRTC)):
261+
if isinstance(backend, (Backend.CPU, Backend.NVRTC, Backend.GCC)):
256262
if radius not in range(4):
257263
raise ValueError("invalid radius (tbsize)")
258264
if block_size != 16:
@@ -328,6 +334,8 @@ def DFTTest2(
328334
rdft = core.dfttest2_nvrtc.RDFT
329335
elif isinstance(backend, Backend.CPU):
330336
rdft = core.dfttest2_cpu.RDFT
337+
elif isinstance(backend, Backend.GCC):
338+
rdft = core.dfttest2_gcc.RDFT
331339
else:
332340
raise TypeError("unknown backend")
333341

@@ -358,6 +366,21 @@ def DFTTest2(
358366
window_freq=window_freq,
359367
opt=backend.opt
360368
)
369+
elif isinstance(backend, Backend.GCC):
370+
return core.dfttest2_gcc.DFTTest(
371+
clip,
372+
window=window,
373+
sigma=[sigma_scalar] * (2 * radius + 1) * block_size * (block_size // 2 + 1) if sigma_is_scalar else sigma_array,
374+
sigma2=sigma2,
375+
pmin=pmin,
376+
pmax=pmax,
377+
radius=radius,
378+
block_size=block_size,
379+
block_step=block_step,
380+
planes=planes,
381+
filter_type=filter_type,
382+
window_freq=window_freq
383+
)
361384

362385
if isinstance(backend, Backend.cuFFT):
363386
to_single = core.dfttest2_cuda.ToSingle
@@ -482,8 +505,10 @@ def select_backend(
482505
return Backend.NVRTC()
483506
elif hasattr(core, "dfttest2_cuda"):
484507
return Backend.cuFFT()
485-
else:
508+
elif hasattr(core, "dfttest2_cpu"):
486509
return Backend.CPU()
510+
else:
511+
return Backend.GCC()
487512
else:
488513
return Backend.cuFFT()
489514

@@ -677,12 +702,12 @@ def DFTTest(
677702
678703
backend: Backend implementation to use.
679704
All available backends can be found in the dfttest2.Backend "namespace":
680-
dfttest2.Backend.{CPU, cuFFT, NVRTC}
681-
682-
The CPU and NVRTC backend require sbsize=16.
705+
dfttest2.Backend.{CPU, cuFFT, NVRTC, GCC}
706+
707+
The CPU, NVRTC and GCC backends require sbsize=16.
683708
The cuFFT and NVRTC backend require a CUDA-enabled system.
684-
685-
Speed: NVRTC >> cuFFT > CPU
709+
710+
Speed: NVRTC >> cuFFT > CPU == GCC
686711
"""
687712

688713
if (

gcc_source/CMakeLists.txt

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
add_library(dfttest2_gcc MODULE source.cpp)
2+
3+
set_target_properties(dfttest2_gcc PROPERTIES
4+
CXX_EXTENSIONS OFF
5+
CXX_STANDARD 20
6+
CXX_STANDARD_REQUIRED ON
7+
)
8+
9+
target_include_directories(dfttest2_gcc PRIVATE ${VCL_HOME})
10+
11+
if(PKG_CONFIG_FOUND AND VS_FOUND)
12+
target_include_directories(dfttest2_gcc PRIVATE ${VS_INCLUDE_DIRS})
13+
install(TARGETS dfttest2_gcc LIBRARY DESTINATION ${install_dir})
14+
else()
15+
target_include_directories(dfttest2_gcc PRIVATE ${VS_INCLUDE_DIR})
16+
install(TARGETS dfttest2_gcc LIBRARY DESTINATION lib)
17+
endif()
18+
19+
target_include_directories(dfttest2_gcc PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..)
20+
21+
22+
add_library(getframe_impl OBJECT getframe_impl.cpp)
23+
24+
set_target_properties(getframe_impl PROPERTIES
25+
CXX_EXTENSIONS OFF
26+
CXX_STANDARD 20
27+
CXX_STANDARD_REQUIRED ON
28+
)
29+
30+
target_include_directories(getframe_impl PRIVATE ${VCL_HOME})
31+
32+
if(PKG_CONFIG_FOUND AND VS_FOUND)
33+
target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIRS})
34+
else()
35+
target_include_directories(getframe_impl PRIVATE ${VS_INCLUDE_DIR})
36+
endif()
37+
38+
target_link_libraries(dfttest2_gcc PRIVATE getframe_impl)

gcc_source/dfttest2_cpu.h

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#ifndef DFTTEST2_CPU_H
2+
#define DFTTEST2_CPU_H
3+
4+
#include <array>
5+
#include <atomic>
6+
#include <cstdint>
7+
#include <memory>
8+
#include <shared_mutex>
9+
#include <thread>
10+
#include <unordered_map>
11+
12+
#include <VapourSynth.h>
13+
#include <VSHelper.h>
14+
15+
16+
static void vs_aligned_free_float(float * ptr) {
17+
vs_aligned_free(static_cast<void *>(ptr));
18+
}
19+
20+
21+
struct DFTTestThreadData {
22+
uint8_t * padded; // shape: (pad_height, pad_width)
23+
float * padded2; // shape: (pad_height, pad_width)
24+
};
25+
26+
27+
struct DFTTestData {
28+
VSNodeRef * node;
29+
int radius;
30+
int block_size;
31+
int block_step;
32+
std::array<bool, 3> process;
33+
bool zero_mean;
34+
std::unique_ptr<float [], decltype(&vs_aligned_free_float)> window { nullptr, &vs_aligned_free_float };
35+
std::unique_ptr<float [], decltype(&vs_aligned_free_float)> window_freq { nullptr, &vs_aligned_free_float };
36+
std::unique_ptr<float [], decltype(&vs_aligned_free_float)> sigma { nullptr, &vs_aligned_free_float };
37+
int filter_type;
38+
float sigma2;
39+
float pmin;
40+
float pmax;
41+
42+
std::atomic<int> num_uninitialized_threads;
43+
std::unordered_map<std::thread::id, DFTTestThreadData> thread_data;
44+
std::shared_mutex thread_data_lock;
45+
};
46+
47+
extern const VSFrameRef *VS_CC DFTTestGetFrame(
48+
int n, int activationReason, void **instanceData, void **frameData,
49+
VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi
50+
) noexcept;
51+
52+
#endif // DFTTEST2_CPU_H

0 commit comments

Comments
 (0)