Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cmake-linux-amd64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- name: Configure CMake
run: |
IFS="," read -r -a cpp_compiler <<< "${{matrix.cpp_compiler}}"
export PATH=/usr/lib/llvm-21/bin/:$PATH
export PATH=/home/cudeiro/cmake-4.1.2-linux-x86_64/bin/:/usr/lib/llvm-21/bin/:$PATH
export CUDACXX=/usr/local/cuda-${cpp_compiler[2]}/bin/nvcc
cmake -G "Ninja" -B ${{steps.strings.outputs.build-output-dir}} -DCMAKE_CXX_COMPILER="${cpp_compiler[0]}" -DCMAKE_CUDA_COMPILER="${cpp_compiler[1]}" -DCUDAToolkit_ROOT="/usr/local/cuda-${cpp_compiler[2]}" -DCMAKE_BUILD_TYPE="Release" -S ${{github.workspace}}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cmake-linux-arm64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
- name: Configure CMake
run: |
IFS="," read -r -a cpp_compiler <<< "${{matrix.cpp_compiler}}"
export PATH=/usr/lib/llvm-21/bin/:$PATH
export PATH=/home/cudeiro/cmake-4.1.2-linux-aarch64/bin/:/usr/lib/llvm-21/bin/:$PATH
export CUDACXX=/usr/local/cuda-${cpp_compiler[2]}/bin/nvcc
cmake -G "Ninja" -B ${{steps.strings.outputs.build-output-dir}} -DCMAKE_CXX_COMPILER="${cpp_compiler[0]}" -DCMAKE_CUDA_COMPILER="${cpp_compiler[1]}" -DCUDAToolkit_ROOT="/usr/local/cuda-${cpp_compiler[2]}" -DCUDA_ARCH="87;" -DCMAKE_BUILD_TYPE="Release" -S ${{github.workspace}}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cmake-windows-amd64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
matrix:
#host compiler, version, cuda compiler, cuda version,
#disabling ,"llvm,21.1.0,llvm,12.1", not working on windows yet
toolset: ["cl,14.16,nvcc,11.8","cl,14.44,nvcc,12.9"]
toolset: ["cl,14.44,nvcc,12.9"]


steps:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ function (enable_intellisense TARGET_NAME)
target_compile_definitions(${TARGET_NAME} PRIVATE ENABLE_BENCHMARK)
endif()

set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)
set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)

target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_SOURCE_DIR}")
add_optimization_flags(${TARGET_NAME})
Expand Down
2 changes: 1 addition & 1 deletion cmake/discover_tests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ function (discover_tests DIR)
add_nvtx_support_to_target(${cuda_target})
endif()

set_target_properties(${cuda_target} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)
set_target_properties(${cuda_target} PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)
target_include_directories(${cuda_target} PRIVATE "${CMAKE_SOURCE_DIR}/")
target_include_directories(${cuda_target} PRIVATE "${CMAKE_SOURCE_DIR}/include")
#target_link_libraries(${cuda_target} PRIVATE ${PROJECT_NAME})
Expand Down
10 changes: 4 additions & 6 deletions cmake/libs/cuda/target_generation.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ function(set_default_cuda_target_properties TARGET_NAME)
endif()
target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${COMPILER_CUDA_FLAGS}>)

set_target_properties(${TARGET_NAME} PROPERTIES CUDA_STANDARD_REQUIRED ON CUDA_STANDARD 17 CUDA_RUNTIME_LIBRARY
set_target_properties(${TARGET_NAME} PROPERTIES CUDA_STANDARD_REQUIRED ON CUDA_STANDARD 20 CUDA_RUNTIME_LIBRARY
Shared)
set_target_cuda_arch_flags(${TARGET_NAME})

Expand All @@ -16,13 +16,11 @@ function(set_default_cuda_target_properties TARGET_NAME)
#see https://forums.developer.nvidia.com/t/the-cost-of-relocatable-device-code-rdc-true/47665
target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=false>)

#cuda 12 can compile in parallel, so let's use this
if (${CUDA_VERSION_MAJOR} GREATER_EQUAL 12)
#split compile does not work with gpu debug code
if(NOT ${ENABLE_DEBUG})
target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-split-compile=0>)
endif()
if(NOT ${ENABLE_DEBUG})
target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-split-compile=0>)
endif()


if (NOT(${TEMPLATE_DEPTH} STREQUAL "default"))
#bugfix for windows compilation of tests with more than 200 recursions
Expand Down
4 changes: 2 additions & 2 deletions cmake/libs/fkl.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set(FKL_VERSION_MAJOR 0)
set(FKL_VERSION_MINOR 1)
set(FKL_VERSION_RELEASE 6)
set(FKL_VERSION_MINOR 2)
set(FKL_VERSION_RELEASE 0)
set(FKL_VERSION ${FKL_VERSION_MAJOR}.${FKL_VERSION_MINOR}.${FKL_VERSION_RELEASE})

list(APPEND CMAKE_PREFIX_PATH "${CMAKE_SOURCE_DIR}/fkl/lib/export")
Expand Down
2 changes: 1 addition & 1 deletion fkl
Submodule fkl updated from 1f3322 to 5d5b1f
2 changes: 1 addition & 1 deletion include/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ add_library(headers INTERFACE ${CUDA_SOURCES})
# Set virtual folders for MSVC
source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${CUDA_SOURCES})
#target_include_directories(include INTERFACE .)
set_target_properties(${cuda_target} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)
set_target_properties(${cuda_target} PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)
28 changes: 14 additions & 14 deletions include/fast_npp.cuh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2025 Oscar Amoros Huguet
/* Copyright 2025 Oscar Amoros Huguet

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -19,10 +19,10 @@
#include <nppi_geometry_transforms.h>

#include <fused_kernel/core/utils/utils.h>
#include <fused_kernel/fused_kernel.cuh>
#include <fused_kernel/algorithms/image_processing/resize.cuh>
#include <fused_kernel/algorithms/basic_ops/cuda_vector.cuh>
#include <fused_kernel/algorithms/basic_ops/arithmetic.cuh>
#include <fused_kernel/fused_kernel.h>
#include <fused_kernel/algorithms/image_processing/resize.h>
#include <fused_kernel/algorithms/basic_ops/vector_ops.h>
#include <fused_kernel/algorithms/basic_ops/arithmetic.h>

namespace fastNPP {

Expand All @@ -34,17 +34,17 @@ namespace fastNPP {
// currently expecting the destination ROI's to be equal to nMaxWidth and nMaxHeight
int currentDevice{ 0 };
gpuErrchk(cudaGetDevice(&currentDevice));
std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;
std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;
Copy link

Copilot AI Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace should be removed from this line.

Suggested change
std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;
std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;

Copilot uses AI. Check for mistakes.
for (int i = 0; i < BATCH; ++i) {
srcBatch[i] = fk::Ptr2D<uchar3>(reinterpret_cast<uchar3*>(h_pBatchSrc[i].pData),
h_pBatchSrc[i].oSize.width,
h_pBatchSrc[i].oSize.height,
h_pBatchSrc[i].nStep,
fk::Device, currentDevice);
h_pBatchSrc[i].nStep, fk::MemType::Device,
currentDevice);
Comment on lines +42 to +43
Copy link

Copilot AI Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The line break after the comma creates inconsistent formatting. Consider keeping the parameter list on fewer lines or align the continuation properly.

Copilot uses AI. Check for mistakes.
}
const fk::Size dstSize(nMaxWidth, nMaxHeight);
return fk::PerThreadRead<fk::_2D, uchar3>::build(srcBatch)
.then(fk::Resize<fk::INTER_LINEAR>::build(dstSize));
return fk::PerThreadRead<fk::ND::_2D, uchar3>::build(srcBatch)
.then(fk::Resize<fk::InterpolationType::INTER_LINEAR>::build(dstSize));
}

constexpr inline auto SwapChannels_32f_C3R_Ctx(const int(&dstOrder)[3]) {
Expand Down Expand Up @@ -76,20 +76,20 @@ namespace fastNPP {
template <size_t BATCH>
constexpr inline auto CopyBatch_32f_C3P3R_Ctx(const std::array<Npp32f*, BATCH> (&aDst)[3],
const int& nDstStep, const NppiSize& oSizeROI) {
std::array<fk::SplitWriteParams<fk::_2D, float3>, BATCH> params;
std::array<fk::SplitWriteParams<fk::ND::_2D, float3>, BATCH> params;
for (int i = 0; i < BATCH; ++i) {
const uint width = static_cast<uint>(oSizeROI.width);
const uint height = static_cast<uint>(oSizeROI.height);
const uint step = static_cast<uint>(nDstStep);
const fk::PtrDims<fk::_2D> dims{ width, height, step };
const fk::SplitWriteParams<fk::_2D, float3> param{
const fk::PtrDims<fk::ND::_2D> dims{ width, height, step };
const fk::SplitWriteParams<fk::ND::_2D, float3> param{
{reinterpret_cast<float*>(aDst[0][i]), dims},
{reinterpret_cast<float*>(aDst[1][i]), dims},
{reinterpret_cast<float*>(aDst[2][i]), dims}
};
params[i] = param;
}
return fk::SplitWrite<fk::_2D, float3>::build(params);
return fk::SplitWrite<fk::ND::_2D, float3>::build(params);
}

template <typename... IOps>
Expand Down