Libraries-Openly-Fused · albertandaluz · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/.github/workflows/cmake-linux-amd64.yml b/.github/workflows/cmake-linux-amd64.yml
@@ -31,7 +31,7 @@ jobs:
     - name: Configure CMake      
       run: |          
         IFS="," read -r -a cpp_compiler <<< "${{matrix.cpp_compiler}}"   
-        export PATH=/usr/lib/llvm-21/bin/:$PATH  
+        export PATH=/home/cudeiro/cmake-4.1.2-linux-x86_64/bin/:/usr/lib/llvm-21/bin/:$PATH  
         export CUDACXX=/usr/local/cuda-${cpp_compiler[2]}/bin/nvcc
         cmake -G "Ninja" -B ${{steps.strings.outputs.build-output-dir}} -DCMAKE_CXX_COMPILER="${cpp_compiler[0]}" -DCMAKE_CUDA_COMPILER="${cpp_compiler[1]}"  -DCUDAToolkit_ROOT="/usr/local/cuda-${cpp_compiler[2]}" -DCMAKE_BUILD_TYPE="Release" -S ${{github.workspace}}       
 

diff --git a/.github/workflows/cmake-linux-arm64.yml b/.github/workflows/cmake-linux-arm64.yml
@@ -32,7 +32,7 @@ jobs:
     - name: Configure CMake      
       run: |          
         IFS="," read -r -a cpp_compiler <<< "${{matrix.cpp_compiler}}"   
-        export PATH=/usr/lib/llvm-21/bin/:$PATH  
+        export PATH=/home/cudeiro/cmake-4.1.2-linux-aarch64/bin/:/usr/lib/llvm-21/bin/:$PATH  
         export CUDACXX=/usr/local/cuda-${cpp_compiler[2]}/bin/nvcc
         cmake -G "Ninja" -B ${{steps.strings.outputs.build-output-dir}} -DCMAKE_CXX_COMPILER="${cpp_compiler[0]}" -DCMAKE_CUDA_COMPILER="${cpp_compiler[1]}"  -DCUDAToolkit_ROOT="/usr/local/cuda-${cpp_compiler[2]}" -DCUDA_ARCH="87;" -DCMAKE_BUILD_TYPE="Release" -S ${{github.workspace}}       
 

diff --git a/.github/workflows/cmake-windows-amd64.yml b/.github/workflows/cmake-windows-amd64.yml
@@ -16,7 +16,7 @@ jobs:
       matrix:         
       #host compiler, version, cuda compiler, cuda version,
       #disabling ,"llvm,21.1.0,llvm,12.1", not working on windows yet
-        toolset: ["cl,14.16,nvcc,11.8","cl,14.44,nvcc,12.9"]
+        toolset: ["cl,14.44,nvcc,12.9"]
 
 
     steps:

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -20,7 +20,7 @@ function (enable_intellisense TARGET_NAME)
         target_compile_definitions(${TARGET_NAME} PRIVATE ENABLE_BENCHMARK)
     endif()
 
-    set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)
+    set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)
 
     target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_SOURCE_DIR}")
     add_optimization_flags(${TARGET_NAME})

diff --git a/cmake/discover_tests.cmake b/cmake/discover_tests.cmake
@@ -32,7 +32,7 @@ function (discover_tests DIR)
             add_nvtx_support_to_target(${cuda_target})
         endif()
 
-        set_target_properties(${cuda_target} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)            
+        set_target_properties(${cuda_target} PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)            
         target_include_directories(${cuda_target} PRIVATE "${CMAKE_SOURCE_DIR}/")      
         target_include_directories(${cuda_target} PRIVATE "${CMAKE_SOURCE_DIR}/include")          
         #target_link_libraries(${cuda_target} PRIVATE ${PROJECT_NAME})        

diff --git a/cmake/libs/cuda/target_generation.cmake b/cmake/libs/cuda/target_generation.cmake
@@ -4,7 +4,7 @@ function(set_default_cuda_target_properties TARGET_NAME)
     endif()
     target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${COMPILER_CUDA_FLAGS}>)
 
-    set_target_properties(${TARGET_NAME} PROPERTIES CUDA_STANDARD_REQUIRED ON CUDA_STANDARD 17 CUDA_RUNTIME_LIBRARY
+    set_target_properties(${TARGET_NAME} PROPERTIES CUDA_STANDARD_REQUIRED ON CUDA_STANDARD 20 CUDA_RUNTIME_LIBRARY
                                                                                                Shared)
     set_target_cuda_arch_flags(${TARGET_NAME})
 
@@ -16,13 +16,11 @@ function(set_default_cuda_target_properties TARGET_NAME)
     #see https://forums.developer.nvidia.com/t/the-cost-of-relocatable-device-code-rdc-true/47665
     target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=false>)
 
-    #cuda 12 can compile in parallel, so let's use this 
-    if (${CUDA_VERSION_MAJOR} GREATER_EQUAL 12)
         #split compile does not work with gpu debug code
-        if(NOT ${ENABLE_DEBUG})
-            target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-split-compile=0>)
-        endif()
+    if(NOT ${ENABLE_DEBUG})
+        target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-split-compile=0>)
     endif()
+
 
     if (NOT(${TEMPLATE_DEPTH} STREQUAL  "default"))
     #bugfix for windows compilation of tests with more than 200 recursions

diff --git a/cmake/libs/fkl.cmake b/cmake/libs/fkl.cmake
@@ -1,6 +1,6 @@
 set(FKL_VERSION_MAJOR 0)
-set(FKL_VERSION_MINOR 1)
-set(FKL_VERSION_RELEASE 6)
+set(FKL_VERSION_MINOR 2)
+set(FKL_VERSION_RELEASE 0)
 set(FKL_VERSION ${FKL_VERSION_MAJOR}.${FKL_VERSION_MINOR}.${FKL_VERSION_RELEASE})
 
 list(APPEND CMAKE_PREFIX_PATH "${CMAKE_SOURCE_DIR}/fkl/lib/export")

diff --git a/fkl b/fkl
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
@@ -10,4 +10,4 @@ add_library(headers INTERFACE ${CUDA_SOURCES})
 # Set virtual folders for MSVC
 source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${CUDA_SOURCES})
 #target_include_directories(include INTERFACE .)
-set_target_properties(${cuda_target} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)
+set_target_properties(${cuda_target} PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)
diff --git a/include/fast_npp.cuh b/include/fast_npp.cuh
@@ -1,4 +1,4 @@
-/* Copyright 2025 Oscar Amoros Huguet
+/* Copyright 2025 Oscar Amoros Huguet
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -19,10 +19,10 @@
 #include <nppi_geometry_transforms.h>
 
 #include <fused_kernel/core/utils/utils.h>
-#include <fused_kernel/fused_kernel.cuh>
-#include <fused_kernel/algorithms/image_processing/resize.cuh>
-#include <fused_kernel/algorithms/basic_ops/cuda_vector.cuh>
-#include <fused_kernel/algorithms/basic_ops/arithmetic.cuh>
+#include <fused_kernel/fused_kernel.h>
+#include <fused_kernel/algorithms/image_processing/resize.h>
+#include <fused_kernel/algorithms/basic_ops/vector_ops.h>
+#include <fused_kernel/algorithms/basic_ops/arithmetic.h>
 
 namespace fastNPP {
 
@@ -34,17 +34,17 @@ namespace fastNPP {
         // currently expecting the destination ROI's to be equal to nMaxWidth and nMaxHeight
         int currentDevice{ 0 };
         gpuErrchk(cudaGetDevice(&currentDevice));
-        std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;
+        std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;  
-        std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;  
+        std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;
-        std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;  
+        std::array<fk::Ptr2D<uchar3>, BATCH> srcBatch;
         for (int i = 0; i < BATCH; ++i) {
             srcBatch[i] = fk::Ptr2D<uchar3>(reinterpret_cast<uchar3*>(h_pBatchSrc[i].pData),
                                                                       h_pBatchSrc[i].oSize.width,
                                                                       h_pBatchSrc[i].oSize.height,
-                                                                      h_pBatchSrc[i].nStep,
-                                                                      fk::Device, currentDevice);
+                                                                      h_pBatchSrc[i].nStep, fk::MemType::Device, 
+                                                                      currentDevice);
         }
         const fk::Size dstSize(nMaxWidth, nMaxHeight);
-        return fk::PerThreadRead<fk::_2D, uchar3>::build(srcBatch)
-               .then(fk::Resize<fk::INTER_LINEAR>::build(dstSize));
+        return fk::PerThreadRead<fk::ND::_2D, uchar3>::build(srcBatch)
+               .then(fk::Resize<fk::InterpolationType::INTER_LINEAR>::build(dstSize));
     }
 
     constexpr inline auto SwapChannels_32f_C3R_Ctx(const int(&dstOrder)[3]) {
@@ -76,20 +76,20 @@ namespace fastNPP {
     template <size_t BATCH>
     constexpr inline auto CopyBatch_32f_C3P3R_Ctx(const std::array<Npp32f*, BATCH>  (&aDst)[3],
                                                   const int& nDstStep, const NppiSize& oSizeROI) {
-        std::array<fk::SplitWriteParams<fk::_2D, float3>, BATCH> params;
+        std::array<fk::SplitWriteParams<fk::ND::_2D, float3>, BATCH> params;
         for (int i = 0; i < BATCH; ++i) {
             const uint width = static_cast<uint>(oSizeROI.width);
             const uint height = static_cast<uint>(oSizeROI.height);
             const uint step = static_cast<uint>(nDstStep);
-            const fk::PtrDims<fk::_2D> dims{ width, height, step };
-            const fk::SplitWriteParams<fk::_2D, float3> param{
+            const fk::PtrDims<fk::ND::_2D> dims{ width, height, step };
+            const fk::SplitWriteParams<fk::ND::_2D, float3> param{
                 {reinterpret_cast<float*>(aDst[0][i]), dims},
                 {reinterpret_cast<float*>(aDst[1][i]), dims},
                 {reinterpret_cast<float*>(aDst[2][i]), dims}
             };
             params[i] = param;
         }
-        return fk::SplitWrite<fk::_2D, float3>::build(params);
+        return fk::SplitWrite<fk::ND::_2D, float3>::build(params);
     }
 
     template <typename... IOps>