From 11b91cc079bd33a25c7b3d582be75f3f364b36d4 Mon Sep 17 00:00:00 2001 From: Anthony Lombardi Date: Fri, 13 Jan 2023 15:41:45 -0500 Subject: [PATCH] ENH: Add support for 16 and 32-bit radiograph images Prior to these changes, the supported CUDA texture types (and by extension radiograph images) was hard-coded to 8-bit (unsigned char) and managed as a static global variable. Since texture references needed to be known at compile time and the current type was hard-coded to 8-bit, the use of them prevented from being able to easily support 8, 16, and 32 bit radiograph images known at runtime. To address this, this commit removes the use of `cudaBindTextureToArray` (Deprecated since CUDA 11.3 and removed in CUDA 12.x) and switch to using texture object instead by introducing a convenience helper function `createTextureObjectFromArray`. Approach implemented here is based of recommendation the post titled "Kepler Texture Objects Improve Performance and Flexibility" published in 2013. See https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ For reference, before this changes, the error message reported at build time while attempting to use autoscoper built against CUDA 12.x was: ``` 3 Errors under file path `...\libautoscoper\src\gpu\cuda\RadRenderer_kernels.cu: - no instance of overloaded function "tex2D" matches the argument list - identifier "cudaBindTextureToArray" is undefined - texture is not a template ``` --- .../src/gpu/cuda/BackgroundRenderer.cpp | 8 +++-- .../gpu/cuda/BackgroundRenderer_kernels.cu | 26 +++------------ .../src/gpu/cuda/BackgroundRenderer_kernels.h | 3 +- .../src/gpu/cuda/Compositor_kernels.cu | 4 +-- .../src/gpu/cuda/DRRBackground_kernels.cu | 4 +-- libautoscoper/src/gpu/cuda/Merger_kernels.cu | 4 +-- libautoscoper/src/gpu/cuda/Mult_kernels.cu | 4 +-- libautoscoper/src/gpu/cuda/RadRenderer.cpp | 8 +++-- .../src/gpu/cuda/RadRenderer_kernels.cu | 33 +++---------------- .../src/gpu/cuda/RadRenderer_kernels.h | 6 +--- libautoscoper/src/gpu/cuda/RayCaster.cpp | 13 ++++++-- .../src/gpu/cuda/RayCaster_kernels.cu | 28 ++++------------ .../src/gpu/cuda/RayCaster_kernels.h | 4 +-- .../src/gpu/cuda/cutil/cutil_create_tex_obj.h | 26 +++++++++++++++ 14 files changed, 76 insertions(+), 95 deletions(-) create mode 100644 libautoscoper/src/gpu/cuda/cutil/cutil_create_tex_obj.h diff --git a/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp b/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp index d4bd6713..c2eb1c9b 100644 --- a/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp +++ b/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp @@ -43,6 +43,7 @@ #include #include +#include #include #include @@ -111,8 +112,9 @@ BackgroundRenderer::set_viewport(float x, float y, float width, float height) void BackgroundRenderer::render(float* buffer, size_t width, size_t height, float threshold) const { - background_bind_array(array_); - background_render(buffer, + cudaTextureObject_t tex = createTexureObjectFromArray(array_, cudaReadModeElementType); + + background_render(tex,buffer, (int)width, (int)height, image_plane_[0], @@ -124,6 +126,8 @@ BackgroundRenderer::render(float* buffer, size_t width, size_t height, float thr viewport_[2], viewport_[3], threshold); + + cudaDestroyTextureObject(tex); } } } // namespace xromm::cuda diff --git a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu index 5d28209d..e03953f8 100644 --- a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu +++ b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu @@ -43,14 +43,10 @@ #include -/////// Global Variables //////// - -static texture tex; - //////// Image Rendering Kernel //////// __global__ -void background_render_kernel(float* output, int width, int height, float u0, +void background_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0, float v0, float u1, float v1, float u2, float v2, float u3, float v3, float threshold); @@ -60,20 +56,8 @@ namespace xromm namespace gpu { -void background_bind_array(const cudaArray* array) -{ - // Setup 2D texture. - tex.normalized = true; - tex.filterMode = cudaFilterModeLinear; - tex.addressMode[0] = cudaAddressModeClamp; - tex.addressMode[1] = cudaAddressModeClamp; - - // Bind array to 3D texture. - cutilSafeCall(cudaBindTextureToArray(tex, array)); -} - -void background_render(float* output, int width, int height, float u0, +void background_render(cudaTextureObject_t tex,float* output, int width, int height, float u0, float v0, float u1, float v1, float u2, float v2, float u3, float v3, float threshold) { @@ -82,7 +66,7 @@ void background_render(float* output, int width, int height, float u0, dim3 gridDim((width+blockDim.x-1)/blockDim.x, (height+blockDim.y-1)/blockDim.y); - background_render_kernel << > >(output, width, height, + background_render_kernel << > >(tex,output, width, height, u0, v0, u1, v1, u2, v2, u3, v3, threshold); } @@ -92,7 +76,7 @@ void background_render(float* output, int width, int height, float u0, } // namespace xromm __global__ -void background_render_kernel(float* output, int width, int height, float u0, +void background_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0, float v0, float u1, float v1, float u2, float v2, float u3, float v3, float threshold) { @@ -113,7 +97,7 @@ void background_render_kernel(float* output, int width, int height, float u0, output[width*y+x] = 0.0f; } else { - output[width*y + x] = (threshold <= tex2D(tex, s, t)) ? 1.0f : 0.0f; + output[width*y + x] = (threshold <= tex2D(tex, s, t)) ? 1.0f : 0.0f; } } diff --git a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h index 42d8a340..988767c9 100644 --- a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h +++ b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h @@ -48,9 +48,8 @@ namespace xromm namespace gpu { -void background_bind_array(const cudaArray* array); -void background_render(float* output, int width, int height, float u0, +void background_render(cudaTextureObject_t tex,float* output, int width, int height, float u0, float v0, float u1, float v1, float u2, float v2, float u3, float v3, float threshold); diff --git a/libautoscoper/src/gpu/cuda/Compositor_kernels.cu b/libautoscoper/src/gpu/cuda/Compositor_kernels.cu index c3315e16..aefd7646 100644 --- a/libautoscoper/src/gpu/cuda/Compositor_kernels.cu +++ b/libautoscoper/src/gpu/cuda/Compositor_kernels.cu @@ -67,8 +67,8 @@ void composite(float* src1, { // Calculate the block and grid sizes. dim3 blockDim(32, 32); - dim3 gridDim((width+blockDim.x-1)/blockDim.x, - (height+blockDim.y-1)/blockDim.y); + dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x, + ((unsigned int)height+blockDim.y-1)/blockDim.y); // Call the kernel composite_kernel<<>>(src1,src2,src3,src4,dest,width,height); diff --git a/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu b/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu index 29857715..22bb2ecf 100644 --- a/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu +++ b/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu @@ -58,8 +58,8 @@ namespace xromm { { // Calculate the block and grid sizes. dim3 blockDim(32, 32); - dim3 gridDim((width+blockDim.x-1)/blockDim.x, - (height+blockDim.y-1)/blockDim.y); + dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x, + ((unsigned int)height+blockDim.y-1)/blockDim.y); // Call the kernel drr_background_kernel << > >(src1, dest, width, height); diff --git a/libautoscoper/src/gpu/cuda/Merger_kernels.cu b/libautoscoper/src/gpu/cuda/Merger_kernels.cu index 9fad7b62..9cbb3d92 100644 --- a/libautoscoper/src/gpu/cuda/Merger_kernels.cu +++ b/libautoscoper/src/gpu/cuda/Merger_kernels.cu @@ -60,8 +60,8 @@ void merge(float* src1, { // Calculate the block and grid sizes. dim3 blockDim(32, 32); - dim3 gridDim((width+blockDim.x-1)/blockDim.x, - (height+blockDim.y-1)/blockDim.y); + dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x, + ((unsigned int)height+blockDim.y-1)/blockDim.y); // Call the kernel merge_kernel<<>>(src1,src2,dest,width,height); diff --git a/libautoscoper/src/gpu/cuda/Mult_kernels.cu b/libautoscoper/src/gpu/cuda/Mult_kernels.cu index dd1f330d..a3c1db94 100644 --- a/libautoscoper/src/gpu/cuda/Mult_kernels.cu +++ b/libautoscoper/src/gpu/cuda/Mult_kernels.cu @@ -60,8 +60,8 @@ namespace xromm { { // Calculate the block and grid sizes. dim3 blockDim(32, 32); - dim3 gridDim((width + blockDim.x - 1) / blockDim.x, - (height + blockDim.y - 1) / blockDim.y); + dim3 gridDim(((unsigned int)width + blockDim.x - 1) / blockDim.x, + ((unsigned int)height + blockDim.y - 1) / blockDim.y); // Call the kernel mult_kernel<<>>(src1, src2, dest, width, height); diff --git a/libautoscoper/src/gpu/cuda/RadRenderer.cpp b/libautoscoper/src/gpu/cuda/RadRenderer.cpp index b44651b5..a472614c 100644 --- a/libautoscoper/src/gpu/cuda/RadRenderer.cpp +++ b/libautoscoper/src/gpu/cuda/RadRenderer.cpp @@ -43,6 +43,7 @@ #include #include +#include #include #include @@ -125,8 +126,9 @@ RadRenderer::set_viewport(float x, float y, float width, float height) void RadRenderer::render(float* buffer, size_t width, size_t height) const { - video_bind_array(array_); - video_render(buffer, + cudaTextureObject_t tex = createTexureObjectFromArray(array_, cudaReadModeNormalizedFloat); + + video_render(tex,buffer, (int)width, (int)height, image_plane_[0], @@ -137,6 +139,8 @@ RadRenderer::render(float* buffer, size_t width, size_t height) const viewport_[1], viewport_[2], viewport_[3]); + + cudaDestroyTextureObject(tex); } } } // namespace xromm::cuda diff --git a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu index ccaa0cd5..54e2544f 100644 --- a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu +++ b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu @@ -43,14 +43,11 @@ #include -/////// Global Variables //////// - -static texture tex; //////// Image Rendering Kernel //////// __global__ -void image_render_kernel(float* output, int width, int height, float u0, +void image_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0, float v0, float u1, float v1, float u2, float v2, float u3, float v3); @@ -60,27 +57,7 @@ namespace xromm namespace gpu { -void video_bind_array(const cudaArray* array) -{ - // Setup 2D texture. - tex.normalized = true; - tex.filterMode = cudaFilterModeLinear; - tex.addressMode[0] = cudaAddressModeClamp; - tex.addressMode[1] = cudaAddressModeClamp; - - // Bind array to 3D texture. - cutilSafeCall(cudaBindTextureToArray(tex, array)); -} - -/* -void image_deinit() -{ - cutilSafeCall(cudaUnbindTexture(tex)); - cutilSafeCall(cudaFreeArray(array)); -} -*/ - -void video_render(float* output, int width, int height, float u0, +void video_render(cudaTextureObject_t tex,float* output, int width, int height, float u0, float v0, float u1, float v1, float u2, float v2, float u3, float v3) { @@ -89,7 +66,7 @@ void video_render(float* output, int width, int height, float u0, dim3 gridDim((width+blockDim.x-1)/blockDim.x, (height+blockDim.y-1)/blockDim.y); - image_render_kernel<<>>(output, width, height, + image_render_kernel<<>>(tex,output, width, height, u0, v0, u1, v1, u2, v2, u3, v3); } @@ -99,7 +76,7 @@ void video_render(float* output, int width, int height, float u0, } // namespace xromm __global__ -void image_render_kernel(float* output, int width, int height, float u0, +void image_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0, float v0, float u1, float v1, float u2, float v2, float u3, float v3) { @@ -120,7 +97,7 @@ void image_render_kernel(float* output, int width, int height, float u0, output[width*y+x] = 0.0f; } else { - output[width*y+x] = 1.0f-tex2D(tex, s, t); + output[width*y+x] = 1.0f-tex2D(tex, s, t); } } diff --git a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h index 2d4d2470..8fad0f2f 100644 --- a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h +++ b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h @@ -47,11 +47,7 @@ namespace xromm namespace gpu { -void video_bind_array(const cudaArray* array); - -//void cuda_image_deinit(); - -void video_render(float* output, int width, int height, float u0, +void video_render(cudaTextureObject_t tex,float* output, int width, int height, float u0, float v0, float u1, float v1, float u2, float v2, float u3, float v3); diff --git a/libautoscoper/src/gpu/cuda/RayCaster.cpp b/libautoscoper/src/gpu/cuda/RayCaster.cpp index 182eb480..92c267c5 100644 --- a/libautoscoper/src/gpu/cuda/RayCaster.cpp +++ b/libautoscoper/src/gpu/cuda/RayCaster.cpp @@ -43,6 +43,11 @@ #include #include +#include +#include +#include +#include + #include "RayCaster.hpp" #include "RayCaster_kernels.h" #include "VolumeDescription.hpp" @@ -142,16 +147,18 @@ RayCaster::render(float* buffer, size_t width, size_t height) return; } - //float aspectRatio = (float)width/(float)height; - volume_bind_array(volumeDescription_->image()); + cudaTextureObject_t tex = createTexureObjectFromArray((cudaArray_t)volumeDescription_->image(), cudaReadModeNormalizedFloat); + volume_viewport(viewport_[0], viewport_[1], viewport_[2], viewport_[3]); - volume_render(buffer, + volume_render(tex,buffer, width, height, invModelView_, sampleDistance_, rayIntensity_, cutoff_); + + cudaDestroyTextureObject(tex); } /* diff --git a/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu b/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu index e87852d4..0d188992 100644 --- a/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu +++ b/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu @@ -58,13 +58,11 @@ struct float3x4 // Forward declarations __global__ -void cuda_volume_render_kernel(float* output, size_t width, size_t height, +void cuda_volume_render_kernel(cudaTextureObject_t tex,float* output, size_t width, size_t height, float step, float intensity, float cutoff); // Global variables -static texture tex; - static __constant__ float4 d_viewport; static __constant__ float3x4 d_invModelView; @@ -74,25 +72,13 @@ namespace xromm namespace gpu { -void volume_bind_array(const cudaArray* array) -{ - // Setup 3D texture. - tex.normalized = true; - tex.filterMode = cudaFilterModeLinear; - tex.addressMode[0] = cudaAddressModeClamp; - tex.addressMode[1] = cudaAddressModeClamp; - - // Bind array to 3D texture. - cutilSafeCall(cudaBindTextureToArray(tex, array)); -} - void volume_viewport(float x, float y, float width, float height) { float4 viewport = make_float4(x, y, width, height); cutilSafeCall(cudaMemcpyToSymbol(d_viewport, &viewport, sizeof(float4))); } -void volume_render(float* buffer, size_t width, size_t height, +void volume_render(cudaTextureObject_t tex,float* buffer, size_t width, size_t height, const float* invModelView, float step, float intensity, float cutoff) { @@ -103,11 +89,11 @@ void volume_render(float* buffer, size_t width, size_t height, // Calculate the block and grid sizes. dim3 blockDim(32, 32); - dim3 gridDim((width+blockDim.x-1)/blockDim.x, - (height+blockDim.y-1)/blockDim.y); + dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x, + ((unsigned int)height+blockDim.y-1)/blockDim.y); // Call the kernel - cuda_volume_render_kernel<<>>(buffer, width, height, + cuda_volume_render_kernel<<>>(tex,buffer, width, height, step, intensity, cutoff); //This crashes it under windows @@ -168,7 +154,7 @@ float4 mul(const float3x4 &M, const float4 &v) // Render the volume using ray marching. __global__ -void cuda_volume_render_kernel(float* buffer, size_t width, size_t height, +void cuda_volume_render_kernel(cudaTextureObject_t tex,float* buffer, size_t width, size_t height, float step, float intensity, float cutoff) { uint x = blockIdx.x*blockDim.x+threadIdx.x; @@ -208,7 +194,7 @@ void cuda_volume_render_kernel(float* buffer, size_t width, size_t height, float density = 0.0f; while (t > _near) { float3 point = ray.origin+t*ray.direction; - float sample = tex3D(tex, point.x, 1.0f-point.y, -point.z); + float sample = tex3D(tex, point.x, 1.0f-point.y, -point.z); density += sample > cutoff? step*sample: 0.0f; t -= 1.0f; } diff --git a/libautoscoper/src/gpu/cuda/RayCaster_kernels.h b/libautoscoper/src/gpu/cuda/RayCaster_kernels.h index de04032b..e87349e5 100644 --- a/libautoscoper/src/gpu/cuda/RayCaster_kernels.h +++ b/libautoscoper/src/gpu/cuda/RayCaster_kernels.h @@ -50,11 +50,9 @@ namespace xromm namespace gpu { -void volume_bind_array(const cudaArray* array); - void volume_viewport(float x, float y, float width, float height); -void volume_render(float* buffer, +void volume_render(cudaTextureObject_t tex, float* buffer, size_t width, size_t height, const float* invModelViewMat, diff --git a/libautoscoper/src/gpu/cuda/cutil/cutil_create_tex_obj.h b/libautoscoper/src/gpu/cuda/cutil/cutil_create_tex_obj.h new file mode 100644 index 00000000..f9c35dd7 --- /dev/null +++ b/libautoscoper/src/gpu/cuda/cutil/cutil_create_tex_obj.h @@ -0,0 +1,26 @@ +#pragma once +#include +#include +#include + +inline cudaTextureObject_t createTexureObjectFromArray(cudaArray* arr, cudaTextureReadMode readMode) { + // Approach implemented below is based off of + // https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ + + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeArray; + resDesc.res.array.array = arr; + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.normalizedCoords = true; + texDesc.filterMode = cudaFilterModeLinear; + texDesc.addressMode[0] = cudaAddressModeClamp; + texDesc.addressMode[1] = cudaAddressModeClamp; + texDesc.readMode = readMode; + + cudaTextureObject_t tex = 0; + cutilSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); + return tex; +}