ENH: Add support for 16 and 32-bit radiograph images

Prior to these changes, the supported CUDA texture types (and by extension radiograph images) was hard-coded to 8-bit (unsigned char) and managed as a static global variable. Since texture references needed to be known at compile time and the current type was hard-coded to 8-bit, the use of them prevented from being able to easily support 8, 16, and 32 bit radiograph images known at runtime. To address this, this commit removes the use of `cudaBindTextureToArray` (Deprecated since CUDA 11.3 and removed in CUDA 12.x) and switch to using texture object instead by introducing a convenience helper function `createTextureObjectFromArray`. Approach implemented here is based of recommendation the post titled "Kepler Texture Objects Improve Performance and Flexibility" published in 2013. See https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ For reference, before this changes, the error message reported at build time while attempting to use autoscoper built against CUDA 12.x was: ``` 3 Errors under file path `...\libautoscoper\src\gpu\cuda\RadRenderer_kernels.cu: - no instance of overloaded function "tex2D" matches the argument list - identifier "cudaBindTextureToArray" is undefined - texture is not a template ```
BrownBiomechanics · Dec 20, 2023 · 11b91cc · 11b91cc
1 parent 6444737
commit 11b91cc
Show file tree

Hide file tree

Showing 14 changed files with 76 additions and 95 deletions.
diff --git a/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp b/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp
@@ -43,6 +43,7 @@
 #include <sstream>
 
 #include <cuda.h>
+#include <cutil_create_tex_obj.h>
 #include <cutil_inline.h>
 #include <cutil_math.h>
 
@@ -111,8 +112,9 @@ BackgroundRenderer::set_viewport(float x, float y, float width, float height)
 void
 BackgroundRenderer::render(float* buffer, size_t width, size_t height, float threshold) const
 {
-    background_bind_array(array_);
-    background_render(buffer,
+    cudaTextureObject_t tex = createTexureObjectFromArray(array_, cudaReadModeElementType);
+
+    background_render(tex,buffer,
                  (int)width,
                  (int)height,
                  image_plane_[0],
@@ -124,6 +126,8 @@ BackgroundRenderer::render(float* buffer, size_t width, size_t height, float thr
                  viewport_[2],
                  viewport_[3],
                  threshold);
+
+    cudaDestroyTextureObject(tex);
 }
 
 } } // namespace xromm::cuda
diff --git a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu
@@ -43,14 +43,10 @@
 
 #include <cutil_inline.h>
 
-/////// Global Variables ////////
-
-static texture<float,2> tex;
-
 //////// Image Rendering Kernel ////////
 
 __global__
-void background_render_kernel(float* output, int width, int height, float u0,
+void background_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                          float v0, float u1, float v1, float u2, float v2,
              float u3, float v3, float threshold);
 
@@ -60,20 +56,8 @@ namespace xromm
 namespace gpu
 {
 
-void background_bind_array(const cudaArray* array)
-{
-    // Setup 2D texture.
-    tex.normalized = true;
-    tex.filterMode = cudaFilterModeLinear;
-    tex.addressMode[0] = cudaAddressModeClamp;
-    tex.addressMode[1] = cudaAddressModeClamp;
-
-    // Bind array to 3D texture.
-    cutilSafeCall(cudaBindTextureToArray(tex, array));
-}
-
 
-void background_render(float* output, int width, int height, float u0,
+void background_render(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                   float v0, float u1, float v1, float u2, float v2,
                   float u3, float v3, float threshold)
 {
@@ -82,7 +66,7 @@ void background_render(float* output, int width, int height, float u0,
     dim3 gridDim((width+blockDim.x-1)/blockDim.x,
                  (height+blockDim.y-1)/blockDim.y);
 
-  background_render_kernel << <gridDim, blockDim >> >(output, width, height,
+  background_render_kernel << <gridDim, blockDim >> >(tex,output, width, height,
                                                u0, v0, u1, v1, u2, v2,
                                                u3, v3, threshold);
 }
@@ -92,7 +76,7 @@ void background_render(float* output, int width, int height, float u0,
 } // namespace xromm
 
 __global__
-void background_render_kernel(float* output, int width, int height, float u0,
+void background_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                          float v0, float u1, float v1, float u2, float v2,
              float u3, float v3, float threshold)
 {
@@ -113,7 +97,7 @@ void background_render_kernel(float* output, int width, int height, float u0,
         output[width*y+x] = 0.0f;
     }
     else {
-    output[width*y + x] = (threshold <= tex2D(tex, s, t)) ? 1.0f : 0.0f;
+    output[width*y + x] = (threshold <= tex2D<float>(tex, s, t)) ? 1.0f : 0.0f;
     }
 }
 
diff --git a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h
@@ -48,9 +48,8 @@ namespace xromm
 namespace gpu
 {
 
-void background_bind_array(const cudaArray* array);
 
-void background_render(float* output, int width, int height, float u0,
+void background_render(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                   float v0, float u1, float v1, float u2, float v2,
                   float u3, float v3, float threshold);
 

diff --git a/libautoscoper/src/gpu/cuda/Compositor_kernels.cu b/libautoscoper/src/gpu/cuda/Compositor_kernels.cu
@@ -67,8 +67,8 @@ void composite(float* src1,
 {
     // Calculate the block and grid sizes.
     dim3 blockDim(32, 32);
-    dim3 gridDim((width+blockDim.x-1)/blockDim.x,
-                 (height+blockDim.y-1)/blockDim.y);
+    dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x,
+                 ((unsigned int)height+blockDim.y-1)/blockDim.y);
 
     // Call the kernel
     composite_kernel<<<gridDim, blockDim>>>(src1,src2,src3,src4,dest,width,height);

diff --git a/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu b/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu
@@ -58,8 +58,8 @@ namespace xromm {
 {
     // Calculate the block and grid sizes.
     dim3 blockDim(32, 32);
-    dim3 gridDim((width+blockDim.x-1)/blockDim.x,
-                 (height+blockDim.y-1)/blockDim.y);
+    dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x,
+                 ((unsigned int)height+blockDim.y-1)/blockDim.y);
 
     // Call the kernel
   drr_background_kernel << <gridDim, blockDim >> >(src1, dest, width, height);

diff --git a/libautoscoper/src/gpu/cuda/Merger_kernels.cu b/libautoscoper/src/gpu/cuda/Merger_kernels.cu
@@ -60,8 +60,8 @@ void merge(float* src1,
 {
     // Calculate the block and grid sizes.
     dim3 blockDim(32, 32);
-    dim3 gridDim((width+blockDim.x-1)/blockDim.x,
-                 (height+blockDim.y-1)/blockDim.y);
+    dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x,
+                 ((unsigned int)height+blockDim.y-1)/blockDim.y);
 
     // Call the kernel
     merge_kernel<<<gridDim, blockDim>>>(src1,src2,dest,width,height);

diff --git a/libautoscoper/src/gpu/cuda/Mult_kernels.cu b/libautoscoper/src/gpu/cuda/Mult_kernels.cu
@@ -60,8 +60,8 @@ namespace xromm {
     {
       // Calculate the block and grid sizes.
       dim3 blockDim(32, 32);
-      dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
-        (height + blockDim.y - 1) / blockDim.y);
+      dim3 gridDim(((unsigned int)width + blockDim.x - 1) / blockDim.x,
+        ((unsigned int)height + blockDim.y - 1) / blockDim.y);
 
       // Call the kernel
       mult_kernel<<<gridDim, blockDim>>>(src1, src2, dest, width, height);

diff --git a/libautoscoper/src/gpu/cuda/RadRenderer.cpp b/libautoscoper/src/gpu/cuda/RadRenderer.cpp
@@ -43,6 +43,7 @@
 #include <sstream>
 
 #include <cuda.h>
+#include <cutil_create_tex_obj.h>
 #include <cutil_inline.h>
 #include <cutil_math.h>
 
@@ -125,8 +126,9 @@ RadRenderer::set_viewport(float x, float y, float width, float height)
 void
 RadRenderer::render(float* buffer, size_t width, size_t height) const
 {
-    video_bind_array(array_);
-    video_render(buffer,
+    cudaTextureObject_t tex = createTexureObjectFromArray(array_, cudaReadModeNormalizedFloat);
+
+    video_render(tex,buffer,
                  (int)width,
                  (int)height,
                  image_plane_[0],
@@ -137,6 +139,8 @@ RadRenderer::render(float* buffer, size_t width, size_t height) const
                  viewport_[1],
                  viewport_[2],
                  viewport_[3]);
+
+    cudaDestroyTextureObject(tex);
 }
 
 } } // namespace xromm::cuda
diff --git a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu
@@ -43,14 +43,11 @@
 
 #include <cutil_inline.h>
 
-/////// Global Variables ////////
-
-static texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex;
 
 //////// Image Rendering Kernel ////////
 
 __global__
-void image_render_kernel(float* output, int width, int height, float u0,
+void image_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                          float v0, float u1, float v1, float u2, float v2,
                          float u3, float v3);
 
@@ -60,27 +57,7 @@ namespace xromm
 namespace gpu
 {
 
-void video_bind_array(const cudaArray* array)
-{
-    // Setup 2D texture.
-    tex.normalized = true;
-    tex.filterMode = cudaFilterModeLinear;
-    tex.addressMode[0] = cudaAddressModeClamp;
-    tex.addressMode[1] = cudaAddressModeClamp;
-
-    // Bind array to 3D texture.
-    cutilSafeCall(cudaBindTextureToArray(tex, array));
-}
-
-/*
-void image_deinit()
-{
-    cutilSafeCall(cudaUnbindTexture(tex));
-    cutilSafeCall(cudaFreeArray(array));
-}
-*/
-
-void video_render(float* output, int width, int height, float u0,
+void video_render(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                   float v0, float u1, float v1, float u2, float v2,
                   float u3, float v3)
 {
@@ -89,7 +66,7 @@ void video_render(float* output, int width, int height, float u0,
     dim3 gridDim((width+blockDim.x-1)/blockDim.x,
                  (height+blockDim.y-1)/blockDim.y);
 
-    image_render_kernel<<<gridDim, blockDim>>>(output, width, height,
+    image_render_kernel<<<gridDim, blockDim>>>(tex,output, width, height,
                                                u0, v0, u1, v1, u2, v2,
                                                u3, v3);
 }
@@ -99,7 +76,7 @@ void video_render(float* output, int width, int height, float u0,
 } // namespace xromm
 
 __global__
-void image_render_kernel(float* output, int width, int height, float u0,
+void image_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                          float v0, float u1, float v1, float u2, float v2,
                          float u3, float v3)
 {
@@ -120,7 +97,7 @@ void image_render_kernel(float* output, int width, int height, float u0,
         output[width*y+x] = 0.0f;
     }
     else {
-        output[width*y+x] = 1.0f-tex2D(tex, s, t);
+        output[width*y+x] = 1.0f-tex2D<float>(tex, s, t);
     }
 }
 
diff --git a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h
@@ -47,11 +47,7 @@ namespace xromm
 namespace gpu
 {
 
-void video_bind_array(const cudaArray* array);
-
-//void cuda_image_deinit();
-
-void video_render(float* output, int width, int height, float u0,
+void video_render(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                   float v0, float u1, float v1, float u2, float v2,
                   float u3, float v3);
 

diff --git a/libautoscoper/src/gpu/cuda/RayCaster.cpp b/libautoscoper/src/gpu/cuda/RayCaster.cpp
@@ -43,6 +43,11 @@
 #include <iostream>
 #include <sstream>
 
+#include <cuda.h>
+#include <cutil_create_tex_obj.h>
+#include <cutil_inline.h>
+#include <cutil_math.h>
+
 #include "RayCaster.hpp"
 #include "RayCaster_kernels.h"
 #include "VolumeDescription.hpp"
@@ -142,16 +147,18 @@ RayCaster::render(float* buffer, size_t width, size_t height)
         return;
     }
 
-    //float aspectRatio = (float)width/(float)height;
-    volume_bind_array(volumeDescription_->image());
+    cudaTextureObject_t tex = createTexureObjectFromArray((cudaArray_t)volumeDescription_->image(), cudaReadModeNormalizedFloat);
+
     volume_viewport(viewport_[0], viewport_[1], viewport_[2], viewport_[3]);
-    volume_render(buffer,
+    volume_render(tex,buffer,
                   width,
                   height,
                   invModelView_,
                   sampleDistance_,
                   rayIntensity_,
                   cutoff_);
+
+    cudaDestroyTextureObject(tex);
 }
 
 /*

diff --git a/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu b/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu
@@ -58,13 +58,11 @@ struct float3x4
 // Forward declarations
 
 __global__
-void cuda_volume_render_kernel(float* output, size_t width, size_t height,
+void cuda_volume_render_kernel(cudaTextureObject_t tex,float* output, size_t width, size_t height,
                                float step, float intensity, float cutoff);
 
 // Global variables
 
-static texture<unsigned short, 3, cudaReadModeNormalizedFloat> tex;
-
 static __constant__ float4 d_viewport;
 static __constant__ float3x4 d_invModelView;
 
@@ -74,25 +72,13 @@ namespace xromm
 namespace gpu
 {
 
-void volume_bind_array(const cudaArray* array)
-{
-    // Setup 3D texture.
-    tex.normalized = true;
-    tex.filterMode = cudaFilterModeLinear;
-    tex.addressMode[0] = cudaAddressModeClamp;
-    tex.addressMode[1] = cudaAddressModeClamp;
-
-    // Bind array to 3D texture.
-    cutilSafeCall(cudaBindTextureToArray(tex, array));
-}
-
 void volume_viewport(float x, float y, float width, float height)
 {
     float4 viewport = make_float4(x, y, width, height);
     cutilSafeCall(cudaMemcpyToSymbol(d_viewport, &viewport, sizeof(float4)));
 }
 
-void volume_render(float* buffer, size_t width, size_t height,
+void volume_render(cudaTextureObject_t tex,float* buffer, size_t width, size_t height,
                    const float* invModelView, float step, float intensity,
                    float cutoff)
 {
@@ -103,11 +89,11 @@ void volume_render(float* buffer, size_t width, size_t height,
 
     // Calculate the block and grid sizes.
     dim3 blockDim(32, 32);
-    dim3 gridDim((width+blockDim.x-1)/blockDim.x,
-                 (height+blockDim.y-1)/blockDim.y);
+    dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x,
+                 ((unsigned int)height+blockDim.y-1)/blockDim.y);
 
     // Call the kernel
-    cuda_volume_render_kernel<<<gridDim, blockDim>>>(buffer, width, height,
+    cuda_volume_render_kernel<<<gridDim, blockDim>>>(tex,buffer, width, height,
                                                 step, intensity, cutoff);
 
     //This crashes it under windows
@@ -168,7 +154,7 @@ float4 mul(const float3x4 &M, const float4 &v)
 
 // Render the volume using ray marching.
 __global__
-void cuda_volume_render_kernel(float* buffer, size_t width, size_t height,
+void cuda_volume_render_kernel(cudaTextureObject_t tex,float* buffer, size_t width, size_t height,
                                float step, float intensity, float cutoff)
 {
   uint x = blockIdx.x*blockDim.x+threadIdx.x;
@@ -208,7 +194,7 @@ void cuda_volume_render_kernel(float* buffer, size_t width, size_t height,
     float density = 0.0f;
     while (t > _near) {
         float3 point = ray.origin+t*ray.direction;
-        float sample = tex3D(tex, point.x, 1.0f-point.y, -point.z);
+        float sample = tex3D<float>(tex, point.x, 1.0f-point.y, -point.z);
         density += sample > cutoff? step*sample: 0.0f;
         t -= 1.0f;
     }

diff --git a/libautoscoper/src/gpu/cuda/RayCaster_kernels.h b/libautoscoper/src/gpu/cuda/RayCaster_kernels.h
@@ -50,11 +50,9 @@ namespace xromm
 namespace gpu
 {
 
-void volume_bind_array(const cudaArray* array);
-
 void volume_viewport(float x, float y, float width, float height);
 
-void volume_render(float* buffer,
+void volume_render(cudaTextureObject_t tex, float* buffer,
                    size_t width,
                    size_t height,
                    const float* invModelViewMat,