From 11b91cc079bd33a25c7b3d582be75f3f364b36d4 Mon Sep 17 00:00:00 2001
From: Anthony Lombardi <anthony.lombardi@kitware.com>
Date: Fri, 13 Jan 2023 15:41:45 -0500
Subject: [PATCH] ENH: Add support for 16 and 32-bit radiograph images

Prior to these changes, the supported CUDA texture types (and by extension
radiograph images) was hard-coded to 8-bit (unsigned char) and managed as
a static global variable.

Since texture references needed to be known at compile time and the current
type was hard-coded to 8-bit, the use of them prevented from being able to
easily support 8, 16, and 32 bit radiograph images known at runtime.

To address this, this commit removes the use of `cudaBindTextureToArray`
(Deprecated since CUDA 11.3 and removed in CUDA 12.x)
and switch to using texture object instead by introducing a convenience helper
function `createTextureObjectFromArray`.

Approach implemented here is based of recommendation the post titled
"Kepler Texture Objects Improve Performance and Flexibility" published
in 2013.
See https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/

For reference, before this changes, the error message reported at build time
while attempting to use autoscoper built against CUDA 12.x was:

```
3 Errors under file path `...\libautoscoper\src\gpu\cuda\RadRenderer_kernels.cu:
- no instance of overloaded function "tex2D" matches the argument list
- identifier "cudaBindTextureToArray" is undefined
- texture is not a template
```
---
 .../src/gpu/cuda/BackgroundRenderer.cpp       |  8 +++--
 .../gpu/cuda/BackgroundRenderer_kernels.cu    | 26 +++------------
 .../src/gpu/cuda/BackgroundRenderer_kernels.h |  3 +-
 .../src/gpu/cuda/Compositor_kernels.cu        |  4 +--
 .../src/gpu/cuda/DRRBackground_kernels.cu     |  4 +--
 libautoscoper/src/gpu/cuda/Merger_kernels.cu  |  4 +--
 libautoscoper/src/gpu/cuda/Mult_kernels.cu    |  4 +--
 libautoscoper/src/gpu/cuda/RadRenderer.cpp    |  8 +++--
 .../src/gpu/cuda/RadRenderer_kernels.cu       | 33 +++----------------
 .../src/gpu/cuda/RadRenderer_kernels.h        |  6 +---
 libautoscoper/src/gpu/cuda/RayCaster.cpp      | 13 ++++++--
 .../src/gpu/cuda/RayCaster_kernels.cu         | 28 ++++------------
 .../src/gpu/cuda/RayCaster_kernels.h          |  4 +--
 .../src/gpu/cuda/cutil/cutil_create_tex_obj.h | 26 +++++++++++++++
 14 files changed, 76 insertions(+), 95 deletions(-)
 create mode 100644 libautoscoper/src/gpu/cuda/cutil/cutil_create_tex_obj.h
diff --git a/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp b/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp
index d4bd6713..c2eb1c9b 100644
--- a/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp
+++ b/libautoscoper/src/gpu/cuda/BackgroundRenderer.cpp
@@ -43,6 +43,7 @@
 #include <sstream>
 
 #include <cuda.h>
+#include <cutil_create_tex_obj.h>
 #include <cutil_inline.h>
 #include <cutil_math.h>
 
@@ -111,8 +112,9 @@ BackgroundRenderer::set_viewport(float x, float y, float width, float height)
 void
 BackgroundRenderer::render(float* buffer, size_t width, size_t height, float threshold) const
 {
-    background_bind_array(array_);
-    background_render(buffer,
+    cudaTextureObject_t tex = createTexureObjectFromArray(array_, cudaReadModeElementType);
+
+    background_render(tex,buffer,
                  (int)width,
                  (int)height,
                  image_plane_[0],
@@ -124,6 +126,8 @@ BackgroundRenderer::render(float* buffer, size_t width, size_t height, float thr
                  viewport_[2],
                  viewport_[3],
                  threshold);
+
+    cudaDestroyTextureObject(tex);
 }
 
 } } // namespace xromm::cuda
diff --git a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu
index 5d28209d..e03953f8 100644
--- a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu
+++ b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.cu
@@ -43,14 +43,10 @@
 
 #include <cutil_inline.h>
 
-/////// Global Variables ////////
-
-static texture<float,2> tex;
-
 //////// Image Rendering Kernel ////////
 
 __global__
-void background_render_kernel(float* output, int width, int height, float u0,
+void background_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                          float v0, float u1, float v1, float u2, float v2,
              float u3, float v3, float threshold);
 
@@ -60,20 +56,8 @@ namespace xromm
 namespace gpu
 {
 
-void background_bind_array(const cudaArray* array)
-{
-    // Setup 2D texture.
-    tex.normalized = true;
-    tex.filterMode = cudaFilterModeLinear;
-    tex.addressMode[0] = cudaAddressModeClamp;
-    tex.addressMode[1] = cudaAddressModeClamp;
-
-    // Bind array to 3D texture.
-    cutilSafeCall(cudaBindTextureToArray(tex, array));
-}
-
 
-void background_render(float* output, int width, int height, float u0,
+void background_render(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                   float v0, float u1, float v1, float u2, float v2,
                   float u3, float v3, float threshold)
 {
@@ -82,7 +66,7 @@ void background_render(float* output, int width, int height, float u0,
     dim3 gridDim((width+blockDim.x-1)/blockDim.x,
                  (height+blockDim.y-1)/blockDim.y);
 
-  background_render_kernel << <gridDim, blockDim >> >(output, width, height,
+  background_render_kernel << <gridDim, blockDim >> >(tex,output, width, height,
                                                u0, v0, u1, v1, u2, v2,
                                                u3, v3, threshold);
 }
@@ -92,7 +76,7 @@ void background_render(float* output, int width, int height, float u0,
 } // namespace xromm
 
 __global__
-void background_render_kernel(float* output, int width, int height, float u0,
+void background_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                          float v0, float u1, float v1, float u2, float v2,
              float u3, float v3, float threshold)
 {
@@ -113,7 +97,7 @@ void background_render_kernel(float* output, int width, int height, float u0,
         output[width*y+x] = 0.0f;
     }
     else {
-    output[width*y + x] = (threshold <= tex2D(tex, s, t)) ? 1.0f : 0.0f;
+    output[width*y + x] = (threshold <= tex2D<float>(tex, s, t)) ? 1.0f : 0.0f;
     }
 }
 
diff --git a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h
index 42d8a340..988767c9 100644
--- a/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h
+++ b/libautoscoper/src/gpu/cuda/BackgroundRenderer_kernels.h
@@ -48,9 +48,8 @@ namespace xromm
 namespace gpu
 {
 
-void background_bind_array(const cudaArray* array);
 
-void background_render(float* output, int width, int height, float u0,
+void background_render(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                   float v0, float u1, float v1, float u2, float v2,
                   float u3, float v3, float threshold);
 
diff --git a/libautoscoper/src/gpu/cuda/Compositor_kernels.cu b/libautoscoper/src/gpu/cuda/Compositor_kernels.cu
index c3315e16..aefd7646 100644
--- a/libautoscoper/src/gpu/cuda/Compositor_kernels.cu
+++ b/libautoscoper/src/gpu/cuda/Compositor_kernels.cu
@@ -67,8 +67,8 @@ void composite(float* src1,
 {
     // Calculate the block and grid sizes.
     dim3 blockDim(32, 32);
-    dim3 gridDim((width+blockDim.x-1)/blockDim.x,
-                 (height+blockDim.y-1)/blockDim.y);
+    dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x,
+                 ((unsigned int)height+blockDim.y-1)/blockDim.y);
 
     // Call the kernel
     composite_kernel<<<gridDim, blockDim>>>(src1,src2,src3,src4,dest,width,height);
diff --git a/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu b/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu
index 29857715..22bb2ecf 100644
--- a/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu
+++ b/libautoscoper/src/gpu/cuda/DRRBackground_kernels.cu
@@ -58,8 +58,8 @@ namespace xromm {
 {
     // Calculate the block and grid sizes.
     dim3 blockDim(32, 32);
-    dim3 gridDim((width+blockDim.x-1)/blockDim.x,
-                 (height+blockDim.y-1)/blockDim.y);
+    dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x,
+                 ((unsigned int)height+blockDim.y-1)/blockDim.y);
 
     // Call the kernel
   drr_background_kernel << <gridDim, blockDim >> >(src1, dest, width, height);
diff --git a/libautoscoper/src/gpu/cuda/Merger_kernels.cu b/libautoscoper/src/gpu/cuda/Merger_kernels.cu
index 9fad7b62..9cbb3d92 100644
--- a/libautoscoper/src/gpu/cuda/Merger_kernels.cu
+++ b/libautoscoper/src/gpu/cuda/Merger_kernels.cu
@@ -60,8 +60,8 @@ void merge(float* src1,
 {
     // Calculate the block and grid sizes.
     dim3 blockDim(32, 32);
-    dim3 gridDim((width+blockDim.x-1)/blockDim.x,
-                 (height+blockDim.y-1)/blockDim.y);
+    dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x,
+                 ((unsigned int)height+blockDim.y-1)/blockDim.y);
 
     // Call the kernel
     merge_kernel<<<gridDim, blockDim>>>(src1,src2,dest,width,height);
diff --git a/libautoscoper/src/gpu/cuda/Mult_kernels.cu b/libautoscoper/src/gpu/cuda/Mult_kernels.cu
index dd1f330d..a3c1db94 100644
--- a/libautoscoper/src/gpu/cuda/Mult_kernels.cu
+++ b/libautoscoper/src/gpu/cuda/Mult_kernels.cu
@@ -60,8 +60,8 @@ namespace xromm {
     {
       // Calculate the block and grid sizes.
       dim3 blockDim(32, 32);
-      dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
-        (height + blockDim.y - 1) / blockDim.y);
+      dim3 gridDim(((unsigned int)width + blockDim.x - 1) / blockDim.x,
+        ((unsigned int)height + blockDim.y - 1) / blockDim.y);
 
       // Call the kernel
       mult_kernel<<<gridDim, blockDim>>>(src1, src2, dest, width, height);
diff --git a/libautoscoper/src/gpu/cuda/RadRenderer.cpp b/libautoscoper/src/gpu/cuda/RadRenderer.cpp
index b44651b5..a472614c 100644
--- a/libautoscoper/src/gpu/cuda/RadRenderer.cpp
+++ b/libautoscoper/src/gpu/cuda/RadRenderer.cpp
@@ -43,6 +43,7 @@
 #include <sstream>
 
 #include <cuda.h>
+#include <cutil_create_tex_obj.h>
 #include <cutil_inline.h>
 #include <cutil_math.h>
 
@@ -125,8 +126,9 @@ RadRenderer::set_viewport(float x, float y, float width, float height)
 void
 RadRenderer::render(float* buffer, size_t width, size_t height) const
 {
-    video_bind_array(array_);
-    video_render(buffer,
+    cudaTextureObject_t tex = createTexureObjectFromArray(array_, cudaReadModeNormalizedFloat);
+
+    video_render(tex,buffer,
                  (int)width,
                  (int)height,
                  image_plane_[0],
@@ -137,6 +139,8 @@ RadRenderer::render(float* buffer, size_t width, size_t height) const
                  viewport_[1],
                  viewport_[2],
                  viewport_[3]);
+
+    cudaDestroyTextureObject(tex);
 }
 
 } } // namespace xromm::cuda
diff --git a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu
index ccaa0cd5..54e2544f 100644
--- a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu
+++ b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.cu
@@ -43,14 +43,11 @@
 
 #include <cutil_inline.h>
 
-/////// Global Variables ////////
-
-static texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex;
 
 //////// Image Rendering Kernel ////////
 
 __global__
-void image_render_kernel(float* output, int width, int height, float u0,
+void image_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                          float v0, float u1, float v1, float u2, float v2,
                          float u3, float v3);
 
@@ -60,27 +57,7 @@ namespace xromm
 namespace gpu
 {
 
-void video_bind_array(const cudaArray* array)
-{
-    // Setup 2D texture.
-    tex.normalized = true;
-    tex.filterMode = cudaFilterModeLinear;
-    tex.addressMode[0] = cudaAddressModeClamp;
-    tex.addressMode[1] = cudaAddressModeClamp;
-
-    // Bind array to 3D texture.
-    cutilSafeCall(cudaBindTextureToArray(tex, array));
-}
-
-/*
-void image_deinit()
-{
-    cutilSafeCall(cudaUnbindTexture(tex));
-    cutilSafeCall(cudaFreeArray(array));
-}
-*/
-
-void video_render(float* output, int width, int height, float u0,
+void video_render(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                   float v0, float u1, float v1, float u2, float v2,
                   float u3, float v3)
 {
@@ -89,7 +66,7 @@ void video_render(float* output, int width, int height, float u0,
     dim3 gridDim((width+blockDim.x-1)/blockDim.x,
                  (height+blockDim.y-1)/blockDim.y);
 
-    image_render_kernel<<<gridDim, blockDim>>>(output, width, height,
+    image_render_kernel<<<gridDim, blockDim>>>(tex,output, width, height,
                                                u0, v0, u1, v1, u2, v2,
                                                u3, v3);
 }
@@ -99,7 +76,7 @@ void video_render(float* output, int width, int height, float u0,
 } // namespace xromm
 
 __global__
-void image_render_kernel(float* output, int width, int height, float u0,
+void image_render_kernel(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                          float v0, float u1, float v1, float u2, float v2,
                          float u3, float v3)
 {
@@ -120,7 +97,7 @@ void image_render_kernel(float* output, int width, int height, float u0,
         output[width*y+x] = 0.0f;
     }
     else {
-        output[width*y+x] = 1.0f-tex2D(tex, s, t);
+        output[width*y+x] = 1.0f-tex2D<float>(tex, s, t);
     }
 }
 
diff --git a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h
index 2d4d2470..8fad0f2f 100644
--- a/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h
+++ b/libautoscoper/src/gpu/cuda/RadRenderer_kernels.h
@@ -47,11 +47,7 @@ namespace xromm
 namespace gpu
 {
 
-void video_bind_array(const cudaArray* array);
-
-//void cuda_image_deinit();
-
-void video_render(float* output, int width, int height, float u0,
+void video_render(cudaTextureObject_t tex,float* output, int width, int height, float u0,
                   float v0, float u1, float v1, float u2, float v2,
                   float u3, float v3);
 
diff --git a/libautoscoper/src/gpu/cuda/RayCaster.cpp b/libautoscoper/src/gpu/cuda/RayCaster.cpp
index 182eb480..92c267c5 100644
--- a/libautoscoper/src/gpu/cuda/RayCaster.cpp
+++ b/libautoscoper/src/gpu/cuda/RayCaster.cpp
@@ -43,6 +43,11 @@
 #include <iostream>
 #include <sstream>
 
+#include <cuda.h>
+#include <cutil_create_tex_obj.h>
+#include <cutil_inline.h>
+#include <cutil_math.h>
+
 #include "RayCaster.hpp"
 #include "RayCaster_kernels.h"
 #include "VolumeDescription.hpp"
@@ -142,16 +147,18 @@ RayCaster::render(float* buffer, size_t width, size_t height)
         return;
     }
 
-    //float aspectRatio = (float)width/(float)height;
-    volume_bind_array(volumeDescription_->image());
+    cudaTextureObject_t tex = createTexureObjectFromArray((cudaArray_t)volumeDescription_->image(), cudaReadModeNormalizedFloat);
+
     volume_viewport(viewport_[0], viewport_[1], viewport_[2], viewport_[3]);
-    volume_render(buffer,
+    volume_render(tex,buffer,
                   width,
                   height,
                   invModelView_,
                   sampleDistance_,
                   rayIntensity_,
                   cutoff_);
+
+    cudaDestroyTextureObject(tex);
 }
 
 /*
diff --git a/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu b/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu
index e87852d4..0d188992 100644
--- a/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu
+++ b/libautoscoper/src/gpu/cuda/RayCaster_kernels.cu
@@ -58,13 +58,11 @@ struct float3x4
 // Forward declarations
 
 __global__
-void cuda_volume_render_kernel(float* output, size_t width, size_t height,
+void cuda_volume_render_kernel(cudaTextureObject_t tex,float* output, size_t width, size_t height,
                                float step, float intensity, float cutoff);
 
 // Global variables
 
-static texture<unsigned short, 3, cudaReadModeNormalizedFloat> tex;
-
 static __constant__ float4 d_viewport;
 static __constant__ float3x4 d_invModelView;
 
@@ -74,25 +72,13 @@ namespace xromm
 namespace gpu
 {
 
-void volume_bind_array(const cudaArray* array)
-{
-    // Setup 3D texture.
-    tex.normalized = true;
-    tex.filterMode = cudaFilterModeLinear;
-    tex.addressMode[0] = cudaAddressModeClamp;
-    tex.addressMode[1] = cudaAddressModeClamp;
-
-    // Bind array to 3D texture.
-    cutilSafeCall(cudaBindTextureToArray(tex, array));
-}
-
 void volume_viewport(float x, float y, float width, float height)
 {
     float4 viewport = make_float4(x, y, width, height);
     cutilSafeCall(cudaMemcpyToSymbol(d_viewport, &viewport, sizeof(float4)));
 }
 
-void volume_render(float* buffer, size_t width, size_t height,
+void volume_render(cudaTextureObject_t tex,float* buffer, size_t width, size_t height,
                    const float* invModelView, float step, float intensity,
                    float cutoff)
 {
@@ -103,11 +89,11 @@ void volume_render(float* buffer, size_t width, size_t height,
 
     // Calculate the block and grid sizes.
     dim3 blockDim(32, 32);
-    dim3 gridDim((width+blockDim.x-1)/blockDim.x,
-                 (height+blockDim.y-1)/blockDim.y);
+    dim3 gridDim(((unsigned int)width+blockDim.x-1)/blockDim.x,
+                 ((unsigned int)height+blockDim.y-1)/blockDim.y);
 
     // Call the kernel
-    cuda_volume_render_kernel<<<gridDim, blockDim>>>(buffer, width, height,
+    cuda_volume_render_kernel<<<gridDim, blockDim>>>(tex,buffer, width, height,
                                                 step, intensity, cutoff);
 
     //This crashes it under windows
@@ -168,7 +154,7 @@ float4 mul(const float3x4 &M, const float4 &v)
 
 // Render the volume using ray marching.
 __global__
-void cuda_volume_render_kernel(float* buffer, size_t width, size_t height,
+void cuda_volume_render_kernel(cudaTextureObject_t tex,float* buffer, size_t width, size_t height,
                                float step, float intensity, float cutoff)
 {
   uint x = blockIdx.x*blockDim.x+threadIdx.x;
@@ -208,7 +194,7 @@ void cuda_volume_render_kernel(float* buffer, size_t width, size_t height,
     float density = 0.0f;
     while (t > _near) {
         float3 point = ray.origin+t*ray.direction;
-        float sample = tex3D(tex, point.x, 1.0f-point.y, -point.z);
+        float sample = tex3D<float>(tex, point.x, 1.0f-point.y, -point.z);
         density += sample > cutoff? step*sample: 0.0f;
         t -= 1.0f;
     }
diff --git a/libautoscoper/src/gpu/cuda/RayCaster_kernels.h b/libautoscoper/src/gpu/cuda/RayCaster_kernels.h
index de04032b..e87349e5 100644
--- a/libautoscoper/src/gpu/cuda/RayCaster_kernels.h
+++ b/libautoscoper/src/gpu/cuda/RayCaster_kernels.h
@@ -50,11 +50,9 @@ namespace xromm
 namespace gpu
 {
 
-void volume_bind_array(const cudaArray* array);
-
 void volume_viewport(float x, float y, float width, float height);
 
-void volume_render(float* buffer,
+void volume_render(cudaTextureObject_t tex, float* buffer,
                    size_t width,
                    size_t height,
                    const float* invModelViewMat,
diff --git a/libautoscoper/src/gpu/cuda/cutil/cutil_create_tex_obj.h b/libautoscoper/src/gpu/cuda/cutil/cutil_create_tex_obj.h
new file mode 100644
index 00000000..f9c35dd7
--- /dev/null
+++ b/libautoscoper/src/gpu/cuda/cutil/cutil_create_tex_obj.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <cuda.h>
+#include <cutil_inline.h>
+#include <cutil_inline_runtime.h>
+
+inline cudaTextureObject_t createTexureObjectFromArray(cudaArray* arr, cudaTextureReadMode readMode) {
+    // Approach implemented below is based off of
+    // https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
+
+    cudaResourceDesc resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+    resDesc.resType = cudaResourceTypeArray;
+    resDesc.res.array.array = arr;
+
+    cudaTextureDesc texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.normalizedCoords = true;
+    texDesc.filterMode = cudaFilterModeLinear;
+    texDesc.addressMode[0] = cudaAddressModeClamp;
+    texDesc.addressMode[1] = cudaAddressModeClamp;
+    texDesc.readMode = readMode;
+
+    cudaTextureObject_t tex = 0;
+    cutilSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
+    return tex;
+}