From 19974332df356fe1135bb3b2bc6adf919990ae78 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Fri, 9 Mar 2018 18:00:43 -0800
Subject: [PATCH] CUDA support for ImageBufAlgo (experimental and very
 incomplete)

First stab at this, it's experimental, the general organization may change
as we extend it.

* To get these features, you must build with `USE_CUDA=1`, in which case
  it will look for Cuda toolkit. For simplicity, we're setting a version
  floor of Cuda 7.0 and sm_30.

* To enable at runtime (duh, still only if you built with Cuda support
  enabled), you can either set `OIIO::attribute("cuda",1)` or use the
  magic environment variable `OPENIMAGEIO_CUDA=1`. When running oiiotool,
  the command line argument `--cuda` turns the attribut on (or cheat with
  the aforementioned env variable).

* When the attribute is set, ImageBuf of "local" (not ImageCache-backed)
  float (no other data types yet) buffers will allocate and free with
  cudaMallocManaged/cudaFree (other cases will use the usual malloc/free).
  We are thus heavily leveraging Unified Memory, never do any explicit
  copying of data back and forth.

* Certain ImageBufAlgo functions, then, have the options of calling
  Cuda implementations when all the stars align -- Cuda support enabled,
  Cuda turned on, the ImageBufs in question all have local storage that
  was allocated as visible to Cuda, the buffers are all float, and other
  restrctions to just the most common cases (all image inputs have
  identical ROIs, etc.).

* Implemented this for IBA::add() and sub() initially. Will extend to
  other operations in the future and as the need arises.

Results and discussion:

Perf: add and sub operations on 1920x1080 3 channel float images, on my
workstation (16 core Xeon Silver 4110, it's ISA is AVX-512 but I'm only
compiling for SSE4.2 support at the moment) runs in about 20ms single
threaded, ~3.8ms multithreaded. With Cuda enabled (NVIDIA Quadro P5000,
Pascal architecture), I am getting about 12ms (i.e., moderately faster
than single core, quite a bit slower than fully using all the CPU
cores).

Now, this is not an especially good case for GPU -- the compute-to-memory
ratio is very poor, just a single math op for every 12 bytes of transfer
on or off the GPU. When I contrive to do an example with about 10x more
math per pixel, the Cuda times are approximately equal to the CPU times
when I take advantage of all the CPU cores. Maybe it only helps if we
do a bunch of IBA operations in a row before needing the results. Maybe
it's only worth Cuda-accelerating the most expensive operations (resize,
area ops, etc.), but we'll never get gain from something simple like add?

If anybody can point out ways in which I'm being very wasteful, please do
let me know!

Even after we flesh out many more image operations to be
Cuda-accelerated, and even we see an improvement in all cases over CPU,
I don't expect people to see much practical improvement in a typical
oiiotool command line, since disk/network to read input images and write
results are almost certain to dominate runtime, compared to the
math. But if you have a program that's doing a whole bunch of repeated
image math via IBA calls themselves, that's where the bigger payoff is
going to be, I think.

Note that CUDA is extremely finicky about what compilers it can use,
with an especially narrow idea of which "host compiler" is required by
each version of the Cuda Toolkit/nvcc. I'm still working through those
issues, and am considering the merits of compiling the cuda itself with
clang (if available) rather than nvcc, just to ease up on these
requirements. We'll be making the rest of the build issues more robust
over time as well.
---
 Makefile                                   |   5 +
 site/spi/Makefile-bits-arnold              |   5 +
 src/cmake/compiler.cmake                   |   2 +
 src/cmake/externalpackages.cmake           |  24 +++
 src/include/OpenImageIO/imagebuf.h         |   3 +
 src/include/OpenImageIO/platform.h         |   2 +
 src/libOpenImageIO/CMakeLists.txt          |  26 ++-
 src/libOpenImageIO/imagebuf.cpp            |  76 +++++++--
 src/libOpenImageIO/imagebufalgo_addsub.cpp |  18 +++
 src/libOpenImageIO/imagebufalgo_cuda.cu    | 110 +++++++++++++
 src/libOpenImageIO/imagebufalgo_cuda.h     |  45 ++++++
 src/libOpenImageIO/imageio.cpp             |  11 ++
 src/libOpenImageIO/imageio_cuda.cpp        | 174 +++++++++++++++++++++
 src/libOpenImageIO/imageio_pvt.h.in        |  16 ++
 src/oiiotool/oiiotool.cpp                  |  23 +++
 15 files changed, 521 insertions(+), 19 deletions(-)
 create mode 100644 src/libOpenImageIO/imagebufalgo_cuda.cu
 create mode 100644 src/libOpenImageIO/imagebufalgo_cuda.h
 create mode 100644 src/libOpenImageIO/imageio_cuda.cpp

diff --git a/Makefile b/Makefile
index 6b7185eea1..44ce82903b 100644
--- a/Makefile
+++ b/Makefile
@@ -223,6 +223,10 @@ ifneq (${BUILD_OIIOUTIL_ONLY},)
 MY_CMAKE_FLAGS += -DBUILD_OIIOUTIL_ONLY:BOOL=${BUILD_OIIOUTIL_ONLY}
 endif
 
+ifneq (${USE_CUDA},)
+MY_CMAKE_FLAGS += -DUSE_CUDA:BOOL=${USE_CUDA}
+endif
+
 ifdef DEBUG
 MY_CMAKE_FLAGS += -DCMAKE_BUILD_TYPE:STRING=Debug
 endif
@@ -484,6 +488,7 @@ help:
 	@echo "      USE_OPENCV=0             Skip anything that needs OpenCV"
 	@echo "      USE_PTEX=0               Skip anything that needs PTex"
 	@echo "      USE_FREETYPE=0           Skip anything that needs Freetype"
+	@echo "      USE_CUDA=1               Build NVIDIA CUDA support (if found)"
 	@echo "  OIIO build-time options:"
 	@echo "      INSTALL_PREFIX=path      Set installation prefix (default: ./${INSTALL_PREFIX_BRIEF})"
 	@echo "      NAMESPACE=name           Override namespace base name (default: OpenImageIO)"
diff --git a/site/spi/Makefile-bits-arnold b/site/spi/Makefile-bits-arnold
index 6429220085..f4ad1226c8 100644
--- a/site/spi/Makefile-bits-arnold
+++ b/site/spi/Makefile-bits-arnold
@@ -87,6 +87,11 @@ ifeq (${SP_OS}, rhel7)
            -DCMAKE_CXX_COMPILER=${LLVM_DIRECTORY}/bin/clang++
     endif
 
+    # CUDA customizations
+    MY_CMAKE_FLAGS += \
+	-DCUDA_TOOLKIT_ROOT_DIR=/net/soft_scratch/apps/arnold/tools/nvidia/cuda9.1 \
+	-DCUDA_HOST_COMPILER=/shots/spi/home/lib/arnold/rhel7/llvm_4.0_final/bin/clang++
+
     MY_CMAKE_FLAGS += \
 	-DOPENEXR_CUSTOM_INCLUDE_DIR=/usr/include/OpenEXR2 \
 	-DOPENEXR_CUSTOM_LIB_DIR=/usr/lib64/OpenEXR2 \
diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
index efec452b21..96f401c4bf 100644
--- a/src/cmake/compiler.cmake
+++ b/src/cmake/compiler.cmake
@@ -21,6 +21,8 @@ option (CLANG_TIDY "Enable clang-tidy" OFF)
 set (CLANG_TIDY_CHECKS "-*" CACHE STRING "clang-tidy checks to perform")
 set (CLANG_TIDY_ARGS "" CACHE STRING "clang-tidy args")
 option (CLANG_TIDY_FIX "Have clang-tidy fix source" OFF)
+set (USE_CUDA OFF CACHE BOOL "Use CUDA if found")
+set (CUDA_TARGET_ARCH "sm_30" CACHE STRING "CUDA GPU architecture (e.g. sm_35)")
 
 # Figure out which compiler we're using
 if (CMAKE_COMPILER_IS_GNUCC)
diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index 9508f4d1b8..becd67dad6 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -4,6 +4,7 @@
 # When not in VERBOSE mode, try to make things as quiet as possible
 if (NOT VERBOSE)
     set (Boost_FIND_QUIETLY true)
+    set (CUDA_FIND_QUIETLY true)
     set (DCMTK_FIND_QUIETLY true)
     set (FFmpeg_FIND_QUIETLY true)
     set (Field3D_FIND_QUIETLY true)
@@ -557,4 +558,27 @@ endmacro()
 
 
 ###########################################################################
+if (USE_CUDA)
+    if (NOT CUDA_TOOLKIT_ROOT_DIR AND NOT $ENV{CUDA_TOOLKIT_ROOT_DIR} STREQUAL "")
+        set (CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_TOOLKIT_ROOT_DIR})
+    endif ()
+    if (NOT CUDA_FIND_QUIETLY)
+        message (STATUS "CUDA_TOOLKIT_ROOT_DIR = ${CUDA_TOOLKIT_ROOT_DIR}")
+    endif ()
+    set (CUDA_PROPAGATE_HOST_FLAGS ON)
+    set (CUDA_VERBOSE_BUILD ${VERBOSE})
+    find_package (CUDA 7.0 REQUIRED)
+    list (APPEND CUDA_NVCC_FLAGS ${CSTD_FLAGS} -expt-relaxed-constexpr)
+    set (CUDA_INCLUDE_DIR ${CUDA_TOOLKIT_ROOT_DIR}/include)
+    message (STATUS "CUDA version = ${CUDA_VERSION}")
+    if (NOT CUDA_FIND_QUIETLY)
+        message (STATUS "CUDA includes  = ${CUDA_INCLUDE_DIR}")
+        message (STATUS "CUDA libraries = ${CUDA_LIBRARIES}")
+        message (STATUS "CUDA host compiler = ${CUDA_HOST_COMPILER}")
+        message (STATUS "CUDA nvcc flags = ${CUDA_NVCC_FLAGS}")
+    endif ()
+endif ()
+
+# end Cuda
+###########################################################################
 
diff --git a/src/include/OpenImageIO/imagebuf.h b/src/include/OpenImageIO/imagebuf.h
index 0ef29610a5..ac66825d81 100644
--- a/src/include/OpenImageIO/imagebuf.h
+++ b/src/include/OpenImageIO/imagebuf.h
@@ -172,6 +172,9 @@ class OIIO_API ImageBuf {
     /// Which type of storage is being used for the pixels?
     IBStorage storage () const;
 
+    /// Is the pixel memory of this ImageBuf visible to Cuda?
+    bool cuda_storage () const;
+
     /// Is this ImageBuf object initialized?
     bool initialized () const;
 
diff --git a/src/include/OpenImageIO/platform.h b/src/include/OpenImageIO/platform.h
index 6e6987ba09..85516e315e 100644
--- a/src/include/OpenImageIO/platform.h
+++ b/src/include/OpenImageIO/platform.h
@@ -323,8 +323,10 @@
 // OIIO_HOSTDEVICE is used to supply the function decorators needed when
 // compiling for CUDA devices.
 #ifdef __CUDACC__
+#  define OIIO_HOST __host__
 #  define OIIO_HOSTDEVICE __host__ __device__
 #else
+#  define OIIO_HOST
 #  define OIIO_HOSTDEVICE
 #endif
 
diff --git a/src/libOpenImageIO/CMakeLists.txt b/src/libOpenImageIO/CMakeLists.txt
index cb08e763f8..3c82c88f23 100644
--- a/src/libOpenImageIO/CMakeLists.txt
+++ b/src/libOpenImageIO/CMakeLists.txt
@@ -5,6 +5,12 @@ file (TO_NATIVE_PATH "${PLUGIN_SEARCH_PATH}" PLUGIN_SEARCH_PATH_NATIVE)
 configure_file (imageio_pvt.h.in "${CMAKE_CURRENT_BINARY_DIR}/imageio_pvt.h" @ONLY)
 include_directories("${CMAKE_CURRENT_BINARY_DIR}")
 
+if (BUILDSTATIC)
+    set (OIIO_LIB_TYPE "STATIC")
+else ()
+    set (OIIO_LIB_TYPE "SHARED")
+endif ()
+
 file (GLOB libOpenImageIO_hdrs ../include/OpenImageIO/*.h)
 
 if (NOT USE_EXTERNAL_PUGIXML)
@@ -48,6 +54,7 @@ set (libOpenImageIO_srcs
                           imageoutput.cpp iptc.cpp xmp.cpp
                           color_ocio.cpp
                           maketexture.cpp
+                          imageio_cuda.cpp
                           ../libutil/argparse.cpp
                           ../libutil/benchmark.cpp
                           ../libutil/errorhandler.cpp 
@@ -74,6 +81,13 @@ set (libOpenImageIO_srcs
                           ${libOpenImageIO_hdrs}
                          )
 
+#set (cuda_using_srcs imagebufalgo_addsub.cpp)
+
+if (USE_CUDA)
+    file (GLOB gpu_source_files "*.cu")
+    message (STATUS "Extra cuda files: ${gpu_source_files}")
+    list (APPEND libOpenImageIO_srcs ${gpu_source_files})
+endif ()
 
 # If the 'EMBEDPLUGINS' option is set, we want to compile the source for
 # all the plugins into libOpenImageIO.
@@ -101,10 +115,12 @@ endif ()
 source_group ("libutil"    REGULAR_EXPRESSION ".+/libutil/.+")
 source_group ("libtexture" REGULAR_EXPRESSION ".+/libtexture/.+")
 
-if (BUILDSTATIC)
-    add_library (OpenImageIO STATIC ${libOpenImageIO_srcs})
+if (USE_CUDA)
+    add_definitions ("-DOIIO_USE_CUDA=1")
+    cuda_add_library (OpenImageIO ${OIIO_LIB_TYPE} ${libOpenImageIO_srcs}
+                      OPTIONS -arch ${CUDA_TARGET_ARCH})
 else ()
-    add_library (OpenImageIO SHARED ${libOpenImageIO_srcs})
+    add_library (OpenImageIO ${OIIO_LIB_TYPE} ${libOpenImageIO_srcs})
 endif ()
 
 # if (SANITIZE AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
@@ -207,7 +223,9 @@ if (EXTRA_DSO_LINK_ARGS)
     set_target_properties (OpenImageIO PROPERTIES LINK_FLAGS ${EXTRA_DSO_LINK_ARGS})
 endif()
 
-oiio_install_targets (OpenImageIO)
+oiio_install_targets (OpenImageIO 
+#${OIIO_CUDA_LIB}
+)
 
 
 # Testing
diff --git a/src/libOpenImageIO/imagebuf.cpp b/src/libOpenImageIO/imagebuf.cpp
index 0dbafa6456..97be4a1f81 100644
--- a/src/libOpenImageIO/imagebuf.cpp
+++ b/src/libOpenImageIO/imagebuf.cpp
@@ -50,6 +50,8 @@
 #include <OpenImageIO/fmath.h>
 #include <OpenImageIO/thread.h>
 #include <OpenImageIO/simd.h>
+#include "imageio_pvt.h"
+
 
 OIIO_NAMESPACE_BEGIN
 
@@ -143,6 +145,7 @@ class ImageBufImpl {
     void append_error (const std::string& message) const;
 
     ImageBuf::IBStorage storage () const { return m_storage; }
+    bool cuda_storage () const { return m_cuda_storage; }
 
     TypeDesc pixeltype () const {
         validate_spec ();
@@ -193,6 +196,7 @@ class ImageBufImpl {
                               m_current_subimage, m_current_miplevel);
     }
 
+    // Make sure the pixels are ready to read with an iterator.
     bool validate_pixels () const {
         if (m_pixels_valid)
             return true;
@@ -242,6 +246,21 @@ class ImageBufImpl {
         return (z * m_spec.height + y) * m_spec.width + x;
     }
 
+    void release_pixels () {
+        IB_local_mem_current -= m_allocated_size;
+#if OIIO_USE_CUDA
+        if (m_cuda_storage) {
+            OIIO::debug ("IB Cuda free %p\n", (void*)m_pixels.get());
+            OIIO::pvt::cuda_free (m_pixels.release());
+            m_cuda_storage = false;
+        }
+#endif
+        m_pixels.reset();
+        m_localpixels = nullptr;
+        m_pixels_valid = false;
+        m_allocated_size = 0;
+    }
+
 private:
     ImageBuf::IBStorage m_storage; ///< Pixel storage class
     ustring m_name;              ///< Filename of the image
@@ -258,6 +277,7 @@ class ImageBufImpl {
     mutable spin_mutex m_valid_mutex;
     mutable bool m_spec_valid;   ///< Is the spec valid
     mutable bool m_pixels_valid; ///< Image is valid
+    mutable bool m_cuda_storage = false; ///< Is the pixel memory visible to Cuda?
     bool m_badfile;              ///< File not found
     float m_pixelaspect;         ///< Pixel aspect ratio of the image
     size_t m_pixel_bytes;
@@ -358,23 +378,23 @@ ImageBufImpl::ImageBufImpl (const ImageBufImpl &src)
 {
     m_spec_valid = src.m_spec_valid;
     m_pixels_valid = src.m_pixels_valid;
-    m_allocated_size = src.m_localpixels ? src.spec().image_bytes() : 0;
+    m_allocated_size = 0;
     IB_local_mem_current += m_allocated_size;
     if (src.m_localpixels) {
         // Source had the image fully in memory (no cache)
         if (m_storage == ImageBuf::APPBUFFER) {
             // Source just wrapped the client app's pixels, we do the same
+            m_allocated_size = src.m_localpixels ? src.spec().image_bytes() : 0;
             m_localpixels = src.m_localpixels;
         } else {
             // We own our pixels -- copy from source
-            m_pixels.reset (new char [src.m_spec.image_bytes()]);
+            realloc ();
             memcpy (m_pixels.get(), src.m_pixels.get(), m_spec.image_bytes());
-            m_localpixels = m_pixels.get();
         }
     } else {
         // Source was cache-based or deep
         // nothing else to do
-        m_localpixels = NULL;
+        m_localpixels = nullptr;
     }
     if (src.m_configspec)
         m_configspec.reset (new ImageSpec(*src.m_configspec));
@@ -388,7 +408,7 @@ ImageBufImpl::~ImageBufImpl ()
     // externally and passed to the ImageBuf ctr or reset() method, or
     // else init_spec requested the system-wide shared cache, which
     // does not need to be destroyed.
-    IB_local_mem_current -= m_allocated_size;
+    release_pixels ();
 }
 
 
@@ -513,6 +533,14 @@ ImageBuf::storage () const
 
 
 
+bool
+ImageBuf::cuda_storage () const
+{
+    return impl()->cuda_storage ();
+}
+
+
+
 void
 ImageBufImpl::clear ()
 {
@@ -524,10 +552,8 @@ ImageBufImpl::clear ()
     m_current_miplevel = -1;
     m_spec = ImageSpec ();
     m_nativespec = ImageSpec ();
-    m_pixels.reset ();
-    m_localpixels = NULL;
+    release_pixels ();
     m_spec_valid = false;
-    m_pixels_valid = false;
     m_badfile = false;
     m_pixelaspect = 1;
     m_pixel_bytes = 0;
@@ -629,20 +655,41 @@ ImageBuf::reset (const ImageSpec &spec)
 void
 ImageBufImpl::realloc ()
 {
-    IB_local_mem_current -= m_allocated_size;
+    release_pixels ();
     m_allocated_size = m_spec.deep ? size_t(0) : m_spec.image_bytes ();
     IB_local_mem_current += m_allocated_size;
-    m_pixels.reset (m_allocated_size ? new char [m_allocated_size] : NULL);
-    m_localpixels = m_pixels.get();
-    m_storage = m_allocated_size ? ImageBuf::LOCALBUFFER : ImageBuf::UNINITIALIZED;
+    m_cuda_storage = false;
+    if (m_allocated_size) {
+#ifdef OIIO_USE_CUDA
+        if (OIIO::get_int_attribute("cuda") && m_spec.format == TypeFloat) {
+            char *cudaptr = (char *)OIIO::pvt::cuda_malloc (m_allocated_size);
+            if (cudaptr) {
+                OIIO::debug ("IB Cuda allocated %p\n", (void*)cudaptr);
+                m_pixels.reset (cudaptr);
+                m_cuda_storage = true;
+            }
+            else {
+                OIIO::debug ("Requested cudaMallocManaged of %s FAILED\n",
+                             m_allocated_size);
+            }
+        }
+#endif
+        if (! m_pixels)   // no cuda, or cuda failed
+            m_pixels.reset (new char [m_allocated_size]);
+        m_pixels_valid = true;
+        m_storage = ImageBuf::LOCALBUFFER;
+        m_localpixels = m_pixels.get();
+    } else {
+        m_pixels_valid = false;
+        m_storage = ImageBuf::UNINITIALIZED;
+        m_localpixels = nullptr;
+    }
     m_pixel_bytes = m_spec.pixel_bytes();
     m_scanline_bytes = m_spec.scanline_bytes();
     m_plane_bytes = clamped_mult64 (m_scanline_bytes, (imagesize_t)m_spec.height);
     m_channel_bytes = m_spec.format.size();
     m_blackpixel.resize (round_to_multiple (m_pixel_bytes, OIIO_SIMD_MAX_SIZE_BYTES), 0);
     // NB make it big enough for SSE
-    if (m_allocated_size)
-        m_pixels_valid = true;
     if (m_spec.deep) {
         m_deepdata.init (m_spec);
         m_storage = ImageBuf::LOCALBUFFER;
@@ -2372,5 +2419,4 @@ ImageBuf::retile (int x, int y, int z, ImageCache::Tile* &tile,
 }
 
 
-
 OIIO_NAMESPACE_END
diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index fdf7067fa5..fbd88ad10c 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -44,6 +44,7 @@
 #include <OpenImageIO/deepdata.h>
 #include <OpenImageIO/dassert.h>
 #include "imageio_pvt.h"
+#include "imagebufalgo_cuda.h"
 
 
 
@@ -119,6 +120,15 @@ ImageBufAlgo::add (ImageBuf &dst, const ImageBuf &A, const ImageBuf &B,
         return false;
     ROI origroi = roi;
     roi.chend = std::min (roi.chend, std::min (A.nchannels(), B.nchannels()));
+
+#ifdef OIIO_USE_CUDA
+    if (dst.cuda_storage() && A.cuda_storage() && B.cuda_storage() &&
+            dst.roi() == roi && A.roi() == roi && B.roi() == roi) {
+        return pvt::add_impl_cuda (dst, A, B, dst.roi());
+    }
+// make >/dev/null && OPENIMAGEIO_LOG_TIMES=2 oiiotool -cuda -frames 1-1 -pattern fill:topleft=0,0,0:topright=0.5,0,0:bottomleft=0,0.5,0:bottomright=0.5,0.5,0.5 1920x1080 3 -pattern fill:topleft=0.5,0,0:topright=0,0.5,0:bottomleft=0.5,0.5,0.5:bottomright=0,.5,.5 1920x1080 3 -add -o out.exr
+#endif
+
     bool ok;
     OIIO_DISPATCH_COMMON_TYPES3 (ok, "add", add_impl, dst.spec().format,
                                  A.spec().format, B.spec().format,
@@ -211,6 +221,14 @@ ImageBufAlgo::sub (ImageBuf &dst, const ImageBuf &A, const ImageBuf &B,
         return false;
     ROI origroi = roi;
     roi.chend = std::min (roi.chend, std::min (A.nchannels(), B.nchannels()));
+
+#ifdef OIIO_USE_CUDA
+    if (dst.cuda_storage() && A.cuda_storage() && B.cuda_storage() &&
+            dst.roi() == roi && A.roi() == roi && B.roi() == roi) {
+        return pvt::sub_impl_cuda (dst, A, B, dst.roi());
+    }
+#endif
+
     bool ok;
     OIIO_DISPATCH_COMMON_TYPES3 (ok, "sub", sub_impl, dst.spec().format,
                                  A.spec().format, B.spec().format,
diff --git a/src/libOpenImageIO/imagebufalgo_cuda.cu b/src/libOpenImageIO/imagebufalgo_cuda.cu
new file mode 100644
index 0000000000..a474230d3c
--- /dev/null
+++ b/src/libOpenImageIO/imagebufalgo_cuda.cu
@@ -0,0 +1,110 @@
+/*
+  Copyright 2018 Larry Gritz and the other authors and contributors.
+  All Rights Reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of the software's owners nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  (This is the Modified BSD License)
+*/
+
+#include <OpenImageIO/imagebuf.h>
+#include <OpenImageIO/timer.h>
+#include "imagebufalgo_cuda.h"
+
+
+OIIO_NAMESPACE_BEGIN
+namespace pvt {
+
+
+__global__
+void add_cuda (float *R, const float *A, const float *B, ROI roi)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    int n = int(roi.npixels());
+    int nc = roi.nchannels();
+    for (int p = index; p < n; p += stride) {
+        int i = p*nc;
+        for (int c = roi.chbegin; c < roi.chend; ++c)
+            R[i+c] = A[i+c] + B[i+c];
+    }
+}
+
+
+
+bool
+add_impl_cuda (ImageBuf &R, const ImageBuf &A, const ImageBuf &B,
+               ROI roi)
+{
+    Timer timer;
+    int blockSize = 1024;
+    int numBlocks = (int(roi.npixels()) + blockSize - 1) / blockSize;
+    add_cuda<<<numBlocks, blockSize>>>((float *)R.localpixels(),
+                                       (const float *)A.localpixels(),
+                                       (const float *)B.localpixels(), roi);
+    cudaDeviceSynchronize();
+    OIIO::debug ("Running cuda ImageBufAlgo::add, %d blocks of %d: %gms\n",
+                 numBlocks, blockSize, timer()*1000.0f);
+    return true;
+}
+
+
+
+
+__global__
+void sub_cuda (float *R, const float *A, const float *B, ROI roi)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    int n = int(roi.npixels());
+    int nc = roi.nchannels();
+    for (int p = index; p < n; p += stride) {
+        int i = p*nc;
+        for (int c = roi.chbegin; c < roi.chend; ++c)
+            R[i+c] = A[i+c] - B[i+c];
+    }
+}
+
+
+
+bool
+sub_impl_cuda (ImageBuf &R, const ImageBuf &A, const ImageBuf &B,
+               ROI roi)
+{
+    Timer timer;
+    int blockSize = 1024;
+    int numBlocks = (int(roi.npixels()) + blockSize - 1) / blockSize;
+    sub_cuda<<<numBlocks, blockSize>>>((float *)R.localpixels(),
+                                       (const float *)A.localpixels(),
+                                       (const float *)B.localpixels(), roi);
+    cudaDeviceSynchronize();
+    OIIO::debug ("Running cuda ImageBufAlgo::sub, %d blocks of %d: %gms\n",
+                 numBlocks, blockSize, timer()*1000.0f);
+    return true;
+}
+
+
+}  // end namespace pvt
+OIIO_NAMESPACE_END
+
diff --git a/src/libOpenImageIO/imagebufalgo_cuda.h b/src/libOpenImageIO/imagebufalgo_cuda.h
new file mode 100644
index 0000000000..e071b76bef
--- /dev/null
+++ b/src/libOpenImageIO/imagebufalgo_cuda.h
@@ -0,0 +1,45 @@
+/*
+  Copyright 2018 Larry Gritz and the other authors and contributors.
+  All Rights Reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of the software's owners nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  (This is the Modified BSD License)
+*/
+
+#include <OpenImageIO/oiioversion.h>
+
+
+
+OIIO_NAMESPACE_BEGIN
+namespace pvt {
+
+
+bool add_impl_cuda (ImageBuf &R, const ImageBuf &A, const ImageBuf &B, ROI roi);
+bool sub_impl_cuda (ImageBuf &R, const ImageBuf &A, const ImageBuf &B, ROI roi);
+
+
+}  // end namespace pvt
+OIIO_NAMESPACE_END
+
diff --git a/src/libOpenImageIO/imageio.cpp b/src/libOpenImageIO/imageio.cpp
index c3fc9ca49c..1c0b6bb635 100644
--- a/src/libOpenImageIO/imageio.cpp
+++ b/src/libOpenImageIO/imageio.cpp
@@ -74,6 +74,7 @@ std::string output_format_list;  // comma-separated list of writeable formats
 std::string extension_list;   // list of all extensions for all formats
 std::string library_list;   // list of all libraries for all formats
 int oiio_log_times = Strutil::from_string<int>(Sysutil::getenv("OPENIMAGEIO_LOG_TIMES"));
+atomic_int use_cuda (Strutil::from_string<int>(Sysutil::getenv("OPENIMAGEIO_CUDA")));
 }
 
 using namespace pvt;
@@ -282,6 +283,11 @@ attribute (string_view name, TypeDesc type, const void *val)
         default_thread_pool()->resize (ot-1);
         return true;
     }
+    if (name == "cuda" && type == TypeDesc::TypeInt) {
+        use_cuda = (*(const int *)val);
+        return true;
+    }
+
     spin_lock lock (attrib_mutex);
     if (name == "read_chunk" && type == TypeInt) {
         oiio_read_chunk = *(const int *)val;
@@ -323,6 +329,11 @@ getattribute (string_view name, TypeDesc type, void *val)
         *(int *)val = oiio_threads;
         return true;
     }
+    if (name == "cuda" && type == TypeDesc::TypeInt) {
+        *(int *)val = openimageio_cuda();
+        return true;
+    }
+
     spin_lock lock (attrib_mutex);
     if (name == "read_chunk" && type == TypeInt) {
         *(int *)val = oiio_read_chunk;
diff --git a/src/libOpenImageIO/imageio_cuda.cpp b/src/libOpenImageIO/imageio_cuda.cpp
new file mode 100644
index 0000000000..413097da29
--- /dev/null
+++ b/src/libOpenImageIO/imageio_cuda.cpp
@@ -0,0 +1,174 @@
+/*
+  Copyright 2018 Larry Gritz and the other authors and contributors.
+  All Rights Reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of the software's owners nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  (This is the Modified BSD License)
+*/
+
+#include <cstdlib>
+#include <cstdio>
+#include <mutex>
+
+#ifdef OIIO_USE_CUDA
+// #include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#include <OpenImageIO/imageio.h>
+#include <OpenImageIO/strutil.h>
+#include <OpenImageIO/thread.h>
+#include "imageio_pvt.h"
+
+
+OIIO_NAMESPACE_BEGIN
+
+// Global private data
+namespace pvt {
+
+spin_mutex cuda_mutex;
+bool cuda_supported = false;
+std::string cuda_device_name;
+int cuda_driver_version = 0;
+int cuda_runtime_version = 0;
+int cuda_compatibility = 0;
+size_t cuda_total_memory = 0;
+
+
+
+#ifdef OIIO_USE_CUDA
+
+// This will output the proper CUDA error strings in the event that a
+// CUDA host call returns an error
+#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline bool __checkCudaErrors(cudaError_t err, const char *file, const int line)
+{
+    if (cudaSuccess != err) {
+        Strutil::fprintf (stderr, "Cuda error %d (%s) at %s:%d\n",
+                          (int)err, cudaGetErrorString(err), file, line);
+    }
+    return true;
+    return (err == cudaSuccess);
+}
+
+
+
+static void
+initialize_cuda ()
+{
+    // Environment OPENIMAGEIO_CUDA=0 trumps everything else, turns off
+    // Cuda functionality.
+    const char *env = getenv ("OPENIMAGEIO_CUDA");
+    if (env && strtol(env,NULL,10) == 0)
+        return;
+
+    // if (! checkCudaErrors (cuInit (0)))
+    //     return;
+
+    // Get number of devices supporting CUDA
+    int deviceCount = 0;
+    if (! checkCudaErrors (cudaGetDeviceCount(&deviceCount))) {
+        return;
+    }
+
+    OIIO::debug ("Number of Cuda devices: %d\n", deviceCount);
+#if 0
+    for (int dev = 0; dev < deviceCount; ++dev) {
+        CUdevice device;
+        cudaGetDevice (&device, dev);
+        cudaSetDevice(dev);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+        cuda_device_name = deviceProp.name;
+        cuDriverGetVersion (&cuda_driver_version);
+        cudaRuntimeGetVersion (&cuda_runtime_version);
+        cuda_compatibility = 100 * deviceProp.major + deviceProp.minor;
+        cuda_total_memory = deviceProp.totalGlobalMem;
+        OIIO::debug ("Cuda device \"%s\": driver %s, runtime %s, Cuda compat %s\n",
+                     cuda_device_name, cuda_driver_version,
+                     cuda_runtime_version, cuda_compatibility);
+        OIIO::debug (" total mem %g MB\n", cuda_total_memory/(1024.0*1024.0));
+        break;  // only inventory the first Cuda device. FIXME? 
+    }
+#endif
+    cuda_supported = true;
+}
+
+#endif /* defined(OIIO_USE_CUDA) */
+
+
+
+bool
+openimageio_cuda ()
+{
+    if (! use_cuda)
+        return false;
+#ifdef OIIO_USE_CUDA
+    static std::once_flag cuda_initialized;
+    std::call_once (cuda_initialized, initialize_cuda);
+#endif
+    return cuda_supported;
+}
+
+
+struct cuda_force_initializer {
+    cuda_force_initializer() { (void) openimageio_cuda(); }
+};
+cuda_force_initializer init;
+
+
+
+void* cuda_malloc (size_t size)
+{
+#ifdef OIIO_USE_CUDA
+    if (use_cuda) {
+        char *cudaptr = nullptr;
+        checkCudaErrors (cudaMallocManaged (&cudaptr, size));
+        cudaDeviceSynchronize();
+        return cudaptr;
+    }
+#endif
+    return malloc (size);
+}
+
+
+
+void cuda_free (void *mem)
+{
+#ifdef OIIO_USE_CUDA
+    if (use_cuda) {
+        cudaDeviceSynchronize();
+        checkCudaErrors (cudaFree (mem));
+        return;
+    }
+#endif
+    return free (mem);
+}
+
+
+} // end namespace pvt
+
+OIIO_NAMESPACE_END
diff --git a/src/libOpenImageIO/imageio_pvt.h.in b/src/libOpenImageIO/imageio_pvt.h.in
index 5a5a02108a..8d84eaaab9 100644
--- a/src/libOpenImageIO/imageio_pvt.h.in
+++ b/src/libOpenImageIO/imageio_pvt.h.in
@@ -59,6 +59,14 @@ extern std::string extension_list;
 extern std::string library_list;
 extern int oiio_log_times;
 
+extern atomic_int use_cuda;
+extern bool cuda_supported;
+extern std::string cuda_device_name;
+extern int cuda_driver_version;
+extern int cuda_runtime_version;
+extern int cuda_compatibility;
+extern size_t cuda_total_memory;
+
 
 // For internal use - use error() below for a nicer interface.
 void seterror (string_view message);
@@ -74,6 +82,13 @@ inline void error (string_view fmt, const Args&... args) {
 // imageio_mutex is held.  For internal use only.
 void catalog_all_plugins (std::string searchpath);
 
+// Is Cuda available to OpenImageIO?
+bool openimageio_cuda ();
+
+/// Allocate unified Cuda/CPU memory
+void* cuda_malloc (size_t size);
+void cuda_free (void *mem);
+
 /// Given the format, set the default quantization range.
 void get_default_quantize (TypeDesc format,
                            long long &quant_min, long long &quant_max);
@@ -135,6 +150,7 @@ public:
     }
     void stop () { m_timer.stop(); }
     void rename (string_view name) { m_name = name; }
+    Timer& timer () { return m_timer; }
 private:
     Timer m_timer;
     std::string m_name;
diff --git a/src/oiiotool/oiiotool.cpp b/src/oiiotool/oiiotool.cpp
index f8572b24ff..052749d2ac 100644
--- a/src/oiiotool/oiiotool.cpp
+++ b/src/oiiotool/oiiotool.cpp
@@ -452,6 +452,27 @@ unset_autopremult (int argc, const char *argv[])
 }
 
 
+static int
+enable_cuda (int argc, const char *argv[])
+{
+    ASSERT (argc == 1);
+    OIIO::attribute ("cuda", 1);
+    int r = OIIO::get_int_attribute ("cuda");   // force initialization
+    if (ot.debug)
+        std::cout << "Enable_cuda: " << r << "\n";
+    return 0;
+}
+
+
+static int
+disable_cuda (int argc, const char *argv[])
+{
+    ASSERT (argc == 1);
+    OIIO::attribute ("cuda", 0);
+    return 0;
+}
+
+
 
 static int
 action_label (int argc, const char *argv[])
@@ -5105,6 +5126,8 @@ getargs (int argc, char *argv[])
                 "-a", &ot.allsubimages, "Do operations on all subimages/miplevels",
                 "--debug", &ot.debug, "Debug mode",
                 "--runstats", &ot.runstats, "Print runtime statistics",
+                "--cuda %@", &enable_cuda, NULL, "Use Cuda if available",
+                "--nocuda %@", &disable_cuda, NULL, "Don't use Cuda, even if available",
                 "--info %@", set_printinfo, NULL, "Print resolution and basic info on all inputs, detailed metadata if -v is also used (options: format=xml:verbose=1)",
                 "--echo %@ %s", do_echo, NULL, "Echo message to console (options: newline=0)",
                 "--metamatch %s", &ot.printinfo_metamatch,