diff --git a/dlib/cuda/cuda_dlib.cu b/dlib/cuda/cuda_dlib.cu
index 5d6ec4052c..921bbdbd35 100644
--- a/dlib/cuda/cuda_dlib.cu
+++ b/dlib/cuda/cuda_dlib.cu
@@ -9,6 +9,17 @@
 
 namespace dlib 
 { 
+    namespace
+    {
+        bool& use_cuda_impl (
+        )
+        {
+            thread_local bool var(cuda::is_available());
+            return var;
+        }
+
+    }
+
     namespace cuda 
     {
 
@@ -18,14 +29,16 @@ namespace dlib
             int dev
         )
         {
-            CHECK_CUDA(cudaSetDevice(dev));
+            if (is_available())
+                CHECK_CUDA(cudaSetDevice(dev));
         }
 
         int get_device (
         )
         {
-            int dev = 0;
-            CHECK_CUDA(cudaGetDevice(&dev));
+            int dev = -1;
+            if (is_available())
+                CHECK_CUDA(cudaGetDevice(&dev));
             return dev;
         }
 
@@ -44,6 +57,27 @@ namespace dlib
             CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
         }
 
+        bool is_available(
+        ) 
+        {
+            int num_devices;
+            return cudaGetDeviceCount(&num_devices) == cudaSuccess && num_devices > 0;
+        }
+
+        bool use_cuda(
+        )
+        {
+            return use_cuda_impl();
+        }
+
+        void set_use_cuda(
+            bool flag
+        )
+        {
+            if (is_available())
+                use_cuda_impl() = flag;
+        }
+
         int get_num_devices (
         )
         {
diff --git a/dlib/cuda/cuda_dlib.h b/dlib/cuda/cuda_dlib.h
index 2f22b7e23e..289b2c6ec3 100644
--- a/dlib/cuda/cuda_dlib.h
+++ b/dlib/cuda/cuda_dlib.h
@@ -25,6 +25,16 @@ namespace dlib
         int get_num_devices (
         );
 
+        bool is_available (
+        );
+
+        bool use_cuda(
+        );
+
+        void set_use_cuda(
+            bool flag
+        );
+
         std::string get_device_name (
             int device
         );
@@ -892,6 +902,16 @@ namespace dlib
         inline int get_num_devices (
         ) { return 1; }
 
+        inline bool is_available (
+        ) { return false; }
+
+        inline bool use_cuda(
+        ) { return false; }
+
+        inline void set_use_cuda(
+            bool flag
+        ) {}
+
         inline std::string get_device_name (
             int device
         ) 
diff --git a/dlib/cuda/curand_dlibapi.cpp b/dlib/cuda/curand_dlibapi.cpp
index 67828e6640..6f8b00006b 100644
--- a/dlib/cuda/curand_dlibapi.cpp
+++ b/dlib/cuda/curand_dlibapi.cpp
@@ -6,6 +6,7 @@
 #ifdef DLIB_USE_CUDA
 
 #include "curand_dlibapi.h"
+#include "cuda_dlib.h"
 #include <curand.h>
 #include "../string.h"
 
@@ -47,11 +48,14 @@ namespace dlib
             unsigned long long seed
         ) : handle(nullptr)
         {
-            curandGenerator_t gen;
-            CHECK_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
-            handle = gen;
+            if (is_available())
+            {
+                curandGenerator_t gen;
+                CHECK_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+                handle = gen;
 
-            CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, seed));
+                CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, seed));
+            }
         }
 
         curand_generator::
diff --git a/dlib/cuda/gpu_data.cpp b/dlib/cuda/gpu_data.cpp
index 64f184aede..413c3996ee 100644
--- a/dlib/cuda/gpu_data.cpp
+++ b/dlib/cuda/gpu_data.cpp
@@ -58,10 +58,16 @@ namespace dlib
             if (dest_offset == 0 && num == dest.size())
             {
                 // copy the memory efficiently based on which copy is current in each object.
-                if (src.device_ready())
+                if (dest.device_id() >= 0 && src.device_ready())
                     CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset,  num*sizeof(float), cudaMemcpyDeviceToDevice));
-                else 
+                else if (dest.device_id() < 0 && src.device_ready())
+                    CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.device()+src_offset,    num*sizeof(float), cudaMemcpyDeviceToHost));
+                else if (dest.device_id() >= 0 && !src.device_ready())
                     CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset,    num*sizeof(float), cudaMemcpyHostToDevice));
+                else if (dest.device_id() >= 0 || src.device_id() >= 0)
+                    CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.host()+src_offset,      num*sizeof(float), cudaMemcpyHostToHost));
+                else
+                    std::memcpy(dest.host_write_only(), src.host()+src_offset,                num*sizeof(float));
             }
             else
             {
@@ -72,8 +78,11 @@ namespace dlib
                     CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset,   num*sizeof(float), cudaMemcpyDeviceToHost));
                 else if (dest.device_ready() && !src.device_ready())
                     CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset,   num*sizeof(float), cudaMemcpyHostToDevice));
-                else 
+                else if (dest.device_id() >= 0 || src.device_id() >= 0)
                     CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset,     num*sizeof(float), cudaMemcpyHostToHost));
+                else
+                    std::memcpy(dest.host()+dest_offset, src.host()+src_offset,               num*sizeof(float));
+
             }
         }
     }
@@ -199,6 +208,13 @@ namespace dlib
             device_current = true;
             device_in_use = false;
 
+            if (!cuda::use_cuda())
+            {
+                data_host.reset(new float[new_size], std::default_delete<float[]>());
+                the_device_id = -1;
+                return;
+            }
+
             try
             {
                 CHECK_CUDA(cudaGetDevice(&the_device_id));
diff --git a/dlib/cuda/gpu_data.h b/dlib/cuda/gpu_data.h
index 022a05f71c..4d37315a40 100644
--- a/dlib/cuda/gpu_data.h
+++ b/dlib/cuda/gpu_data.h
@@ -12,6 +12,14 @@
 namespace dlib
 {
 
+// ----------------------------------------------------------------------------------------
+
+    namespace cuda
+    {
+        bool use_cuda(
+        );
+    }
+
 // ----------------------------------------------------------------------------------------
 
     class gpu_data 
@@ -93,14 +101,16 @@ namespace dlib
         float* host() 
         {
             copy_to_host();
-            device_current = false;
+            if (device_id() >= 0)
+                device_current = false;
             return data_host.get(); 
         }
 
         float* host_write_only() 
         {
             host_current = true;
-            device_current = false;
+            if (device_id() >= 0)
+                device_current = false;
             return data_host.get(); 
         }
 
@@ -109,6 +119,7 @@ namespace dlib
 #ifndef DLIB_USE_CUDA
             DLIB_CASSERT(false, "CUDA NOT ENABLED");
 #endif
+            DLIB_CASSERT(device_id() >= 0, "This data is host only");
             copy_to_device();
             device_in_use = true;
             return data_device.get(); 
@@ -119,6 +130,7 @@ namespace dlib
 #ifndef DLIB_USE_CUDA
             DLIB_CASSERT(false, "CUDA NOT ENABLED");
 #endif
+            DLIB_CASSERT(device_id() >= 0, "This data is host only");
             copy_to_device();
             host_current = false;
             device_in_use = true;
@@ -130,6 +142,7 @@ namespace dlib
 #ifndef DLIB_USE_CUDA
             DLIB_CASSERT(false, "CUDA NOT ENABLED");
 #endif
+            DLIB_CASSERT(device_id() >= 0, "This data is host only");
             wait_for_transfer_to_finish();
             host_current = false;
             device_current = true;
@@ -141,7 +154,7 @@ namespace dlib
         ) const { return host_current; }
 
         bool device_ready (
-        ) const { return device_current && !have_active_transfer; }
+        ) const { return device_current && !have_active_transfer && device_id() >= 0; }
 
         size_t size() const { return data_size; }
 
diff --git a/dlib/cuda/tensor.h b/dlib/cuda/tensor.h
index 6a893df311..aaae4e836e 100644
--- a/dlib/cuda/tensor.h
+++ b/dlib/cuda/tensor.h
@@ -12,6 +12,18 @@
 #include <memory>
 #include "../any.h"
 
+#ifdef DLIB_USE_CUDA
+#define IF_DLIB_USE_CUDA(...) if (cuda::use_cuda()) { __VA_ARGS__ }
+#else
+#define IF_DLIB_USE_CUDA(...)
+#endif
+
+#ifdef DLIB_USE_CUDA
+#define IF_DLIB_NOT_USE_CUDA(...) if (!cuda::use_cuda()) { __VA_ARGS__ }
+#else
+#define IF_DLIB_NOT_USE_CUDA(...) __VA_ARGS__
+#endif
+
 namespace dlib
 {
 
@@ -77,17 +89,18 @@ namespace dlib
 
         tensor& operator= (float val)
         {
-#ifdef DLIB_USE_CUDA
-            // If you are using CUDA then presumably you will be mostly using tensors on
-            // the GPU.  So unless you seem to be actively working with the host side's
-            // data then we do this initialization on the device side since this avoids a
-            // host to device transfer that would likely immediately follow.
-            if (data().device_ready())
-            {
-                cuda::set_tensor(*this, val);
-                return *this;
-            }
-#endif
+            IF_DLIB_USE_CUDA(
+                // If you are using CUDA then presumably you will be mostly using tensors on
+                // the GPU.  So unless you seem to be actively working with the host side's
+                // data then we do this initialization on the device side since this avoids a
+                // host to device transfer that would likely immediately follow.
+                if (data().device_ready())
+                {
+                    cuda::set_tensor(*this, val);
+                    return *this;
+                }
+            )
+
             auto d = host_write_only();
             for (size_t i = 0; i < size(); ++i)
                 d[i] = val;
@@ -97,15 +110,16 @@ namespace dlib
 
         tensor& operator*= (float val)
         {
-#ifdef DLIB_USE_CUDA
-            cuda::scale_tensor(*this, val);
-            return *this;
-#else
-            for (auto& d : *this)
-                d *= val;
+            IF_DLIB_USE_CUDA(
+                cuda::scale_tensor(*this, val);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                for (auto& d : *this)
+                    d *= val;
+            )
 
             return *this;
-#endif
         }
         
         tensor& operator/= (float val)
diff --git a/dlib/cuda/tensor_tools.cpp b/dlib/cuda/tensor_tools.cpp
index 8dece8369f..ce3d323c92 100644
--- a/dlib/cuda/tensor_tools.cpp
+++ b/dlib/cuda/tensor_tools.cpp
@@ -49,11 +49,13 @@ namespace dlib { namespace tt
         const double eps
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::inverse_norms(invnorms, data, eps);
-#else
-        invnorms = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps));
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::inverse_norms(invnorms, data, eps);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            invnorms = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps));
+        )
     }
 
     void dot_prods (
@@ -62,11 +64,13 @@ namespace dlib { namespace tt
         const tensor& rhs
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::dot_prods(out, lhs, rhs);
-#else
-        out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::dot_prods(out, lhs, rhs);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+        )
     }
 
     void dot_prods (
@@ -76,14 +80,16 @@ namespace dlib { namespace tt
         const tensor& rhs
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::dot_prods(add_to, out, lhs, rhs);
-#else
-        if (add_to)
-            out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
-        else
-            out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::dot_prods(add_to, out, lhs, rhs);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            if (add_to)
+                out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+            else
+                out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+        )
     }
 
     void scale_columns (
@@ -99,11 +105,13 @@ namespace dlib { namespace tt
         DLIB_CASSERT(m.size() != 0);
         DLIB_CASSERT(m.size()/m.num_samples() == v.size());
 
-#ifdef DLIB_USE_CUDA
-        cuda::scale_columns(out, m, v);
-#else
-        out = scale_columns(mat(m), mat(v));
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::scale_columns(out, m, v);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            out = scale_columns(mat(m), mat(v));
+        )
     }
 
     void scale_rows (
@@ -119,11 +127,13 @@ namespace dlib { namespace tt
         DLIB_CASSERT(m.size() != 0);
         DLIB_CASSERT(m.num_samples() == static_cast<long long>(v.size()));
 
-#ifdef DLIB_USE_CUDA
-        cuda::scale_rows(out, m, v);
-#else
-        out = scale_rows(mat(m), mat(v));
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::scale_rows(out, m, v);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            out = scale_rows(mat(m), mat(v));
+        )
     }
 
     void scale_rows2 (
@@ -141,14 +151,16 @@ namespace dlib { namespace tt
         DLIB_CASSERT(is_vector(mat(v1))); 
         DLIB_CASSERT(static_cast<long long>(v1.size()) == m1.num_samples());
 
-#ifdef DLIB_USE_CUDA
-        cuda::scale_rows2(beta, out, m1, m2, v1, v2);
-#else
-        if (beta == 0)
-            out = scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
-        else
-            out = beta*mat(out) + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::scale_rows2(beta, out, m1, m2, v1, v2);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            if (beta == 0)
+                out = scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+            else
+                out = beta*mat(out) + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -160,11 +172,13 @@ namespace dlib { namespace tt
     {
         DLIB_CASSERT(dest.size() == src.size());
 
-#ifdef DLIB_USE_CUDA
-        cuda::exp(dest,src);
-#else
-        dest = exp(mat(src));
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::exp(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            dest = exp(mat(src));
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -176,11 +190,13 @@ namespace dlib { namespace tt
     {
         DLIB_CASSERT(dest.size() == src.size());
 
-#ifdef DLIB_USE_CUDA
-        cuda::log(dest,src);
-#else
-        dest = log(mat(src));
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::log(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            dest = log(mat(src));
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -192,11 +208,13 @@ namespace dlib { namespace tt
     {
         DLIB_CASSERT(dest.size() == src.size());
 
-#ifdef DLIB_USE_CUDA
-        cuda::log10(dest,src);
-#else
-        dest = log10(mat(src));
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::log10(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            dest = log10(mat(src));
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -212,97 +230,99 @@ namespace dlib { namespace tt
         operation_mode mode
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs, mode);
-#else
-        if (mode == operation_mode::CHANNEL_WISE)
-        {
-            if (beta != 0)
+        IF_DLIB_USE_CUDA(
+            cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs, mode);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            if (mode == operation_mode::CHANNEL_WISE)
             {
-                if (trans_lhs && trans_rhs)
-                    dest = alpha * trans(mat(lhs)) * trans(mat(rhs)) + beta * mat(dest);
-                else if (!trans_lhs && trans_rhs)
-                    dest = alpha * mat(lhs) * trans(mat(rhs)) + beta * mat(dest);
-                else if (trans_lhs && !trans_rhs)
-                    dest = alpha * trans(mat(lhs)) * mat(rhs) + beta * mat(dest);
+                if (beta != 0)
+                {
+                    if (trans_lhs && trans_rhs)
+                        dest = alpha * trans(mat(lhs)) * trans(mat(rhs)) + beta * mat(dest);
+                    else if (!trans_lhs && trans_rhs)
+                        dest = alpha * mat(lhs) * trans(mat(rhs)) + beta * mat(dest);
+                    else if (trans_lhs && !trans_rhs)
+                        dest = alpha * trans(mat(lhs)) * mat(rhs) + beta * mat(dest);
+                    else
+                        dest = alpha * mat(lhs) * mat(rhs) + beta * mat(dest);
+                }
                 else
-                    dest = alpha * mat(lhs) * mat(rhs) + beta * mat(dest);
+                {
+                    if (trans_lhs && trans_rhs)
+                        dest = alpha * trans(mat(lhs)) * trans(mat(rhs));
+                    else if (!trans_lhs && trans_rhs)
+                        dest = alpha * mat(lhs) * trans(mat(rhs));
+                    else if (trans_lhs && !trans_rhs)
+                        dest = alpha * trans(mat(lhs)) * mat(rhs);
+                    else
+                        dest = alpha * mat(lhs) * mat(rhs);
+                }
             }
-            else
+            else if (mode == operation_mode::PLANE_WISE)
             {
-                if (trans_lhs && trans_rhs)
-                    dest = alpha * trans(mat(lhs)) * trans(mat(rhs));
-                else if (!trans_lhs && trans_rhs)
-                    dest = alpha * mat(lhs) * trans(mat(rhs));
-                else if (trans_lhs && !trans_rhs)
-                    dest = alpha * trans(mat(lhs)) * mat(rhs);
-                else
-                    dest = alpha * mat(lhs) * mat(rhs);
-            }
-        }
-        else if (mode == operation_mode::PLANE_WISE)
-        {
-            auto is_matrix = [](const auto& tensor) {
-                return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) ||
-                    (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1));
-                };
+                auto is_matrix = [](const auto& tensor) {
+                    return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) ||
+                        (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1));
+                    };
 
-            long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() });
-            long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() });
-            const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest);
+                long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() });
+                long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() });
+                const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest);
 
-            if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) {
-                num_samples = num_channels = 1;
-            }
+                if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) {
+                    num_samples = num_channels = 1;
+                }
 
-            long lhs_rows = (lhs_is_matrix && lhs.num_samples() > 1) ? lhs.num_samples() : lhs.nr();
-            long lhs_cols = (lhs_is_matrix && lhs.k() > 1) ? lhs.k() : lhs.nc();
-            long rhs_rows = (rhs_is_matrix && rhs.num_samples() > 1) ? rhs.num_samples() : rhs.nr();
-            long rhs_cols = (rhs_is_matrix && rhs.k() > 1) ? rhs.k() : rhs.nc();
-            long dest_rows = (dest_is_matrix && dest.num_samples() > 1) ? dest.num_samples() : dest.nr();
-            long dest_cols = (dest_is_matrix && dest.k() > 1) ? dest.k() : dest.nc();
+                long lhs_rows = (lhs_is_matrix && lhs.num_samples() > 1) ? lhs.num_samples() : lhs.nr();
+                long lhs_cols = (lhs_is_matrix && lhs.k() > 1) ? lhs.k() : lhs.nc();
+                long rhs_rows = (rhs_is_matrix && rhs.num_samples() > 1) ? rhs.num_samples() : rhs.nr();
+                long rhs_cols = (rhs_is_matrix && rhs.k() > 1) ? rhs.k() : rhs.nc();
+                long dest_rows = (dest_is_matrix && dest.num_samples() > 1) ? dest.num_samples() : dest.nr();
+                long dest_cols = (dest_is_matrix && dest.k() > 1) ? dest.k() : dest.nc();
 
-            const size_t lhs_plane_size = lhs_rows * lhs_cols;
-            const size_t rhs_plane_size = rhs_rows * rhs_cols;
-            const size_t dest_plane_size = dest_rows * dest_cols;
+                const size_t lhs_plane_size = lhs_rows * lhs_cols;
+                const size_t rhs_plane_size = rhs_rows * rhs_cols;
+                const size_t dest_plane_size = dest_rows * dest_cols;
 
-            for (long b = 0; b < num_samples; ++b)
-            {
-                for (long c = 0; c < num_channels; ++c)
+                for (long b = 0; b < num_samples; ++b)
                 {
-                    auto lhs_slice = lhs_is_matrix ? alias_tensor(lhs_rows, lhs_cols)(lhs, 0) :
-                        alias_tensor(lhs_rows, lhs_cols)(lhs, (b * num_channels + c) * lhs_plane_size);
-                    auto rhs_slice = rhs_is_matrix ? alias_tensor(rhs_rows, rhs_cols)(rhs, 0) :
-                        alias_tensor(rhs_rows, rhs_cols)(rhs, (b * num_channels + c) * rhs_plane_size);
-                    auto dest_slice = dest_is_matrix ? alias_tensor(dest_rows, dest_cols)(dest, 0) :
-                        alias_tensor(dest_rows, dest_cols)(dest, (b * num_channels + c) * dest_plane_size);
-
-                    if (beta != 0)
+                    for (long c = 0; c < num_channels; ++c)
                     {
-                        if (trans_lhs && trans_rhs)
-                            dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)) + beta * mat(dest_slice);
-                        else if (!trans_lhs && trans_rhs)
-                            dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)) + beta * mat(dest_slice);
-                        else if (trans_lhs && !trans_rhs)
-                            dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice) + beta * mat(dest_slice);
+                        auto lhs_slice = lhs_is_matrix ? alias_tensor(lhs_rows, lhs_cols)(lhs, 0) :
+                            alias_tensor(lhs_rows, lhs_cols)(lhs, (b * num_channels + c) * lhs_plane_size);
+                        auto rhs_slice = rhs_is_matrix ? alias_tensor(rhs_rows, rhs_cols)(rhs, 0) :
+                            alias_tensor(rhs_rows, rhs_cols)(rhs, (b * num_channels + c) * rhs_plane_size);
+                        auto dest_slice = dest_is_matrix ? alias_tensor(dest_rows, dest_cols)(dest, 0) :
+                            alias_tensor(dest_rows, dest_cols)(dest, (b * num_channels + c) * dest_plane_size);
+
+                        if (beta != 0)
+                        {
+                            if (trans_lhs && trans_rhs)
+                                dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)) + beta * mat(dest_slice);
+                            else if (!trans_lhs && trans_rhs)
+                                dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)) + beta * mat(dest_slice);
+                            else if (trans_lhs && !trans_rhs)
+                                dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice) + beta * mat(dest_slice);
+                            else
+                                dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice) + beta * mat(dest_slice);
+                        }
                         else
-                            dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice) + beta * mat(dest_slice);
-                    }
-                    else
-                    {
-                        if (trans_lhs && trans_rhs)
-                            dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice));
-                        else if (!trans_lhs && trans_rhs)
-                            dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice));
-                        else if (trans_lhs && !trans_rhs)
-                            dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice);
-                        else
-                            dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice);
+                        {
+                            if (trans_lhs && trans_rhs)
+                                dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice));
+                            else if (!trans_lhs && trans_rhs)
+                                dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice));
+                            else if (trans_lhs && !trans_rhs)
+                                dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice);
+                            else
+                                dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice);
+                        }
                     }
                 }
             }
-        }
-#endif
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -313,10 +333,9 @@ namespace dlib { namespace tt
         unsigned long long seed
     ) 
 #ifdef DLIB_USE_CUDA
-    :rnd(seed){}
-#else
-    {rnd.set_seed(cast_to_string(seed)); }
+    :cuda_impl(seed)
 #endif
+    {cpu_impl.set_seed(cast_to_string(seed)); }
 
     void tensor_rand::
     fill_gaussian (
@@ -326,12 +345,15 @@ namespace dlib { namespace tt
     )
     {
         DLIB_CASSERT(data.size()%2 == 0);
-#ifdef DLIB_USE_CUDA
-        rnd.fill_gaussian(data, mean, stddev);
-#else
-        for (auto& x : data) 
-            x = rnd.get_random_gaussian()*stddev + mean;
-#endif
+
+        IF_DLIB_USE_CUDA(
+            cuda_impl.fill_gaussian(data, mean, stddev);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            for (auto& x : data) 
+                x = cpu_impl.get_random_gaussian()*stddev + mean;
+        )
     }
 
     void tensor_rand::
@@ -339,12 +361,14 @@ namespace dlib { namespace tt
         tensor& data
     )
     {
-#ifdef DLIB_USE_CUDA
-        rnd.fill_uniform(data);
-#else
-        for (auto& x : data) 
-            x = rnd.get_random_float();
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda_impl.fill_uniform(data);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            for (auto& x : data) 
+                x = cpu_impl.get_random_float();
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -364,11 +388,14 @@ namespace dlib { namespace tt
         DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
                     (src1.num_samples()==1 || src1.num_samples()==MD) &&
                     (src2.num_samples()==1 || src2.num_samples()==MD) );
-#ifdef DLIB_USE_CUDA
-        cuda::multiply(add_to, dest, src1, src2);
-#else
-        cpu::multiply(add_to, dest, src1, src2);
-#endif
+
+        IF_DLIB_USE_CUDA(
+            cuda::multiply(add_to, dest, src1, src2);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::multiply(add_to, dest, src1, src2);
+        )
 
     }
 
@@ -379,11 +406,13 @@ namespace dlib { namespace tt
         const tensor& scales
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::scale_channels(add_to, dest, src, scales);
-#else
-        cpu::scale_channels(add_to, dest, src, scales);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::scale_channels(add_to, dest, src, scales);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::scale_channels(add_to, dest, src, scales);
+        )
     }
 
     void multiply_conv (
@@ -393,11 +422,13 @@ namespace dlib { namespace tt
         const tensor& src2
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::multiply_conv(add_to, dest, src1, src2);
-#else
-        cpu::multiply_conv(add_to, dest, src1, src2);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::multiply_conv(add_to, dest, src1, src2);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::multiply_conv(add_to, dest, src1, src2);
+        )
     }
 
     void multiply_zero_padded (
@@ -407,11 +438,13 @@ namespace dlib { namespace tt
         const tensor& src2
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::multiply_zero_padded(add_to, dest, src1, src2);
-#else
-        cpu::multiply_zero_padded(add_to, dest, src1, src2);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::multiply_zero_padded(add_to, dest, src1, src2);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::multiply_zero_padded(add_to, dest, src1, src2);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -423,11 +456,13 @@ namespace dlib { namespace tt
         const float B
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform(dest,src,A,B);
-#else
-        cpu::affine_transform(dest,src,A,B);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform(dest,src,A,B);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform(dest,src,A,B);
+        )
     }
 
     void affine_transform(
@@ -436,11 +471,13 @@ namespace dlib { namespace tt
         const float A
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform(dest,src,A);
-#else
-        cpu::affine_transform(dest,src,A,0);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform(dest,src,A);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform(dest,src,A,0);
+        )
     }
 
     void affine_transform(
@@ -452,11 +489,13 @@ namespace dlib { namespace tt
         const float C
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform(dest,src1,src2,A,B,C);
-#else
-        cpu::affine_transform(dest,src1,src2,A,B,C);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform(dest,src1,src2,A,B,C);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform(dest,src1,src2,A,B,C);
+        )
     }
 
     void affine_transform(
@@ -467,11 +506,13 @@ namespace dlib { namespace tt
         const float B
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform(dest,src1,src2,A,B);
-#else
-        cpu::affine_transform(dest,src1,src2,A,B,0);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform(dest,src1,src2,A,B);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform(dest,src1,src2,A,B,0);
+        )
     }
 
     void affine_transform(
@@ -485,11 +526,13 @@ namespace dlib { namespace tt
         const float D
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform(dest,src1,src2,src3,A,B,C,D);
-#else
-        cpu::affine_transform(dest,src1,src2,src3,A,B,C,D);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform(dest,src1,src2,src3,A,B,C,D);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform(dest,src1,src2,src3,A,B,C,D);
+        )
     }
 
     void affine_transform_range(
@@ -504,11 +547,13 @@ namespace dlib { namespace tt
         const float C
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
-#else
-        cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+        )
     }
 
     void affine_transform(
@@ -522,11 +567,13 @@ namespace dlib { namespace tt
         float C
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform(rect, dest,src1,src2,src3,A,B,C);
-#else
-        cpu::affine_transform(rect, dest,src1,src2,src3,A,B,C);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform(rect, dest,src1,src2,src3,A,B,C);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform(rect, dest,src1,src2,src3,A,B,C);
+        )
     }
 
     void affine_transform(
@@ -539,11 +586,13 @@ namespace dlib { namespace tt
         const float C
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
-#else
-        cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -555,11 +604,13 @@ namespace dlib { namespace tt
         const tensor& B
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform(dest,src,A,B);
-#else
-        cpu::affine_transform(dest,src,A,B);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform(dest,src,A,B);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform(dest,src,A,B);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -571,11 +622,13 @@ namespace dlib { namespace tt
         const tensor& B
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::affine_transform_conv(dest,src,A,B);
-#else
-        cpu::affine_transform_conv(dest,src,A,B);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::affine_transform_conv(dest,src,A,B);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::affine_transform_conv(dest,src,A,B);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -595,13 +648,15 @@ namespace dlib { namespace tt
         const tensor& params_grad
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
-            momentum2, params, params_grad);
-#else
-        cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
-            momentum2, params, params_grad);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
+                momentum2, params, params_grad);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
+                momentum2, params, params_grad);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -616,11 +671,13 @@ namespace dlib { namespace tt
         const tensor& running_variances
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
-#else
-        cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+        )
     }
 
     void batch_normalize (
@@ -636,11 +693,13 @@ namespace dlib { namespace tt
         const tensor& beta 
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
-#else
-        cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        )
     }
 
     void batch_normalize_gradient (
@@ -656,11 +715,13 @@ namespace dlib { namespace tt
     )
     {
              
-#ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
-#else
-        cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -675,11 +736,13 @@ namespace dlib { namespace tt
         const tensor& running_variances
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
-#else
-        cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
+        )
     }
 
     void batch_normalize_conv (
@@ -695,11 +758,13 @@ namespace dlib { namespace tt
         const tensor& beta 
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
-#else
-        cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        )
     }
 
     void batch_normalize_conv_gradient (
@@ -715,11 +780,13 @@ namespace dlib { namespace tt
     )
     {
              
-#ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
-#else
-        cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -734,11 +801,13 @@ namespace dlib { namespace tt
         const tensor& beta
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::layer_normalize(eps, dest, means, vars, src, gamma, beta);
-#else
-        cpu::layer_normalize(eps, dest, means, vars, src, gamma, beta);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::layer_normalize(eps, dest, means, vars, src, gamma, beta);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::layer_normalize(eps, dest, means, vars, src, gamma, beta);
+        )
     }
 
     void layer_normalize_gradient (
@@ -755,11 +824,13 @@ namespace dlib { namespace tt
             resizable_tensor& dvars
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars);
-#else
-        cpu::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -772,11 +843,13 @@ namespace dlib { namespace tt
         const tensor& gamma
     )
     {            
-#ifdef DLIB_USE_CUDA
-        cuda::rms_normalize(eps, dest, scale, src, gamma);
-#else
-        cpu::rms_normalize(eps, dest, scale, src, gamma);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::rms_normalize(eps, dest, scale, src, gamma);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::rms_normalize(eps, dest, scale, src, gamma);
+        )
     }
 
     void rms_normalize_gradient(
@@ -789,11 +862,13 @@ namespace dlib { namespace tt
         resizable_tensor& dscale
     )
     {            
-#ifdef DLIB_USE_CUDA
-        cuda::rms_normalize_gradient(gradient_input, scale, src, gamma, src_grad, gamma_grad, dscale);
-#else
-        cpu::rms_normalize_gradient(gradient_input, scale, src, gamma, src_grad, gamma_grad, dscale);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::rms_normalize_gradient(gradient_input, scale, src, gamma, src_grad, gamma_grad, dscale);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::rms_normalize_gradient(gradient_input, scale, src, gamma, src_grad, gamma_grad, dscale);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -803,11 +878,13 @@ namespace dlib { namespace tt
         float thresh
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::threshold(data,thresh);
-#else
-        cpu::threshold(data,thresh);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::threshold(data,thresh);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::threshold(data,thresh);
+        )
     }
 
     void dot (
@@ -817,11 +894,13 @@ namespace dlib { namespace tt
         size_t idx
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::dot(a,b,result,idx);
-#else
-        cpu::dot(a,b,result,idx);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::dot(a,b,result,idx);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::dot(a,b,result,idx);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -833,11 +912,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::add(beta,dest,alpha,src);
-#else
-        cpu::add(beta,dest,alpha,src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::add(beta,dest,alpha,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::add(beta,dest,alpha,src);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -848,11 +929,13 @@ namespace dlib { namespace tt
         const tensor& src2
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::add(dest, src1, src2);
-#else
-        cpu::add(dest, src1, src2);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::add(dest, src1, src2);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::add(dest, src1, src2);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -862,11 +945,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::assign_conv_bias_gradient(grad,gradient_input);
-#else
-        cpu::assign_conv_bias_gradient(grad,gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::assign_conv_bias_gradient(grad,gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::assign_conv_bias_gradient(grad,gradient_input);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -876,11 +961,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::assign_bias_gradient(grad,gradient_input);
-#else
-        cpu::assign_bias_gradient(grad,gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::assign_bias_gradient(grad,gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::assign_bias_gradient(grad,gradient_input);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -891,11 +978,13 @@ namespace dlib { namespace tt
         operation_mode mode
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::softmax(dest, src, mode);
-#else
-        cpu::softmax(dest, src, mode);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::softmax(dest, src, mode);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::softmax(dest, src, mode);
+        )
     }
 
     void softmax_gradient(
@@ -905,11 +994,13 @@ namespace dlib { namespace tt
         operation_mode mode
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::softmax_gradient(grad, dest, gradient_input, mode);
-#else
-        cpu::softmax_gradient(grad, dest, gradient_input, mode);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::softmax_gradient(grad, dest, gradient_input, mode);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::softmax_gradient(grad, dest, gradient_input, mode);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -919,11 +1010,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::softmax_all(dest,src);
-#else
-        cpu::softmax_all(dest,src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::softmax_all(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::softmax_all(dest,src);
+        )
     }
 
     void softmax_all_gradient (
@@ -932,11 +1025,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::softmax_all_gradient(grad, dest, gradient_input);
-#else
-        cpu::softmax_all_gradient(grad, dest, gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::softmax_all_gradient(grad, dest, gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::softmax_all_gradient(grad, dest, gradient_input);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -946,11 +1041,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::sigmoid(dest,src);
-#else
-        cpu::sigmoid(dest,src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::sigmoid(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::sigmoid(dest,src);
+        )
     }
 
     void sigmoid_gradient (
@@ -959,11 +1056,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::sigmoid_gradient(grad, dest, gradient_input);
-#else
-        cpu::sigmoid_gradient(grad, dest, gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::sigmoid_gradient(grad, dest, gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::sigmoid_gradient(grad, dest, gradient_input);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -973,11 +1072,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::mish(dest,src);
-#else
-        cpu::mish(dest,src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::mish(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::mish(dest,src);
+        )
     }
 
     void mish_gradient (
@@ -986,11 +1087,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::mish_gradient(grad, src, gradient_input);
-#else
-        cpu::mish_gradient(grad, src, gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::mish_gradient(grad, src, gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::mish_gradient(grad, src, gradient_input);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1000,11 +1103,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::relu(dest,src);
-#else
-        cpu::relu(dest,src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::relu(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::relu(dest,src);
+        )
     }
 
     void relu_gradient (
@@ -1013,11 +1118,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::relu_gradient(grad, dest, gradient_input);
-#else
-        cpu::relu_gradient(grad, dest, gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::relu_gradient(grad, dest, gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::relu_gradient(grad, dest, gradient_input);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1028,11 +1135,13 @@ namespace dlib { namespace tt
         const tensor& param
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::prelu(dest, src, param);
-#else
-        cpu::prelu(dest, src, param);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::prelu(dest, src, param);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::prelu(dest, src, param);
+        )
     }
 
     void prelu_gradient (
@@ -1043,11 +1152,13 @@ namespace dlib { namespace tt
         tensor& params_grad 
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::prelu_gradient(grad, src, gradient_input, param, params_grad);
-#else
-        cpu::prelu_gradient(grad, src, gradient_input, param, params_grad);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::prelu_gradient(grad, src, gradient_input, param, params_grad);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::prelu_gradient(grad, src, gradient_input, param, params_grad);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1058,11 +1169,13 @@ namespace dlib { namespace tt
         const float alpha
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::leaky_relu(dest, src, alpha);
-#else
-        cpu::leaky_relu(dest, src, alpha);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::leaky_relu(dest, src, alpha);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::leaky_relu(dest, src, alpha);
+        )
     }
 
     void leaky_relu_gradient (
@@ -1072,11 +1185,13 @@ namespace dlib { namespace tt
         const float alpha
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::leaky_relu_gradient(grad, dest, gradient_input, alpha);
-#else
-        cpu::leaky_relu_gradient(grad, dest, gradient_input, alpha);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::leaky_relu_gradient(grad, dest, gradient_input, alpha);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::leaky_relu_gradient(grad, dest, gradient_input, alpha);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1086,11 +1201,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::tanh(dest,src);
-#else
-        cpu::tanh(dest,src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::tanh(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::tanh(dest,src);
+        )
     }
 
     void tanh_gradient (
@@ -1099,11 +1216,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::tanh_gradient(grad, dest, gradient_input);
-#else
-        cpu::tanh_gradient(grad, dest, gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::tanh_gradient(grad, dest, gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::tanh_gradient(grad, dest, gradient_input);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1114,11 +1233,13 @@ namespace dlib { namespace tt
         const float ceiling
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::clipped_relu(dest, src, ceiling);
-#else
-        cpu::clipped_relu(dest, src, ceiling);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::clipped_relu(dest, src, ceiling);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::clipped_relu(dest, src, ceiling);
+        )
     }
 
     void clipped_relu_gradient (
@@ -1128,11 +1249,13 @@ namespace dlib { namespace tt
         const float ceiling
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::clipped_relu_gradient(grad, dest, gradient_input, ceiling);
-#else
-        cpu::clipped_relu_gradient(grad, dest, gradient_input, ceiling);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::clipped_relu_gradient(grad, dest, gradient_input, ceiling);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::clipped_relu_gradient(grad, dest, gradient_input, ceiling);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1143,11 +1266,13 @@ namespace dlib { namespace tt
         const float alpha
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::elu(dest, src, alpha);
-#else
-        cpu::elu(dest, src, alpha);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::elu(dest, src, alpha);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::elu(dest, src, alpha);
+        )
     }
 
     void elu_gradient (
@@ -1157,11 +1282,13 @@ namespace dlib { namespace tt
         const float alpha
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::elu_gradient(grad, dest, gradient_input, alpha);
-#else
-        cpu::elu_gradient(grad, dest, gradient_input, alpha);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::elu_gradient(grad, dest, gradient_input, alpha);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::elu_gradient(grad, dest, gradient_input, alpha);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1171,11 +1298,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::gelu(dest,src);
-#else
-        cpu::gelu(dest,src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::gelu(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::gelu(dest,src);
+        )
     }
 
     void gelu_gradient (
@@ -1184,11 +1313,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::gelu_gradient(grad, src, gradient_input);
-#else
-        cpu::gelu_gradient(grad, src, gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::gelu_gradient(grad, src, gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::gelu_gradient(grad, src, gradient_input);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1200,11 +1331,14 @@ namespace dlib { namespace tt
     )
     {
         DLIB_CASSERT(beta > 0);
-#ifdef DLIB_USE_CUDA
-        cuda::smelu(dest, src, beta);
-#else
-        cpu::smelu(dest, src, beta);
-#endif
+
+        IF_DLIB_USE_CUDA(
+            cuda::smelu(dest, src, beta);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::smelu(dest, src, beta);
+        )
     }
 
     void smelu_gradient (
@@ -1215,11 +1349,14 @@ namespace dlib { namespace tt
     )
     {
         DLIB_CASSERT(beta > 0);
-#ifdef DLIB_USE_CUDA
-        cuda::smelu_gradient(grad, dest, gradient_input, beta);
-#else
-        cpu::smelu_gradient(grad, dest, gradient_input, beta);
-#endif
+
+        IF_DLIB_USE_CUDA(
+            cuda::smelu_gradient(grad, dest, gradient_input, beta);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::smelu_gradient(grad, dest, gradient_input, beta);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1229,11 +1366,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::silu(dest,src);
-#else
-        cpu::silu(dest,src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::silu(dest,src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::silu(dest,src);
+        )
     }
 
     void silu_gradient (
@@ -1242,11 +1381,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::silu_gradient(grad, src, gradient_input);
-#else
-        cpu::silu_gradient(grad, src, gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::silu_gradient(grad, src, gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::silu_gradient(grad, src, gradient_input);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1260,11 +1401,13 @@ namespace dlib { namespace tt
         long src_channel_stride
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride);
-#else
-        cpu::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride);
+        )
     }
 
     void resize_bilinear_gradient (
@@ -1276,11 +1419,13 @@ namespace dlib { namespace tt
         long gradient_input_channel_stride
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride,  gradient_input,gradient_input_row_stride,gradient_input_channel_stride);
-#else
-        cpu::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride,  gradient_input,gradient_input_row_stride,gradient_input_channel_stride);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride,  gradient_input,gradient_input_row_stride,gradient_input_channel_stride);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride,  gradient_input,gradient_input_row_stride,gradient_input_channel_stride);
+        )
     }
 
 // ------------------------------------------------------------------------------------
@@ -1293,11 +1438,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::reorg(add_to, dest, row_stride, col_stride, src);
-#else
-        cpu::reorg(add_to, dest, row_stride, col_stride, src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::reorg(add_to, dest, row_stride, col_stride, src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::reorg(add_to, dest, row_stride, col_stride, src);
+        )
     }
 
     void reorg_gradient (
@@ -1308,11 +1455,13 @@ namespace dlib { namespace tt
         const tensor& gradient_input
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::reorg_gradient(add_to, grad, row_stride, col_stride, gradient_input);
-#else
-        cpu::reorg_gradient(add_to, grad, row_stride, col_stride, gradient_input);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::reorg_gradient(add_to, grad, row_stride, col_stride, gradient_input);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::reorg_gradient(add_to, grad, row_stride, col_stride, gradient_input);
+        )
     }
 
 // ------------------------------------------------------------------------------------
@@ -1326,11 +1475,13 @@ namespace dlib { namespace tt
             size_t count_k
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
-#else
-        cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1344,11 +1495,13 @@ namespace dlib { namespace tt
         size_t k, size_t nr, size_t nc
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::copy_tensor(add_to, dest, dk, dnr, dnc , src, sk, snr, snc, k, nr, nc);
-#else
-        cpu::copy_tensor(add_to, dest, dk, dnr, dnc, src, sk, snr, snc, k, nr, nc);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::copy_tensor(add_to, dest, dk, dnr, dnc , src, sk, snr, snc, k, nr, nc);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::copy_tensor(add_to, dest, dk, dnr, dnc, src, sk, snr, snc, k, nr, nc);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1359,11 +1512,13 @@ namespace dlib { namespace tt
         resizable_tensor& out
     )
     {
-#ifdef DLIB_USE_CUDA
-        finv(m,out);
-#else
-        out = dlib::inv(mat(m));
-#endif
+        IF_DLIB_USE_CUDA(
+            finv(m,out);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            out = dlib::inv(mat(m));
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1374,11 +1529,13 @@ namespace dlib { namespace tt
         const tensor& src
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::transpose(add_to, dest, src);
-#else
-        cpu::transpose(add_to, dest, src);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::transpose(add_to, dest, src);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::transpose(add_to, dest, src);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1389,11 +1546,13 @@ namespace dlib { namespace tt
         const tensor& embs
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::embeddings(dest, src, embs);
-#else
-        cpu::embeddings(dest, src, embs);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::embeddings(dest, src, embs);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::embeddings(dest, src, embs);
+        )
     }
 
     void embeddings_gradient(
@@ -1405,11 +1564,13 @@ namespace dlib { namespace tt
         bool scale
     )
     {
-#ifdef DLIB_USE_CUDA
-        cuda::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale);
-#else
-        cpu::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale);
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale);
+        )
+
+        IF_DLIB_NOT_USE_CUDA(
+            cpu::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
diff --git a/dlib/cuda/tensor_tools.h b/dlib/cuda/tensor_tools.h
index 18a5564f98..e551555260 100644
--- a/dlib/cuda/tensor_tools.h
+++ b/dlib/cuda/tensor_tools.h
@@ -292,10 +292,9 @@ namespace dlib { namespace tt
         !*/
 
 #ifdef DLIB_USE_CUDA
-        cuda::curand_generator rnd;
-#else
-        dlib::rand rnd;
+        cuda::curand_generator cuda_impl;
 #endif
+        dlib::rand cpu_impl;
     };
 
 // ----------------------------------------------------------------------------------------
@@ -1074,14 +1073,32 @@ namespace dlib { namespace tt
         tensor_conv() {}
 
         void clear(
-        ) { impl.clear(); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl.clear();
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl.clear();
+            )
+        }
 
         void operator() (
             const bool add_to_output,
             tensor& output,
             const tensor& data,
             const tensor& filters
-        ) { impl(add_to_output,output,data,filters); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl(add_to_output,output,data,filters);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl(add_to_output,output,data,filters);
+            )
+        }
         /*!
             requires
                 - setup() has been called.  Specifically, setup() has been called like this:
@@ -1107,7 +1124,16 @@ namespace dlib { namespace tt
             resizable_tensor& output,
             const tensor& data,
             const tensor& filters
-        ) { impl(add_to_output,output,data,filters); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl(add_to_output,output,data,filters);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl(add_to_output,output,data,filters);
+            )
+        }
         /*!
             requires
                 - setup() has been called.  Specifically, setup() has been called like this:
@@ -1135,7 +1161,16 @@ namespace dlib { namespace tt
             const tensor& filters,
             const tensor& biases,
             bool use_relu
-        ) { impl(add_to_output,output,data,filters,biases,use_relu); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl(add_to_output,output,data,filters,biases,use_relu);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl(add_to_output,output,data,filters,biases,use_relu);
+            )
+        }
         /*!
             requires
                 - setup() has been called.  Specifically, setup() has been called like this:
@@ -1167,7 +1202,16 @@ namespace dlib { namespace tt
             const tensor& filters,
             const tensor& biases,
             bool use_relu
-        ) { impl(add_to_output,output,data,filters,biases,use_relu); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl(add_to_output,output,data,filters,biases,use_relu);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl(add_to_output,output,data,filters,biases,use_relu);
+            )
+        }
         /*!
             requires
                 - setup() has been called.  Specifically, setup() has been called like this:
@@ -1195,7 +1239,16 @@ namespace dlib { namespace tt
             const tensor& gradient_input, 
             const tensor& filters,
             tensor& data_gradient
-        ) { impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient);
+            )
+        }
         /*!
             requires
                 - One of the following must be true:
@@ -1230,7 +1283,16 @@ namespace dlib { namespace tt
             const tensor& gradient_input, 
             const tensor& data,
             tensor& filters_gradient
-        ) { impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient);
+            )
+        }
         /*!
             requires
                 - One of the following must be true:
@@ -1268,7 +1330,16 @@ namespace dlib { namespace tt
             int stride_x,
             int padding_y,
             int padding_x
-        ) {impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x);
+            )
+        }
         /*!
             requires
                 - filters.k() == data.k()
@@ -1292,11 +1363,9 @@ namespace dlib { namespace tt
 
     private:
 #ifdef DLIB_USE_CUDA
-        cuda::tensor_conv impl;
-#else
-        cpu::tensor_conv impl;
+        cuda::tensor_conv cuda_impl;
 #endif
-
+        cpu::tensor_conv cpu_impl;
     };
 
 // ----------------------------------------------------------------------------------------
@@ -1317,7 +1386,16 @@ namespace dlib { namespace tt
         ) = default;
 
         void clear(
-        ) { impl.clear(); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl.clear();
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl.clear();
+            )
+        }
 
         void setup_max_pooling(
             int window_height,
@@ -1326,7 +1404,16 @@ namespace dlib { namespace tt
             int stride_x,
             int padding_y,
             int padding_x
-        ) { impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x);
+            )
+        }
         /*!
             requires
                 - window_height > 0
@@ -1347,7 +1434,16 @@ namespace dlib { namespace tt
             int stride_x,
             int padding_y,
             int padding_x
-        ) { impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x);
+            )
+        }
         /*!
             requires
                 - window_height > 0
@@ -1362,12 +1458,30 @@ namespace dlib { namespace tt
         !*/
 
         bool does_max_pooling(
-        ) const { return impl.does_max_pooling(); }
+        ) const
+        {
+            IF_DLIB_USE_CUDA(
+                return cuda_impl.does_max_pooling();
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                return cpu_impl.does_max_pooling();
+            )
+        }
 
         void operator() (
             resizable_tensor& dest,
             const tensor& src
-        ) { impl(dest, src); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl(dest, src);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl(dest, src);
+            )
+        }
         /*!
             requires
                 - is_same_object(dest,src) == false
@@ -1395,7 +1509,16 @@ namespace dlib { namespace tt
             const tensor& dest,
             const tensor& src,
             tensor& grad 
-        ) { impl.get_gradient(gradient_input, dest, src, grad); }
+        )
+        {
+            IF_DLIB_USE_CUDA(
+                cuda_impl.get_gradient(gradient_input, dest, src, grad);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_impl.get_gradient(gradient_input, dest, src, grad);
+            )
+        }
         /*!
             requires
                 - have_same_dimensions(gradient_input,dest) == true
@@ -1413,10 +1536,9 @@ namespace dlib { namespace tt
 
         private:
 #ifdef DLIB_USE_CUDA
-        cuda::pooling impl;
-#else
-        cpu::pooling impl;
+        cuda::pooling cuda_impl;
 #endif
+        cpu::pooling cpu_impl;
     };
 
 // ----------------------------------------------------------------------------------------
diff --git a/dlib/dnn/core_abstract.h b/dlib/dnn/core_abstract.h
index 491183a685..9d61c716b3 100644
--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -196,6 +196,28 @@ namespace dlib
             - #dnn_prefer_fastest_algorithms() == false 
     !*/
 
+    bool use_cuda(
+    );
+    /*!
+        ensures
+            - If dlib should use the CUDA implementation of a deep neural network
+              then this function returns true and false otherwise.
+            - On program startup this function will return true if DLIB_USE_CUDA is defined and
+              there is an available GPU device to use.
+            - This function always returns false if DLIB_USE_CUDA is not defined.
+            - This function sets a thread local variable.  That is, each thread has its own value
+              for use_cuda().  This means that one thread may use cuda while another thread might
+              not, depending on the setting of use_cuda().
+    !*/
+
+    void set_use_cuda(
+        bool flag
+    );
+    /*!
+        ensures
+            - #use_cuda() == flag for the calling thread.
+    !*/
+
 // ----------------------------------------------------------------------------------------
 
     template <
diff --git a/dlib/dnn/loss.h b/dlib/dnn/loss.h
index 36b37a2956..6a8f257bec 100644
--- a/dlib/dnn/loss.h
+++ b/dlib/dnn/loss.h
@@ -2823,11 +2823,13 @@ namespace dlib
             }
 
             double loss;
-#ifdef DLIB_USE_CUDA
-            cuda_compute(truth, output_tensor, grad, loss);
-#else
-            cpu_compute(truth, output_tensor, grad, loss);
-#endif
+            IF_DLIB_USE_CUDA(
+                cuda_compute(truth, output_tensor, grad, loss);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_compute(truth, output_tensor, grad, loss);
+            )
             return loss;
         }
 
@@ -2859,9 +2861,8 @@ namespace dlib
 
 #ifdef DLIB_USE_CUDA
         cuda::compute_loss_binary_log_per_pixel cuda_compute;
-#else
-        cpu::compute_loss_binary_log_per_pixel cpu_compute;
 #endif
+        cpu::compute_loss_binary_log_per_pixel cpu_compute;
     };
 
     template <typename SUBNET>
@@ -2968,11 +2969,13 @@ namespace dlib
 
 
             double loss;
-#ifdef DLIB_USE_CUDA
-            cuda_compute(truth, output_tensor, grad, loss);
-#else
-            cpu_compute(truth, output_tensor, grad, loss);
-#endif
+            IF_DLIB_USE_CUDA(
+                cuda_compute(truth, output_tensor, grad, loss);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_compute(truth, output_tensor, grad, loss);
+            )
             return loss;
         }
 
@@ -3004,9 +3007,8 @@ namespace dlib
 
 #ifdef DLIB_USE_CUDA
         cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
-#else
-        cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
 #endif
+        cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
     };
 
     template <typename SUBNET>
@@ -3068,11 +3070,13 @@ namespace dlib
             }
 
             double loss;
-#ifdef DLIB_USE_CUDA
-            cuda_compute(truth, output_tensor, grad, loss);
-#else
-            cpu_compute(truth, output_tensor, grad, loss);
-#endif
+            IF_DLIB_USE_CUDA(
+                cuda_compute(truth, output_tensor, grad, loss);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_compute(truth, output_tensor, grad, loss);
+            )
             return loss;
         }
 
@@ -3104,9 +3108,8 @@ namespace dlib
 
 #ifdef DLIB_USE_CUDA
         cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute;
-#else
-        cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute;
 #endif
+        cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute;
 
     };
 
@@ -3319,11 +3322,13 @@ namespace dlib
                 }
             }
             double loss;
-#ifdef DLIB_USE_CUDA
-            cuda_compute(truth, output_tensor, grad, loss);
-#else
-            cpu_compute(truth, output_tensor, grad, loss);
-#endif
+            IF_DLIB_USE_CUDA(
+                cuda_compute(truth, output_tensor, grad, loss);
+            )
+
+            IF_DLIB_NOT_USE_CUDA(
+                cpu_compute(truth, output_tensor, grad, loss);
+            )
             return loss;
         }
 
@@ -3355,9 +3360,8 @@ namespace dlib
 
 #ifdef DLIB_USE_CUDA
         cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
-#else
-        cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
 #endif
+        cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
     };
 
     template <long num_channels, typename SUBNET>
diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp
index c564e277e1..09169e6dd2 100644
--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -154,16 +154,16 @@ namespace
         dlog << LINFO << "src error: " << grad_error;
         DLIB_TEST(grad_error < 0.001);
 
-#ifdef DLIB_USE_CUDA
-        resizable_tensor src1 = src;
-        resizable_tensor src2 = src;
-        resizable_tensor dest1, dest2;
-        dest1.copy_size(src);
-        dest2.copy_size(src);
-        cuda::softmax_all(dest1, src1);
-        cpu::softmax_all(dest2, src2);
-        DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
-#endif
+        IF_DLIB_USE_CUDA(
+            resizable_tensor src1 = src;
+            resizable_tensor src2 = src;
+            resizable_tensor dest1, dest2;
+            dest1.copy_size(src);
+            dest2.copy_size(src);
+            cuda::softmax_all(dest1, src1);
+            cpu::softmax_all(dest2, src2);
+            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
+        )
     }
 
     void test_softmaxm()
@@ -234,15 +234,16 @@ namespace
         cpu::softmax(output_tensor, input_tensor, operation_mode::PLANE_WISE);
         cpu::softmax_gradient(cpu_grad, output_tensor, gradient_input, operation_mode::PLANE_WISE);
         DLIB_TEST(max(abs(mat(output_tensor) - mat(expected_output))) < 1e-5);
-#ifdef DLIB_USE_CUDA
-        resizable_tensor cuda_grad;
-        cuda_grad.copy_size(input_tensor);
-        cuda_grad = 0;
-        cuda::softmax(output_tensor, input_tensor, operation_mode::PLANE_WISE);
-        cpu::softmax_gradient(cuda_grad, output_tensor, gradient_input, operation_mode::PLANE_WISE);
-        DLIB_TEST(max(abs(mat(output_tensor) - mat(expected_output))) < 1e-5);
-        DLIB_TEST(max(abs(mat(cuda_grad) - mat(cpu_grad))) < 1e-5);
-#endif
+
+        IF_DLIB_USE_CUDA(
+            resizable_tensor cuda_grad;
+            cuda_grad.copy_size(input_tensor);
+            cuda_grad = 0;
+            cuda::softmax(output_tensor, input_tensor, operation_mode::PLANE_WISE);
+            cpu::softmax_gradient(cuda_grad, output_tensor, gradient_input, operation_mode::PLANE_WISE);
+            DLIB_TEST(max(abs(mat(output_tensor) - mat(expected_output))) < 1e-5);
+            DLIB_TEST(max(abs(mat(cuda_grad) - mat(cpu_grad))) < 1e-5);
+        )
     }
 
     void test_softmax_all()
@@ -284,222 +285,222 @@ namespace
         dlog << LINFO << "src error: " << grad_error;
         DLIB_TEST(grad_error < 0.001);
 
-#ifdef DLIB_USE_CUDA
-        resizable_tensor src1 = src;
-        resizable_tensor src2 = src;
-        resizable_tensor dest1, dest2;
-        dest1.copy_size(src);
-        dest2.copy_size(src);
-        cuda::softmax_all(dest1, src1);
-        cpu::softmax_all(dest2, src2);
-        DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
-#endif
+        IF_DLIB_USE_CUDA(
+            resizable_tensor src1 = src;
+            resizable_tensor src2 = src;
+            resizable_tensor dest1, dest2;
+            dest1.copy_size(src);
+            dest2.copy_size(src);
+            cuda::softmax_all(dest1, src1);
+            cpu::softmax_all(dest2, src2);
+            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2))));
+        )
     }
 
     void test_mish()
     {
-#ifdef DLIB_USE_CUDA
-        // make sure that cuda::mish and cpu::mish return the same results
-        using namespace dlib::tt;
-        print_spinner();
-        const long n = 4;
-        const long k = 5;
-        const long nr = 3;
-        const long nc = 3;
-        resizable_tensor src(n,k,nr,nc);
-        tt::tensor_rand rnd;
-        rnd.fill_gaussian(src);
+        IF_DLIB_USE_CUDA(
+            // make sure that cuda::mish and cpu::mish return the same results
+            using namespace dlib::tt;
+            print_spinner();
+            const long n = 4;
+            const long k = 5;
+            const long nr = 3;
+            const long nc = 3;
+            resizable_tensor src(n,k,nr,nc);
+            tt::tensor_rand rnd;
+            rnd.fill_gaussian(src);
 
-        resizable_tensor dest1, dest2;
-        dest1.copy_size(src);
-        dest2.copy_size(src);
-        // initialize to different values in order to make sure the output is actually changed
-        dest1 = 1;
-        dest2 = 2;
-        cuda::mish(dest1, src);
-        cpu::mish(dest2, src);
-        DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2))));
-#endif // DLIB_USE_CUDA
+            resizable_tensor dest1, dest2;
+            dest1.copy_size(src);
+            dest2.copy_size(src);
+            // initialize to different values in order to make sure the output is actually changed
+            dest1 = 1;
+            dest2 = 2;
+            cuda::mish(dest1, src);
+            cpu::mish(dest2, src);
+            DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2))));
+            )
     }
 
     void test_leaky_relu()
     {
-#ifdef DLIB_USE_CUDA
-        using namespace dlib::tt;
-        print_spinner();
-        const long n = 4;
-        const long k = 5;
-        const long nr = 3;
-        const long nc = 3;
-        const float alpha = 0.01;
-        resizable_tensor src(n, k, nr, nc);
-        tt::tensor_rand rnd;
-        rnd.fill_gaussian(src);
-        resizable_tensor dest_cuda, dest_cpu;
-        dest_cuda.copy_size(src);
-        dest_cpu.copy_size(src);
-        // initialize to different values in order to make sure the output is actually changed
-        dest_cuda = 1;
-        dest_cpu = 2;
-        cuda::leaky_relu(dest_cuda, src, alpha);
-        cpu::leaky_relu(dest_cpu, src, alpha);
-
-        DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu))));
-#endif // DLIB_USE_CUDA
+        IF_DLIB_USE_CUDA(
+            using namespace dlib::tt;
+            print_spinner();
+            const long n = 4;
+            const long k = 5;
+            const long nr = 3;
+            const long nc = 3;
+            const float alpha = 0.01;
+            resizable_tensor src(n, k, nr, nc);
+            tt::tensor_rand rnd;
+            rnd.fill_gaussian(src);
+            resizable_tensor dest_cuda, dest_cpu;
+            dest_cuda.copy_size(src);
+            dest_cpu.copy_size(src);
+            // initialize to different values in order to make sure the output is actually changed
+            dest_cuda = 1;
+            dest_cpu = 2;
+            cuda::leaky_relu(dest_cuda, src, alpha);
+            cpu::leaky_relu(dest_cpu, src, alpha);
+
+            DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu))));
+            )
     }
 
     void test_clipped_relu()
     {
-#ifdef DLIB_USE_CUDA
-        using namespace dlib::tt;
-        print_spinner();
-        const long n = 4;
-        const long k = 5;
-        const long nr = 3;
-        const long nc = 3;
-        const float ceiling = 6.0f;
-        resizable_tensor src(n, k, nr, nc);
-        tt::tensor_rand rnd;
-        rnd.fill_gaussian(src, 0, 3);
-        resizable_tensor dest_cuda, dest_cpu;
-        dest_cuda.copy_size(src);
-        dest_cpu.copy_size(src);
-        // initialize to different values in order to make sure the output is actually changed
-        dest_cuda = 1;
-        dest_cpu = 2;
-        cuda::clipped_relu(dest_cuda, src, ceiling);
-        cpu::clipped_relu(dest_cpu, src, ceiling);
-        auto error = max(abs(mat(dest_cuda) - mat(dest_cpu)));
-        DLIB_TEST_MSG(error < 1e-7, "error: " << error);
-
-        // test gradients
-        resizable_tensor grad_cuda, grad_cpu, grad_input;
-        grad_cuda.copy_size(src);
-        grad_cpu.copy_size(src);
-        grad_input.copy_size(src);
-        rnd.fill_uniform(grad_input);
-        grad_cuda = 0;
-        grad_cpu = 0;
-        cuda::clipped_relu_gradient(grad_cuda, dest_cuda, grad_input, ceiling);
-        cpu::clipped_relu_gradient(grad_cpu, dest_cpu, grad_input, ceiling);
-        error = max(abs(mat(grad_cuda) - mat(grad_cpu)));
-        DLIB_TEST_MSG(error < 1e-7, "error: " << error);
-#endif // DLIB_USE_CUDA
+        IF_DLIB_USE_CUDA(
+            using namespace dlib::tt;
+            print_spinner();
+            const long n = 4;
+            const long k = 5;
+            const long nr = 3;
+            const long nc = 3;
+            const float ceiling = 6.0f;
+            resizable_tensor src(n, k, nr, nc);
+            tt::tensor_rand rnd;
+            rnd.fill_gaussian(src, 0, 3);
+            resizable_tensor dest_cuda, dest_cpu;
+            dest_cuda.copy_size(src);
+            dest_cpu.copy_size(src);
+            // initialize to different values in order to make sure the output is actually changed
+            dest_cuda = 1;
+            dest_cpu = 2;
+            cuda::clipped_relu(dest_cuda, src, ceiling);
+            cpu::clipped_relu(dest_cpu, src, ceiling);
+            auto error = max(abs(mat(dest_cuda) - mat(dest_cpu)));
+            DLIB_TEST_MSG(error < 1e-7, "error: " << error);
+
+            // test gradients
+            resizable_tensor grad_cuda, grad_cpu, grad_input;
+            grad_cuda.copy_size(src);
+            grad_cpu.copy_size(src);
+            grad_input.copy_size(src);
+            rnd.fill_uniform(grad_input);
+            grad_cuda = 0;
+            grad_cpu = 0;
+            cuda::clipped_relu_gradient(grad_cuda, dest_cuda, grad_input, ceiling);
+            cpu::clipped_relu_gradient(grad_cpu, dest_cpu, grad_input, ceiling);
+            error = max(abs(mat(grad_cuda) - mat(grad_cpu)));
+            DLIB_TEST_MSG(error < 1e-7, "error: " << error);
+            )
     }
 
     void test_elu()
     {
-#ifdef DLIB_USE_CUDA
-        using namespace dlib::tt;
-        print_spinner();
-        const long n = 4;
-        const long k = 5;
-        const long nr = 3;
-        const long nc = 3;
-        const float alpha = 1.0f;
-        resizable_tensor src(n, k, nr, nc);
-        tt::tensor_rand rnd;
-        rnd.fill_gaussian(src);
-        resizable_tensor dest_cuda, dest_cpu;
-        dest_cuda.copy_size(src);
-        dest_cpu.copy_size(src);
-        // initialize to different values in order to make sure the output is actually changed
-        dest_cuda = 1;
-        dest_cpu = 2;
-        cuda::elu(dest_cuda, src, alpha);
-        cpu::elu(dest_cpu, src, alpha);
-        auto error = max(abs(mat(dest_cuda) - mat(dest_cpu)));
-        DLIB_TEST_MSG(error < 1e-7, "error: " << error);
-        // test gradients
-        resizable_tensor grad_cuda, grad_cpu, grad_input;
-        grad_cuda.copy_size(src);
-        grad_cpu.copy_size(src);
-        grad_input.copy_size(src);
-        rnd.fill_gaussian(grad_input);
-        grad_cuda = 0;
-        grad_cpu = 0;
-        cuda::elu_gradient(grad_cuda, dest_cuda, grad_input, alpha);
-        cpu::elu_gradient(grad_cpu, dest_cpu, grad_input, alpha);
-        error = max(abs(mat(grad_cuda) - mat(grad_cpu)));
-        DLIB_TEST_MSG(error < 1e-6, "error: " << error);
-#endif // DLIB_USE_CUDA
+        IF_DLIB_USE_CUDA(
+            using namespace dlib::tt;
+            print_spinner();
+            const long n = 4;
+            const long k = 5;
+            const long nr = 3;
+            const long nc = 3;
+            const float alpha = 1.0f;
+            resizable_tensor src(n, k, nr, nc);
+            tt::tensor_rand rnd;
+            rnd.fill_gaussian(src);
+            resizable_tensor dest_cuda, dest_cpu;
+            dest_cuda.copy_size(src);
+            dest_cpu.copy_size(src);
+            // initialize to different values in order to make sure the output is actually changed
+            dest_cuda = 1;
+            dest_cpu = 2;
+            cuda::elu(dest_cuda, src, alpha);
+            cpu::elu(dest_cpu, src, alpha);
+            auto error = max(abs(mat(dest_cuda) - mat(dest_cpu)));
+            DLIB_TEST_MSG(error < 1e-7, "error: " << error);
+            // test gradients
+            resizable_tensor grad_cuda, grad_cpu, grad_input;
+            grad_cuda.copy_size(src);
+            grad_cpu.copy_size(src);
+            grad_input.copy_size(src);
+            rnd.fill_gaussian(grad_input);
+            grad_cuda = 0;
+            grad_cpu = 0;
+            cuda::elu_gradient(grad_cuda, dest_cuda, grad_input, alpha);
+            cpu::elu_gradient(grad_cpu, dest_cpu, grad_input, alpha);
+            error = max(abs(mat(grad_cuda) - mat(grad_cpu)));
+            DLIB_TEST_MSG(error < 1e-6, "error: " << error);
+            )
     }
 
     void test_gelu()
     {
-#ifdef DLIB_USE_CUDA
-        // make sure that cuda::gelu and cpu::gelu return the same results
-        using namespace dlib::tt;
-        print_spinner();
-        const long n = 4;
-        const long k = 5;
-        const long nr = 3;
-        const long nc = 3;
-        resizable_tensor src(n,k,nr,nc);
-        tt::tensor_rand rnd;
-        rnd.fill_gaussian(src);
+        IF_DLIB_USE_CUDA(
+            // make sure that cuda::gelu and cpu::gelu return the same results
+            using namespace dlib::tt;
+            print_spinner();
+            const long n = 4;
+            const long k = 5;
+            const long nr = 3;
+            const long nc = 3;
+            resizable_tensor src(n,k,nr,nc);
+            tt::tensor_rand rnd;
+            rnd.fill_gaussian(src);
 
-        resizable_tensor dest1, dest2;
-        dest1.copy_size(src);
-        dest2.copy_size(src);
-        // initialize to different values in order to make sure the output is actually changed
-        dest1 = 1;
-        dest2 = 2;
-        cuda::gelu(dest1, src);
-        cpu::gelu(dest2, src);
-        DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2))));
-#endif // DLIB_USE_CUDA
+            resizable_tensor dest1, dest2;
+            dest1.copy_size(src);
+            dest2.copy_size(src);
+            // initialize to different values in order to make sure the output is actually changed
+            dest1 = 1;
+            dest2 = 2;
+            cuda::gelu(dest1, src);
+            cpu::gelu(dest2, src);
+            DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2))));
+            )
     }
 
     void test_smelu()
     {
-#ifdef DLIB_USE_CUDA
-        using namespace dlib::tt;
-        print_spinner();
-        const long n = 4;
-        const long k = 5;
-        const long nr = 3;
-        const long nc = 3;
-        const float beta = 1;
-        resizable_tensor src(n, k, nr, nc);
-        tt::tensor_rand rnd;
-        rnd.fill_gaussian(src);
-        resizable_tensor dest_cuda, dest_cpu;
-        dest_cuda.copy_size(src);
-        dest_cpu.copy_size(src);
-        // initialize to different values in order to make sure the output is actually changed
-        dest_cuda = 1;
-        dest_cpu = 2;
-        cuda::smelu(dest_cuda, src, beta);
-        cpu::smelu(dest_cpu, src, beta);
-
-        DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu))));
-#endif // DLIB_USE_CUDA
+        IF_DLIB_USE_CUDA(
+            using namespace dlib::tt;
+            print_spinner();
+            const long n = 4;
+            const long k = 5;
+            const long nr = 3;
+            const long nc = 3;
+            const float beta = 1;
+            resizable_tensor src(n, k, nr, nc);
+            tt::tensor_rand rnd;
+            rnd.fill_gaussian(src);
+            resizable_tensor dest_cuda, dest_cpu;
+            dest_cuda.copy_size(src);
+            dest_cpu.copy_size(src);
+            // initialize to different values in order to make sure the output is actually changed
+            dest_cuda = 1;
+            dest_cpu = 2;
+            cuda::smelu(dest_cuda, src, beta);
+            cpu::smelu(dest_cpu, src, beta);
+
+            DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu))));
+            )
     }
 
     void test_silu()
     {
-#ifdef DLIB_USE_CUDA
-        using namespace dlib::tt;
-        print_spinner();
-        const long n = 4;
-        const long k = 5;
-        const long nr = 3;
-        const long nc = 3;
-        resizable_tensor src(n, k, nr, nc);
-        tt::tensor_rand rnd;
-        rnd.fill_gaussian(src);
-        resizable_tensor dest_cuda, dest_cpu;
-        dest_cuda.copy_size(src);
-        dest_cpu.copy_size(src);
-        // initialize to different values in order to make sure the output is actually changed
-        dest_cuda = 1;
-        dest_cpu = 2;
-        cuda::silu(dest_cuda, src);
-        cpu::silu(dest_cpu, src);
-
-        DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-6, max(abs(mat(dest_cuda) - mat(dest_cpu))));
-#endif // DLIB_USE_CUDA
+        IF_DLIB_USE_CUDA(
+            using namespace dlib::tt;
+            print_spinner();
+            const long n = 4;
+            const long k = 5;
+            const long nr = 3;
+            const long nc = 3;
+            resizable_tensor src(n, k, nr, nc);
+            tt::tensor_rand rnd;
+            rnd.fill_gaussian(src);
+            resizable_tensor dest_cuda, dest_cpu;
+            dest_cuda.copy_size(src);
+            dest_cpu.copy_size(src);
+            // initialize to different values in order to make sure the output is actually changed
+            dest_cuda = 1;
+            dest_cpu = 2;
+            cuda::silu(dest_cuda, src);
+            cpu::silu(dest_cpu, src);
+
+            DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-6, max(abs(mat(dest_cuda) - mat(dest_cpu))));
+        )
     }
 
     void test_batch_normalize()
@@ -710,28 +711,28 @@ namespace
             DLIB_TEST(::std::abs(rs.stddev() - 1.0f) < 0.01);
         }
         // check that the CPU and the CUDA implementation are equivalent
-#ifdef DLIB_USE_CUDA
-        resizable_tensor y_cuda(x);
-        resizable_tensor means_cuda(x.num_samples()), invstds_cuda(x.num_samples());
-        cuda::layer_normalize(eps, y_cuda, means_cuda, invstds_cuda, x, gamma, beta);
-        DLIB_TEST(max(abs(mat(y_cpu) - mat(y_cuda))) < 1e-5);
-        DLIB_TEST(max(abs(mat(means_cpu) - mat(means_cuda))) < 1e-5);
-        DLIB_TEST(max(abs(mat(invstds_cpu) - mat(invstds_cuda))) < 1e-5);
-        resizable_tensor gradient_input(x);
-        resizable_tensor src_grad_cpu(x), gamma_grad_cpu(1, x.k(), 1, 1), beta_grad_cpu(1, x.k(), 1, 1);
-        resizable_tensor src_grad_cuda(x), gamma_grad_cuda(1, x.k(), 1, 1), beta_grad_cuda(1, x.k(), 1, 1);
-        resizable_tensor dmeans_cpu, dvars_cpu, dmeans_cuda, dvars_cuda;
-        rnd.fill_gaussian(gradient_input);
-        src_grad_cpu = 0;
-        src_grad_cuda = 0;
-        cpu::layer_normalize_gradient(eps, gradient_input, means_cpu, invstds_cpu, x, gamma, src_grad_cpu, gamma_grad_cpu, beta_grad_cpu, dmeans_cpu, dvars_cpu);
-        cuda::layer_normalize_gradient(eps, gradient_input, means_cuda, invstds_cuda, x, gamma, src_grad_cuda, gamma_grad_cuda, beta_grad_cuda, dmeans_cuda, dvars_cuda);
-        DLIB_TEST(max(abs(mat(src_grad_cpu) - mat(src_grad_cuda))) < 1e-5);
-        DLIB_TEST(max(abs(mat(gamma_grad_cpu) - mat(gamma_grad_cuda))) < 1e-5);
-        DLIB_TEST(max(abs(mat(beta_grad_cpu) - mat(beta_grad_cuda))) < 1e-5);
-        DLIB_TEST(max(abs(mat(dmeans_cpu) - mat(dmeans_cuda))) < 1e-4);
-        DLIB_TEST(max(abs(mat(dvars_cpu) - mat(dvars_cuda))) < 1e-4);
-#endif
+        IF_DLIB_USE_CUDA(
+            resizable_tensor y_cuda(x);
+            resizable_tensor means_cuda(x.num_samples()), invstds_cuda(x.num_samples());
+            cuda::layer_normalize(eps, y_cuda, means_cuda, invstds_cuda, x, gamma, beta);
+            DLIB_TEST(max(abs(mat(y_cpu) - mat(y_cuda))) < 1e-5);
+            DLIB_TEST(max(abs(mat(means_cpu) - mat(means_cuda))) < 1e-5);
+            DLIB_TEST(max(abs(mat(invstds_cpu) - mat(invstds_cuda))) < 1e-5);
+            resizable_tensor gradient_input(x);
+            resizable_tensor src_grad_cpu(x), gamma_grad_cpu(1, x.k(), 1, 1), beta_grad_cpu(1, x.k(), 1, 1);
+            resizable_tensor src_grad_cuda(x), gamma_grad_cuda(1, x.k(), 1, 1), beta_grad_cuda(1, x.k(), 1, 1);
+            resizable_tensor dmeans_cpu, dvars_cpu, dmeans_cuda, dvars_cuda;
+            rnd.fill_gaussian(gradient_input);
+            src_grad_cpu = 0;
+            src_grad_cuda = 0;
+            cpu::layer_normalize_gradient(eps, gradient_input, means_cpu, invstds_cpu, x, gamma, src_grad_cpu, gamma_grad_cpu, beta_grad_cpu, dmeans_cpu, dvars_cpu);
+            cuda::layer_normalize_gradient(eps, gradient_input, means_cuda, invstds_cuda, x, gamma, src_grad_cuda, gamma_grad_cuda, beta_grad_cuda, dmeans_cuda, dvars_cuda);
+            DLIB_TEST(max(abs(mat(src_grad_cpu) - mat(src_grad_cuda))) < 1e-5);
+            DLIB_TEST(max(abs(mat(gamma_grad_cpu) - mat(gamma_grad_cuda))) < 1e-5);
+            DLIB_TEST(max(abs(mat(beta_grad_cpu) - mat(beta_grad_cuda))) < 1e-5);
+            DLIB_TEST(max(abs(mat(dmeans_cpu) - mat(dmeans_cuda))) < 1e-4);
+            DLIB_TEST(max(abs(mat(dvars_cpu) - mat(dvars_cuda))) < 1e-4);
+            )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -810,21 +811,21 @@ namespace
         DLIB_TEST(!backward_error_found);        
 
         // check that the CPU and the CUDA implementation are equivalent 
-#ifdef DLIB_USE_CUDA 
-        resizable_tensor y_cuda(x);
-        resizable_tensor scale_cuda;
-        cuda::rms_normalize(eps, y_cuda, scale_cuda, x, gamma);
-        DLIB_TEST(max(abs(mat(y_cpu) - mat(y_cuda))) < 1e-5);
-        DLIB_TEST(max(abs(mat(scale_cpu) - mat(scale_cuda))) < 1e-5);
-
-        resizable_tensor src_grad_cuda(x), gamma_grad_cuda(1, x.k());
-        resizable_tensor dscale_cuda(x.num_samples());
-        src_grad_cuda = 0;
-        cuda::rms_normalize_gradient(gradient_input, scale_cuda, x, gamma, src_grad_cuda, gamma_grad_cuda, dscale_cuda);
-        DLIB_TEST(max(abs(mat(src_grad_cpu) - mat(src_grad_cuda))) < 1e-5);
-        DLIB_TEST(max(abs(mat(gamma_grad_cpu) - mat(gamma_grad_cuda))) < 1e-5);
-        DLIB_TEST(max(abs(mat(dscale_cpu) - mat(dscale_cuda))) < 1e-5);
-#endif        
+        IF_DLIB_USE_CUDA(
+            resizable_tensor y_cuda(x);
+            resizable_tensor scale_cuda;
+            cuda::rms_normalize(eps, y_cuda, scale_cuda, x, gamma);
+            DLIB_TEST(max(abs(mat(y_cpu) - mat(y_cuda))) < 1e-5);
+            DLIB_TEST(max(abs(mat(scale_cpu) - mat(scale_cuda))) < 1e-5);
+
+            resizable_tensor src_grad_cuda(x), gamma_grad_cuda(1, x.k());
+            resizable_tensor dscale_cuda(x.num_samples());
+            src_grad_cuda = 0;
+            cuda::rms_normalize_gradient(gradient_input, scale_cuda, x, gamma, src_grad_cuda, gamma_grad_cuda, dscale_cuda);
+            DLIB_TEST(max(abs(mat(src_grad_cpu) - mat(src_grad_cuda))) < 1e-5);
+            DLIB_TEST(max(abs(mat(gamma_grad_cpu) - mat(gamma_grad_cuda))) < 1e-5);
+            DLIB_TEST(max(abs(mat(dscale_cpu) - mat(dscale_cuda))) < 1e-5);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -847,15 +848,15 @@ namespace
         input *= 2;
         DLIB_TEST(max(abs(mat(output_cpu_b) - mat(input))) < 1e-5);
 
-#ifdef DLIB_USE_CUDA
-        input /= 2;
-        resizable_tensor output_cuda_a, output_cuda_b(input);    
-        output_cuda_a.copy_size(output_cpu_a);
-        cuda::transpose(false, output_cuda_a, input);
-        cuda::transpose(true, output_cuda_b, output_cuda_a);
-        DLIB_TEST(max(abs(mat(output_cpu_a) - mat(output_cuda_a))) < 1e-5);
-        DLIB_TEST(max(abs(mat(output_cpu_b) - mat(output_cuda_b))) < 1e-5);
-#endif
+        IF_DLIB_USE_CUDA(
+            input /= 2;
+            resizable_tensor output_cuda_a, output_cuda_b(input);    
+            output_cuda_a.copy_size(output_cpu_a);
+            cuda::transpose(false, output_cuda_a, input);
+            cuda::transpose(true, output_cuda_b, output_cuda_a);
+            DLIB_TEST(max(abs(mat(output_cpu_a) - mat(output_cuda_a))) < 1e-5);
+            DLIB_TEST(max(abs(mat(output_cpu_b) - mat(output_cuda_b))) < 1e-5);
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -1093,31 +1094,31 @@ void test_embeddings()
             memcpy(A, truth);
             DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
 
-#ifdef DLIB_USE_CUDA
-            A = 4;
-            A.device();
-            B.host();
-            memcpy(A, truth);
-            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
+            IF_DLIB_USE_CUDA(
+                A = 4;
+                A.device();
+                B.host();
+                memcpy(A, truth);
+                DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
 
-            A = 4;
-            A.device();
-            B.device();
-            memcpy(A, truth);
-            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
+                A = 4;
+                A.device();
+                B.device();
+                memcpy(A, truth);
+                DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
 
-            A = 4;
-            A.host();
-            B.device();
-            memcpy(A, truth);
-            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
+                A = 4;
+                A.host();
+                B.device();
+                memcpy(A, truth);
+                DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
 
-            A = 4;
-            A.host_write_only();
-            B.device();
-            memcpy(A, truth);
-            DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
-#endif
+                A = 4;
+                A.host_write_only();
+                B.device();
+                memcpy(A, truth);
+                DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5);
+                )
         }
 
         {
@@ -1166,69 +1167,69 @@ void test_embeddings()
             }
 
 
-#ifdef DLIB_USE_CUDA
-            A = 4;
-            A.device();
-            B.host();
-            {
-                // non-aliasing test
-                auto aA = at(A,5);
-                auto aB = at(B,5);
-                memcpy(aA, aB);
-                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
-                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
-            }
-            {
-                // aliasing test
-                auto aA = at(A,1);
-                auto aB = at(A,6);
-                memcpy(aA, aB);
-                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
-                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
-            }
+            IF_DLIB_USE_CUDA(
+                A = 4;
+                A.device();
+                B.host();
+                {
+                    // non-aliasing test
+                    auto aA = at(A,5);
+                    auto aB = at(B,5);
+                    memcpy(aA, aB);
+                    truth = {4,4,4,4,4,  1,1,1,1,1, 4};
+                    DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+                }
+                {
+                    // aliasing test
+                    auto aA = at(A,1);
+                    auto aB = at(A,6);
+                    memcpy(aA, aB);
+                    truth = {4,1,1,1,1,  4,1,1,1,1, 4};
+                    DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+                }
 
 
-            A = 4;
-            A.device();
-            B.device();
-            {
-                // non-aliasing test
-                auto aA = at(A,5);
-                auto aB = at(B,5);
-                memcpy(aA, aB);
-                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
-                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
-            }
-            {
-                // aliasing test
-                auto aA = at(A,1);
-                auto aB = at(A,6);
-                memcpy(aA, aB);
-                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
-                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
-            }
+                A = 4;
+                A.device();
+                B.device();
+                {
+                    // non-aliasing test
+                    auto aA = at(A,5);
+                    auto aB = at(B,5);
+                    memcpy(aA, aB);
+                    truth = {4,4,4,4,4,  1,1,1,1,1, 4};
+                    DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+                }
+                {
+                    // aliasing test
+                    auto aA = at(A,1);
+                    auto aB = at(A,6);
+                    memcpy(aA, aB);
+                    truth = {4,1,1,1,1,  4,1,1,1,1, 4};
+                    DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+                }
 
-            A = 4;
-            A.host();
-            B.device();
-            {
-                // non-aliasing test
-                auto aA = at(A,5);
-                auto aB = at(B,5);
-                memcpy(aA, aB);
-                truth = {4,4,4,4,4,  1,1,1,1,1, 4};
-                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
-            }
-            {
-                // aliasing test
-                auto aA = at(A,1);
-                auto aB = at(A,6);
-                memcpy(aA, aB);
-                truth = {4,1,1,1,1,  4,1,1,1,1, 4};
-                DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
-            }
+                A = 4;
+                A.host();
+                B.device();
+                {
+                    // non-aliasing test
+                    auto aA = at(A,5);
+                    auto aB = at(B,5);
+                    memcpy(aA, aB);
+                    truth = {4,4,4,4,4,  1,1,1,1,1, 4};
+                    DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+                }
+                {
+                    // aliasing test
+                    auto aA = at(A,1);
+                    auto aB = at(A,6);
+                    memcpy(aA, aB);
+                    truth = {4,1,1,1,1,  4,1,1,1,1, 4};
+                    DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5);
+                }
 
-#endif
+                )
         }
 
         {
@@ -3671,22 +3672,22 @@ void test_multm_prev()
         trainer.train(inputs, labels);
         const auto error_after = compute_error();
         DLIB_TEST_MSG(error_after < error_before, "multi channel error increased after training");
-#if DLIB_USE_CUDA
-        cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
-        cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
-        double cuda_loss, cpu_loss;
-        const tensor& output_tensor = net.subnet().get_output();
-        resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
-        cuda_compute(labels.begin(), output_tensor, cuda_grad, cuda_loss);
-        cpu_compute(labels.begin(), output_tensor, cpu_grad, cpu_loss);
-        DLIB_TEST(cuda_grad.size() == cpu_grad.size());
-        for (size_t i = 0; i < cuda_grad.size(); ++i)
-        {
-            DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
-        }
-        const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
-        DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ");
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
+            cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
+            double cuda_loss, cpu_loss;
+            const tensor& output_tensor = net.subnet().get_output();
+            resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
+            cuda_compute(labels.begin(), output_tensor, cuda_grad, cuda_loss);
+            cpu_compute(labels.begin(), output_tensor, cpu_grad, cpu_loss);
+            DLIB_TEST(cuda_grad.size() == cpu_grad.size());
+            for (size_t i = 0; i < cuda_grad.size(); ++i)
+            {
+                DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
+            }
+            const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
+            DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ");
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -3882,22 +3883,22 @@ void test_multm_prev()
         DLIB_TEST_MSG(num_correct >= num_correct_required,
                       "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
 
-#if DLIB_USE_CUDA
-        cuda::compute_loss_binary_log_per_pixel cuda_compute;
-        cpu::compute_loss_binary_log_per_pixel cpu_compute;
-        double cuda_loss, cpu_loss;
-        const tensor& output_tensor = net.subnet().get_output();
-        resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
-        cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
-        cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
-        DLIB_TEST(cuda_grad.size() == cpu_grad.size());
-        for (size_t i = 0; i < cuda_grad.size(); ++i)
-        {
-            DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
-        }
-        const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
-        DLIB_TEST_MSG(err < 1e-6, "binary log per pixel cuda and cpu losses differ");
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::compute_loss_binary_log_per_pixel cuda_compute;
+            cpu::compute_loss_binary_log_per_pixel cpu_compute;
+            double cuda_loss, cpu_loss;
+            const tensor& output_tensor = net.subnet().get_output();
+            resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
+            cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
+            cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
+            DLIB_TEST(cuda_grad.size() == cpu_grad.size());
+            for (size_t i = 0; i < cuda_grad.size(); ++i)
+            {
+                DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
+            }
+            const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
+            DLIB_TEST_MSG(err < 1e-6, "binary log per pixel cuda and cpu losses differ");
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -4233,22 +4234,22 @@ void test_multm_prev()
         DLIB_TEST_MSG(num_correct >= num_correct_required,
                       "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
 
-#if DLIB_USE_CUDA
-        cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
-        cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
-        double cuda_loss, cpu_loss;
-        const tensor& output_tensor = net.subnet().get_output();
-        resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
-        cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
-        cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
-        DLIB_TEST(cuda_grad.size() == cpu_grad.size());
-        for (size_t i = 0; i < cuda_grad.size(); ++i)
-        {
-            DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
-        }
-        const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
-        DLIB_TEST_MSG(err < 1e-6, "multiclass log per pixel cuda and cpu losses differ");
-#endif
+        IF_DLIB_USE_CUDA(
+            cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
+            cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
+            double cuda_loss, cpu_loss;
+            const tensor& output_tensor = net.subnet().get_output();
+            resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
+            cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
+            cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
+            DLIB_TEST(cuda_grad.size() == cpu_grad.size());
+            for (size_t i = 0; i < cuda_grad.size(); ++i)
+            {
+                DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
+            }
+            const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
+            DLIB_TEST_MSG(err < 1e-6, "multiclass log per pixel cuda and cpu losses differ");
+        )
     }
 
 // ----------------------------------------------------------------------------------------
@@ -4344,22 +4345,22 @@ void test_multm_prev()
                           "The weighted class (" << weighted_class << ") does not dominate: "
                           << num_weighted_class << " <= " << num_not_weighted_class);
 
-#if DLIB_USE_CUDA
-            cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute;
-            cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute;
-            double cuda_loss, cpu_loss;
-            const tensor& output_tensor = net.subnet().get_output();
-            resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
-            cuda_compute(y_weighted.begin(), output_tensor, cuda_grad, cuda_loss);
-            cpu_compute(y_weighted.begin(), output_tensor, cpu_grad, cpu_loss);
-            DLIB_TEST(cuda_grad.size() == cpu_grad.size());
-            for (size_t i = 0; i < cuda_grad.size(); ++i)
-            {
-                DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
-            }
-            const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
-            DLIB_TEST_MSG(err < 1e-5, "multi class log per pixel weighted cuda and cpu losses differ: " << err);
-#endif
+            IF_DLIB_USE_CUDA(
+                cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute;
+                cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute;
+                double cuda_loss, cpu_loss;
+                const tensor& output_tensor = net.subnet().get_output();
+                resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
+                cuda_compute(y_weighted.begin(), output_tensor, cuda_grad, cuda_loss);
+                cpu_compute(y_weighted.begin(), output_tensor, cpu_grad, cpu_loss);
+                DLIB_TEST(cuda_grad.size() == cpu_grad.size());
+                for (size_t i = 0; i < cuda_grad.size(); ++i)
+                {
+                    DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
+                }
+                const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
+                DLIB_TEST_MSG(err < 1e-5, "multi class log per pixel weighted cuda and cpu losses differ: " << err);
+            )
         }
     }
 
@@ -4526,10 +4527,10 @@ void test_multm_prev()
             img = 1;
             img.host()[idx] = 2;
             cpu::resize_bilinear(out, img);
-#ifdef DLIB_USE_CUDA
-            cuda::resize_bilinear(out2, img);
-            DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5);
-#endif
+            IF_DLIB_USE_CUDA(
+                cuda::resize_bilinear(out2, img);
+                DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5);
+            )
 
             resizable_tensor gradient_input;
             gradient_input.copy_size(out);
@@ -4560,12 +4561,12 @@ void test_multm_prev()
             dlog << LINFO << "analytic grad: "<< grad2.host()[idx]-0.1;
             DLIB_TEST_MSG(std::abs(numerical_grad - grad2.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad2.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);
 
-#ifdef DLIB_USE_CUDA
-            cuda::resize_bilinear_gradient(grad, gradient_input);
-            dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1;
-            DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);
-            DLIB_TEST(max(abs(mat(grad)-mat(grad2))) < 1e-5);
-#endif
+            IF_DLIB_USE_CUDA(
+                cuda::resize_bilinear_gradient(grad, gradient_input);
+                dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1;
+                DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);
+                DLIB_TEST(max(abs(mat(grad)-mat(grad2))) < 1e-5);
+            )
 
         }
 
@@ -4586,11 +4587,11 @@ void test_multm_prev()
             auto wout = aout(out, out.nc()*1+1);
             auto wimg = aimg(img, img.nc()*1+1);
             cpu::resize_bilinear(wout,out.nc(),out.nr()*out.nc(),  wimg,img.nc(),img.nr()*img.nc());
-#ifdef DLIB_USE_CUDA
-            auto wout2 = aout(out2, out2.nc()*1+1);
-            cuda::resize_bilinear(wout2,out2.nc(),out2.nr()*out2.nc(),  wimg,img.nc(),img.nr()*img.nc());
-            DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5);
-#endif
+            IF_DLIB_USE_CUDA(
+                auto wout2 = aout(out2, out2.nc()*1+1);
+                cuda::resize_bilinear(wout2,out2.nc(),out2.nr()*out2.nc(),  wimg,img.nc(),img.nr()*img.nc());
+                DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5);
+            )
 
 
             resizable_tensor gradient_input;
@@ -4628,15 +4629,14 @@ void test_multm_prev()
             dlog << LINFO << "analytic grad: "<< grad2.host()[idx]-0.1;
             DLIB_TEST_MSG(std::abs(numerical_grad - grad2.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad2.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);
 
-#ifdef DLIB_USE_CUDA
-            wgrad2 = aimg(grad, grad.nc()*1+1);
-            wgradient_input = aout(gradient_input, gradient_input.nc()*1+1);
-            cuda::resize_bilinear_gradient(wgrad2,grad.nc(),grad.nr()*grad.nc(),  wgradient_input,gradient_input.nc(),gradient_input.nr()*gradient_input.nc());
-            dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1;
-            DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);
-            DLIB_TEST_MSG(max(abs(mat(grad)-mat(grad2))) < 1e-5, max(abs(mat(grad)-mat(grad2))));
-#endif
-
+            IF_DLIB_USE_CUDA(
+                wgrad2 = aimg(grad, grad.nc()*1+1);
+                wgradient_input = aout(gradient_input, gradient_input.nc()*1+1);
+                cuda::resize_bilinear_gradient(wgrad2,grad.nc(),grad.nr()*grad.nc(),  wgradient_input,gradient_input.nc(),gradient_input.nr()*gradient_input.nc());
+                dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1;
+                DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << "  numerical_grad: " << numerical_grad);
+                DLIB_TEST_MSG(max(abs(mat(grad)-mat(grad2))) < 1e-5, max(abs(mat(grad)-mat(grad2))));
+            )
 
         }
     }
@@ -5036,20 +5036,20 @@ void test_multm_prev()
 
     void test_reorg()
     {
-#ifdef DLIB_USE_CUDA
-        print_spinner();
-        resizable_tensor x(2, 4, 8, 16);
-        resizable_tensor out_cpu(2, 16, 4, 8), out_cuda(2, 16, 4, 8);
-        resizable_tensor grad_cpu(x), grad_cuda(x);
-        tt::tensor_rand rnd;
-        rnd.fill_gaussian(x);
-        cpu::reorg(false, out_cpu, 2, 2, x);
-        cuda::reorg(false, out_cuda, 2, 2, x);
-        DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0);
-        cpu::reorg_gradient(false, grad_cpu, 2, 2, out_cpu);
-        cuda::reorg_gradient(false, grad_cuda, 2, 2, out_cuda);
-        DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0);
-#endif
+        IF_DLIB_USE_CUDA(
+            print_spinner();
+            resizable_tensor x(2, 4, 8, 16);
+            resizable_tensor out_cpu(2, 16, 4, 8), out_cuda(2, 16, 4, 8);
+            resizable_tensor grad_cpu(x), grad_cuda(x);
+            tt::tensor_rand rnd;
+            rnd.fill_gaussian(x);
+            cpu::reorg(false, out_cpu, 2, 2, x);
+            cuda::reorg(false, out_cuda, 2, 2, x);
+            DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0);
+            cpu::reorg_gradient(false, grad_cpu, 2, 2, out_cpu);
+            cuda::reorg_gradient(false, grad_cuda, 2, 2, out_cuda);
+            DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0);
+        )
     }
 
     void test_input_tensor()
@@ -5143,27 +5143,28 @@ void test_multm_prev()
             srand(1234);
 
             test_tagging();
-#ifdef DLIB_USE_CUDA
-            test_affine_rect();
-            test_conv();
-            test_more_ops2();
-            test_more_ops(1,1);
-            test_more_ops(3,4);
-            test_more_ops(4,3);
-            test_more_ops(4,1);
-            test_more_ops(1,4);
-            test_more_ops(10000,4);
-            compare_bn_gpu_and_cpu();
-            compare_bn_conv_gpu_and_cpu();
-            test_add();
-            test_multiply_zero_padded();
-            compare_adam();
-            test_copy_tensor_gpu();
-            test_copy_tensor_add_to_gpu();
-            test_copy_tensor_gpu();
-            test_copy_tensor_add_to_gpu();
-            test_scale_channels();
-#endif
+            IF_DLIB_USE_CUDA(
+                test_affine_rect();
+                test_conv();
+                test_more_ops2();
+                test_more_ops(1,1);
+                test_more_ops(3,4);
+                test_more_ops(4,3);
+                test_more_ops(4,1);
+                test_more_ops(1,4);
+                test_more_ops(10000,4);
+                compare_bn_gpu_and_cpu();
+                compare_bn_conv_gpu_and_cpu();
+                test_add();
+                test_multiply_zero_padded();
+                compare_adam();
+                test_copy_tensor_gpu();
+                test_copy_tensor_add_to_gpu();
+                test_copy_tensor_gpu();
+                test_copy_tensor_add_to_gpu();
+                test_scale_channels();
+            )
+
             test_tensor_resize_bilinear(2, 3, 6,6, 11, 11);
             test_tensor_resize_bilinear(2, 3, 6,6, 3, 4);
             test_tensor_resize_bilinear(2, 3, 5,6, 12, 21);