diff --git a/dlib/cuda/cuda_dlib.cu b/dlib/cuda/cuda_dlib.cu index 5d6ec4052c..921bbdbd35 100644 --- a/dlib/cuda/cuda_dlib.cu +++ b/dlib/cuda/cuda_dlib.cu @@ -9,6 +9,17 @@ namespace dlib { + namespace + { + bool& use_cuda_impl ( + ) + { + thread_local bool var(cuda::is_available()); + return var; + } + + } + namespace cuda { @@ -18,14 +29,16 @@ namespace dlib int dev ) { - CHECK_CUDA(cudaSetDevice(dev)); + if (is_available()) + CHECK_CUDA(cudaSetDevice(dev)); } int get_device ( ) { - int dev = 0; - CHECK_CUDA(cudaGetDevice(&dev)); + int dev = -1; + if (is_available()) + CHECK_CUDA(cudaGetDevice(&dev)); return dev; } @@ -44,6 +57,27 @@ namespace dlib CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); } + bool is_available( + ) + { + int num_devices; + return cudaGetDeviceCount(&num_devices) == cudaSuccess && num_devices > 0; + } + + bool use_cuda( + ) + { + return use_cuda_impl(); + } + + void set_use_cuda( + bool flag + ) + { + if (is_available()) + use_cuda_impl() = flag; + } + int get_num_devices ( ) { diff --git a/dlib/cuda/cuda_dlib.h b/dlib/cuda/cuda_dlib.h index 2f22b7e23e..289b2c6ec3 100644 --- a/dlib/cuda/cuda_dlib.h +++ b/dlib/cuda/cuda_dlib.h @@ -25,6 +25,16 @@ namespace dlib int get_num_devices ( ); + bool is_available ( + ); + + bool use_cuda( + ); + + void set_use_cuda( + bool flag + ); + std::string get_device_name ( int device ); @@ -892,6 +902,16 @@ namespace dlib inline int get_num_devices ( ) { return 1; } + inline bool is_available ( + ) { return false; } + + inline bool use_cuda( + ) { return false; } + + inline void set_use_cuda( + bool flag + ) {} + inline std::string get_device_name ( int device ) diff --git a/dlib/cuda/curand_dlibapi.cpp b/dlib/cuda/curand_dlibapi.cpp index 67828e6640..6f8b00006b 100644 --- a/dlib/cuda/curand_dlibapi.cpp +++ b/dlib/cuda/curand_dlibapi.cpp @@ -6,6 +6,7 @@ #ifdef DLIB_USE_CUDA #include "curand_dlibapi.h" +#include "cuda_dlib.h" #include #include "../string.h" @@ -47,11 +48,14 @@ namespace dlib unsigned long long seed ) : handle(nullptr) { - curandGenerator_t gen; - CHECK_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); - handle = gen; + if (is_available()) + { + curandGenerator_t gen; + CHECK_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); + handle = gen; - CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, seed)); + CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, seed)); + } } curand_generator:: diff --git a/dlib/cuda/gpu_data.cpp b/dlib/cuda/gpu_data.cpp index 64f184aede..413c3996ee 100644 --- a/dlib/cuda/gpu_data.cpp +++ b/dlib/cuda/gpu_data.cpp @@ -58,10 +58,16 @@ namespace dlib if (dest_offset == 0 && num == dest.size()) { // copy the memory efficiently based on which copy is current in each object. - if (src.device_ready()) + if (dest.device_id() >= 0 && src.device_ready()) CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToDevice)); - else + else if (dest.device_id() < 0 && src.device_ready()) + CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToHost)); + else if (dest.device_id() >= 0 && !src.device_ready()) CHECK_CUDA(cudaMemcpy(dest.device_write_only(), src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); + else if (dest.device_id() >= 0 || src.device_id() >= 0) + CHECK_CUDA(cudaMemcpy(dest.host_write_only(), src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToHost)); + else + std::memcpy(dest.host_write_only(), src.host()+src_offset, num*sizeof(float)); } else { @@ -72,8 +78,11 @@ namespace dlib CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.device()+src_offset, num*sizeof(float), cudaMemcpyDeviceToHost)); else if (dest.device_ready() && !src.device_ready()) CHECK_CUDA(cudaMemcpy(dest.device()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToDevice)); - else + else if (dest.device_id() >= 0 || src.device_id() >= 0) CHECK_CUDA(cudaMemcpy(dest.host()+dest_offset, src.host()+src_offset, num*sizeof(float), cudaMemcpyHostToHost)); + else + std::memcpy(dest.host()+dest_offset, src.host()+src_offset, num*sizeof(float)); + } } } @@ -199,6 +208,13 @@ namespace dlib device_current = true; device_in_use = false; + if (!cuda::use_cuda()) + { + data_host.reset(new float[new_size], std::default_delete()); + the_device_id = -1; + return; + } + try { CHECK_CUDA(cudaGetDevice(&the_device_id)); diff --git a/dlib/cuda/gpu_data.h b/dlib/cuda/gpu_data.h index 022a05f71c..4d37315a40 100644 --- a/dlib/cuda/gpu_data.h +++ b/dlib/cuda/gpu_data.h @@ -12,6 +12,14 @@ namespace dlib { +// ---------------------------------------------------------------------------------------- + + namespace cuda + { + bool use_cuda( + ); + } + // ---------------------------------------------------------------------------------------- class gpu_data @@ -93,14 +101,16 @@ namespace dlib float* host() { copy_to_host(); - device_current = false; + if (device_id() >= 0) + device_current = false; return data_host.get(); } float* host_write_only() { host_current = true; - device_current = false; + if (device_id() >= 0) + device_current = false; return data_host.get(); } @@ -109,6 +119,7 @@ namespace dlib #ifndef DLIB_USE_CUDA DLIB_CASSERT(false, "CUDA NOT ENABLED"); #endif + DLIB_CASSERT(device_id() >= 0, "This data is host only"); copy_to_device(); device_in_use = true; return data_device.get(); @@ -119,6 +130,7 @@ namespace dlib #ifndef DLIB_USE_CUDA DLIB_CASSERT(false, "CUDA NOT ENABLED"); #endif + DLIB_CASSERT(device_id() >= 0, "This data is host only"); copy_to_device(); host_current = false; device_in_use = true; @@ -130,6 +142,7 @@ namespace dlib #ifndef DLIB_USE_CUDA DLIB_CASSERT(false, "CUDA NOT ENABLED"); #endif + DLIB_CASSERT(device_id() >= 0, "This data is host only"); wait_for_transfer_to_finish(); host_current = false; device_current = true; @@ -141,7 +154,7 @@ namespace dlib ) const { return host_current; } bool device_ready ( - ) const { return device_current && !have_active_transfer; } + ) const { return device_current && !have_active_transfer && device_id() >= 0; } size_t size() const { return data_size; } diff --git a/dlib/cuda/tensor.h b/dlib/cuda/tensor.h index 6a893df311..aaae4e836e 100644 --- a/dlib/cuda/tensor.h +++ b/dlib/cuda/tensor.h @@ -12,6 +12,18 @@ #include #include "../any.h" +#ifdef DLIB_USE_CUDA +#define IF_DLIB_USE_CUDA(...) if (cuda::use_cuda()) { __VA_ARGS__ } +#else +#define IF_DLIB_USE_CUDA(...) +#endif + +#ifdef DLIB_USE_CUDA +#define IF_DLIB_NOT_USE_CUDA(...) if (!cuda::use_cuda()) { __VA_ARGS__ } +#else +#define IF_DLIB_NOT_USE_CUDA(...) __VA_ARGS__ +#endif + namespace dlib { @@ -77,17 +89,18 @@ namespace dlib tensor& operator= (float val) { -#ifdef DLIB_USE_CUDA - // If you are using CUDA then presumably you will be mostly using tensors on - // the GPU. So unless you seem to be actively working with the host side's - // data then we do this initialization on the device side since this avoids a - // host to device transfer that would likely immediately follow. - if (data().device_ready()) - { - cuda::set_tensor(*this, val); - return *this; - } -#endif + IF_DLIB_USE_CUDA( + // If you are using CUDA then presumably you will be mostly using tensors on + // the GPU. So unless you seem to be actively working with the host side's + // data then we do this initialization on the device side since this avoids a + // host to device transfer that would likely immediately follow. + if (data().device_ready()) + { + cuda::set_tensor(*this, val); + return *this; + } + ) + auto d = host_write_only(); for (size_t i = 0; i < size(); ++i) d[i] = val; @@ -97,15 +110,16 @@ namespace dlib tensor& operator*= (float val) { -#ifdef DLIB_USE_CUDA - cuda::scale_tensor(*this, val); - return *this; -#else - for (auto& d : *this) - d *= val; + IF_DLIB_USE_CUDA( + cuda::scale_tensor(*this, val); + ) + + IF_DLIB_NOT_USE_CUDA( + for (auto& d : *this) + d *= val; + ) return *this; -#endif } tensor& operator/= (float val) diff --git a/dlib/cuda/tensor_tools.cpp b/dlib/cuda/tensor_tools.cpp index 8dece8369f..ce3d323c92 100644 --- a/dlib/cuda/tensor_tools.cpp +++ b/dlib/cuda/tensor_tools.cpp @@ -49,11 +49,13 @@ namespace dlib { namespace tt const double eps ) { -#ifdef DLIB_USE_CUDA - cuda::inverse_norms(invnorms, data, eps); -#else - invnorms = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps)); -#endif + IF_DLIB_USE_CUDA( + cuda::inverse_norms(invnorms, data, eps); + ) + + IF_DLIB_NOT_USE_CUDA( + invnorms = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps)); + ) } void dot_prods ( @@ -62,11 +64,13 @@ namespace dlib { namespace tt const tensor& rhs ) { -#ifdef DLIB_USE_CUDA - cuda::dot_prods(out, lhs, rhs); -#else - out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); -#endif + IF_DLIB_USE_CUDA( + cuda::dot_prods(out, lhs, rhs); + ) + + IF_DLIB_NOT_USE_CUDA( + out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); + ) } void dot_prods ( @@ -76,14 +80,16 @@ namespace dlib { namespace tt const tensor& rhs ) { -#ifdef DLIB_USE_CUDA - cuda::dot_prods(add_to, out, lhs, rhs); -#else - if (add_to) - out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); - else - out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); -#endif + IF_DLIB_USE_CUDA( + cuda::dot_prods(add_to, out, lhs, rhs); + ) + + IF_DLIB_NOT_USE_CUDA( + if (add_to) + out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); + else + out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); + ) } void scale_columns ( @@ -99,11 +105,13 @@ namespace dlib { namespace tt DLIB_CASSERT(m.size() != 0); DLIB_CASSERT(m.size()/m.num_samples() == v.size()); -#ifdef DLIB_USE_CUDA - cuda::scale_columns(out, m, v); -#else - out = scale_columns(mat(m), mat(v)); -#endif + IF_DLIB_USE_CUDA( + cuda::scale_columns(out, m, v); + ) + + IF_DLIB_NOT_USE_CUDA( + out = scale_columns(mat(m), mat(v)); + ) } void scale_rows ( @@ -119,11 +127,13 @@ namespace dlib { namespace tt DLIB_CASSERT(m.size() != 0); DLIB_CASSERT(m.num_samples() == static_cast(v.size())); -#ifdef DLIB_USE_CUDA - cuda::scale_rows(out, m, v); -#else - out = scale_rows(mat(m), mat(v)); -#endif + IF_DLIB_USE_CUDA( + cuda::scale_rows(out, m, v); + ) + + IF_DLIB_NOT_USE_CUDA( + out = scale_rows(mat(m), mat(v)); + ) } void scale_rows2 ( @@ -141,14 +151,16 @@ namespace dlib { namespace tt DLIB_CASSERT(is_vector(mat(v1))); DLIB_CASSERT(static_cast(v1.size()) == m1.num_samples()); -#ifdef DLIB_USE_CUDA - cuda::scale_rows2(beta, out, m1, m2, v1, v2); -#else - if (beta == 0) - out = scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2)); - else - out = beta*mat(out) + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2)); -#endif + IF_DLIB_USE_CUDA( + cuda::scale_rows2(beta, out, m1, m2, v1, v2); + ) + + IF_DLIB_NOT_USE_CUDA( + if (beta == 0) + out = scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2)); + else + out = beta*mat(out) + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2)); + ) } // ---------------------------------------------------------------------------------------- @@ -160,11 +172,13 @@ namespace dlib { namespace tt { DLIB_CASSERT(dest.size() == src.size()); -#ifdef DLIB_USE_CUDA - cuda::exp(dest,src); -#else - dest = exp(mat(src)); -#endif + IF_DLIB_USE_CUDA( + cuda::exp(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + dest = exp(mat(src)); + ) } // ---------------------------------------------------------------------------------------- @@ -176,11 +190,13 @@ namespace dlib { namespace tt { DLIB_CASSERT(dest.size() == src.size()); -#ifdef DLIB_USE_CUDA - cuda::log(dest,src); -#else - dest = log(mat(src)); -#endif + IF_DLIB_USE_CUDA( + cuda::log(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + dest = log(mat(src)); + ) } // ---------------------------------------------------------------------------------------- @@ -192,11 +208,13 @@ namespace dlib { namespace tt { DLIB_CASSERT(dest.size() == src.size()); -#ifdef DLIB_USE_CUDA - cuda::log10(dest,src); -#else - dest = log10(mat(src)); -#endif + IF_DLIB_USE_CUDA( + cuda::log10(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + dest = log10(mat(src)); + ) } // ---------------------------------------------------------------------------------------- @@ -212,97 +230,99 @@ namespace dlib { namespace tt operation_mode mode ) { -#ifdef DLIB_USE_CUDA - cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs, mode); -#else - if (mode == operation_mode::CHANNEL_WISE) - { - if (beta != 0) + IF_DLIB_USE_CUDA( + cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs, mode); + ) + + IF_DLIB_NOT_USE_CUDA( + if (mode == operation_mode::CHANNEL_WISE) { - if (trans_lhs && trans_rhs) - dest = alpha * trans(mat(lhs)) * trans(mat(rhs)) + beta * mat(dest); - else if (!trans_lhs && trans_rhs) - dest = alpha * mat(lhs) * trans(mat(rhs)) + beta * mat(dest); - else if (trans_lhs && !trans_rhs) - dest = alpha * trans(mat(lhs)) * mat(rhs) + beta * mat(dest); + if (beta != 0) + { + if (trans_lhs && trans_rhs) + dest = alpha * trans(mat(lhs)) * trans(mat(rhs)) + beta * mat(dest); + else if (!trans_lhs && trans_rhs) + dest = alpha * mat(lhs) * trans(mat(rhs)) + beta * mat(dest); + else if (trans_lhs && !trans_rhs) + dest = alpha * trans(mat(lhs)) * mat(rhs) + beta * mat(dest); + else + dest = alpha * mat(lhs) * mat(rhs) + beta * mat(dest); + } else - dest = alpha * mat(lhs) * mat(rhs) + beta * mat(dest); + { + if (trans_lhs && trans_rhs) + dest = alpha * trans(mat(lhs)) * trans(mat(rhs)); + else if (!trans_lhs && trans_rhs) + dest = alpha * mat(lhs) * trans(mat(rhs)); + else if (trans_lhs && !trans_rhs) + dest = alpha * trans(mat(lhs)) * mat(rhs); + else + dest = alpha * mat(lhs) * mat(rhs); + } } - else + else if (mode == operation_mode::PLANE_WISE) { - if (trans_lhs && trans_rhs) - dest = alpha * trans(mat(lhs)) * trans(mat(rhs)); - else if (!trans_lhs && trans_rhs) - dest = alpha * mat(lhs) * trans(mat(rhs)); - else if (trans_lhs && !trans_rhs) - dest = alpha * trans(mat(lhs)) * mat(rhs); - else - dest = alpha * mat(lhs) * mat(rhs); - } - } - else if (mode == operation_mode::PLANE_WISE) - { - auto is_matrix = [](const auto& tensor) { - return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) || - (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1)); - }; + auto is_matrix = [](const auto& tensor) { + return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) || + (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1)); + }; - long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() }); - long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() }); - const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest); + long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() }); + long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() }); + const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest); - if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) { - num_samples = num_channels = 1; - } + if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) { + num_samples = num_channels = 1; + } - long lhs_rows = (lhs_is_matrix && lhs.num_samples() > 1) ? lhs.num_samples() : lhs.nr(); - long lhs_cols = (lhs_is_matrix && lhs.k() > 1) ? lhs.k() : lhs.nc(); - long rhs_rows = (rhs_is_matrix && rhs.num_samples() > 1) ? rhs.num_samples() : rhs.nr(); - long rhs_cols = (rhs_is_matrix && rhs.k() > 1) ? rhs.k() : rhs.nc(); - long dest_rows = (dest_is_matrix && dest.num_samples() > 1) ? dest.num_samples() : dest.nr(); - long dest_cols = (dest_is_matrix && dest.k() > 1) ? dest.k() : dest.nc(); + long lhs_rows = (lhs_is_matrix && lhs.num_samples() > 1) ? lhs.num_samples() : lhs.nr(); + long lhs_cols = (lhs_is_matrix && lhs.k() > 1) ? lhs.k() : lhs.nc(); + long rhs_rows = (rhs_is_matrix && rhs.num_samples() > 1) ? rhs.num_samples() : rhs.nr(); + long rhs_cols = (rhs_is_matrix && rhs.k() > 1) ? rhs.k() : rhs.nc(); + long dest_rows = (dest_is_matrix && dest.num_samples() > 1) ? dest.num_samples() : dest.nr(); + long dest_cols = (dest_is_matrix && dest.k() > 1) ? dest.k() : dest.nc(); - const size_t lhs_plane_size = lhs_rows * lhs_cols; - const size_t rhs_plane_size = rhs_rows * rhs_cols; - const size_t dest_plane_size = dest_rows * dest_cols; + const size_t lhs_plane_size = lhs_rows * lhs_cols; + const size_t rhs_plane_size = rhs_rows * rhs_cols; + const size_t dest_plane_size = dest_rows * dest_cols; - for (long b = 0; b < num_samples; ++b) - { - for (long c = 0; c < num_channels; ++c) + for (long b = 0; b < num_samples; ++b) { - auto lhs_slice = lhs_is_matrix ? alias_tensor(lhs_rows, lhs_cols)(lhs, 0) : - alias_tensor(lhs_rows, lhs_cols)(lhs, (b * num_channels + c) * lhs_plane_size); - auto rhs_slice = rhs_is_matrix ? alias_tensor(rhs_rows, rhs_cols)(rhs, 0) : - alias_tensor(rhs_rows, rhs_cols)(rhs, (b * num_channels + c) * rhs_plane_size); - auto dest_slice = dest_is_matrix ? alias_tensor(dest_rows, dest_cols)(dest, 0) : - alias_tensor(dest_rows, dest_cols)(dest, (b * num_channels + c) * dest_plane_size); - - if (beta != 0) + for (long c = 0; c < num_channels; ++c) { - if (trans_lhs && trans_rhs) - dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)) + beta * mat(dest_slice); - else if (!trans_lhs && trans_rhs) - dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)) + beta * mat(dest_slice); - else if (trans_lhs && !trans_rhs) - dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice) + beta * mat(dest_slice); + auto lhs_slice = lhs_is_matrix ? alias_tensor(lhs_rows, lhs_cols)(lhs, 0) : + alias_tensor(lhs_rows, lhs_cols)(lhs, (b * num_channels + c) * lhs_plane_size); + auto rhs_slice = rhs_is_matrix ? alias_tensor(rhs_rows, rhs_cols)(rhs, 0) : + alias_tensor(rhs_rows, rhs_cols)(rhs, (b * num_channels + c) * rhs_plane_size); + auto dest_slice = dest_is_matrix ? alias_tensor(dest_rows, dest_cols)(dest, 0) : + alias_tensor(dest_rows, dest_cols)(dest, (b * num_channels + c) * dest_plane_size); + + if (beta != 0) + { + if (trans_lhs && trans_rhs) + dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)) + beta * mat(dest_slice); + else if (!trans_lhs && trans_rhs) + dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)) + beta * mat(dest_slice); + else if (trans_lhs && !trans_rhs) + dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice) + beta * mat(dest_slice); + else + dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice) + beta * mat(dest_slice); + } else - dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice) + beta * mat(dest_slice); - } - else - { - if (trans_lhs && trans_rhs) - dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)); - else if (!trans_lhs && trans_rhs) - dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)); - else if (trans_lhs && !trans_rhs) - dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice); - else - dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice); + { + if (trans_lhs && trans_rhs) + dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)); + else if (!trans_lhs && trans_rhs) + dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)); + else if (trans_lhs && !trans_rhs) + dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice); + else + dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice); + } } } } - } -#endif + ) } // ---------------------------------------------------------------------------------------- @@ -313,10 +333,9 @@ namespace dlib { namespace tt unsigned long long seed ) #ifdef DLIB_USE_CUDA - :rnd(seed){} -#else - {rnd.set_seed(cast_to_string(seed)); } + :cuda_impl(seed) #endif + {cpu_impl.set_seed(cast_to_string(seed)); } void tensor_rand:: fill_gaussian ( @@ -326,12 +345,15 @@ namespace dlib { namespace tt ) { DLIB_CASSERT(data.size()%2 == 0); -#ifdef DLIB_USE_CUDA - rnd.fill_gaussian(data, mean, stddev); -#else - for (auto& x : data) - x = rnd.get_random_gaussian()*stddev + mean; -#endif + + IF_DLIB_USE_CUDA( + cuda_impl.fill_gaussian(data, mean, stddev); + ) + + IF_DLIB_NOT_USE_CUDA( + for (auto& x : data) + x = cpu_impl.get_random_gaussian()*stddev + mean; + ) } void tensor_rand:: @@ -339,12 +361,14 @@ namespace dlib { namespace tt tensor& data ) { -#ifdef DLIB_USE_CUDA - rnd.fill_uniform(data); -#else - for (auto& x : data) - x = rnd.get_random_float(); -#endif + IF_DLIB_USE_CUDA( + cuda_impl.fill_uniform(data); + ) + + IF_DLIB_NOT_USE_CUDA( + for (auto& x : data) + x = cpu_impl.get_random_float(); + ) } // ---------------------------------------------------------------------------------------- @@ -364,11 +388,14 @@ namespace dlib { namespace tt DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) && (src1.num_samples()==1 || src1.num_samples()==MD) && (src2.num_samples()==1 || src2.num_samples()==MD) ); -#ifdef DLIB_USE_CUDA - cuda::multiply(add_to, dest, src1, src2); -#else - cpu::multiply(add_to, dest, src1, src2); -#endif + + IF_DLIB_USE_CUDA( + cuda::multiply(add_to, dest, src1, src2); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::multiply(add_to, dest, src1, src2); + ) } @@ -379,11 +406,13 @@ namespace dlib { namespace tt const tensor& scales ) { -#ifdef DLIB_USE_CUDA - cuda::scale_channels(add_to, dest, src, scales); -#else - cpu::scale_channels(add_to, dest, src, scales); -#endif + IF_DLIB_USE_CUDA( + cuda::scale_channels(add_to, dest, src, scales); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::scale_channels(add_to, dest, src, scales); + ) } void multiply_conv ( @@ -393,11 +422,13 @@ namespace dlib { namespace tt const tensor& src2 ) { -#ifdef DLIB_USE_CUDA - cuda::multiply_conv(add_to, dest, src1, src2); -#else - cpu::multiply_conv(add_to, dest, src1, src2); -#endif + IF_DLIB_USE_CUDA( + cuda::multiply_conv(add_to, dest, src1, src2); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::multiply_conv(add_to, dest, src1, src2); + ) } void multiply_zero_padded ( @@ -407,11 +438,13 @@ namespace dlib { namespace tt const tensor& src2 ) { -#ifdef DLIB_USE_CUDA - cuda::multiply_zero_padded(add_to, dest, src1, src2); -#else - cpu::multiply_zero_padded(add_to, dest, src1, src2); -#endif + IF_DLIB_USE_CUDA( + cuda::multiply_zero_padded(add_to, dest, src1, src2); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::multiply_zero_padded(add_to, dest, src1, src2); + ) } // ---------------------------------------------------------------------------------------- @@ -423,11 +456,13 @@ namespace dlib { namespace tt const float B ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform(dest,src,A,B); -#else - cpu::affine_transform(dest,src,A,B); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform(dest,src,A,B); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform(dest,src,A,B); + ) } void affine_transform( @@ -436,11 +471,13 @@ namespace dlib { namespace tt const float A ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform(dest,src,A); -#else - cpu::affine_transform(dest,src,A,0); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform(dest,src,A); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform(dest,src,A,0); + ) } void affine_transform( @@ -452,11 +489,13 @@ namespace dlib { namespace tt const float C ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform(dest,src1,src2,A,B,C); -#else - cpu::affine_transform(dest,src1,src2,A,B,C); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform(dest,src1,src2,A,B,C); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform(dest,src1,src2,A,B,C); + ) } void affine_transform( @@ -467,11 +506,13 @@ namespace dlib { namespace tt const float B ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform(dest,src1,src2,A,B); -#else - cpu::affine_transform(dest,src1,src2,A,B,0); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform(dest,src1,src2,A,B); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform(dest,src1,src2,A,B,0); + ) } void affine_transform( @@ -485,11 +526,13 @@ namespace dlib { namespace tt const float D ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform(dest,src1,src2,src3,A,B,C,D); -#else - cpu::affine_transform(dest,src1,src2,src3,A,B,C,D); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform(dest,src1,src2,src3,A,B,C,D); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform(dest,src1,src2,src3,A,B,C,D); + ) } void affine_transform_range( @@ -504,11 +547,13 @@ namespace dlib { namespace tt const float C ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C); -#else - cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C); + ) } void affine_transform( @@ -522,11 +567,13 @@ namespace dlib { namespace tt float C ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform(rect, dest,src1,src2,src3,A,B,C); -#else - cpu::affine_transform(rect, dest,src1,src2,src3,A,B,C); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform(rect, dest,src1,src2,src3,A,B,C); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform(rect, dest,src1,src2,src3,A,B,C); + ) } void affine_transform( @@ -539,11 +586,13 @@ namespace dlib { namespace tt const float C ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C); -#else - cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C); + ) } // ---------------------------------------------------------------------------------------- @@ -555,11 +604,13 @@ namespace dlib { namespace tt const tensor& B ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform(dest,src,A,B); -#else - cpu::affine_transform(dest,src,A,B); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform(dest,src,A,B); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform(dest,src,A,B); + ) } // ---------------------------------------------------------------------------------------- @@ -571,11 +622,13 @@ namespace dlib { namespace tt const tensor& B ) { -#ifdef DLIB_USE_CUDA - cuda::affine_transform_conv(dest,src,A,B); -#else - cpu::affine_transform_conv(dest,src,A,B); -#endif + IF_DLIB_USE_CUDA( + cuda::affine_transform_conv(dest,src,A,B); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::affine_transform_conv(dest,src,A,B); + ) } // ---------------------------------------------------------------------------------------- @@ -595,13 +648,15 @@ namespace dlib { namespace tt const tensor& params_grad ) { -#ifdef DLIB_USE_CUDA - cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1, - momentum2, params, params_grad); -#else - cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1, - momentum2, params, params_grad); -#endif + IF_DLIB_USE_CUDA( + cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1, + momentum2, params, params_grad); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1, + momentum2, params, params_grad); + ) } // ---------------------------------------------------------------------------------------- @@ -616,11 +671,13 @@ namespace dlib { namespace tt const tensor& running_variances ) { -#ifdef DLIB_USE_CUDA - cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances); -#else - cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances); -#endif + IF_DLIB_USE_CUDA( + cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances); + ) } void batch_normalize ( @@ -636,11 +693,13 @@ namespace dlib { namespace tt const tensor& beta ) { -#ifdef DLIB_USE_CUDA - cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); -#else - cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); -#endif + IF_DLIB_USE_CUDA( + cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); + ) } void batch_normalize_gradient ( @@ -656,11 +715,13 @@ namespace dlib { namespace tt ) { -#ifdef DLIB_USE_CUDA - cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); -#else - cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); -#endif + IF_DLIB_USE_CUDA( + cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); + ) } // ---------------------------------------------------------------------------------------- @@ -675,11 +736,13 @@ namespace dlib { namespace tt const tensor& running_variances ) { -#ifdef DLIB_USE_CUDA - cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances); -#else - cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances); -#endif + IF_DLIB_USE_CUDA( + cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances); + ) } void batch_normalize_conv ( @@ -695,11 +758,13 @@ namespace dlib { namespace tt const tensor& beta ) { -#ifdef DLIB_USE_CUDA - cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); -#else - cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); -#endif + IF_DLIB_USE_CUDA( + cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); + ) } void batch_normalize_conv_gradient ( @@ -715,11 +780,13 @@ namespace dlib { namespace tt ) { -#ifdef DLIB_USE_CUDA - cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); -#else - cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); -#endif + IF_DLIB_USE_CUDA( + cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); + ) } // ---------------------------------------------------------------------------------------- @@ -734,11 +801,13 @@ namespace dlib { namespace tt const tensor& beta ) { -#ifdef DLIB_USE_CUDA - cuda::layer_normalize(eps, dest, means, vars, src, gamma, beta); -#else - cpu::layer_normalize(eps, dest, means, vars, src, gamma, beta); -#endif + IF_DLIB_USE_CUDA( + cuda::layer_normalize(eps, dest, means, vars, src, gamma, beta); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::layer_normalize(eps, dest, means, vars, src, gamma, beta); + ) } void layer_normalize_gradient ( @@ -755,11 +824,13 @@ namespace dlib { namespace tt resizable_tensor& dvars ) { -#ifdef DLIB_USE_CUDA - cuda::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars); -#else - cpu::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars); -#endif + IF_DLIB_USE_CUDA( + cuda::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::layer_normalize_gradient(eps, gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad, dmeans, dvars); + ) } // ---------------------------------------------------------------------------------------- @@ -772,11 +843,13 @@ namespace dlib { namespace tt const tensor& gamma ) { -#ifdef DLIB_USE_CUDA - cuda::rms_normalize(eps, dest, scale, src, gamma); -#else - cpu::rms_normalize(eps, dest, scale, src, gamma); -#endif + IF_DLIB_USE_CUDA( + cuda::rms_normalize(eps, dest, scale, src, gamma); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::rms_normalize(eps, dest, scale, src, gamma); + ) } void rms_normalize_gradient( @@ -789,11 +862,13 @@ namespace dlib { namespace tt resizable_tensor& dscale ) { -#ifdef DLIB_USE_CUDA - cuda::rms_normalize_gradient(gradient_input, scale, src, gamma, src_grad, gamma_grad, dscale); -#else - cpu::rms_normalize_gradient(gradient_input, scale, src, gamma, src_grad, gamma_grad, dscale); -#endif + IF_DLIB_USE_CUDA( + cuda::rms_normalize_gradient(gradient_input, scale, src, gamma, src_grad, gamma_grad, dscale); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::rms_normalize_gradient(gradient_input, scale, src, gamma, src_grad, gamma_grad, dscale); + ) } // ---------------------------------------------------------------------------------------- @@ -803,11 +878,13 @@ namespace dlib { namespace tt float thresh ) { -#ifdef DLIB_USE_CUDA - cuda::threshold(data,thresh); -#else - cpu::threshold(data,thresh); -#endif + IF_DLIB_USE_CUDA( + cuda::threshold(data,thresh); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::threshold(data,thresh); + ) } void dot ( @@ -817,11 +894,13 @@ namespace dlib { namespace tt size_t idx ) { -#ifdef DLIB_USE_CUDA - cuda::dot(a,b,result,idx); -#else - cpu::dot(a,b,result,idx); -#endif + IF_DLIB_USE_CUDA( + cuda::dot(a,b,result,idx); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::dot(a,b,result,idx); + ) } // ---------------------------------------------------------------------------------------- @@ -833,11 +912,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::add(beta,dest,alpha,src); -#else - cpu::add(beta,dest,alpha,src); -#endif + IF_DLIB_USE_CUDA( + cuda::add(beta,dest,alpha,src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::add(beta,dest,alpha,src); + ) } // ---------------------------------------------------------------------------------------- @@ -848,11 +929,13 @@ namespace dlib { namespace tt const tensor& src2 ) { -#ifdef DLIB_USE_CUDA - cuda::add(dest, src1, src2); -#else - cpu::add(dest, src1, src2); -#endif + IF_DLIB_USE_CUDA( + cuda::add(dest, src1, src2); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::add(dest, src1, src2); + ) } // ---------------------------------------------------------------------------------------- @@ -862,11 +945,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::assign_conv_bias_gradient(grad,gradient_input); -#else - cpu::assign_conv_bias_gradient(grad,gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::assign_conv_bias_gradient(grad,gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::assign_conv_bias_gradient(grad,gradient_input); + ) } // ---------------------------------------------------------------------------------------- @@ -876,11 +961,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::assign_bias_gradient(grad,gradient_input); -#else - cpu::assign_bias_gradient(grad,gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::assign_bias_gradient(grad,gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::assign_bias_gradient(grad,gradient_input); + ) } // ---------------------------------------------------------------------------------------- @@ -891,11 +978,13 @@ namespace dlib { namespace tt operation_mode mode ) { -#ifdef DLIB_USE_CUDA - cuda::softmax(dest, src, mode); -#else - cpu::softmax(dest, src, mode); -#endif + IF_DLIB_USE_CUDA( + cuda::softmax(dest, src, mode); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::softmax(dest, src, mode); + ) } void softmax_gradient( @@ -905,11 +994,13 @@ namespace dlib { namespace tt operation_mode mode ) { -#ifdef DLIB_USE_CUDA - cuda::softmax_gradient(grad, dest, gradient_input, mode); -#else - cpu::softmax_gradient(grad, dest, gradient_input, mode); -#endif + IF_DLIB_USE_CUDA( + cuda::softmax_gradient(grad, dest, gradient_input, mode); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::softmax_gradient(grad, dest, gradient_input, mode); + ) } // ---------------------------------------------------------------------------------------- @@ -919,11 +1010,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::softmax_all(dest,src); -#else - cpu::softmax_all(dest,src); -#endif + IF_DLIB_USE_CUDA( + cuda::softmax_all(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::softmax_all(dest,src); + ) } void softmax_all_gradient ( @@ -932,11 +1025,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::softmax_all_gradient(grad, dest, gradient_input); -#else - cpu::softmax_all_gradient(grad, dest, gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::softmax_all_gradient(grad, dest, gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::softmax_all_gradient(grad, dest, gradient_input); + ) } // ---------------------------------------------------------------------------------------- @@ -946,11 +1041,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::sigmoid(dest,src); -#else - cpu::sigmoid(dest,src); -#endif + IF_DLIB_USE_CUDA( + cuda::sigmoid(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::sigmoid(dest,src); + ) } void sigmoid_gradient ( @@ -959,11 +1056,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::sigmoid_gradient(grad, dest, gradient_input); -#else - cpu::sigmoid_gradient(grad, dest, gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::sigmoid_gradient(grad, dest, gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::sigmoid_gradient(grad, dest, gradient_input); + ) } // ---------------------------------------------------------------------------------------- @@ -973,11 +1072,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::mish(dest,src); -#else - cpu::mish(dest,src); -#endif + IF_DLIB_USE_CUDA( + cuda::mish(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::mish(dest,src); + ) } void mish_gradient ( @@ -986,11 +1087,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::mish_gradient(grad, src, gradient_input); -#else - cpu::mish_gradient(grad, src, gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::mish_gradient(grad, src, gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::mish_gradient(grad, src, gradient_input); + ) } // ---------------------------------------------------------------------------------------- @@ -1000,11 +1103,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::relu(dest,src); -#else - cpu::relu(dest,src); -#endif + IF_DLIB_USE_CUDA( + cuda::relu(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::relu(dest,src); + ) } void relu_gradient ( @@ -1013,11 +1118,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::relu_gradient(grad, dest, gradient_input); -#else - cpu::relu_gradient(grad, dest, gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::relu_gradient(grad, dest, gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::relu_gradient(grad, dest, gradient_input); + ) } // ---------------------------------------------------------------------------------------- @@ -1028,11 +1135,13 @@ namespace dlib { namespace tt const tensor& param ) { -#ifdef DLIB_USE_CUDA - cuda::prelu(dest, src, param); -#else - cpu::prelu(dest, src, param); -#endif + IF_DLIB_USE_CUDA( + cuda::prelu(dest, src, param); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::prelu(dest, src, param); + ) } void prelu_gradient ( @@ -1043,11 +1152,13 @@ namespace dlib { namespace tt tensor& params_grad ) { -#ifdef DLIB_USE_CUDA - cuda::prelu_gradient(grad, src, gradient_input, param, params_grad); -#else - cpu::prelu_gradient(grad, src, gradient_input, param, params_grad); -#endif + IF_DLIB_USE_CUDA( + cuda::prelu_gradient(grad, src, gradient_input, param, params_grad); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::prelu_gradient(grad, src, gradient_input, param, params_grad); + ) } // ---------------------------------------------------------------------------------------- @@ -1058,11 +1169,13 @@ namespace dlib { namespace tt const float alpha ) { -#ifdef DLIB_USE_CUDA - cuda::leaky_relu(dest, src, alpha); -#else - cpu::leaky_relu(dest, src, alpha); -#endif + IF_DLIB_USE_CUDA( + cuda::leaky_relu(dest, src, alpha); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::leaky_relu(dest, src, alpha); + ) } void leaky_relu_gradient ( @@ -1072,11 +1185,13 @@ namespace dlib { namespace tt const float alpha ) { -#ifdef DLIB_USE_CUDA - cuda::leaky_relu_gradient(grad, dest, gradient_input, alpha); -#else - cpu::leaky_relu_gradient(grad, dest, gradient_input, alpha); -#endif + IF_DLIB_USE_CUDA( + cuda::leaky_relu_gradient(grad, dest, gradient_input, alpha); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::leaky_relu_gradient(grad, dest, gradient_input, alpha); + ) } // ---------------------------------------------------------------------------------------- @@ -1086,11 +1201,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::tanh(dest,src); -#else - cpu::tanh(dest,src); -#endif + IF_DLIB_USE_CUDA( + cuda::tanh(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::tanh(dest,src); + ) } void tanh_gradient ( @@ -1099,11 +1216,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::tanh_gradient(grad, dest, gradient_input); -#else - cpu::tanh_gradient(grad, dest, gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::tanh_gradient(grad, dest, gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::tanh_gradient(grad, dest, gradient_input); + ) } // ---------------------------------------------------------------------------------------- @@ -1114,11 +1233,13 @@ namespace dlib { namespace tt const float ceiling ) { -#ifdef DLIB_USE_CUDA - cuda::clipped_relu(dest, src, ceiling); -#else - cpu::clipped_relu(dest, src, ceiling); -#endif + IF_DLIB_USE_CUDA( + cuda::clipped_relu(dest, src, ceiling); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::clipped_relu(dest, src, ceiling); + ) } void clipped_relu_gradient ( @@ -1128,11 +1249,13 @@ namespace dlib { namespace tt const float ceiling ) { -#ifdef DLIB_USE_CUDA - cuda::clipped_relu_gradient(grad, dest, gradient_input, ceiling); -#else - cpu::clipped_relu_gradient(grad, dest, gradient_input, ceiling); -#endif + IF_DLIB_USE_CUDA( + cuda::clipped_relu_gradient(grad, dest, gradient_input, ceiling); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::clipped_relu_gradient(grad, dest, gradient_input, ceiling); + ) } // ---------------------------------------------------------------------------------------- @@ -1143,11 +1266,13 @@ namespace dlib { namespace tt const float alpha ) { -#ifdef DLIB_USE_CUDA - cuda::elu(dest, src, alpha); -#else - cpu::elu(dest, src, alpha); -#endif + IF_DLIB_USE_CUDA( + cuda::elu(dest, src, alpha); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::elu(dest, src, alpha); + ) } void elu_gradient ( @@ -1157,11 +1282,13 @@ namespace dlib { namespace tt const float alpha ) { -#ifdef DLIB_USE_CUDA - cuda::elu_gradient(grad, dest, gradient_input, alpha); -#else - cpu::elu_gradient(grad, dest, gradient_input, alpha); -#endif + IF_DLIB_USE_CUDA( + cuda::elu_gradient(grad, dest, gradient_input, alpha); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::elu_gradient(grad, dest, gradient_input, alpha); + ) } // ---------------------------------------------------------------------------------------- @@ -1171,11 +1298,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::gelu(dest,src); -#else - cpu::gelu(dest,src); -#endif + IF_DLIB_USE_CUDA( + cuda::gelu(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::gelu(dest,src); + ) } void gelu_gradient ( @@ -1184,11 +1313,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::gelu_gradient(grad, src, gradient_input); -#else - cpu::gelu_gradient(grad, src, gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::gelu_gradient(grad, src, gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::gelu_gradient(grad, src, gradient_input); + ) } // ---------------------------------------------------------------------------------------- @@ -1200,11 +1331,14 @@ namespace dlib { namespace tt ) { DLIB_CASSERT(beta > 0); -#ifdef DLIB_USE_CUDA - cuda::smelu(dest, src, beta); -#else - cpu::smelu(dest, src, beta); -#endif + + IF_DLIB_USE_CUDA( + cuda::smelu(dest, src, beta); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::smelu(dest, src, beta); + ) } void smelu_gradient ( @@ -1215,11 +1349,14 @@ namespace dlib { namespace tt ) { DLIB_CASSERT(beta > 0); -#ifdef DLIB_USE_CUDA - cuda::smelu_gradient(grad, dest, gradient_input, beta); -#else - cpu::smelu_gradient(grad, dest, gradient_input, beta); -#endif + + IF_DLIB_USE_CUDA( + cuda::smelu_gradient(grad, dest, gradient_input, beta); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::smelu_gradient(grad, dest, gradient_input, beta); + ) } // ---------------------------------------------------------------------------------------- @@ -1229,11 +1366,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::silu(dest,src); -#else - cpu::silu(dest,src); -#endif + IF_DLIB_USE_CUDA( + cuda::silu(dest,src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::silu(dest,src); + ) } void silu_gradient ( @@ -1242,11 +1381,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::silu_gradient(grad, src, gradient_input); -#else - cpu::silu_gradient(grad, src, gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::silu_gradient(grad, src, gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::silu_gradient(grad, src, gradient_input); + ) } // ---------------------------------------------------------------------------------------- @@ -1260,11 +1401,13 @@ namespace dlib { namespace tt long src_channel_stride ) { -#ifdef DLIB_USE_CUDA - cuda::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride); -#else - cpu::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride); -#endif + IF_DLIB_USE_CUDA( + cuda::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::resize_bilinear(dest,dest_row_stride,dest_channel_stride, src,src_row_stride,src_channel_stride); + ) } void resize_bilinear_gradient ( @@ -1276,11 +1419,13 @@ namespace dlib { namespace tt long gradient_input_channel_stride ) { -#ifdef DLIB_USE_CUDA - cuda::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride, gradient_input,gradient_input_row_stride,gradient_input_channel_stride); -#else - cpu::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride, gradient_input,gradient_input_row_stride,gradient_input_channel_stride); -#endif + IF_DLIB_USE_CUDA( + cuda::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride, gradient_input,gradient_input_row_stride,gradient_input_channel_stride); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::resize_bilinear_gradient(grad,grad_row_stride,grad_channel_stride, gradient_input,gradient_input_row_stride,gradient_input_channel_stride); + ) } // ------------------------------------------------------------------------------------ @@ -1293,11 +1438,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::reorg(add_to, dest, row_stride, col_stride, src); -#else - cpu::reorg(add_to, dest, row_stride, col_stride, src); -#endif + IF_DLIB_USE_CUDA( + cuda::reorg(add_to, dest, row_stride, col_stride, src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::reorg(add_to, dest, row_stride, col_stride, src); + ) } void reorg_gradient ( @@ -1308,11 +1455,13 @@ namespace dlib { namespace tt const tensor& gradient_input ) { -#ifdef DLIB_USE_CUDA - cuda::reorg_gradient(add_to, grad, row_stride, col_stride, gradient_input); -#else - cpu::reorg_gradient(add_to, grad, row_stride, col_stride, gradient_input); -#endif + IF_DLIB_USE_CUDA( + cuda::reorg_gradient(add_to, grad, row_stride, col_stride, gradient_input); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::reorg_gradient(add_to, grad, row_stride, col_stride, gradient_input); + ) } // ------------------------------------------------------------------------------------ @@ -1326,11 +1475,13 @@ namespace dlib { namespace tt size_t count_k ) { -#ifdef DLIB_USE_CUDA - cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k); -#else - cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k); -#endif + IF_DLIB_USE_CUDA( + cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k); + ) } // ---------------------------------------------------------------------------------------- @@ -1344,11 +1495,13 @@ namespace dlib { namespace tt size_t k, size_t nr, size_t nc ) { -#ifdef DLIB_USE_CUDA - cuda::copy_tensor(add_to, dest, dk, dnr, dnc , src, sk, snr, snc, k, nr, nc); -#else - cpu::copy_tensor(add_to, dest, dk, dnr, dnc, src, sk, snr, snc, k, nr, nc); -#endif + IF_DLIB_USE_CUDA( + cuda::copy_tensor(add_to, dest, dk, dnr, dnc , src, sk, snr, snc, k, nr, nc); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::copy_tensor(add_to, dest, dk, dnr, dnc, src, sk, snr, snc, k, nr, nc); + ) } // ---------------------------------------------------------------------------------------- @@ -1359,11 +1512,13 @@ namespace dlib { namespace tt resizable_tensor& out ) { -#ifdef DLIB_USE_CUDA - finv(m,out); -#else - out = dlib::inv(mat(m)); -#endif + IF_DLIB_USE_CUDA( + finv(m,out); + ) + + IF_DLIB_NOT_USE_CUDA( + out = dlib::inv(mat(m)); + ) } // ---------------------------------------------------------------------------------------- @@ -1374,11 +1529,13 @@ namespace dlib { namespace tt const tensor& src ) { -#ifdef DLIB_USE_CUDA - cuda::transpose(add_to, dest, src); -#else - cpu::transpose(add_to, dest, src); -#endif + IF_DLIB_USE_CUDA( + cuda::transpose(add_to, dest, src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::transpose(add_to, dest, src); + ) } // ---------------------------------------------------------------------------------------- @@ -1389,11 +1546,13 @@ namespace dlib { namespace tt const tensor& embs ) { -#ifdef DLIB_USE_CUDA - cuda::embeddings(dest, src, embs); -#else - cpu::embeddings(dest, src, embs); -#endif + IF_DLIB_USE_CUDA( + cuda::embeddings(dest, src, embs); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::embeddings(dest, src, embs); + ) } void embeddings_gradient( @@ -1405,11 +1564,13 @@ namespace dlib { namespace tt bool scale ) { -#ifdef DLIB_USE_CUDA - cuda::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale); -#else - cpu::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale); -#endif + IF_DLIB_USE_CUDA( + cuda::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu::embeddings_gradient(prev, gradient_input, grads, freqs, learning_rate, scale); + ) } // ---------------------------------------------------------------------------------------- diff --git a/dlib/cuda/tensor_tools.h b/dlib/cuda/tensor_tools.h index 18a5564f98..e551555260 100644 --- a/dlib/cuda/tensor_tools.h +++ b/dlib/cuda/tensor_tools.h @@ -292,10 +292,9 @@ namespace dlib { namespace tt !*/ #ifdef DLIB_USE_CUDA - cuda::curand_generator rnd; -#else - dlib::rand rnd; + cuda::curand_generator cuda_impl; #endif + dlib::rand cpu_impl; }; // ---------------------------------------------------------------------------------------- @@ -1074,14 +1073,32 @@ namespace dlib { namespace tt tensor_conv() {} void clear( - ) { impl.clear(); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl.clear(); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl.clear(); + ) + } void operator() ( const bool add_to_output, tensor& output, const tensor& data, const tensor& filters - ) { impl(add_to_output,output,data,filters); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl(add_to_output,output,data,filters); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl(add_to_output,output,data,filters); + ) + } /*! requires - setup() has been called. Specifically, setup() has been called like this: @@ -1107,7 +1124,16 @@ namespace dlib { namespace tt resizable_tensor& output, const tensor& data, const tensor& filters - ) { impl(add_to_output,output,data,filters); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl(add_to_output,output,data,filters); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl(add_to_output,output,data,filters); + ) + } /*! requires - setup() has been called. Specifically, setup() has been called like this: @@ -1135,7 +1161,16 @@ namespace dlib { namespace tt const tensor& filters, const tensor& biases, bool use_relu - ) { impl(add_to_output,output,data,filters,biases,use_relu); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl(add_to_output,output,data,filters,biases,use_relu); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl(add_to_output,output,data,filters,biases,use_relu); + ) + } /*! requires - setup() has been called. Specifically, setup() has been called like this: @@ -1167,7 +1202,16 @@ namespace dlib { namespace tt const tensor& filters, const tensor& biases, bool use_relu - ) { impl(add_to_output,output,data,filters,biases,use_relu); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl(add_to_output,output,data,filters,biases,use_relu); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl(add_to_output,output,data,filters,biases,use_relu); + ) + } /*! requires - setup() has been called. Specifically, setup() has been called like this: @@ -1195,7 +1239,16 @@ namespace dlib { namespace tt const tensor& gradient_input, const tensor& filters, tensor& data_gradient - ) { impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient); + ) + } /*! requires - One of the following must be true: @@ -1230,7 +1283,16 @@ namespace dlib { namespace tt const tensor& gradient_input, const tensor& data, tensor& filters_gradient - ) { impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient); + ) + } /*! requires - One of the following must be true: @@ -1268,7 +1330,16 @@ namespace dlib { namespace tt int stride_x, int padding_y, int padding_x - ) {impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x); + ) + } /*! requires - filters.k() == data.k() @@ -1292,11 +1363,9 @@ namespace dlib { namespace tt private: #ifdef DLIB_USE_CUDA - cuda::tensor_conv impl; -#else - cpu::tensor_conv impl; + cuda::tensor_conv cuda_impl; #endif - + cpu::tensor_conv cpu_impl; }; // ---------------------------------------------------------------------------------------- @@ -1317,7 +1386,16 @@ namespace dlib { namespace tt ) = default; void clear( - ) { impl.clear(); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl.clear(); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl.clear(); + ) + } void setup_max_pooling( int window_height, @@ -1326,7 +1404,16 @@ namespace dlib { namespace tt int stride_x, int padding_y, int padding_x - ) { impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl.setup_max_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); + ) + } /*! requires - window_height > 0 @@ -1347,7 +1434,16 @@ namespace dlib { namespace tt int stride_x, int padding_y, int padding_x - ) { impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl.setup_avg_pooling(window_height, window_width, stride_y, stride_x, padding_y, padding_x); + ) + } /*! requires - window_height > 0 @@ -1362,12 +1458,30 @@ namespace dlib { namespace tt !*/ bool does_max_pooling( - ) const { return impl.does_max_pooling(); } + ) const + { + IF_DLIB_USE_CUDA( + return cuda_impl.does_max_pooling(); + ) + + IF_DLIB_NOT_USE_CUDA( + return cpu_impl.does_max_pooling(); + ) + } void operator() ( resizable_tensor& dest, const tensor& src - ) { impl(dest, src); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl(dest, src); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl(dest, src); + ) + } /*! requires - is_same_object(dest,src) == false @@ -1395,7 +1509,16 @@ namespace dlib { namespace tt const tensor& dest, const tensor& src, tensor& grad - ) { impl.get_gradient(gradient_input, dest, src, grad); } + ) + { + IF_DLIB_USE_CUDA( + cuda_impl.get_gradient(gradient_input, dest, src, grad); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_impl.get_gradient(gradient_input, dest, src, grad); + ) + } /*! requires - have_same_dimensions(gradient_input,dest) == true @@ -1413,10 +1536,9 @@ namespace dlib { namespace tt private: #ifdef DLIB_USE_CUDA - cuda::pooling impl; -#else - cpu::pooling impl; + cuda::pooling cuda_impl; #endif + cpu::pooling cpu_impl; }; // ---------------------------------------------------------------------------------------- diff --git a/dlib/dnn/core_abstract.h b/dlib/dnn/core_abstract.h index 491183a685..9d61c716b3 100644 --- a/dlib/dnn/core_abstract.h +++ b/dlib/dnn/core_abstract.h @@ -196,6 +196,28 @@ namespace dlib - #dnn_prefer_fastest_algorithms() == false !*/ + bool use_cuda( + ); + /*! + ensures + - If dlib should use the CUDA implementation of a deep neural network + then this function returns true and false otherwise. + - On program startup this function will return true if DLIB_USE_CUDA is defined and + there is an available GPU device to use. + - This function always returns false if DLIB_USE_CUDA is not defined. + - This function sets a thread local variable. That is, each thread has its own value + for use_cuda(). This means that one thread may use cuda while another thread might + not, depending on the setting of use_cuda(). + !*/ + + void set_use_cuda( + bool flag + ); + /*! + ensures + - #use_cuda() == flag for the calling thread. + !*/ + // ---------------------------------------------------------------------------------------- template < diff --git a/dlib/dnn/loss.h b/dlib/dnn/loss.h index 36b37a2956..6a8f257bec 100644 --- a/dlib/dnn/loss.h +++ b/dlib/dnn/loss.h @@ -2823,11 +2823,13 @@ namespace dlib } double loss; -#ifdef DLIB_USE_CUDA - cuda_compute(truth, output_tensor, grad, loss); -#else - cpu_compute(truth, output_tensor, grad, loss); -#endif + IF_DLIB_USE_CUDA( + cuda_compute(truth, output_tensor, grad, loss); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_compute(truth, output_tensor, grad, loss); + ) return loss; } @@ -2859,9 +2861,8 @@ namespace dlib #ifdef DLIB_USE_CUDA cuda::compute_loss_binary_log_per_pixel cuda_compute; -#else - cpu::compute_loss_binary_log_per_pixel cpu_compute; #endif + cpu::compute_loss_binary_log_per_pixel cpu_compute; }; template @@ -2968,11 +2969,13 @@ namespace dlib double loss; -#ifdef DLIB_USE_CUDA - cuda_compute(truth, output_tensor, grad, loss); -#else - cpu_compute(truth, output_tensor, grad, loss); -#endif + IF_DLIB_USE_CUDA( + cuda_compute(truth, output_tensor, grad, loss); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_compute(truth, output_tensor, grad, loss); + ) return loss; } @@ -3004,9 +3007,8 @@ namespace dlib #ifdef DLIB_USE_CUDA cuda::compute_loss_multiclass_log_per_pixel cuda_compute; -#else - cpu::compute_loss_multiclass_log_per_pixel cpu_compute; #endif + cpu::compute_loss_multiclass_log_per_pixel cpu_compute; }; template @@ -3068,11 +3070,13 @@ namespace dlib } double loss; -#ifdef DLIB_USE_CUDA - cuda_compute(truth, output_tensor, grad, loss); -#else - cpu_compute(truth, output_tensor, grad, loss); -#endif + IF_DLIB_USE_CUDA( + cuda_compute(truth, output_tensor, grad, loss); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_compute(truth, output_tensor, grad, loss); + ) return loss; } @@ -3104,9 +3108,8 @@ namespace dlib #ifdef DLIB_USE_CUDA cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute; -#else - cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute; #endif + cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute; }; @@ -3319,11 +3322,13 @@ namespace dlib } } double loss; -#ifdef DLIB_USE_CUDA - cuda_compute(truth, output_tensor, grad, loss); -#else - cpu_compute(truth, output_tensor, grad, loss); -#endif + IF_DLIB_USE_CUDA( + cuda_compute(truth, output_tensor, grad, loss); + ) + + IF_DLIB_NOT_USE_CUDA( + cpu_compute(truth, output_tensor, grad, loss); + ) return loss; } @@ -3355,9 +3360,8 @@ namespace dlib #ifdef DLIB_USE_CUDA cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute; -#else - cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute; #endif + cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute; }; template diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp index c564e277e1..09169e6dd2 100644 --- a/dlib/test/dnn.cpp +++ b/dlib/test/dnn.cpp @@ -154,16 +154,16 @@ namespace dlog << LINFO << "src error: " << grad_error; DLIB_TEST(grad_error < 0.001); -#ifdef DLIB_USE_CUDA - resizable_tensor src1 = src; - resizable_tensor src2 = src; - resizable_tensor dest1, dest2; - dest1.copy_size(src); - dest2.copy_size(src); - cuda::softmax_all(dest1, src1); - cpu::softmax_all(dest2, src2); - DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2)))); -#endif + IF_DLIB_USE_CUDA( + resizable_tensor src1 = src; + resizable_tensor src2 = src; + resizable_tensor dest1, dest2; + dest1.copy_size(src); + dest2.copy_size(src); + cuda::softmax_all(dest1, src1); + cpu::softmax_all(dest2, src2); + DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2)))); + ) } void test_softmaxm() @@ -234,15 +234,16 @@ namespace cpu::softmax(output_tensor, input_tensor, operation_mode::PLANE_WISE); cpu::softmax_gradient(cpu_grad, output_tensor, gradient_input, operation_mode::PLANE_WISE); DLIB_TEST(max(abs(mat(output_tensor) - mat(expected_output))) < 1e-5); -#ifdef DLIB_USE_CUDA - resizable_tensor cuda_grad; - cuda_grad.copy_size(input_tensor); - cuda_grad = 0; - cuda::softmax(output_tensor, input_tensor, operation_mode::PLANE_WISE); - cpu::softmax_gradient(cuda_grad, output_tensor, gradient_input, operation_mode::PLANE_WISE); - DLIB_TEST(max(abs(mat(output_tensor) - mat(expected_output))) < 1e-5); - DLIB_TEST(max(abs(mat(cuda_grad) - mat(cpu_grad))) < 1e-5); -#endif + + IF_DLIB_USE_CUDA( + resizable_tensor cuda_grad; + cuda_grad.copy_size(input_tensor); + cuda_grad = 0; + cuda::softmax(output_tensor, input_tensor, operation_mode::PLANE_WISE); + cpu::softmax_gradient(cuda_grad, output_tensor, gradient_input, operation_mode::PLANE_WISE); + DLIB_TEST(max(abs(mat(output_tensor) - mat(expected_output))) < 1e-5); + DLIB_TEST(max(abs(mat(cuda_grad) - mat(cpu_grad))) < 1e-5); + ) } void test_softmax_all() @@ -284,222 +285,222 @@ namespace dlog << LINFO << "src error: " << grad_error; DLIB_TEST(grad_error < 0.001); -#ifdef DLIB_USE_CUDA - resizable_tensor src1 = src; - resizable_tensor src2 = src; - resizable_tensor dest1, dest2; - dest1.copy_size(src); - dest2.copy_size(src); - cuda::softmax_all(dest1, src1); - cpu::softmax_all(dest2, src2); - DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2)))); -#endif + IF_DLIB_USE_CUDA( + resizable_tensor src1 = src; + resizable_tensor src2 = src; + resizable_tensor dest1, dest2; + dest1.copy_size(src); + dest2.copy_size(src); + cuda::softmax_all(dest1, src1); + cpu::softmax_all(dest2, src2); + DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2))) < 1e-5, max(abs(mat(dest1)-mat(dest2)))); + ) } void test_mish() { -#ifdef DLIB_USE_CUDA - // make sure that cuda::mish and cpu::mish return the same results - using namespace dlib::tt; - print_spinner(); - const long n = 4; - const long k = 5; - const long nr = 3; - const long nc = 3; - resizable_tensor src(n,k,nr,nc); - tt::tensor_rand rnd; - rnd.fill_gaussian(src); + IF_DLIB_USE_CUDA( + // make sure that cuda::mish and cpu::mish return the same results + using namespace dlib::tt; + print_spinner(); + const long n = 4; + const long k = 5; + const long nr = 3; + const long nc = 3; + resizable_tensor src(n,k,nr,nc); + tt::tensor_rand rnd; + rnd.fill_gaussian(src); - resizable_tensor dest1, dest2; - dest1.copy_size(src); - dest2.copy_size(src); - // initialize to different values in order to make sure the output is actually changed - dest1 = 1; - dest2 = 2; - cuda::mish(dest1, src); - cpu::mish(dest2, src); - DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2)))); -#endif // DLIB_USE_CUDA + resizable_tensor dest1, dest2; + dest1.copy_size(src); + dest2.copy_size(src); + // initialize to different values in order to make sure the output is actually changed + dest1 = 1; + dest2 = 2; + cuda::mish(dest1, src); + cpu::mish(dest2, src); + DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2)))); + ) } void test_leaky_relu() { -#ifdef DLIB_USE_CUDA - using namespace dlib::tt; - print_spinner(); - const long n = 4; - const long k = 5; - const long nr = 3; - const long nc = 3; - const float alpha = 0.01; - resizable_tensor src(n, k, nr, nc); - tt::tensor_rand rnd; - rnd.fill_gaussian(src); - resizable_tensor dest_cuda, dest_cpu; - dest_cuda.copy_size(src); - dest_cpu.copy_size(src); - // initialize to different values in order to make sure the output is actually changed - dest_cuda = 1; - dest_cpu = 2; - cuda::leaky_relu(dest_cuda, src, alpha); - cpu::leaky_relu(dest_cpu, src, alpha); - - DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu)))); -#endif // DLIB_USE_CUDA + IF_DLIB_USE_CUDA( + using namespace dlib::tt; + print_spinner(); + const long n = 4; + const long k = 5; + const long nr = 3; + const long nc = 3; + const float alpha = 0.01; + resizable_tensor src(n, k, nr, nc); + tt::tensor_rand rnd; + rnd.fill_gaussian(src); + resizable_tensor dest_cuda, dest_cpu; + dest_cuda.copy_size(src); + dest_cpu.copy_size(src); + // initialize to different values in order to make sure the output is actually changed + dest_cuda = 1; + dest_cpu = 2; + cuda::leaky_relu(dest_cuda, src, alpha); + cpu::leaky_relu(dest_cpu, src, alpha); + + DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu)))); + ) } void test_clipped_relu() { -#ifdef DLIB_USE_CUDA - using namespace dlib::tt; - print_spinner(); - const long n = 4; - const long k = 5; - const long nr = 3; - const long nc = 3; - const float ceiling = 6.0f; - resizable_tensor src(n, k, nr, nc); - tt::tensor_rand rnd; - rnd.fill_gaussian(src, 0, 3); - resizable_tensor dest_cuda, dest_cpu; - dest_cuda.copy_size(src); - dest_cpu.copy_size(src); - // initialize to different values in order to make sure the output is actually changed - dest_cuda = 1; - dest_cpu = 2; - cuda::clipped_relu(dest_cuda, src, ceiling); - cpu::clipped_relu(dest_cpu, src, ceiling); - auto error = max(abs(mat(dest_cuda) - mat(dest_cpu))); - DLIB_TEST_MSG(error < 1e-7, "error: " << error); - - // test gradients - resizable_tensor grad_cuda, grad_cpu, grad_input; - grad_cuda.copy_size(src); - grad_cpu.copy_size(src); - grad_input.copy_size(src); - rnd.fill_uniform(grad_input); - grad_cuda = 0; - grad_cpu = 0; - cuda::clipped_relu_gradient(grad_cuda, dest_cuda, grad_input, ceiling); - cpu::clipped_relu_gradient(grad_cpu, dest_cpu, grad_input, ceiling); - error = max(abs(mat(grad_cuda) - mat(grad_cpu))); - DLIB_TEST_MSG(error < 1e-7, "error: " << error); -#endif // DLIB_USE_CUDA + IF_DLIB_USE_CUDA( + using namespace dlib::tt; + print_spinner(); + const long n = 4; + const long k = 5; + const long nr = 3; + const long nc = 3; + const float ceiling = 6.0f; + resizable_tensor src(n, k, nr, nc); + tt::tensor_rand rnd; + rnd.fill_gaussian(src, 0, 3); + resizable_tensor dest_cuda, dest_cpu; + dest_cuda.copy_size(src); + dest_cpu.copy_size(src); + // initialize to different values in order to make sure the output is actually changed + dest_cuda = 1; + dest_cpu = 2; + cuda::clipped_relu(dest_cuda, src, ceiling); + cpu::clipped_relu(dest_cpu, src, ceiling); + auto error = max(abs(mat(dest_cuda) - mat(dest_cpu))); + DLIB_TEST_MSG(error < 1e-7, "error: " << error); + + // test gradients + resizable_tensor grad_cuda, grad_cpu, grad_input; + grad_cuda.copy_size(src); + grad_cpu.copy_size(src); + grad_input.copy_size(src); + rnd.fill_uniform(grad_input); + grad_cuda = 0; + grad_cpu = 0; + cuda::clipped_relu_gradient(grad_cuda, dest_cuda, grad_input, ceiling); + cpu::clipped_relu_gradient(grad_cpu, dest_cpu, grad_input, ceiling); + error = max(abs(mat(grad_cuda) - mat(grad_cpu))); + DLIB_TEST_MSG(error < 1e-7, "error: " << error); + ) } void test_elu() { -#ifdef DLIB_USE_CUDA - using namespace dlib::tt; - print_spinner(); - const long n = 4; - const long k = 5; - const long nr = 3; - const long nc = 3; - const float alpha = 1.0f; - resizable_tensor src(n, k, nr, nc); - tt::tensor_rand rnd; - rnd.fill_gaussian(src); - resizable_tensor dest_cuda, dest_cpu; - dest_cuda.copy_size(src); - dest_cpu.copy_size(src); - // initialize to different values in order to make sure the output is actually changed - dest_cuda = 1; - dest_cpu = 2; - cuda::elu(dest_cuda, src, alpha); - cpu::elu(dest_cpu, src, alpha); - auto error = max(abs(mat(dest_cuda) - mat(dest_cpu))); - DLIB_TEST_MSG(error < 1e-7, "error: " << error); - // test gradients - resizable_tensor grad_cuda, grad_cpu, grad_input; - grad_cuda.copy_size(src); - grad_cpu.copy_size(src); - grad_input.copy_size(src); - rnd.fill_gaussian(grad_input); - grad_cuda = 0; - grad_cpu = 0; - cuda::elu_gradient(grad_cuda, dest_cuda, grad_input, alpha); - cpu::elu_gradient(grad_cpu, dest_cpu, grad_input, alpha); - error = max(abs(mat(grad_cuda) - mat(grad_cpu))); - DLIB_TEST_MSG(error < 1e-6, "error: " << error); -#endif // DLIB_USE_CUDA + IF_DLIB_USE_CUDA( + using namespace dlib::tt; + print_spinner(); + const long n = 4; + const long k = 5; + const long nr = 3; + const long nc = 3; + const float alpha = 1.0f; + resizable_tensor src(n, k, nr, nc); + tt::tensor_rand rnd; + rnd.fill_gaussian(src); + resizable_tensor dest_cuda, dest_cpu; + dest_cuda.copy_size(src); + dest_cpu.copy_size(src); + // initialize to different values in order to make sure the output is actually changed + dest_cuda = 1; + dest_cpu = 2; + cuda::elu(dest_cuda, src, alpha); + cpu::elu(dest_cpu, src, alpha); + auto error = max(abs(mat(dest_cuda) - mat(dest_cpu))); + DLIB_TEST_MSG(error < 1e-7, "error: " << error); + // test gradients + resizable_tensor grad_cuda, grad_cpu, grad_input; + grad_cuda.copy_size(src); + grad_cpu.copy_size(src); + grad_input.copy_size(src); + rnd.fill_gaussian(grad_input); + grad_cuda = 0; + grad_cpu = 0; + cuda::elu_gradient(grad_cuda, dest_cuda, grad_input, alpha); + cpu::elu_gradient(grad_cpu, dest_cpu, grad_input, alpha); + error = max(abs(mat(grad_cuda) - mat(grad_cpu))); + DLIB_TEST_MSG(error < 1e-6, "error: " << error); + ) } void test_gelu() { -#ifdef DLIB_USE_CUDA - // make sure that cuda::gelu and cpu::gelu return the same results - using namespace dlib::tt; - print_spinner(); - const long n = 4; - const long k = 5; - const long nr = 3; - const long nc = 3; - resizable_tensor src(n,k,nr,nc); - tt::tensor_rand rnd; - rnd.fill_gaussian(src); + IF_DLIB_USE_CUDA( + // make sure that cuda::gelu and cpu::gelu return the same results + using namespace dlib::tt; + print_spinner(); + const long n = 4; + const long k = 5; + const long nr = 3; + const long nc = 3; + resizable_tensor src(n,k,nr,nc); + tt::tensor_rand rnd; + rnd.fill_gaussian(src); - resizable_tensor dest1, dest2; - dest1.copy_size(src); - dest2.copy_size(src); - // initialize to different values in order to make sure the output is actually changed - dest1 = 1; - dest2 = 2; - cuda::gelu(dest1, src); - cpu::gelu(dest2, src); - DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2)))); -#endif // DLIB_USE_CUDA + resizable_tensor dest1, dest2; + dest1.copy_size(src); + dest2.copy_size(src); + // initialize to different values in order to make sure the output is actually changed + dest1 = 1; + dest2 = 2; + cuda::gelu(dest1, src); + cpu::gelu(dest2, src); + DLIB_TEST_MSG(max(abs(mat(dest1) - mat(dest2))) < 1e-6, max(abs(mat(dest1) - mat(dest2)))); + ) } void test_smelu() { -#ifdef DLIB_USE_CUDA - using namespace dlib::tt; - print_spinner(); - const long n = 4; - const long k = 5; - const long nr = 3; - const long nc = 3; - const float beta = 1; - resizable_tensor src(n, k, nr, nc); - tt::tensor_rand rnd; - rnd.fill_gaussian(src); - resizable_tensor dest_cuda, dest_cpu; - dest_cuda.copy_size(src); - dest_cpu.copy_size(src); - // initialize to different values in order to make sure the output is actually changed - dest_cuda = 1; - dest_cpu = 2; - cuda::smelu(dest_cuda, src, beta); - cpu::smelu(dest_cpu, src, beta); - - DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu)))); -#endif // DLIB_USE_CUDA + IF_DLIB_USE_CUDA( + using namespace dlib::tt; + print_spinner(); + const long n = 4; + const long k = 5; + const long nr = 3; + const long nc = 3; + const float beta = 1; + resizable_tensor src(n, k, nr, nc); + tt::tensor_rand rnd; + rnd.fill_gaussian(src); + resizable_tensor dest_cuda, dest_cpu; + dest_cuda.copy_size(src); + dest_cpu.copy_size(src); + // initialize to different values in order to make sure the output is actually changed + dest_cuda = 1; + dest_cpu = 2; + cuda::smelu(dest_cuda, src, beta); + cpu::smelu(dest_cpu, src, beta); + + DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-7, max(abs(mat(dest_cuda) - mat(dest_cpu)))); + ) } void test_silu() { -#ifdef DLIB_USE_CUDA - using namespace dlib::tt; - print_spinner(); - const long n = 4; - const long k = 5; - const long nr = 3; - const long nc = 3; - resizable_tensor src(n, k, nr, nc); - tt::tensor_rand rnd; - rnd.fill_gaussian(src); - resizable_tensor dest_cuda, dest_cpu; - dest_cuda.copy_size(src); - dest_cpu.copy_size(src); - // initialize to different values in order to make sure the output is actually changed - dest_cuda = 1; - dest_cpu = 2; - cuda::silu(dest_cuda, src); - cpu::silu(dest_cpu, src); - - DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-6, max(abs(mat(dest_cuda) - mat(dest_cpu)))); -#endif // DLIB_USE_CUDA + IF_DLIB_USE_CUDA( + using namespace dlib::tt; + print_spinner(); + const long n = 4; + const long k = 5; + const long nr = 3; + const long nc = 3; + resizable_tensor src(n, k, nr, nc); + tt::tensor_rand rnd; + rnd.fill_gaussian(src); + resizable_tensor dest_cuda, dest_cpu; + dest_cuda.copy_size(src); + dest_cpu.copy_size(src); + // initialize to different values in order to make sure the output is actually changed + dest_cuda = 1; + dest_cpu = 2; + cuda::silu(dest_cuda, src); + cpu::silu(dest_cpu, src); + + DLIB_TEST_MSG(max(abs(mat(dest_cuda) - mat(dest_cpu))) < 1e-6, max(abs(mat(dest_cuda) - mat(dest_cpu)))); + ) } void test_batch_normalize() @@ -710,28 +711,28 @@ namespace DLIB_TEST(::std::abs(rs.stddev() - 1.0f) < 0.01); } // check that the CPU and the CUDA implementation are equivalent -#ifdef DLIB_USE_CUDA - resizable_tensor y_cuda(x); - resizable_tensor means_cuda(x.num_samples()), invstds_cuda(x.num_samples()); - cuda::layer_normalize(eps, y_cuda, means_cuda, invstds_cuda, x, gamma, beta); - DLIB_TEST(max(abs(mat(y_cpu) - mat(y_cuda))) < 1e-5); - DLIB_TEST(max(abs(mat(means_cpu) - mat(means_cuda))) < 1e-5); - DLIB_TEST(max(abs(mat(invstds_cpu) - mat(invstds_cuda))) < 1e-5); - resizable_tensor gradient_input(x); - resizable_tensor src_grad_cpu(x), gamma_grad_cpu(1, x.k(), 1, 1), beta_grad_cpu(1, x.k(), 1, 1); - resizable_tensor src_grad_cuda(x), gamma_grad_cuda(1, x.k(), 1, 1), beta_grad_cuda(1, x.k(), 1, 1); - resizable_tensor dmeans_cpu, dvars_cpu, dmeans_cuda, dvars_cuda; - rnd.fill_gaussian(gradient_input); - src_grad_cpu = 0; - src_grad_cuda = 0; - cpu::layer_normalize_gradient(eps, gradient_input, means_cpu, invstds_cpu, x, gamma, src_grad_cpu, gamma_grad_cpu, beta_grad_cpu, dmeans_cpu, dvars_cpu); - cuda::layer_normalize_gradient(eps, gradient_input, means_cuda, invstds_cuda, x, gamma, src_grad_cuda, gamma_grad_cuda, beta_grad_cuda, dmeans_cuda, dvars_cuda); - DLIB_TEST(max(abs(mat(src_grad_cpu) - mat(src_grad_cuda))) < 1e-5); - DLIB_TEST(max(abs(mat(gamma_grad_cpu) - mat(gamma_grad_cuda))) < 1e-5); - DLIB_TEST(max(abs(mat(beta_grad_cpu) - mat(beta_grad_cuda))) < 1e-5); - DLIB_TEST(max(abs(mat(dmeans_cpu) - mat(dmeans_cuda))) < 1e-4); - DLIB_TEST(max(abs(mat(dvars_cpu) - mat(dvars_cuda))) < 1e-4); -#endif + IF_DLIB_USE_CUDA( + resizable_tensor y_cuda(x); + resizable_tensor means_cuda(x.num_samples()), invstds_cuda(x.num_samples()); + cuda::layer_normalize(eps, y_cuda, means_cuda, invstds_cuda, x, gamma, beta); + DLIB_TEST(max(abs(mat(y_cpu) - mat(y_cuda))) < 1e-5); + DLIB_TEST(max(abs(mat(means_cpu) - mat(means_cuda))) < 1e-5); + DLIB_TEST(max(abs(mat(invstds_cpu) - mat(invstds_cuda))) < 1e-5); + resizable_tensor gradient_input(x); + resizable_tensor src_grad_cpu(x), gamma_grad_cpu(1, x.k(), 1, 1), beta_grad_cpu(1, x.k(), 1, 1); + resizable_tensor src_grad_cuda(x), gamma_grad_cuda(1, x.k(), 1, 1), beta_grad_cuda(1, x.k(), 1, 1); + resizable_tensor dmeans_cpu, dvars_cpu, dmeans_cuda, dvars_cuda; + rnd.fill_gaussian(gradient_input); + src_grad_cpu = 0; + src_grad_cuda = 0; + cpu::layer_normalize_gradient(eps, gradient_input, means_cpu, invstds_cpu, x, gamma, src_grad_cpu, gamma_grad_cpu, beta_grad_cpu, dmeans_cpu, dvars_cpu); + cuda::layer_normalize_gradient(eps, gradient_input, means_cuda, invstds_cuda, x, gamma, src_grad_cuda, gamma_grad_cuda, beta_grad_cuda, dmeans_cuda, dvars_cuda); + DLIB_TEST(max(abs(mat(src_grad_cpu) - mat(src_grad_cuda))) < 1e-5); + DLIB_TEST(max(abs(mat(gamma_grad_cpu) - mat(gamma_grad_cuda))) < 1e-5); + DLIB_TEST(max(abs(mat(beta_grad_cpu) - mat(beta_grad_cuda))) < 1e-5); + DLIB_TEST(max(abs(mat(dmeans_cpu) - mat(dmeans_cuda))) < 1e-4); + DLIB_TEST(max(abs(mat(dvars_cpu) - mat(dvars_cuda))) < 1e-4); + ) } // ---------------------------------------------------------------------------------------- @@ -810,21 +811,21 @@ namespace DLIB_TEST(!backward_error_found); // check that the CPU and the CUDA implementation are equivalent -#ifdef DLIB_USE_CUDA - resizable_tensor y_cuda(x); - resizable_tensor scale_cuda; - cuda::rms_normalize(eps, y_cuda, scale_cuda, x, gamma); - DLIB_TEST(max(abs(mat(y_cpu) - mat(y_cuda))) < 1e-5); - DLIB_TEST(max(abs(mat(scale_cpu) - mat(scale_cuda))) < 1e-5); - - resizable_tensor src_grad_cuda(x), gamma_grad_cuda(1, x.k()); - resizable_tensor dscale_cuda(x.num_samples()); - src_grad_cuda = 0; - cuda::rms_normalize_gradient(gradient_input, scale_cuda, x, gamma, src_grad_cuda, gamma_grad_cuda, dscale_cuda); - DLIB_TEST(max(abs(mat(src_grad_cpu) - mat(src_grad_cuda))) < 1e-5); - DLIB_TEST(max(abs(mat(gamma_grad_cpu) - mat(gamma_grad_cuda))) < 1e-5); - DLIB_TEST(max(abs(mat(dscale_cpu) - mat(dscale_cuda))) < 1e-5); -#endif + IF_DLIB_USE_CUDA( + resizable_tensor y_cuda(x); + resizable_tensor scale_cuda; + cuda::rms_normalize(eps, y_cuda, scale_cuda, x, gamma); + DLIB_TEST(max(abs(mat(y_cpu) - mat(y_cuda))) < 1e-5); + DLIB_TEST(max(abs(mat(scale_cpu) - mat(scale_cuda))) < 1e-5); + + resizable_tensor src_grad_cuda(x), gamma_grad_cuda(1, x.k()); + resizable_tensor dscale_cuda(x.num_samples()); + src_grad_cuda = 0; + cuda::rms_normalize_gradient(gradient_input, scale_cuda, x, gamma, src_grad_cuda, gamma_grad_cuda, dscale_cuda); + DLIB_TEST(max(abs(mat(src_grad_cpu) - mat(src_grad_cuda))) < 1e-5); + DLIB_TEST(max(abs(mat(gamma_grad_cpu) - mat(gamma_grad_cuda))) < 1e-5); + DLIB_TEST(max(abs(mat(dscale_cpu) - mat(dscale_cuda))) < 1e-5); + ) } // ---------------------------------------------------------------------------------------- @@ -847,15 +848,15 @@ namespace input *= 2; DLIB_TEST(max(abs(mat(output_cpu_b) - mat(input))) < 1e-5); -#ifdef DLIB_USE_CUDA - input /= 2; - resizable_tensor output_cuda_a, output_cuda_b(input); - output_cuda_a.copy_size(output_cpu_a); - cuda::transpose(false, output_cuda_a, input); - cuda::transpose(true, output_cuda_b, output_cuda_a); - DLIB_TEST(max(abs(mat(output_cpu_a) - mat(output_cuda_a))) < 1e-5); - DLIB_TEST(max(abs(mat(output_cpu_b) - mat(output_cuda_b))) < 1e-5); -#endif + IF_DLIB_USE_CUDA( + input /= 2; + resizable_tensor output_cuda_a, output_cuda_b(input); + output_cuda_a.copy_size(output_cpu_a); + cuda::transpose(false, output_cuda_a, input); + cuda::transpose(true, output_cuda_b, output_cuda_a); + DLIB_TEST(max(abs(mat(output_cpu_a) - mat(output_cuda_a))) < 1e-5); + DLIB_TEST(max(abs(mat(output_cpu_b) - mat(output_cuda_b))) < 1e-5); + ) } // ---------------------------------------------------------------------------------------- @@ -1093,31 +1094,31 @@ void test_embeddings() memcpy(A, truth); DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5); -#ifdef DLIB_USE_CUDA - A = 4; - A.device(); - B.host(); - memcpy(A, truth); - DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5); + IF_DLIB_USE_CUDA( + A = 4; + A.device(); + B.host(); + memcpy(A, truth); + DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5); - A = 4; - A.device(); - B.device(); - memcpy(A, truth); - DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5); + A = 4; + A.device(); + B.device(); + memcpy(A, truth); + DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5); - A = 4; - A.host(); - B.device(); - memcpy(A, truth); - DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5); + A = 4; + A.host(); + B.device(); + memcpy(A, truth); + DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5); - A = 4; - A.host_write_only(); - B.device(); - memcpy(A, truth); - DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5); -#endif + A = 4; + A.host_write_only(); + B.device(); + memcpy(A, truth); + DLIB_TEST(max(abs(mat(A)- mat(truth))) < 1e-5); + ) } { @@ -1166,69 +1167,69 @@ void test_embeddings() } -#ifdef DLIB_USE_CUDA - A = 4; - A.device(); - B.host(); - { - // non-aliasing test - auto aA = at(A,5); - auto aB = at(B,5); - memcpy(aA, aB); - truth = {4,4,4,4,4, 1,1,1,1,1, 4}; - DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); - } - { - // aliasing test - auto aA = at(A,1); - auto aB = at(A,6); - memcpy(aA, aB); - truth = {4,1,1,1,1, 4,1,1,1,1, 4}; - DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); - } + IF_DLIB_USE_CUDA( + A = 4; + A.device(); + B.host(); + { + // non-aliasing test + auto aA = at(A,5); + auto aB = at(B,5); + memcpy(aA, aB); + truth = {4,4,4,4,4, 1,1,1,1,1, 4}; + DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); + } + { + // aliasing test + auto aA = at(A,1); + auto aB = at(A,6); + memcpy(aA, aB); + truth = {4,1,1,1,1, 4,1,1,1,1, 4}; + DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); + } - A = 4; - A.device(); - B.device(); - { - // non-aliasing test - auto aA = at(A,5); - auto aB = at(B,5); - memcpy(aA, aB); - truth = {4,4,4,4,4, 1,1,1,1,1, 4}; - DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); - } - { - // aliasing test - auto aA = at(A,1); - auto aB = at(A,6); - memcpy(aA, aB); - truth = {4,1,1,1,1, 4,1,1,1,1, 4}; - DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); - } + A = 4; + A.device(); + B.device(); + { + // non-aliasing test + auto aA = at(A,5); + auto aB = at(B,5); + memcpy(aA, aB); + truth = {4,4,4,4,4, 1,1,1,1,1, 4}; + DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); + } + { + // aliasing test + auto aA = at(A,1); + auto aB = at(A,6); + memcpy(aA, aB); + truth = {4,1,1,1,1, 4,1,1,1,1, 4}; + DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); + } - A = 4; - A.host(); - B.device(); - { - // non-aliasing test - auto aA = at(A,5); - auto aB = at(B,5); - memcpy(aA, aB); - truth = {4,4,4,4,4, 1,1,1,1,1, 4}; - DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); - } - { - // aliasing test - auto aA = at(A,1); - auto aB = at(A,6); - memcpy(aA, aB); - truth = {4,1,1,1,1, 4,1,1,1,1, 4}; - DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); - } + A = 4; + A.host(); + B.device(); + { + // non-aliasing test + auto aA = at(A,5); + auto aB = at(B,5); + memcpy(aA, aB); + truth = {4,4,4,4,4, 1,1,1,1,1, 4}; + DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); + } + { + // aliasing test + auto aA = at(A,1); + auto aB = at(A,6); + memcpy(aA, aB); + truth = {4,1,1,1,1, 4,1,1,1,1, 4}; + DLIB_TEST(max(abs(mat(A)- truth)) < 1e-5); + } -#endif + ) } { @@ -3671,22 +3672,22 @@ void test_multm_prev() trainer.train(inputs, labels); const auto error_after = compute_error(); DLIB_TEST_MSG(error_after < error_before, "multi channel error increased after training"); -#if DLIB_USE_CUDA - cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute; - cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute; - double cuda_loss, cpu_loss; - const tensor& output_tensor = net.subnet().get_output(); - resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor); - cuda_compute(labels.begin(), output_tensor, cuda_grad, cuda_loss); - cpu_compute(labels.begin(), output_tensor, cpu_grad, cpu_loss); - DLIB_TEST(cuda_grad.size() == cpu_grad.size()); - for (size_t i = 0; i < cuda_grad.size(); ++i) - { - DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8); - } - const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; - DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ"); -#endif + IF_DLIB_USE_CUDA( + cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute; + cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute; + double cuda_loss, cpu_loss; + const tensor& output_tensor = net.subnet().get_output(); + resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor); + cuda_compute(labels.begin(), output_tensor, cuda_grad, cuda_loss); + cpu_compute(labels.begin(), output_tensor, cpu_grad, cpu_loss); + DLIB_TEST(cuda_grad.size() == cpu_grad.size()); + for (size_t i = 0; i < cuda_grad.size(); ++i) + { + DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8); + } + const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; + DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ"); + ) } // ---------------------------------------------------------------------------------------- @@ -3882,22 +3883,22 @@ void test_multm_prev() DLIB_TEST_MSG(num_correct >= num_correct_required, "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required); -#if DLIB_USE_CUDA - cuda::compute_loss_binary_log_per_pixel cuda_compute; - cpu::compute_loss_binary_log_per_pixel cpu_compute; - double cuda_loss, cpu_loss; - const tensor& output_tensor = net.subnet().get_output(); - resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor); - cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss); - cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss); - DLIB_TEST(cuda_grad.size() == cpu_grad.size()); - for (size_t i = 0; i < cuda_grad.size(); ++i) - { - DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8); - } - const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; - DLIB_TEST_MSG(err < 1e-6, "binary log per pixel cuda and cpu losses differ"); -#endif + IF_DLIB_USE_CUDA( + cuda::compute_loss_binary_log_per_pixel cuda_compute; + cpu::compute_loss_binary_log_per_pixel cpu_compute; + double cuda_loss, cpu_loss; + const tensor& output_tensor = net.subnet().get_output(); + resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor); + cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss); + cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss); + DLIB_TEST(cuda_grad.size() == cpu_grad.size()); + for (size_t i = 0; i < cuda_grad.size(); ++i) + { + DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8); + } + const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; + DLIB_TEST_MSG(err < 1e-6, "binary log per pixel cuda and cpu losses differ"); + ) } // ---------------------------------------------------------------------------------------- @@ -4233,22 +4234,22 @@ void test_multm_prev() DLIB_TEST_MSG(num_correct >= num_correct_required, "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required); -#if DLIB_USE_CUDA - cuda::compute_loss_multiclass_log_per_pixel cuda_compute; - cpu::compute_loss_multiclass_log_per_pixel cpu_compute; - double cuda_loss, cpu_loss; - const tensor& output_tensor = net.subnet().get_output(); - resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor); - cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss); - cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss); - DLIB_TEST(cuda_grad.size() == cpu_grad.size()); - for (size_t i = 0; i < cuda_grad.size(); ++i) - { - DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8); - } - const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; - DLIB_TEST_MSG(err < 1e-6, "multiclass log per pixel cuda and cpu losses differ"); -#endif + IF_DLIB_USE_CUDA( + cuda::compute_loss_multiclass_log_per_pixel cuda_compute; + cpu::compute_loss_multiclass_log_per_pixel cpu_compute; + double cuda_loss, cpu_loss; + const tensor& output_tensor = net.subnet().get_output(); + resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor); + cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss); + cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss); + DLIB_TEST(cuda_grad.size() == cpu_grad.size()); + for (size_t i = 0; i < cuda_grad.size(); ++i) + { + DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8); + } + const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; + DLIB_TEST_MSG(err < 1e-6, "multiclass log per pixel cuda and cpu losses differ"); + ) } // ---------------------------------------------------------------------------------------- @@ -4344,22 +4345,22 @@ void test_multm_prev() "The weighted class (" << weighted_class << ") does not dominate: " << num_weighted_class << " <= " << num_not_weighted_class); -#if DLIB_USE_CUDA - cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute; - cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute; - double cuda_loss, cpu_loss; - const tensor& output_tensor = net.subnet().get_output(); - resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor); - cuda_compute(y_weighted.begin(), output_tensor, cuda_grad, cuda_loss); - cpu_compute(y_weighted.begin(), output_tensor, cpu_grad, cpu_loss); - DLIB_TEST(cuda_grad.size() == cpu_grad.size()); - for (size_t i = 0; i < cuda_grad.size(); ++i) - { - DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8); - } - const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; - DLIB_TEST_MSG(err < 1e-5, "multi class log per pixel weighted cuda and cpu losses differ: " << err); -#endif + IF_DLIB_USE_CUDA( + cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute; + cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute; + double cuda_loss, cpu_loss; + const tensor& output_tensor = net.subnet().get_output(); + resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor); + cuda_compute(y_weighted.begin(), output_tensor, cuda_grad, cuda_loss); + cpu_compute(y_weighted.begin(), output_tensor, cpu_grad, cpu_loss); + DLIB_TEST(cuda_grad.size() == cpu_grad.size()); + for (size_t i = 0; i < cuda_grad.size(); ++i) + { + DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8); + } + const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; + DLIB_TEST_MSG(err < 1e-5, "multi class log per pixel weighted cuda and cpu losses differ: " << err); + ) } } @@ -4526,10 +4527,10 @@ void test_multm_prev() img = 1; img.host()[idx] = 2; cpu::resize_bilinear(out, img); -#ifdef DLIB_USE_CUDA - cuda::resize_bilinear(out2, img); - DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5); -#endif + IF_DLIB_USE_CUDA( + cuda::resize_bilinear(out2, img); + DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5); + ) resizable_tensor gradient_input; gradient_input.copy_size(out); @@ -4560,12 +4561,12 @@ void test_multm_prev() dlog << LINFO << "analytic grad: "<< grad2.host()[idx]-0.1; DLIB_TEST_MSG(std::abs(numerical_grad - grad2.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad2.host()[idx]+0.1) << " numerical_grad: " << numerical_grad); -#ifdef DLIB_USE_CUDA - cuda::resize_bilinear_gradient(grad, gradient_input); - dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1; - DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << " numerical_grad: " << numerical_grad); - DLIB_TEST(max(abs(mat(grad)-mat(grad2))) < 1e-5); -#endif + IF_DLIB_USE_CUDA( + cuda::resize_bilinear_gradient(grad, gradient_input); + dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1; + DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << " numerical_grad: " << numerical_grad); + DLIB_TEST(max(abs(mat(grad)-mat(grad2))) < 1e-5); + ) } @@ -4586,11 +4587,11 @@ void test_multm_prev() auto wout = aout(out, out.nc()*1+1); auto wimg = aimg(img, img.nc()*1+1); cpu::resize_bilinear(wout,out.nc(),out.nr()*out.nc(), wimg,img.nc(),img.nr()*img.nc()); -#ifdef DLIB_USE_CUDA - auto wout2 = aout(out2, out2.nc()*1+1); - cuda::resize_bilinear(wout2,out2.nc(),out2.nr()*out2.nc(), wimg,img.nc(),img.nr()*img.nc()); - DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5); -#endif + IF_DLIB_USE_CUDA( + auto wout2 = aout(out2, out2.nc()*1+1); + cuda::resize_bilinear(wout2,out2.nc(),out2.nr()*out2.nc(), wimg,img.nc(),img.nr()*img.nc()); + DLIB_TEST(max(abs(mat(out)-mat(out2))) < 1e-5); + ) resizable_tensor gradient_input; @@ -4628,15 +4629,14 @@ void test_multm_prev() dlog << LINFO << "analytic grad: "<< grad2.host()[idx]-0.1; DLIB_TEST_MSG(std::abs(numerical_grad - grad2.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad2.host()[idx]+0.1) << " numerical_grad: " << numerical_grad); -#ifdef DLIB_USE_CUDA - wgrad2 = aimg(grad, grad.nc()*1+1); - wgradient_input = aout(gradient_input, gradient_input.nc()*1+1); - cuda::resize_bilinear_gradient(wgrad2,grad.nc(),grad.nr()*grad.nc(), wgradient_input,gradient_input.nc(),gradient_input.nr()*gradient_input.nc()); - dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1; - DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << " numerical_grad: " << numerical_grad); - DLIB_TEST_MSG(max(abs(mat(grad)-mat(grad2))) < 1e-5, max(abs(mat(grad)-mat(grad2)))); -#endif - + IF_DLIB_USE_CUDA( + wgrad2 = aimg(grad, grad.nc()*1+1); + wgradient_input = aout(gradient_input, gradient_input.nc()*1+1); + cuda::resize_bilinear_gradient(wgrad2,grad.nc(),grad.nr()*grad.nc(), wgradient_input,gradient_input.nc(),gradient_input.nr()*gradient_input.nc()); + dlog << LINFO << "analytic grad: "<< grad.host()[idx]-0.1; + DLIB_TEST_MSG(std::abs(numerical_grad - grad.host()[idx]+0.1) < 1e-2, std::abs(numerical_grad - grad.host()[idx]+0.1) << " numerical_grad: " << numerical_grad); + DLIB_TEST_MSG(max(abs(mat(grad)-mat(grad2))) < 1e-5, max(abs(mat(grad)-mat(grad2)))); + ) } } @@ -5036,20 +5036,20 @@ void test_multm_prev() void test_reorg() { -#ifdef DLIB_USE_CUDA - print_spinner(); - resizable_tensor x(2, 4, 8, 16); - resizable_tensor out_cpu(2, 16, 4, 8), out_cuda(2, 16, 4, 8); - resizable_tensor grad_cpu(x), grad_cuda(x); - tt::tensor_rand rnd; - rnd.fill_gaussian(x); - cpu::reorg(false, out_cpu, 2, 2, x); - cuda::reorg(false, out_cuda, 2, 2, x); - DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0); - cpu::reorg_gradient(false, grad_cpu, 2, 2, out_cpu); - cuda::reorg_gradient(false, grad_cuda, 2, 2, out_cuda); - DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0); -#endif + IF_DLIB_USE_CUDA( + print_spinner(); + resizable_tensor x(2, 4, 8, 16); + resizable_tensor out_cpu(2, 16, 4, 8), out_cuda(2, 16, 4, 8); + resizable_tensor grad_cpu(x), grad_cuda(x); + tt::tensor_rand rnd; + rnd.fill_gaussian(x); + cpu::reorg(false, out_cpu, 2, 2, x); + cuda::reorg(false, out_cuda, 2, 2, x); + DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0); + cpu::reorg_gradient(false, grad_cpu, 2, 2, out_cpu); + cuda::reorg_gradient(false, grad_cuda, 2, 2, out_cuda); + DLIB_TEST(max(squared(mat(out_cuda) - mat(out_cpu))) == 0); + ) } void test_input_tensor() @@ -5143,27 +5143,28 @@ void test_multm_prev() srand(1234); test_tagging(); -#ifdef DLIB_USE_CUDA - test_affine_rect(); - test_conv(); - test_more_ops2(); - test_more_ops(1,1); - test_more_ops(3,4); - test_more_ops(4,3); - test_more_ops(4,1); - test_more_ops(1,4); - test_more_ops(10000,4); - compare_bn_gpu_and_cpu(); - compare_bn_conv_gpu_and_cpu(); - test_add(); - test_multiply_zero_padded(); - compare_adam(); - test_copy_tensor_gpu(); - test_copy_tensor_add_to_gpu(); - test_copy_tensor_gpu(); - test_copy_tensor_add_to_gpu(); - test_scale_channels(); -#endif + IF_DLIB_USE_CUDA( + test_affine_rect(); + test_conv(); + test_more_ops2(); + test_more_ops(1,1); + test_more_ops(3,4); + test_more_ops(4,3); + test_more_ops(4,1); + test_more_ops(1,4); + test_more_ops(10000,4); + compare_bn_gpu_and_cpu(); + compare_bn_conv_gpu_and_cpu(); + test_add(); + test_multiply_zero_padded(); + compare_adam(); + test_copy_tensor_gpu(); + test_copy_tensor_add_to_gpu(); + test_copy_tensor_gpu(); + test_copy_tensor_add_to_gpu(); + test_scale_channels(); + ) + test_tensor_resize_bilinear(2, 3, 6,6, 11, 11); test_tensor_resize_bilinear(2, 3, 6,6, 3, 4); test_tensor_resize_bilinear(2, 3, 5,6, 12, 21);