From b56c347e7df402fe49e9bd8fa45ad9c04215ba9d Mon Sep 17 00:00:00 2001 From: wangxinjiang Date: Fri, 21 Jul 2023 11:33:35 +0800 Subject: [PATCH 1/6] Remove OpMathType dependencies --- .../bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 4 ++-- .../backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 7 +++---- .../models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 4 ++-- .../backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 7 +++---- classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 4 ++-- classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 7 +++---- detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 4 ++-- detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 7 +++---- segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 4 ++-- segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 7 +++---- .../modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu | 2 +- 11 files changed, 26 insertions(+), 31 deletions(-) diff --git a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu index 52840957..a7c36ab8 100644 --- a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -146,7 +146,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b551ba3f..516058e9 100644 --- a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, @@ -839,7 +838,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -868,7 +867,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu index 52840957..a7c36ab8 100644 --- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -146,7 +146,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b551ba3f..516058e9 100644 --- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, @@ -839,7 +838,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -868,7 +867,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu index c8ee4797..3800c8ee 100644 --- a/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -146,7 +146,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b2bbf844..2378d59f 100644 --- a/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, @@ -888,7 +887,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -917,7 +916,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu index 52840957..a7c36ab8 100644 --- a/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -146,7 +146,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b551ba3f..516058e9 100644 --- a/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, @@ -839,7 +838,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -868,7 +867,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu index 52840957..a7c36ab8 100644 --- a/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -146,7 +146,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b551ba3f..516058e9 100644 --- a/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, @@ -839,7 +838,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -868,7 +867,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu b/tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu index 0eb487f1..43d1ace4 100644 --- a/tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu +++ b/tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu @@ -118,7 +118,7 @@ __global__ void dcnv3_im2col_gpu_kernel( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, From bd9e49b2fd125b22d43e5c67cf8764fe3fcdae24 Mon Sep 17 00:00:00 2001 From: wangxinjiang Date: Fri, 21 Jul 2023 11:41:39 +0800 Subject: [PATCH 2/6] Fix typos --- .../backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 4 ++-- .../models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 4 ++-- classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 4 ++-- detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 4 ++-- segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh | 4 ++-- .../modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index 516058e9..d48f15e1 100644 --- a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -838,7 +838,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -867,7 +867,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index 516058e9..d48f15e1 100644 --- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -838,7 +838,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -867,7 +867,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index 2378d59f..f03c11dc 100644 --- a/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -887,7 +887,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -916,7 +916,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index 516058e9..d48f15e1 100644 --- a/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -838,7 +838,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -867,7 +867,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index 516058e9..d48f15e1 100644 --- a/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -838,7 +838,7 @@ __global__ void dcnv3_col2im_gpu_kernel_gm( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, @@ -867,7 +867,7 @@ void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, } template -void dcnv3_col2im_cuda( +void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, diff --git a/tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu b/tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu index 43d1ace4..0eb487f1 100644 --- a/tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu +++ b/tensorrt/modulated_deform_conv_v3/trt_deform_conv_v3_kernel.cu @@ -118,7 +118,7 @@ __global__ void dcnv3_im2col_gpu_kernel( } template -void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, +void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, From 19b0514c4ba30ad7498cfedce390cfa90082152d Mon Sep 17 00:00:00 2001 From: wangxinjiang Date: Fri, 21 Jul 2023 11:54:48 +0800 Subject: [PATCH 3/6] Add test for half --- classification/ops_dcnv3/test.py | 95 +++++++++++++++++++++++++++++++ detection/ops_dcnv3/test.py | 96 ++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+) diff --git a/classification/ops_dcnv3/test.py b/classification/ops_dcnv3/test.py index 5a0a4e80..9621d47c 100644 --- a/classification/ops_dcnv3/test.py +++ b/classification/ops_dcnv3/test.py @@ -61,6 +61,36 @@ def check_forward_equal_with_pytorch_double(): print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') +@torch.no_grad() +def check_forward_equal_with_pytorch_half(): + input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 + offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 + mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 + mask /= mask.sum(-1, keepdim=True) + mask = mask.reshape(N, H_out, W_out, M*P) + + output_pytorch = dcnv3_core_pytorch( + input.half(), + offset.half(), + mask.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, remove_center).detach().cpu() + + im2col_step = 2 + output_cuda = DCNv3Function.apply( + input.half(), + offset.half(), + mask.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, + im2col_step, remove_center).detach().cpu() + + fwdok = torch.allclose(output_cuda, output_pytorch) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / + output_pytorch.abs()).max() + print('>>> forward half') + print(f'* {fwdok} check_forward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + @torch.no_grad() def check_forward_equal_with_pytorch_float(): input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 @@ -154,6 +184,68 @@ def check_backward_equal_with_pytorch_double(channels=4, grad_input=True, grad_o f'* {bwdok} mask_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') +def check_backward_equal_with_pytorch_half(channels=4, grad_input=True, grad_offset=True, grad_mask=True): + # H_in, W_in = 4, 4 + N = 2 + M = 2 + H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 + W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 + + D = channels + input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 + offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 + mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 + mask0 /= mask0.sum(-1, keepdim=True) + mask0 = mask0.reshape(N, H_out, W_out, M*P) + input0.requires_grad = grad_input + offset0.requires_grad = grad_offset + mask0.requires_grad = grad_mask + + output_pytorch = dcnv3_core_pytorch( + input0.half(), + offset0.half(), + mask0.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, remove_center) + output_pytorch.sum().backward() + + input1 = input0.detach() + offset1 = offset0.detach() + mask1 = mask0.detach() + input1.requires_grad = grad_input + offset1.requires_grad = grad_offset + mask1.requires_grad = grad_mask + + im2col_step = 2 + output_cuda = DCNv3Function.apply( + input1.half(), + offset1.half(), + mask1.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, + im2col_step, remove_center) + output_cuda.sum().backward() + + print(f'>>> backward half: channels {D}') + bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (input0.grad - input1.grad).abs().max() + max_rel_err = ((input0.grad - input1.grad).abs() / + input0.grad.abs()).max() + print( + f'* {bwdok} input_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (offset0.grad - offset1.grad).abs().max() + max_rel_err = ((offset0.grad - offset1.grad).abs() / + offset0.grad.abs()).max() + print( + f'* {bwdok} offset_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (mask0.grad - mask1.grad).abs().max() + max_rel_err = ((mask0.grad - mask1.grad).abs() / + mask0.grad.abs()).max() + print( + f'* {bwdok} mask_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + def check_backward_equal_with_pytorch_float(channels=4, grad_input=True, grad_offset=True, grad_mask=True): # H_in, W_in = 4, 4 N = 2 @@ -254,9 +346,12 @@ def check_time_cost(im2col_step=128): if __name__ == '__main__': check_forward_equal_with_pytorch_double() + check_forward_equal_with_pytorch_half() check_forward_equal_with_pytorch_float() for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_double(channels, True, True, True) + for channels in [1, 16, 30, 32, 64, 71, 1025]: + check_backward_equal_with_pytorch_half(channels, True, True, True) for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_float(channels, True, True, True) for i in range(3): diff --git a/detection/ops_dcnv3/test.py b/detection/ops_dcnv3/test.py index 0277bef4..f36dd4ce 100644 --- a/detection/ops_dcnv3/test.py +++ b/detection/ops_dcnv3/test.py @@ -30,6 +30,36 @@ torch.manual_seed(3) +@torch.no_grad() +def check_forward_equal_with_pytorch_half(): + input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 + offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 + mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 + mask /= mask.sum(-1, keepdim=True) + mask = mask.reshape(N, H_out, W_out, M*P) + + output_pytorch = dcnv3_core_pytorch( + input.half(), + offset.half(), + mask.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu() + + im2col_step = 2 + output_cuda = DCNv3Function.apply( + input.half(), + offset.half(), + mask.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, + im2col_step).detach().cpu() + + fwdok = torch.allclose(output_cuda, output_pytorch) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / + output_pytorch.abs()).max() + print('>>> forward half') + print(f'* {fwdok} check_forward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + @torch.no_grad() def check_forward_equal_with_pytorch_double(): input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 @@ -90,6 +120,69 @@ def check_forward_equal_with_pytorch_float(): print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') +def check_backward_equal_with_pytorch_half(channels=4, grad_input=True, grad_offset=True, grad_mask=True): + # H_in, W_in = 4, 4 + N = 2 + M = 2 + H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 + W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 + + D = channels + input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 + offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 + mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 + mask0 /= mask0.sum(-1, keepdim=True) + mask0 = mask0.reshape(N, H_out, W_out, M*P) + input0.requires_grad = grad_input + offset0.requires_grad = grad_offset + mask0.requires_grad = grad_mask + + output_pytorch = dcnv3_core_pytorch( + input0.half(), + offset0.half(), + mask0.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale) + output_pytorch.sum().backward() + + input1 = input0.detach() + offset1 = offset0.detach() + mask1 = mask0.detach() + input1.requires_grad = grad_input + offset1.requires_grad = grad_offset + mask1.requires_grad = grad_mask + + im2col_step = 2 + output_cuda = DCNv3Function.apply( + input1.half(), + offset1.half(), + mask1.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, + im2col_step) + output_cuda.sum().backward() + + print(f'>>> backward half: channels {D}') + bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (input0.grad - input1.grad).abs().max() + max_rel_err = ((input0.grad - input1.grad).abs() / + input0.grad.abs()).max() + print( + f'* {bwdok} input_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (offset0.grad - offset1.grad).abs().max() + max_rel_err = ((offset0.grad - offset1.grad).abs() / + offset0.grad.abs()).max() + print( + f'* {bwdok} offset_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (mask0.grad - mask1.grad).abs().max() + max_rel_err = ((mask0.grad - mask1.grad).abs() / + mask0.grad.abs()).max() + print( + f'* {bwdok} mask_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + def check_backward_equal_with_pytorch_double(channels=4, grad_input=True, grad_offset=True, grad_mask=True): # H_in, W_in = 4, 4 N = 2 @@ -252,8 +345,11 @@ def check_time_cost(im2col_step=128): if __name__ == '__main__': + check_forward_equal_with_pytorch_half() check_forward_equal_with_pytorch_double() check_forward_equal_with_pytorch_float() + for channels in [1, 16, 30, 32, 64, 71, 1025]: + check_backward_equal_with_pytorch_half(channels, True, True, True) for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_double(channels, True, True, True) for channels in [1, 16, 30, 32, 64, 71, 1025]: From da2cba28f149f9b08f4a136f028ed8a987b96273 Mon Sep 17 00:00:00 2001 From: wangxinjiang Date: Tue, 25 Jul 2023 10:19:49 +0800 Subject: [PATCH 4/6] fix dcnv3_core_pytorch in fp16 --- .../bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py | 2 +- .../baseline/models/backbones/ops_dcnv3/functions/dcnv3_func.py | 2 +- classification/ops_dcnv3/functions/dcnv3_func.py | 2 +- detection/ops_dcnv3/functions/dcnv3_func.py | 2 +- segmentation/ops_dcnv3/functions/dcnv3_func.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py index 4dac8fbd..47ef9bec 100644 --- a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py +++ b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py @@ -174,7 +174,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) diff --git a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/functions/dcnv3_func.py b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/functions/dcnv3_func.py index 4dac8fbd..47ef9bec 100644 --- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/functions/dcnv3_func.py +++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/functions/dcnv3_func.py @@ -174,7 +174,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) diff --git a/classification/ops_dcnv3/functions/dcnv3_func.py b/classification/ops_dcnv3/functions/dcnv3_func.py index 2ccad588..6ab8fe09 100644 --- a/classification/ops_dcnv3/functions/dcnv3_func.py +++ b/classification/ops_dcnv3/functions/dcnv3_func.py @@ -206,7 +206,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) diff --git a/detection/ops_dcnv3/functions/dcnv3_func.py b/detection/ops_dcnv3/functions/dcnv3_func.py index 4dac8fbd..c9f63e9f 100644 --- a/detection/ops_dcnv3/functions/dcnv3_func.py +++ b/detection/ops_dcnv3/functions/dcnv3_func.py @@ -174,7 +174,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) diff --git a/segmentation/ops_dcnv3/functions/dcnv3_func.py b/segmentation/ops_dcnv3/functions/dcnv3_func.py index 4dac8fbd..47ef9bec 100644 --- a/segmentation/ops_dcnv3/functions/dcnv3_func.py +++ b/segmentation/ops_dcnv3/functions/dcnv3_func.py @@ -174,7 +174,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) From 11641c600898aa72422dd637455bb0f4e7dfec3d Mon Sep 17 00:00:00 2001 From: wangxinjiang Date: Tue, 25 Jul 2023 10:21:39 +0800 Subject: [PATCH 5/6] fix forced float conversion --- .../bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 3 --- .../baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 3 --- classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 3 --- detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 3 --- segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu | 3 --- 5 files changed, 15 deletions(-) diff --git a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu index a7c36ab8..97e77bcc 100644 --- a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); diff --git a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu index a7c36ab8..97e77bcc 100644 --- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); diff --git a/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu index 3800c8ee..36f4dc08 100644 --- a/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); diff --git a/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu index a7c36ab8..97e77bcc 100644 --- a/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); diff --git a/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu index a7c36ab8..97e77bcc 100644 --- a/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); From 57a3c0c34774ab505e2e23357747c91e03d903fb Mon Sep 17 00:00:00 2001 From: wangxinjiang Date: Tue, 25 Jul 2023 10:24:04 +0800 Subject: [PATCH 6/6] remove redundant conversion --- detection/ops_dcnv3/functions/dcnv3_func.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detection/ops_dcnv3/functions/dcnv3_func.py b/detection/ops_dcnv3/functions/dcnv3_func.py index c9f63e9f..47ef9bec 100644 --- a/detection/ops_dcnv3/functions/dcnv3_func.py +++ b/detection/ops_dcnv3/functions/dcnv3_func.py @@ -174,7 +174,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1).to(input_.dtype).to(input_.dtype) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False)