diff --git a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py index 4dac8fbd..47ef9bec 100644 --- a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py +++ b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/functions/dcnv3_func.py @@ -174,7 +174,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) diff --git a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu index 52840957..97e77bcc 100644 --- a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); @@ -146,7 +143,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b551ba3f..d48f15e1 100644 --- a/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/autonomous_driving/occupancy_prediction/projects/mmdet3d_plugin/bevformer/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, diff --git a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/functions/dcnv3_func.py b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/functions/dcnv3_func.py index 4dac8fbd..47ef9bec 100644 --- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/functions/dcnv3_func.py +++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/functions/dcnv3_func.py @@ -174,7 +174,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) diff --git a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu index 52840957..97e77bcc 100644 --- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); @@ -146,7 +143,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b551ba3f..d48f15e1 100644 --- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, diff --git a/classification/ops_dcnv3/functions/dcnv3_func.py b/classification/ops_dcnv3/functions/dcnv3_func.py index 2ccad588..6ab8fe09 100644 --- a/classification/ops_dcnv3/functions/dcnv3_func.py +++ b/classification/ops_dcnv3/functions/dcnv3_func.py @@ -206,7 +206,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) diff --git a/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu index c8ee4797..36f4dc08 100644 --- a/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/classification/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); @@ -146,7 +143,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b2bbf844..f03c11dc 100644 --- a/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/classification/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, diff --git a/classification/ops_dcnv3/test.py b/classification/ops_dcnv3/test.py index 5a0a4e80..9621d47c 100644 --- a/classification/ops_dcnv3/test.py +++ b/classification/ops_dcnv3/test.py @@ -61,6 +61,36 @@ def check_forward_equal_with_pytorch_double(): print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') +@torch.no_grad() +def check_forward_equal_with_pytorch_half(): + input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 + offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 + mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 + mask /= mask.sum(-1, keepdim=True) + mask = mask.reshape(N, H_out, W_out, M*P) + + output_pytorch = dcnv3_core_pytorch( + input.half(), + offset.half(), + mask.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, remove_center).detach().cpu() + + im2col_step = 2 + output_cuda = DCNv3Function.apply( + input.half(), + offset.half(), + mask.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, + im2col_step, remove_center).detach().cpu() + + fwdok = torch.allclose(output_cuda, output_pytorch) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / + output_pytorch.abs()).max() + print('>>> forward half') + print(f'* {fwdok} check_forward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + @torch.no_grad() def check_forward_equal_with_pytorch_float(): input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 @@ -154,6 +184,68 @@ def check_backward_equal_with_pytorch_double(channels=4, grad_input=True, grad_o f'* {bwdok} mask_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') +def check_backward_equal_with_pytorch_half(channels=4, grad_input=True, grad_offset=True, grad_mask=True): + # H_in, W_in = 4, 4 + N = 2 + M = 2 + H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 + W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 + + D = channels + input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 + offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 + mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 + mask0 /= mask0.sum(-1, keepdim=True) + mask0 = mask0.reshape(N, H_out, W_out, M*P) + input0.requires_grad = grad_input + offset0.requires_grad = grad_offset + mask0.requires_grad = grad_mask + + output_pytorch = dcnv3_core_pytorch( + input0.half(), + offset0.half(), + mask0.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, remove_center) + output_pytorch.sum().backward() + + input1 = input0.detach() + offset1 = offset0.detach() + mask1 = mask0.detach() + input1.requires_grad = grad_input + offset1.requires_grad = grad_offset + mask1.requires_grad = grad_mask + + im2col_step = 2 + output_cuda = DCNv3Function.apply( + input1.half(), + offset1.half(), + mask1.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, + im2col_step, remove_center) + output_cuda.sum().backward() + + print(f'>>> backward half: channels {D}') + bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (input0.grad - input1.grad).abs().max() + max_rel_err = ((input0.grad - input1.grad).abs() / + input0.grad.abs()).max() + print( + f'* {bwdok} input_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (offset0.grad - offset1.grad).abs().max() + max_rel_err = ((offset0.grad - offset1.grad).abs() / + offset0.grad.abs()).max() + print( + f'* {bwdok} offset_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (mask0.grad - mask1.grad).abs().max() + max_rel_err = ((mask0.grad - mask1.grad).abs() / + mask0.grad.abs()).max() + print( + f'* {bwdok} mask_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + def check_backward_equal_with_pytorch_float(channels=4, grad_input=True, grad_offset=True, grad_mask=True): # H_in, W_in = 4, 4 N = 2 @@ -254,9 +346,12 @@ def check_time_cost(im2col_step=128): if __name__ == '__main__': check_forward_equal_with_pytorch_double() + check_forward_equal_with_pytorch_half() check_forward_equal_with_pytorch_float() for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_double(channels, True, True, True) + for channels in [1, 16, 30, 32, 64, 71, 1025]: + check_backward_equal_with_pytorch_half(channels, True, True, True) for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_float(channels, True, True, True) for i in range(3): diff --git a/detection/ops_dcnv3/functions/dcnv3_func.py b/detection/ops_dcnv3/functions/dcnv3_func.py index 4dac8fbd..47ef9bec 100644 --- a/detection/ops_dcnv3/functions/dcnv3_func.py +++ b/detection/ops_dcnv3/functions/dcnv3_func.py @@ -174,7 +174,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) diff --git a/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu index 52840957..97e77bcc 100644 --- a/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/detection/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); @@ -146,7 +143,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b551ba3f..d48f15e1 100644 --- a/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/detection/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, diff --git a/detection/ops_dcnv3/test.py b/detection/ops_dcnv3/test.py index 0277bef4..f36dd4ce 100644 --- a/detection/ops_dcnv3/test.py +++ b/detection/ops_dcnv3/test.py @@ -30,6 +30,36 @@ torch.manual_seed(3) +@torch.no_grad() +def check_forward_equal_with_pytorch_half(): + input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 + offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 + mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 + mask /= mask.sum(-1, keepdim=True) + mask = mask.reshape(N, H_out, W_out, M*P) + + output_pytorch = dcnv3_core_pytorch( + input.half(), + offset.half(), + mask.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu() + + im2col_step = 2 + output_cuda = DCNv3Function.apply( + input.half(), + offset.half(), + mask.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, + im2col_step).detach().cpu() + + fwdok = torch.allclose(output_cuda, output_pytorch) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / + output_pytorch.abs()).max() + print('>>> forward half') + print(f'* {fwdok} check_forward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + @torch.no_grad() def check_forward_equal_with_pytorch_double(): input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 @@ -90,6 +120,69 @@ def check_forward_equal_with_pytorch_float(): print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') +def check_backward_equal_with_pytorch_half(channels=4, grad_input=True, grad_offset=True, grad_mask=True): + # H_in, W_in = 4, 4 + N = 2 + M = 2 + H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 + W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 + + D = channels + input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 + offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 + mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 + mask0 /= mask0.sum(-1, keepdim=True) + mask0 = mask0.reshape(N, H_out, W_out, M*P) + input0.requires_grad = grad_input + offset0.requires_grad = grad_offset + mask0.requires_grad = grad_mask + + output_pytorch = dcnv3_core_pytorch( + input0.half(), + offset0.half(), + mask0.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale) + output_pytorch.sum().backward() + + input1 = input0.detach() + offset1 = offset0.detach() + mask1 = mask0.detach() + input1.requires_grad = grad_input + offset1.requires_grad = grad_offset + mask1.requires_grad = grad_mask + + im2col_step = 2 + output_cuda = DCNv3Function.apply( + input1.half(), + offset1.half(), + mask1.half(), + Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, + im2col_step) + output_cuda.sum().backward() + + print(f'>>> backward half: channels {D}') + bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (input0.grad - input1.grad).abs().max() + max_rel_err = ((input0.grad - input1.grad).abs() / + input0.grad.abs()).max() + print( + f'* {bwdok} input_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (offset0.grad - offset1.grad).abs().max() + max_rel_err = ((offset0.grad - offset1.grad).abs() / + offset0.grad.abs()).max() + print( + f'* {bwdok} offset_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3) + max_abs_err = (mask0.grad - mask1.grad).abs().max() + max_rel_err = ((mask0.grad - mask1.grad).abs() / + mask0.grad.abs()).max() + print( + f'* {bwdok} mask_grad check_backward_equal_with_pytorch_half: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + def check_backward_equal_with_pytorch_double(channels=4, grad_input=True, grad_offset=True, grad_mask=True): # H_in, W_in = 4, 4 N = 2 @@ -252,8 +345,11 @@ def check_time_cost(im2col_step=128): if __name__ == '__main__': + check_forward_equal_with_pytorch_half() check_forward_equal_with_pytorch_double() check_forward_equal_with_pytorch_float() + for channels in [1, 16, 30, 32, 64, 71, 1025]: + check_backward_equal_with_pytorch_half(channels, True, True, True) for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_double(channels, True, True, True) for channels in [1, 16, 30, 32, 64, 71, 1025]: diff --git a/segmentation/ops_dcnv3/functions/dcnv3_func.py b/segmentation/ops_dcnv3/functions/dcnv3_func.py index 4dac8fbd..47ef9bec 100644 --- a/segmentation/ops_dcnv3/functions/dcnv3_func.py +++ b/segmentation/ops_dcnv3/functions/dcnv3_func.py @@ -174,7 +174,7 @@ def dcnv3_core_pytorch( reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ - flatten(0, 1) + flatten(0, 1).to(input_.dtype) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) diff --git a/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu b/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu index 52840957..97e77bcc 100644 --- a/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu +++ b/segmentation/ops_dcnv3/src/cuda/dcnv3_cuda.cu @@ -68,7 +68,7 @@ at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { - dcnv3_im2col_cuda( + dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + @@ -124,9 +124,6 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, channels, group * group_channels); auto dtype = input.dtype(); - if (dtype == at::kHalf) { - dtype = at::kFloat; - } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); @@ -146,7 +143,7 @@ dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { - dcnv3_col2im_cuda( + dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, diff --git a/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh b/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh index b551ba3f..d48f15e1 100644 --- a/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh +++ b/segmentation/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } -#define opmath_t at::opmath_type +#define opmath_t scalar_t template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data,