[Enhancement] Support DeformConv TensorRT fp16 (open-mmlab#468)

grimoire · web-flow · commit 1a8d7aceaf8a · 2022-05-17T19:58:33.000+08:00
* add DCN TensorRT fp16 support

* fix getOutputDimensions
diff --git a/csrc/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu b/csrc/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
@@ -61,6 +61,8 @@ void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size, int *permu
 
 template void memcpyPermute<float>(float *dst, const float *src, int *src_size, int *permute,
                                    int src_dim, cudaStream_t stream);
+template void memcpyPermute<half>(half *dst, const half *src, int *src_size, int *permute,
+                                  int src_dim, cudaStream_t stream);
 
 cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t *cudnn_dtype) {
   switch (trt_dtype) {
diff --git a/csrc/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp b/csrc/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
@@ -67,7 +67,8 @@ nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
 bool DeformableConvPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
   if (pos == 0) {
-    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+    return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+             ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
             ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
   } else {
     return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
@@ -136,9 +137,14 @@ int DeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input
                          mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle,
                          stream);
       break;
+    case nvinfer1::DataType::kHALF:
+      deform_conv<half>((half *)x, (half *)weight, (half *)offset, (half *)output, workSpace, batch,
+                        channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0],
+                        mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1],
+                        mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
     default:
       return 1;
-      break;
   }
 
   return 0;
diff --git a/csrc/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu b/csrc/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
@@ -163,3 +163,10 @@ template void deform_conv<float>(const float* input, const float* weight, const
                                  int dW, int dH, int padW, int padH, int dilationW, int dilationH,
                                  int group, int deformable_group, int im2col_step,
                                  cublasHandle_t cublas_handle, cudaStream_t stream);
+
+template void deform_conv<__half>(const __half* input, const __half* weight, const __half* offset,
+                                  __half* output, void* workspace, int batchSize, int nInputPlane,
+                                  int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH,
+                                  int dW, int dH, int padW, int padH, int dilationW, int dilationH,
+                                  int group, int deformable_group, int im2col_step,
+                                  cublasHandle_t cublas_handle, cudaStream_t stream);
diff --git a/csrc/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh b/csrc/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
@@ -63,13 +63,15 @@
 // modified from
 // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
 
+#include <cuda_fp16.h>
+
 #include "common_cuda_helper.hpp"
 
 template <typename scalar_t>
 __device__ __forceinline__ scalar_t deformable_im2col_bilinear(const scalar_t* __restrict__ input,
                                                                const int height, const int width,
-                                                               scalar_t h, scalar_t w) {
-  if (h <= -1.f || height <= h || w <= -1.f || width <= w) {
+                                                               float h, float w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
     return 0;
   }
 
@@ -94,6 +96,33 @@ __device__ __forceinline__ scalar_t deformable_im2col_bilinear(const scalar_t* _
   return val;
 }
 
+template <>
+__device__ __forceinline__ __half deformable_im2col_bilinear(const __half* __restrict__ input,
+                                                             const int height, const int width,
+                                                             float h, float w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+
+  input += h_low * width;
+  const float v1 = (h_low >= 0 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
+  const int w_high = w_low + 1;
+  const float v2 = (h_low >= 0 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
+  const float lw = w - w_low;
+  const float v_low = fmaf(v2 - v1, lw, v1);
+  input += width;
+  const float v3 = (h_low <= height - 2 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
+  const float v4 =
+      (h_low <= height - 2 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
+  const float v_high = fmaf(v4 - v3, lw, v3);
+  const float lh = h - h_low;
+  const float val = fmaf(v_high - v_low, lh, v_low);
+  return __float2half(val);
+}
+
 template <typename scalar_t>
 __global__ void deformable_im2col_gpu_kernel(
     const int n, const scalar_t* __restrict__ data_im, const scalar_t* __restrict__ data_offset,
@@ -134,8 +163,8 @@ __global__ void deformable_im2col_gpu_kernel(
         const scalar_t offset_h = data_offset_ptr[data_offset_h];
         const int data_offset_w = data_offset_h + hw_col;
         const scalar_t offset_w = data_offset_ptr[data_offset_w];
-        const scalar_t h_im = h_in + i * dilation_h + offset_h;
-        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        const scalar_t h_im = h_in + i * dilation_h + (float)offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + (float)offset_w;
         const scalar_t val = deformable_im2col_bilinear(data_im_ptr, height, width, h_im, w_im);
         *data_col_ptr = val;
         data_col_ptr += data_col_step;