cudaarithm: add more datatypes to npp transpose and update to stream context api for npp >=10.1

chacha21 · cudawarped · commit e40c43d96a22 · 2023-01-05T12:45:56.000+02:00
diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
@@ -56,40 +56,89 @@ using namespace cv;
 using namespace cv::cuda;
 using namespace cv::cudev;
 
-void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
+namespace
 {
-    GpuMat src = getInputMat(_src, stream);
-
-    const size_t elemSize = src.elemSize();
+  template <int DEPTH> struct NppTransposeFunc
+  {
+    typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
 
-    CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
+#if CV_USE_NPP_STREAM_CTX
+    typedef NppStatus(*func_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize, NppStreamContext stream);
+#else
+    typedef NppStatus(*func_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize);
+#endif
+  };
 
-    GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
+  template <int DEPTH, typename NppTransposeFunc<DEPTH>::func_t func> struct NppTranspose
+  {
+    typedef typename NppTransposeFunc<DEPTH>::npp_type npp_type;
 
-    if (elemSize == 1)
+    static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream)
     {
-        NppStreamHandler h(StreamAccessor::getStream(stream));
-
-        NppiSize sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
+      NppiSize srcsz;
+      srcsz.height = src.rows;
+      srcsz.width = src.cols;
+
+#if CV_USE_NPP_STREAM_CTX
+      NppStreamContext nppStreamContext{};
+      nppSafeCall(nppGetStreamContext(&nppStreamContext));
+      nppStreamContext.hStream = stream;
+      nppSafeCall(func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz, nppStreamContext));
+#else
+      cv::cuda::NppStreamHandler h(stream);
+      nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz) );
+#endif
+      if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+  };
+}
 
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
+    CV_Assert(!src.empty());
+    const size_t elemSize = src.elemSize();
+    CV_Assert((elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4) || (elemSize == 6) || (elemSize == 8) || (elemSize == 12) || (elemSize == 16));
+    GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
 
-        if (!stream)
-            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
-    }
-    else if (elemSize == 4)
+    if ((src.rows == 1) && (src.cols == 1))
+        src.copyTo(dst, stream);
+    else if (src.rows == 1)
+        src.reshape(0, src.cols).copyTo(dst, stream);
+    else if ((src.cols == 1) && src.isContinuous())
+        src.reshape(0, src.cols).copyTo(dst, stream);
+    else
     {
-        gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
-    }
-    else // if (elemSize == 8)
-    {
-        gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
+        typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream);
+        //if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize
+#if CV_USE_NPP_STREAM_CTX
+        static const func_t funcs[8][4] = {
+          {NppTranspose<CV_8U,  nppiTranspose_8u_C1R_Ctx>::call,  NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_8U,  nppiTranspose_8u_C3R_Ctx>::call,  NppTranspose<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
+          {NppTranspose<CV_8U,  nppiTranspose_8u_C1R_Ctx>::call,  NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_8U,  nppiTranspose_8u_C3R_Ctx>::call,  NppTranspose<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
+          {NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R_Ctx>::call},
+          {NppTranspose<CV_16S, nppiTranspose_16s_C1R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C3R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call},
+          {NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C3R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R_Ctx>::call},
+          {NppTranspose<CV_32F, nppiTranspose_32f_C1R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTranspose<CV_32F, nppiTranspose_32f_C3R_Ctx>::call, NppTranspose<CV_32F, nppiTranspose_32f_C4R_Ctx>::call},
+          {NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R_Ctx>::call, nullptr, nullptr},
+          {NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R_Ctx>::call}
+        };
+#else
+        static const func_t funcs[8][4] = {
+          {NppTranspose<CV_8U,  nppiTranspose_8u_C1R>::call,  NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call,   NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
+          {NppTranspose<CV_8U,  nppiTranspose_8u_C1R>::call,  NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call,   NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
+          {NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call},
+          {NppTranspose<CV_16S, nppiTranspose_16s_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C3R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call},
+          {NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C3R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call},
+          {NppTranspose<CV_32F, nppiTranspose_32f_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C3R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C4R>::call},
+          {NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call, nullptr, nullptr},
+          {NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call}
+        };
+#endif
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != nullptr);
+        func(src, dst, StreamAccessor::getStream(stream));
     }
-
-    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/test/test_core.cpp b/modules/cudaarithm/test/test_core.cpp
@@ -187,6 +187,7 @@ PARAM_TEST_CASE(Transpose, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
     cv::Size size;
     int type;
     bool useRoi;
+    Stream stream;
 
     virtual void SetUp()
     {
@@ -218,7 +219,7 @@ CUDA_TEST_P(Transpose, Accuracy)
     else
     {
         cv::cuda::GpuMat dst = createMat(cv::Size(size.height, size.width), type, useRoi);
-        cv::cuda::transpose(loadMat(src, useRoi), dst);
+        cv::cuda::transpose(loadMat(src, useRoi), dst, stream);
 
         cv::Mat dst_gold;
         cv::transpose(src, dst_gold);
@@ -231,12 +232,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC2),
+                    MatType(CV_8UC3),
                     MatType(CV_8UC4),
+                    MatType(CV_8SC1),
+                    MatType(CV_8SC2),
+                    MatType(CV_8SC3),
+                    MatType(CV_8SC4),
+                    MatType(CV_16UC1),
                     MatType(CV_16UC2),
+                    MatType(CV_16UC3),
+                    MatType(CV_16UC4),
+                    MatType(CV_16SC1),
                     MatType(CV_16SC2),
+                    MatType(CV_16SC3),
+                    MatType(CV_16SC4),
                     MatType(CV_32SC1),
                     MatType(CV_32SC2),
-                    MatType(CV_64FC1)),
+                    MatType(CV_32SC3),
+                    MatType(CV_32SC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC2),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4),
+                    MatType(CV_64FC1),
+                    MatType(CV_64FC2)),
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/modules/cudev/include/opencv2/cudev/common.hpp b/modules/cudev/include/opencv2/cudev/common.hpp
@@ -57,6 +57,9 @@ namespace cv { namespace cudev {
 
 using namespace cv::cuda;
 
+// CV_USE_NPP_STREAM_CTX
+#define CV_USE_NPP_STREAM_CTX (NPP_VERSION >= (10 * 1000 + 1 * 100 + 0))
+
 // CV_CUDEV_ARCH
 
 #ifndef __CUDA_ARCH__