From 51180e9a9a0e23e236ac72b96bcef05011772a8a Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Wed, 9 Nov 2022 21:52:41 +0100
Subject: [PATCH 1/9] More data types supported in cv::cuda::transpose()

proposal for #22782
---
 modules/cudaarithm/src/cuda/transpose.cu | 135 +++++++++++++++++++++--
 modules/cudaarithm/test/test_core.cpp    |  21 +++-
 2 files changed, 144 insertions(+), 12 deletions(-)
diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index bfe50bd34fb..7141a0f4c10 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -60,13 +60,30 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
 {
     GpuMat src = getInputMat(_src, stream);
 
+    const int srcType = src.type();
+    const int srcDepth = src.depth();
+    const int srcCn = src.channels();
     const size_t elemSize = src.elemSize();
+    const size_t elemSize1 = src.elemSize1();
 
-    CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
+    //CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
 
     GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
 
-    if (elemSize == 1)
+    const bool isNppiNativelySupported =
+      (srcType == CV_8UC1)  || (srcType == CV_8UC3)  || (srcType == CV_8UC4)  ||
+      (srcType == CV_16UC1) || (srcType == CV_16UC3) || (srcType == CV_16UC4) ||
+      (srcType == CV_16SC1) || (srcType == CV_16SC3) || (srcType == CV_16SC4) ||
+      (srcType == CV_32SC1) || (srcType == CV_32SC3) || (srcType == CV_32SC4) ||
+      (srcType == CV_32FC1) || (srcType == CV_32FC3) || (srcType == CV_32FC4);
+    const bool isElemSizeSupportedByNppi =
+      ((elemSize != 0) && !(elemSize%1) && ((elemSize/1)<=4)) ||
+      ((elemSize != 0) && !(elemSize%2) && ((elemSize/2)<=4)) ||
+      ((elemSize != 0) && !(elemSize%4) && ((elemSize/4)<=4)) ||
+      ((elemSize != 0) && !(elemSize%8) && ((elemSize/8)<=2));
+    if (src.empty())
+      dst.release();
+    else if (isNppiNativelySupported)
     {
         NppStreamHandler h(StreamAccessor::getStream(stream));
 
@@ -74,20 +91,116 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+        if (srcType == CV_8UC1)
+          nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_8UC3)
+          nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_8UC4)
+          nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
             dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16UC1)
+          nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16UC3)
+          nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16UC4)
+          nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16SC1)
+          nppSafeCall( nppiTranspose_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+            dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16SC3)
+          nppSafeCall( nppiTranspose_16s_C3R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+            dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_16SC4)
+          nppSafeCall( nppiTranspose_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+            dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32SC1)
+          nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32SC3)
+          nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32SC4)
+          nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32FC1)
+          nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32FC3)
+          nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+        else if (srcType == CV_32FC4)
+          nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
 
         if (!stream)
             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
-    }
-    else if (elemSize == 4)
-    {
-        gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
-    }
-    else // if (elemSize == 8)
+    }//end if (isNppiNativelySupported)
+    else if (isElemSizeSupportedByNppi)
     {
-        gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
-    }
+      NppStreamHandler h(StreamAccessor::getStream(stream));
+
+      NppiSize sz;
+      sz.width  = src.cols;
+      sz.height = src.rows;
+
+      if (!(elemSize%1) && ((elemSize/1)==1))
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%1) && ((elemSize/1)==2))
+        nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%1) && ((elemSize/1)==3))
+        nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%1) && ((elemSize/1)==4))
+        nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%2) && ((elemSize/2)==1))
+        nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%2) && ((elemSize/2)==2))
+        nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%2) && ((elemSize/2)==3))
+        nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%2) && ((elemSize/2)==4))
+        nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%4) && ((elemSize/4)==1))
+        nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%4) && ((elemSize/4)==2))
+        nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%4) && ((elemSize/4)==3))
+        nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%4) && ((elemSize/4)==4))
+        nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%8) && ((elemSize/8)==1))
+        nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+      else if (!(elemSize%8) && ((elemSize/8)==2))
+        nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+    }//end if (isElemSizeSupportedByNppi)
+    else if (elemSize == 1)
+      gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream);
+    else if (elemSize == 2)
+      gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream);
+    else if (elemSize == 4)
+      gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream);
+    else if (elemSize == 8)
+      gridTranspose(globPtr<double>(src), globPtr<float>(dst), stream);
+    else
+      CV_Error(Error::StsUnsupportedFormat, "");
 
     syncOutput(dst, _dst, stream);
 }
diff --git a/modules/cudaarithm/test/test_core.cpp b/modules/cudaarithm/test/test_core.cpp
index bc8f3737e53..e5c30f703ab 100644
--- a/modules/cudaarithm/test/test_core.cpp
+++ b/modules/cudaarithm/test/test_core.cpp
@@ -231,12 +231,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC2),
+                    MatType(CV_8UC3),
                     MatType(CV_8UC4),
+                    MatType(CV_8SC1),
+                    MatType(CV_8SC2),
+                    MatType(CV_8SC3),
+                    MatType(CV_8SC4),
+                    MatType(CV_16UC1),
                     MatType(CV_16UC2),
+                    MatType(CV_16UC3),
+                    MatType(CV_16UC4),
+                    MatType(CV_16SC1),
                     MatType(CV_16SC2),
+                    MatType(CV_16SC3),
+                    MatType(CV_16SC4),
                     MatType(CV_32SC1),
                     MatType(CV_32SC2),
-                    MatType(CV_64FC1)),
+                    MatType(CV_32SC3),
+                    MatType(CV_32SC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC2),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4),
+                    MatType(CV_64FC1),
+                    MatType(CV_64FC2)),
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////

From be2dd8d15ec5afc96a6732a4b7acdc2a14164278 Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Thu, 10 Nov 2022 11:11:46 +0100
Subject: [PATCH 2/9] added fast paths

Fixed a typo in gridTranspose() usage
Added fast path for single cell/row/col matrix
Throw error for empty matrix instead of no-op
Code style
---
 modules/cudaarithm/src/cuda/transpose.cu | 44 +++++++++++++++---------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index 7141a0f4c10..fb6957926dc 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -77,12 +77,23 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
       (srcType == CV_32SC1) || (srcType == CV_32SC3) || (srcType == CV_32SC4) ||
       (srcType == CV_32FC1) || (srcType == CV_32FC3) || (srcType == CV_32FC4);
     const bool isElemSizeSupportedByNppi =
-      ((elemSize != 0) && !(elemSize%1) && ((elemSize/1)<=4)) ||
-      ((elemSize != 0) && !(elemSize%2) && ((elemSize/2)<=4)) ||
-      ((elemSize != 0) && !(elemSize%4) && ((elemSize/4)<=4)) ||
-      ((elemSize != 0) && !(elemSize%8) && ((elemSize/8)<=2));
-    if (src.empty())
-      dst.release();
+      (!(elemSize%1) && ((elemSize/1)<=4)) ||
+      (!(elemSize%2) && ((elemSize/2)<=4)) ||
+      (!(elemSize%4) && ((elemSize/4)<=4)) ||
+      (!(elemSize%8) && ((elemSize/8)<=2));
+    const bool isElemSizeSupportedByGridTranspose =
+      (elemSize == 1) || (elemSize == 2) || (elemSize == 4) || (elemSize == 8);
+    const bool isSupported = isNppiNativelySupported || isElemSizeSupportedByNppi || isElemSizeSupportedByGridTranspose;
+
+    if (!isSupported)
+      CV_Error(Error::StsUnsupportedFormat, "");
+    else if (src.empty())
+      CV_Error(Error::StsBadArg,"image is empty");
+
+    if ((src.cols == 1) && (dst.cols == 1))
+      src.copyTo(dst, stream);
+    else if (((src.cols == 1) || (src.rows == 1)) && (src.cols*src.elemSize() == src.step))
+      src.reshape(0, src.cols).copyTo(dst, stream);
     else if (isNppiNativelySupported)
     {
         NppStreamHandler h(StreamAccessor::getStream(stream));
@@ -191,16 +202,17 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
         nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
           dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
     }//end if (isElemSizeSupportedByNppi)
-    else if (elemSize == 1)
-      gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream);
-    else if (elemSize == 2)
-      gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream);
-    else if (elemSize == 4)
-      gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream);
-    else if (elemSize == 8)
-      gridTranspose(globPtr<double>(src), globPtr<float>(dst), stream);
-    else
-      CV_Error(Error::StsUnsupportedFormat, "");
+    else if (isElemSizeSupportedByGridTranspose)
+    {
+      if (elemSize == 1)
+        gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream);
+      else if (elemSize == 2)
+        gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream);
+      else if (elemSize == 4)
+        gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream);
+      else if (elemSize == 8)
+        gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
+    }//end if (isElemSizeSupportedByGridTranspose)
 
     syncOutput(dst, _dst, stream);
 }

From 9e3eb3dec93fc39ab34223dab1d9923e0582e8b4 Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Thu, 10 Nov 2022 12:06:05 +0100
Subject: [PATCH 3/9] missing cudaDeviceSynchronize()

---
 modules/cudaarithm/src/cuda/transpose.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index fb6957926dc..7eb16af332c 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -66,8 +66,6 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
     const size_t elemSize = src.elemSize();
     const size_t elemSize1 = src.elemSize1();
 
-    //CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
-
     GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
 
     const bool isNppiNativelySupported =
@@ -201,6 +199,9 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
       else if (!(elemSize%8) && ((elemSize/8)==2))
         nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
           dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+
+      if (!stream)
+        CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
     }//end if (isElemSizeSupportedByNppi)
     else if (isElemSizeSupportedByGridTranspose)
     {

From 00abef3b003fbceabb22be87c514a0dced99bbff Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Fri, 11 Nov 2022 21:16:30 +0100
Subject: [PATCH 4/9] split (rows == 1) and (cols == 1) cases

if (rows == 1), the stride has no importance for the reshape()
---
 modules/cudaarithm/src/cuda/transpose.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index 7eb16af332c..58cb5132cbe 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -90,7 +90,9 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
 
     if ((src.cols == 1) && (dst.cols == 1))
       src.copyTo(dst, stream);
-    else if (((src.cols == 1) || (src.rows == 1)) && (src.cols*src.elemSize() == src.step))
+    else if (src.rows == 1)
+      src.reshape(0, src.cols).copyTo(dst, stream);
+    else if ((src.cols == 1) && (src.cols*src.elemSize() == src.step))
       src.reshape(0, src.cols).copyTo(dst, stream);
     else if (isNppiNativelySupported)
     {

From e5b152ef9d6eb96854ddb93f49c9d988daf89d49 Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Fri, 11 Nov 2022 21:18:08 +0100
Subject: [PATCH 5/9] code clarity

---
 modules/cudaarithm/src/cuda/transpose.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index 58cb5132cbe..d822a174dcf 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -88,7 +88,7 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
     else if (src.empty())
       CV_Error(Error::StsBadArg,"image is empty");
 
-    if ((src.cols == 1) && (dst.cols == 1))
+    if ((src.rows == 1) && (src.cols == 1))
       src.copyTo(dst, stream);
     else if (src.rows == 1)
       src.reshape(0, src.cols).copyTo(dst, stream);

From 4f0e47044277020264dc3afb69d9d893a0371048 Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Wed, 21 Dec 2022 15:32:29 +0100
Subject: [PATCH 6/9] updated according to suggestions

more obvious elemSize test logic to dispatch to the right nppiTranspose variant
added modern NPP _ctx stream support
---
 modules/cudaarithm/src/cuda/transpose.cu | 308 +++++++++++++----------
 1 file changed, 174 insertions(+), 134 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index d822a174dcf..afc6600bfc8 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -68,154 +68,194 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
 
     GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
 
-    const bool isNppiNativelySupported =
-      (srcType == CV_8UC1)  || (srcType == CV_8UC3)  || (srcType == CV_8UC4)  ||
-      (srcType == CV_16UC1) || (srcType == CV_16UC3) || (srcType == CV_16UC4) ||
-      (srcType == CV_16SC1) || (srcType == CV_16SC3) || (srcType == CV_16SC4) ||
-      (srcType == CV_32SC1) || (srcType == CV_32SC3) || (srcType == CV_32SC4) ||
-      (srcType == CV_32FC1) || (srcType == CV_32FC3) || (srcType == CV_32FC4);
-    const bool isElemSizeSupportedByNppi =
-      (!(elemSize%1) && ((elemSize/1)<=4)) ||
-      (!(elemSize%2) && ((elemSize/2)<=4)) ||
-      (!(elemSize%4) && ((elemSize/4)<=4)) ||
-      (!(elemSize%8) && ((elemSize/8)<=2));
-    const bool isElemSizeSupportedByGridTranspose =
-      (elemSize == 1) || (elemSize == 2) || (elemSize == 4) || (elemSize == 8);
-    const bool isSupported = isNppiNativelySupported || isElemSizeSupportedByNppi || isElemSizeSupportedByGridTranspose;
+    const bool isSupported =
+      (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4) ||
+      (elemSize == 6) || (elemSize == 8) || (elemSize == 12) || (elemSize == 16);
 
     if (!isSupported)
-      CV_Error(Error::StsUnsupportedFormat, "");
+        CV_Error(Error::StsUnsupportedFormat, "");
     else if (src.empty())
-      CV_Error(Error::StsBadArg,"image is empty");
+        CV_Error(Error::StsBadArg,"image is empty");
 
     if ((src.rows == 1) && (src.cols == 1))
-      src.copyTo(dst, stream);
+        src.copyTo(dst, stream);
     else if (src.rows == 1)
-      src.reshape(0, src.cols).copyTo(dst, stream);
-    else if ((src.cols == 1) && (src.cols*src.elemSize() == src.step))
-      src.reshape(0, src.cols).copyTo(dst, stream);
-    else if (isNppiNativelySupported)
+        src.reshape(0, src.cols).copyTo(dst, stream);
+    else if ((src.cols == 1) && src.isContinuous())
+        src.reshape(0, src.cols).copyTo(dst, stream);
+    else
     {
-        NppStreamHandler h(StreamAccessor::getStream(stream));
-
         NppiSize sz;
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        if (srcType == CV_8UC1)
-          nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+        if (!stream)
+        {
+          //native implementation
+          if (srcType == CV_8UC1)
+            nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_8UC3)
+            nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_8UC4)
+            nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16UC1)
+            nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16UC3)
+            nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16UC4)
+            nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16SC1)
+            nppSafeCall( nppiTranspose_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16SC3)
+            nppSafeCall( nppiTranspose_16s_C3R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_16SC4)
+            nppSafeCall( nppiTranspose_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32SC1)
+            nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32SC3)
+            nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32SC4)
+            nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32FC1)
+            nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32FC3)
+            nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+          else if (srcType == CV_32FC4)
+            nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+          //reinterpretation
+          else if (elemSize == 1)
+            nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 2)
+            nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 3)
+            nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
               dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_8UC3)
-          nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_8UC4)
-          nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_16UC1)
-          nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_16UC3)
-          nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_16UC4)
-          nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-            dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_16SC1)
-          nppSafeCall( nppiTranspose_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step),
-            dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_16SC3)
-          nppSafeCall( nppiTranspose_16s_C3R(src.ptr<Npp16s>(), static_cast<int>(src.step),
-            dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_16SC4)
-          nppSafeCall( nppiTranspose_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step),
-            dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_32SC1)
-          nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
-            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_32SC3)
-          nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
-            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_32SC4)
-          nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
-            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_32FC1)
-          nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_32FC3)
-          nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-        else if (srcType == CV_32FC4)
-          nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 4)
+            nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 6)
+            nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 8)
+            nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 12)
+            nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+          else if (elemSize == 16)
+            nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+        }//end if (!stream)
+        else//if (stream != 0)
+        {
+          NppStreamContext ctx;
+          nppSafeCall( nppGetStreamContext(&ctx) );
+          ctx.hStream = StreamAccessor::getStream(stream);
+
+          //native implementation
+          if (srcType == CV_8UC1)
+            nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_8UC3)
+            nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_8UC4)
+            nppSafeCall( nppiTranspose_8u_C4R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16UC1)
+            nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16UC3)
+            nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16UC4)
+            nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16SC1)
+            nppSafeCall( nppiTranspose_16s_C1R_Ctx(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16SC3)
+            nppSafeCall( nppiTranspose_16s_C3R_Ctx(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_16SC4)
+            nppSafeCall( nppiTranspose_16s_C4R_Ctx(src.ptr<Npp16s>(), static_cast<int>(src.step),
+              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32SC1)
+            nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32SC3)
+            nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32SC4)
+            nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32FC1)
+            nppSafeCall( nppiTranspose_32f_C1R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32FC3)
+            nppSafeCall( nppiTranspose_32f_C3R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (srcType == CV_32FC4)
+            nppSafeCall( nppiTranspose_32f_C4R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
+              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ctx) );
+          //reinterpretation
+          else if (elemSize == 1)
+            nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 2)
+            nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 3)
+            nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
+              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 4)
+            nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 6)
+            nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 8)
+            nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
+              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 12)
+            nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          else if (elemSize == 16)
+            nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
+              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+        }//end if (stream != 0)
 
         if (!stream)
-            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
-    }//end if (isNppiNativelySupported)
-    else if (isElemSizeSupportedByNppi)
-    {
-      NppStreamHandler h(StreamAccessor::getStream(stream));
-
-      NppiSize sz;
-      sz.width  = src.cols;
-      sz.height = src.rows;
-
-      if (!(elemSize%1) && ((elemSize/1)==1))
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%1) && ((elemSize/1)==2))
-        nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%1) && ((elemSize/1)==3))
-        nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%1) && ((elemSize/1)==4))
-        nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%2) && ((elemSize/2)==1))
-        nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%2) && ((elemSize/2)==2))
-        nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-          dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%2) && ((elemSize/2)==3))
-        nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%2) && ((elemSize/2)==4))
-        nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%4) && ((elemSize/4)==1))
-        nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%4) && ((elemSize/4)==2))
-        nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%4) && ((elemSize/4)==3))
-        nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%4) && ((elemSize/4)==4))
-        nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%8) && ((elemSize/8)==1))
-        nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-          dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-      else if (!(elemSize%8) && ((elemSize/8)==2))
-        nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-          dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-
-      if (!stream)
-        CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
-    }//end if (isElemSizeSupportedByNppi)
-    else if (isElemSizeSupportedByGridTranspose)
-    {
-      if (elemSize == 1)
-        gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream);
-      else if (elemSize == 2)
-        gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream);
-      else if (elemSize == 4)
-        gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream);
-      else if (elemSize == 8)
-        gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
-    }//end if (isElemSizeSupportedByGridTranspose)
+          CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }//end if
+
+    /*
+    if (elemSize == 1)
+      gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream);
+    else if (elemSize == 2)
+      gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream);
+    else if (elemSize == 4)
+      gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream);
+    else if (elemSize == 8)
+      gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
+    */
 
     syncOutput(dst, _dst, stream);
 }

From 43a1691ab06cee401c8de6fd63b9cbac4dcb403e Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Thu, 22 Dec 2022 14:20:23 +0100
Subject: [PATCH 7/9] removed useless and dead code

---
 modules/cudaarithm/src/cuda/transpose.cu | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index afc6600bfc8..3883e405f7a 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -61,10 +61,7 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
     GpuMat src = getInputMat(_src, stream);
 
     const int srcType = src.type();
-    const int srcDepth = src.depth();
-    const int srcCn = src.channels();
     const size_t elemSize = src.elemSize();
-    const size_t elemSize1 = src.elemSize1();
 
     GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
 
@@ -75,7 +72,7 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
     if (!isSupported)
         CV_Error(Error::StsUnsupportedFormat, "");
     else if (src.empty())
-        CV_Error(Error::StsBadArg,"image is empty");
+        CV_Error(Error::StsBadArg, "image is empty");
 
     if ((src.rows == 1) && (src.cols == 1))
         src.copyTo(dst, stream);
@@ -241,23 +238,7 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
             nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
               dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
         }//end if (stream != 0)
-
-        if (!stream)
-          CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
     }//end if
-
-    /*
-    if (elemSize == 1)
-      gridTranspose(globPtr<unsigned char>(src), globPtr<unsigned char>(dst), stream);
-    else if (elemSize == 2)
-      gridTranspose(globPtr<unsigned short>(src), globPtr<unsigned short>(dst), stream);
-    else if (elemSize == 4)
-      gridTranspose(globPtr<signed int>(src), globPtr<signed int>(dst), stream);
-    else if (elemSize == 8)
-      gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
-    */
-
-    syncOutput(dst, _dst, stream);
 }
 
 #endif

From 29cb16a37ff15fdcb24a9f181c3668af69611859 Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Tue, 3 Jan 2023 10:10:56 +0100
Subject: [PATCH 8/9] support legacy npp stream handling

---
 modules/cudaarithm/src/cuda/transpose.cu | 26 +++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index 3883e405f7a..64cf3d838c9 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -56,6 +56,8 @@ using namespace cv;
 using namespace cv::cuda;
 using namespace cv::cudev;
 
+#define USE_NPP_STREAM_CONTEXT (NPP_VERSION >= (10 * 1000 + 1 * 100 + 0))
+
 void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
 {
     GpuMat src = getInputMat(_src, stream);
@@ -86,8 +88,17 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
         sz.width  = src.cols;
         sz.height = src.rows;
 
-        if (!stream)
+        #if USE_NPP_STREAM_CONTEXT
+        constexpr const bool useLegacyStream = false;
+        #else
+        constexpr const bool useLegacyStream = true;
+        #endif
+        cudaStream_t _stream = StreamAccessor::getStream(stream);
+
+        if (!_stream || useLegacyStream)
         {
+          NppStreamHandler h(_stream);
+
           //native implementation
           if (srcType == CV_8UC1)
             nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
@@ -159,12 +170,16 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
           else if (elemSize == 16)
             nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
               dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-        }//end if (!stream)
-        else//if (stream != 0)
+
+          if (_stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }//end if (!_stream || useLegacyStream)
+        else//if ((_stream != 0) && !useLegacyStream)
         {
+          #if USE_NPP_STREAM_CONTEXT
           NppStreamContext ctx;
           nppSafeCall( nppGetStreamContext(&ctx) );
-          ctx.hStream = StreamAccessor::getStream(stream);
+          ctx.hStream = _stream;
 
           //native implementation
           if (srcType == CV_8UC1)
@@ -237,7 +252,8 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
           else if (elemSize == 16)
             nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
               dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
-        }//end if (stream != 0)
+          #endif
+        }//end if ((_stream != 0) && !useLegacyStream)
     }//end if
 }
 

From 367269411f99eba08fe66bdaf4a41b4880393faf Mon Sep 17 00:00:00 2001
From: chacha21 <pierre@chachatelier.fr>
Date: Wed, 4 Jan 2023 09:47:35 +0100
Subject: [PATCH 9/9] Style : use tables of functions

---
 modules/cudaarithm/src/cuda/transpose.cu | 253 +++++++++--------------
 1 file changed, 92 insertions(+), 161 deletions(-)

diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index 64cf3d838c9..4a42df382e6 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -58,6 +58,58 @@ using namespace cv::cudev;
 
 #define USE_NPP_STREAM_CONTEXT (NPP_VERSION >= (10 * 1000 + 1 * 100 + 0))
 
+namespace
+{
+  template <int DEPTH> struct NppTransposeFunc
+  {
+    typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+    typedef NppStatus (*func_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize);
+    #if USE_NPP_STREAM_CONTEXT
+    typedef NppStatus (*func_ctx_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize, NppStreamContext stream);
+    #endif
+  };
+
+  template <int DEPTH, typename NppTransposeFunc<DEPTH>::func_t func> struct NppTranspose
+  {
+    typedef typename NppTransposeFunc<DEPTH>::npp_type npp_type;
+
+    static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream)
+    {
+      NppiSize srcsz;
+      srcsz.height = src.rows;
+      srcsz.width = src.cols;
+
+      cv::cuda::NppStreamHandler h(stream);
+
+      nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz) );
+
+      if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+  };
+
+  #if USE_NPP_STREAM_CONTEXT
+  template <int DEPTH, typename NppTransposeFunc<DEPTH>::func_ctx_t func> struct NppTransposeCtx
+  {
+    typedef typename NppTransposeFunc<DEPTH>::npp_type npp_type;
+
+    static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream)
+    {
+      NppiSize srcsz;
+      srcsz.height = src.rows;
+      srcsz.width = src.cols;
+
+      NppStreamContext ctx;
+      nppSafeCall( nppGetStreamContext(&ctx) );
+      ctx.hStream = stream;
+
+      nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz, ctx) );
+    }
+  };
+  #endif
+}
+
 void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
 {
     GpuMat src = getInputMat(_src, stream);
@@ -84,176 +136,55 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
         src.reshape(0, src.cols).copyTo(dst, stream);
     else
     {
-        NppiSize sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
         #if USE_NPP_STREAM_CONTEXT
-        constexpr const bool useLegacyStream = false;
+        constexpr const bool useNppStreamCtx = true;
         #else
-        constexpr const bool useLegacyStream = true;
+        constexpr const bool useNppStreamCtx = false;
         #endif
         cudaStream_t _stream = StreamAccessor::getStream(stream);
 
-        if (!_stream || useLegacyStream)
+        if (!_stream || !useNppStreamCtx)
         {
-          NppStreamHandler h(_stream);
-
-          //native implementation
-          if (srcType == CV_8UC1)
-            nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_8UC3)
-            nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_8UC4)
-            nppSafeCall( nppiTranspose_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_16UC1)
-            nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_16UC3)
-            nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_16UC4)
-            nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_16SC1)
-            nppSafeCall( nppiTranspose_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step),
-              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_16SC3)
-            nppSafeCall( nppiTranspose_16s_C3R(src.ptr<Npp16s>(), static_cast<int>(src.step),
-              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_16SC4)
-            nppSafeCall( nppiTranspose_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step),
-              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_32SC1)
-            nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_32SC3)
-            nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_32SC4)
-            nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_32FC1)
-            nppSafeCall( nppiTranspose_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_32FC3)
-            nppSafeCall( nppiTranspose_32f_C3R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-          else if (srcType == CV_32FC4)
-            nppSafeCall( nppiTranspose_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step),
-              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-          //reinterpretation
-          else if (elemSize == 1)
-            nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-          else if (elemSize == 2)
-            nppSafeCall( nppiTranspose_16u_C1R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-          else if (elemSize == 3)
-            nppSafeCall( nppiTranspose_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-          else if (elemSize == 4)
-            nppSafeCall( nppiTranspose_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-          else if (elemSize == 6)
-            nppSafeCall( nppiTranspose_16u_C3R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-          else if (elemSize == 8)
-            nppSafeCall( nppiTranspose_16u_C4R(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz) );
-          else if (elemSize == 12)
-            nppSafeCall( nppiTranspose_32s_C3R(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-          else if (elemSize == 16)
-            nppSafeCall( nppiTranspose_32s_C4R(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-
-          if (_stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }//end if (!_stream || useLegacyStream)
-        else//if ((_stream != 0) && !useLegacyStream)
+          typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream);
+          //if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize
+          static const func_t funcs[8][4] = {
+            {NppTranspose<CV_8U,  nppiTranspose_8u_C1R>::call,  NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call,   NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
+            {NppTranspose<CV_8U,  nppiTranspose_8u_C1R>::call,  NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call,   NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
+            {NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call},
+            {NppTranspose<CV_16S, nppiTranspose_16s_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C3R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call},
+            {NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C3R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call},
+            {NppTranspose<CV_32F, nppiTranspose_32f_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C3R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C4R>::call},
+            {NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call, nullptr, nullptr},
+            {NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call}
+          };
+          
+          const func_t func = funcs[src.depth()][src.channels() - 1];
+          CV_Assert(func != nullptr);
+
+          func(src, dst, _stream);
+        }//end if (!_stream || !useNppStreamCtx)
+        else//if ((_stream != 0) && useNppStreamCtx)
         {
           #if USE_NPP_STREAM_CONTEXT
-          NppStreamContext ctx;
-          nppSafeCall( nppGetStreamContext(&ctx) );
-          ctx.hStream = _stream;
-
-          //native implementation
-          if (srcType == CV_8UC1)
-            nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
-              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_8UC3)
-            nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
-              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_8UC4)
-            nppSafeCall( nppiTranspose_8u_C4R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
-              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_16UC1)
-            nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_16UC3)
-            nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_16UC4)
-            nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_16SC1)
-            nppSafeCall( nppiTranspose_16s_C1R_Ctx(src.ptr<Npp16s>(), static_cast<int>(src.step),
-              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_16SC3)
-            nppSafeCall( nppiTranspose_16s_C3R_Ctx(src.ptr<Npp16s>(), static_cast<int>(src.step),
-              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_16SC4)
-            nppSafeCall( nppiTranspose_16s_C4R_Ctx(src.ptr<Npp16s>(), static_cast<int>(src.step),
-              dst.ptr<Npp16s>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_32SC1)
-            nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_32SC3)
-            nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_32SC4)
-            nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_32FC1)
-            nppSafeCall( nppiTranspose_32f_C1R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
-              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_32FC3)
-            nppSafeCall( nppiTranspose_32f_C3R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
-              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (srcType == CV_32FC4)
-            nppSafeCall( nppiTranspose_32f_C4R_Ctx(src.ptr<Npp32f>(), static_cast<int>(src.step),
-              dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ctx) );
-          //reinterpretation
-          else if (elemSize == 1)
-            nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
-              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (elemSize == 2)
-            nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (elemSize == 3)
-            nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr<Npp8u>(), static_cast<int>(src.step),
-              dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (elemSize == 4)
-            nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (elemSize == 6)
-            nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (elemSize == 8)
-            nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr<Npp16u>(), static_cast<int>(src.step),
-              dst.ptr<Npp16u>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (elemSize == 12)
-            nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
-          else if (elemSize == 16)
-            nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr<Npp32s>(), static_cast<int>(src.step),
-              dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz, ctx) );
+          typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream);
+          //if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize
+          static const func_t funcs[8][4] = {
+            {NppTransposeCtx<CV_8U,  nppiTranspose_8u_C1R_Ctx>::call,  NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_8U,  nppiTranspose_8u_C3R_Ctx>::call,  NppTransposeCtx<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_8U,  nppiTranspose_8u_C1R_Ctx>::call,  NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_8U,  nppiTranspose_8u_C3R_Ctx>::call,  NppTransposeCtx<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_16S, nppiTranspose_16s_C1R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C3R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C3R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_32F, nppiTranspose_32f_C1R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTransposeCtx<CV_32F, nppiTranspose_32f_C3R_Ctx>::call, NppTransposeCtx<CV_32F, nppiTranspose_32f_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C4R_Ctx>::call, nullptr, nullptr},
+            {NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C4R_Ctx>::call}
+          };
+
+          const func_t func = funcs[src.depth()][src.channels() - 1];
+          CV_Assert(func != nullptr);
+
+          func(src, dst, _stream);
           #endif
-        }//end if ((_stream != 0) && !useLegacyStream)
+        }//end if ((_stream != 0) && useNppStreamCtx)
     }//end if
 }