From 51180e9a9a0e23e236ac72b96bcef05011772a8a Mon Sep 17 00:00:00 2001 From: chacha21 Date: Wed, 9 Nov 2022 21:52:41 +0100 Subject: [PATCH 1/9] More data types supported in cv::cuda::transpose() proposal for #22782 --- modules/cudaarithm/src/cuda/transpose.cu | 135 +++++++++++++++++++++-- modules/cudaarithm/test/test_core.cpp | 21 +++- 2 files changed, 144 insertions(+), 12 deletions(-) diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index bfe50bd34fb..7141a0f4c10 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -60,13 +60,30 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) { GpuMat src = getInputMat(_src, stream); + const int srcType = src.type(); + const int srcDepth = src.depth(); + const int srcCn = src.channels(); const size_t elemSize = src.elemSize(); + const size_t elemSize1 = src.elemSize1(); - CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 ); + //CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 ); GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream); - if (elemSize == 1) + const bool isNppiNativelySupported = + (srcType == CV_8UC1) || (srcType == CV_8UC3) || (srcType == CV_8UC4) || + (srcType == CV_16UC1) || (srcType == CV_16UC3) || (srcType == CV_16UC4) || + (srcType == CV_16SC1) || (srcType == CV_16SC3) || (srcType == CV_16SC4) || + (srcType == CV_32SC1) || (srcType == CV_32SC3) || (srcType == CV_32SC4) || + (srcType == CV_32FC1) || (srcType == CV_32FC3) || (srcType == CV_32FC4); + const bool isElemSizeSupportedByNppi = + ((elemSize != 0) && !(elemSize%1) && ((elemSize/1)<=4)) || + ((elemSize != 0) && !(elemSize%2) && ((elemSize/2)<=4)) || + ((elemSize != 0) && !(elemSize%4) && ((elemSize/4)<=4)) || + ((elemSize != 0) && !(elemSize%8) && ((elemSize/8)<=2)); + if (src.empty()) + dst.release(); + else if (isNppiNativelySupported) { NppStreamHandler h(StreamAccessor::getStream(stream)); @@ -74,20 +91,116 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) sz.width = src.cols; sz.height = src.rows; - nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), + if (srcType == CV_8UC1) + nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_8UC3) + nppSafeCall( nppiTranspose_8u_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_8UC4) + nppSafeCall( nppiTranspose_8u_C4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16UC1) + nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16UC3) + nppSafeCall( nppiTranspose_16u_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16UC4) + nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16SC1) + nppSafeCall( nppiTranspose_16s_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16SC3) + nppSafeCall( nppiTranspose_16s_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16SC4) + nppSafeCall( nppiTranspose_16s_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32SC1) + nppSafeCall( nppiTranspose_32s_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32SC3) + nppSafeCall( nppiTranspose_32s_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32SC4) + nppSafeCall( nppiTranspose_32s_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32FC1) + nppSafeCall( nppiTranspose_32f_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32FC3) + nppSafeCall( nppiTranspose_32f_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32FC4) + nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); if (!stream) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); - } - else if (elemSize == 4) - { - gridTranspose(globPtr(src), globPtr(dst), stream); - } - else // if (elemSize == 8) + }//end if (isNppiNativelySupported) + else if (isElemSizeSupportedByNppi) { - gridTranspose(globPtr(src), globPtr(dst), stream); - } + NppStreamHandler h(StreamAccessor::getStream(stream)); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + if (!(elemSize%1) && ((elemSize/1)==1)) + nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%1) && ((elemSize/1)==2)) + nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%1) && ((elemSize/1)==3)) + nppSafeCall( nppiTranspose_8u_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%1) && ((elemSize/1)==4)) + nppSafeCall( nppiTranspose_8u_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%2) && ((elemSize/2)==1)) + nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%2) && ((elemSize/2)==2)) + nppSafeCall( nppiTranspose_8u_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%2) && ((elemSize/2)==3)) + nppSafeCall( nppiTranspose_16u_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%2) && ((elemSize/2)==4)) + nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%4) && ((elemSize/4)==1)) + nppSafeCall( nppiTranspose_32f_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%4) && ((elemSize/4)==2)) + nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%4) && ((elemSize/4)==3)) + nppSafeCall( nppiTranspose_32f_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%4) && ((elemSize/4)==4)) + nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%8) && ((elemSize/8)==1)) + nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (!(elemSize%8) && ((elemSize/8)==2)) + nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + }//end if (isElemSizeSupportedByNppi) + else if (elemSize == 1) + gridTranspose(globPtr(src), globPtr(dst), stream); + else if (elemSize == 2) + gridTranspose(globPtr(src), globPtr(dst), stream); + else if (elemSize == 4) + gridTranspose(globPtr(src), globPtr(dst), stream); + else if (elemSize == 8) + gridTranspose(globPtr(src), globPtr(dst), stream); + else + CV_Error(Error::StsUnsupportedFormat, ""); syncOutput(dst, _dst, stream); } diff --git a/modules/cudaarithm/test/test_core.cpp b/modules/cudaarithm/test/test_core.cpp index bc8f3737e53..e5c30f703ab 100644 --- a/modules/cudaarithm/test/test_core.cpp +++ b/modules/cudaarithm/test/test_core.cpp @@ -231,12 +231,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, testing::Values(MatType(CV_8UC1), + MatType(CV_8UC2), + MatType(CV_8UC3), MatType(CV_8UC4), + MatType(CV_8SC1), + MatType(CV_8SC2), + MatType(CV_8SC3), + MatType(CV_8SC4), + MatType(CV_16UC1), MatType(CV_16UC2), + MatType(CV_16UC3), + MatType(CV_16UC4), + MatType(CV_16SC1), MatType(CV_16SC2), + MatType(CV_16SC3), + MatType(CV_16SC4), MatType(CV_32SC1), MatType(CV_32SC2), - MatType(CV_64FC1)), + MatType(CV_32SC3), + MatType(CV_32SC4), + MatType(CV_32FC1), + MatType(CV_32FC2), + MatType(CV_32FC3), + MatType(CV_32FC4), + MatType(CV_64FC1), + MatType(CV_64FC2)), WHOLE_SUBMAT)); //////////////////////////////////////////////////////////////////////////////// From be2dd8d15ec5afc96a6732a4b7acdc2a14164278 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Thu, 10 Nov 2022 11:11:46 +0100 Subject: [PATCH 2/9] added fast paths Fixed a typo in gridTranspose() usage Added fast path for single cell/row/col matrix Throw error for empty matrix instead of no-op Code style --- modules/cudaarithm/src/cuda/transpose.cu | 44 +++++++++++++++--------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index 7141a0f4c10..fb6957926dc 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -77,12 +77,23 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) (srcType == CV_32SC1) || (srcType == CV_32SC3) || (srcType == CV_32SC4) || (srcType == CV_32FC1) || (srcType == CV_32FC3) || (srcType == CV_32FC4); const bool isElemSizeSupportedByNppi = - ((elemSize != 0) && !(elemSize%1) && ((elemSize/1)<=4)) || - ((elemSize != 0) && !(elemSize%2) && ((elemSize/2)<=4)) || - ((elemSize != 0) && !(elemSize%4) && ((elemSize/4)<=4)) || - ((elemSize != 0) && !(elemSize%8) && ((elemSize/8)<=2)); - if (src.empty()) - dst.release(); + (!(elemSize%1) && ((elemSize/1)<=4)) || + (!(elemSize%2) && ((elemSize/2)<=4)) || + (!(elemSize%4) && ((elemSize/4)<=4)) || + (!(elemSize%8) && ((elemSize/8)<=2)); + const bool isElemSizeSupportedByGridTranspose = + (elemSize == 1) || (elemSize == 2) || (elemSize == 4) || (elemSize == 8); + const bool isSupported = isNppiNativelySupported || isElemSizeSupportedByNppi || isElemSizeSupportedByGridTranspose; + + if (!isSupported) + CV_Error(Error::StsUnsupportedFormat, ""); + else if (src.empty()) + CV_Error(Error::StsBadArg,"image is empty"); + + if ((src.cols == 1) && (dst.cols == 1)) + src.copyTo(dst, stream); + else if (((src.cols == 1) || (src.rows == 1)) && (src.cols*src.elemSize() == src.step)) + src.reshape(0, src.cols).copyTo(dst, stream); else if (isNppiNativelySupported) { NppStreamHandler h(StreamAccessor::getStream(stream)); @@ -191,16 +202,17 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); }//end if (isElemSizeSupportedByNppi) - else if (elemSize == 1) - gridTranspose(globPtr(src), globPtr(dst), stream); - else if (elemSize == 2) - gridTranspose(globPtr(src), globPtr(dst), stream); - else if (elemSize == 4) - gridTranspose(globPtr(src), globPtr(dst), stream); - else if (elemSize == 8) - gridTranspose(globPtr(src), globPtr(dst), stream); - else - CV_Error(Error::StsUnsupportedFormat, ""); + else if (isElemSizeSupportedByGridTranspose) + { + if (elemSize == 1) + gridTranspose(globPtr(src), globPtr(dst), stream); + else if (elemSize == 2) + gridTranspose(globPtr(src), globPtr(dst), stream); + else if (elemSize == 4) + gridTranspose(globPtr(src), globPtr(dst), stream); + else if (elemSize == 8) + gridTranspose(globPtr(src), globPtr(dst), stream); + }//end if (isElemSizeSupportedByGridTranspose) syncOutput(dst, _dst, stream); } From 9e3eb3dec93fc39ab34223dab1d9923e0582e8b4 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Thu, 10 Nov 2022 12:06:05 +0100 Subject: [PATCH 3/9] missing cudaDeviceSynchronize() --- modules/cudaarithm/src/cuda/transpose.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index fb6957926dc..7eb16af332c 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -66,8 +66,6 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) const size_t elemSize = src.elemSize(); const size_t elemSize1 = src.elemSize1(); - //CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 ); - GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream); const bool isNppiNativelySupported = @@ -201,6 +199,9 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) else if (!(elemSize%8) && ((elemSize/8)==2)) nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); + + if (!stream) + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); }//end if (isElemSizeSupportedByNppi) else if (isElemSizeSupportedByGridTranspose) { From 00abef3b003fbceabb22be87c514a0dced99bbff Mon Sep 17 00:00:00 2001 From: chacha21 Date: Fri, 11 Nov 2022 21:16:30 +0100 Subject: [PATCH 4/9] split (rows == 1) and (cols == 1) cases if (rows == 1), the stride has no importance for the reshape() --- modules/cudaarithm/src/cuda/transpose.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index 7eb16af332c..58cb5132cbe 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -90,7 +90,9 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) if ((src.cols == 1) && (dst.cols == 1)) src.copyTo(dst, stream); - else if (((src.cols == 1) || (src.rows == 1)) && (src.cols*src.elemSize() == src.step)) + else if (src.rows == 1) + src.reshape(0, src.cols).copyTo(dst, stream); + else if ((src.cols == 1) && (src.cols*src.elemSize() == src.step)) src.reshape(0, src.cols).copyTo(dst, stream); else if (isNppiNativelySupported) { From e5b152ef9d6eb96854ddb93f49c9d988daf89d49 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Fri, 11 Nov 2022 21:18:08 +0100 Subject: [PATCH 5/9] code clarity --- modules/cudaarithm/src/cuda/transpose.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index 58cb5132cbe..d822a174dcf 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -88,7 +88,7 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) else if (src.empty()) CV_Error(Error::StsBadArg,"image is empty"); - if ((src.cols == 1) && (dst.cols == 1)) + if ((src.rows == 1) && (src.cols == 1)) src.copyTo(dst, stream); else if (src.rows == 1) src.reshape(0, src.cols).copyTo(dst, stream); From 4f0e47044277020264dc3afb69d9d893a0371048 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Wed, 21 Dec 2022 15:32:29 +0100 Subject: [PATCH 6/9] updated according to suggestions more obvious elemSize test logic to dispatch to the right nppiTranspose variant added modern NPP _ctx stream support --- modules/cudaarithm/src/cuda/transpose.cu | 308 +++++++++++++---------- 1 file changed, 174 insertions(+), 134 deletions(-) diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index d822a174dcf..afc6600bfc8 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -68,154 +68,194 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream); - const bool isNppiNativelySupported = - (srcType == CV_8UC1) || (srcType == CV_8UC3) || (srcType == CV_8UC4) || - (srcType == CV_16UC1) || (srcType == CV_16UC3) || (srcType == CV_16UC4) || - (srcType == CV_16SC1) || (srcType == CV_16SC3) || (srcType == CV_16SC4) || - (srcType == CV_32SC1) || (srcType == CV_32SC3) || (srcType == CV_32SC4) || - (srcType == CV_32FC1) || (srcType == CV_32FC3) || (srcType == CV_32FC4); - const bool isElemSizeSupportedByNppi = - (!(elemSize%1) && ((elemSize/1)<=4)) || - (!(elemSize%2) && ((elemSize/2)<=4)) || - (!(elemSize%4) && ((elemSize/4)<=4)) || - (!(elemSize%8) && ((elemSize/8)<=2)); - const bool isElemSizeSupportedByGridTranspose = - (elemSize == 1) || (elemSize == 2) || (elemSize == 4) || (elemSize == 8); - const bool isSupported = isNppiNativelySupported || isElemSizeSupportedByNppi || isElemSizeSupportedByGridTranspose; + const bool isSupported = + (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4) || + (elemSize == 6) || (elemSize == 8) || (elemSize == 12) || (elemSize == 16); if (!isSupported) - CV_Error(Error::StsUnsupportedFormat, ""); + CV_Error(Error::StsUnsupportedFormat, ""); else if (src.empty()) - CV_Error(Error::StsBadArg,"image is empty"); + CV_Error(Error::StsBadArg,"image is empty"); if ((src.rows == 1) && (src.cols == 1)) - src.copyTo(dst, stream); + src.copyTo(dst, stream); else if (src.rows == 1) - src.reshape(0, src.cols).copyTo(dst, stream); - else if ((src.cols == 1) && (src.cols*src.elemSize() == src.step)) - src.reshape(0, src.cols).copyTo(dst, stream); - else if (isNppiNativelySupported) + src.reshape(0, src.cols).copyTo(dst, stream); + else if ((src.cols == 1) && src.isContinuous()) + src.reshape(0, src.cols).copyTo(dst, stream); + else { - NppStreamHandler h(StreamAccessor::getStream(stream)); - NppiSize sz; sz.width = src.cols; sz.height = src.rows; - if (srcType == CV_8UC1) - nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), + if (!stream) + { + //native implementation + if (srcType == CV_8UC1) + nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_8UC3) + nppSafeCall( nppiTranspose_8u_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_8UC4) + nppSafeCall( nppiTranspose_8u_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16UC1) + nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16UC3) + nppSafeCall( nppiTranspose_16u_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16UC4) + nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16SC1) + nppSafeCall( nppiTranspose_16s_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16SC3) + nppSafeCall( nppiTranspose_16s_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_16SC4) + nppSafeCall( nppiTranspose_16s_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32SC1) + nppSafeCall( nppiTranspose_32s_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32SC3) + nppSafeCall( nppiTranspose_32s_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32SC4) + nppSafeCall( nppiTranspose_32s_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32FC1) + nppSafeCall( nppiTranspose_32f_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32FC3) + nppSafeCall( nppiTranspose_32f_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (srcType == CV_32FC4) + nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + //reinterpretation + else if (elemSize == 1) + nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (elemSize == 2) + nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (elemSize == 3) + nppSafeCall( nppiTranspose_8u_C3R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_8UC3) - nppSafeCall( nppiTranspose_8u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_8UC4) - nppSafeCall( nppiTranspose_8u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16UC1) - nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16UC3) - nppSafeCall( nppiTranspose_16u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16UC4) - nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16SC1) - nppSafeCall( nppiTranspose_16s_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16SC3) - nppSafeCall( nppiTranspose_16s_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16SC4) - nppSafeCall( nppiTranspose_16s_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32SC1) - nppSafeCall( nppiTranspose_32s_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32SC3) - nppSafeCall( nppiTranspose_32s_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32SC4) - nppSafeCall( nppiTranspose_32s_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32FC1) - nppSafeCall( nppiTranspose_32f_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32FC3) - nppSafeCall( nppiTranspose_32f_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32FC4) - nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); + else if (elemSize == 4) + nppSafeCall( nppiTranspose_32s_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (elemSize == 6) + nppSafeCall( nppiTranspose_16u_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (elemSize == 8) + nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (elemSize == 12) + nppSafeCall( nppiTranspose_32s_C3R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + else if (elemSize == 16) + nppSafeCall( nppiTranspose_32s_C4R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz) ); + }//end if (!stream) + else//if (stream != 0) + { + NppStreamContext ctx; + nppSafeCall( nppGetStreamContext(&ctx) ); + ctx.hStream = StreamAccessor::getStream(stream); + + //native implementation + if (srcType == CV_8UC1) + nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_8UC3) + nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_8UC4) + nppSafeCall( nppiTranspose_8u_C4R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_16UC1) + nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_16UC3) + nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_16UC4) + nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_16SC1) + nppSafeCall( nppiTranspose_16s_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_16SC3) + nppSafeCall( nppiTranspose_16s_C3R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_16SC4) + nppSafeCall( nppiTranspose_16s_C4R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_32SC1) + nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_32SC3) + nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_32SC4) + nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_32FC1) + nppSafeCall( nppiTranspose_32f_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_32FC3) + nppSafeCall( nppiTranspose_32f_C3R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (srcType == CV_32FC4) + nppSafeCall( nppiTranspose_32f_C4R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + //reinterpretation + else if (elemSize == 1) + nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (elemSize == 2) + nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (elemSize == 3) + nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (elemSize == 4) + nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (elemSize == 6) + nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (elemSize == 8) + nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (elemSize == 12) + nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + else if (elemSize == 16) + nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, ctx) ); + }//end if (stream != 0) if (!stream) - CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); - }//end if (isNppiNativelySupported) - else if (isElemSizeSupportedByNppi) - { - NppStreamHandler h(StreamAccessor::getStream(stream)); - - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - if (!(elemSize%1) && ((elemSize/1)==1)) - nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%1) && ((elemSize/1)==2)) - nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%1) && ((elemSize/1)==3)) - nppSafeCall( nppiTranspose_8u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%1) && ((elemSize/1)==4)) - nppSafeCall( nppiTranspose_8u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%2) && ((elemSize/2)==1)) - nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%2) && ((elemSize/2)==2)) - nppSafeCall( nppiTranspose_8u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%2) && ((elemSize/2)==3)) - nppSafeCall( nppiTranspose_16u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%2) && ((elemSize/2)==4)) - nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%4) && ((elemSize/4)==1)) - nppSafeCall( nppiTranspose_32f_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%4) && ((elemSize/4)==2)) - nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%4) && ((elemSize/4)==3)) - nppSafeCall( nppiTranspose_32f_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%4) && ((elemSize/4)==4)) - nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%8) && ((elemSize/8)==1)) - nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (!(elemSize%8) && ((elemSize/8)==2)) - nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - - if (!stream) - CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); - }//end if (isElemSizeSupportedByNppi) - else if (isElemSizeSupportedByGridTranspose) - { - if (elemSize == 1) - gridTranspose(globPtr(src), globPtr(dst), stream); - else if (elemSize == 2) - gridTranspose(globPtr(src), globPtr(dst), stream); - else if (elemSize == 4) - gridTranspose(globPtr(src), globPtr(dst), stream); - else if (elemSize == 8) - gridTranspose(globPtr(src), globPtr(dst), stream); - }//end if (isElemSizeSupportedByGridTranspose) + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); + }//end if + + /* + if (elemSize == 1) + gridTranspose(globPtr(src), globPtr(dst), stream); + else if (elemSize == 2) + gridTranspose(globPtr(src), globPtr(dst), stream); + else if (elemSize == 4) + gridTranspose(globPtr(src), globPtr(dst), stream); + else if (elemSize == 8) + gridTranspose(globPtr(src), globPtr(dst), stream); + */ syncOutput(dst, _dst, stream); } From 43a1691ab06cee401c8de6fd63b9cbac4dcb403e Mon Sep 17 00:00:00 2001 From: chacha21 Date: Thu, 22 Dec 2022 14:20:23 +0100 Subject: [PATCH 7/9] removed useless and dead code --- modules/cudaarithm/src/cuda/transpose.cu | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index afc6600bfc8..3883e405f7a 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -61,10 +61,7 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) GpuMat src = getInputMat(_src, stream); const int srcType = src.type(); - const int srcDepth = src.depth(); - const int srcCn = src.channels(); const size_t elemSize = src.elemSize(); - const size_t elemSize1 = src.elemSize1(); GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream); @@ -75,7 +72,7 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) if (!isSupported) CV_Error(Error::StsUnsupportedFormat, ""); else if (src.empty()) - CV_Error(Error::StsBadArg,"image is empty"); + CV_Error(Error::StsBadArg, "image is empty"); if ((src.rows == 1) && (src.cols == 1)) src.copyTo(dst, stream); @@ -241,23 +238,7 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, ctx) ); }//end if (stream != 0) - - if (!stream) - CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); }//end if - - /* - if (elemSize == 1) - gridTranspose(globPtr(src), globPtr(dst), stream); - else if (elemSize == 2) - gridTranspose(globPtr(src), globPtr(dst), stream); - else if (elemSize == 4) - gridTranspose(globPtr(src), globPtr(dst), stream); - else if (elemSize == 8) - gridTranspose(globPtr(src), globPtr(dst), stream); - */ - - syncOutput(dst, _dst, stream); } #endif From 29cb16a37ff15fdcb24a9f181c3668af69611859 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Tue, 3 Jan 2023 10:10:56 +0100 Subject: [PATCH 8/9] support legacy npp stream handling --- modules/cudaarithm/src/cuda/transpose.cu | 26 +++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index 3883e405f7a..64cf3d838c9 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -56,6 +56,8 @@ using namespace cv; using namespace cv::cuda; using namespace cv::cudev; +#define USE_NPP_STREAM_CONTEXT (NPP_VERSION >= (10 * 1000 + 1 * 100 + 0)) + void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) { GpuMat src = getInputMat(_src, stream); @@ -86,8 +88,17 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) sz.width = src.cols; sz.height = src.rows; - if (!stream) + #if USE_NPP_STREAM_CONTEXT + constexpr const bool useLegacyStream = false; + #else + constexpr const bool useLegacyStream = true; + #endif + cudaStream_t _stream = StreamAccessor::getStream(stream); + + if (!_stream || useLegacyStream) { + NppStreamHandler h(_stream); + //native implementation if (srcType == CV_8UC1) nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), @@ -159,12 +170,16 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) else if (elemSize == 16) nppSafeCall( nppiTranspose_32s_C4R(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - }//end if (!stream) - else//if (stream != 0) + + if (_stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + }//end if (!_stream || useLegacyStream) + else//if ((_stream != 0) && !useLegacyStream) { + #if USE_NPP_STREAM_CONTEXT NppStreamContext ctx; nppSafeCall( nppGetStreamContext(&ctx) ); - ctx.hStream = StreamAccessor::getStream(stream); + ctx.hStream = _stream; //native implementation if (srcType == CV_8UC1) @@ -237,7 +252,8 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) else if (elemSize == 16) nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, ctx) ); - }//end if (stream != 0) + #endif + }//end if ((_stream != 0) && !useLegacyStream) }//end if } From 367269411f99eba08fe66bdaf4a41b4880393faf Mon Sep 17 00:00:00 2001 From: chacha21 Date: Wed, 4 Jan 2023 09:47:35 +0100 Subject: [PATCH 9/9] Style : use tables of functions --- modules/cudaarithm/src/cuda/transpose.cu | 253 +++++++++-------------- 1 file changed, 92 insertions(+), 161 deletions(-) diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index 64cf3d838c9..4a42df382e6 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -58,6 +58,58 @@ using namespace cv::cudev; #define USE_NPP_STREAM_CONTEXT (NPP_VERSION >= (10 * 1000 + 1 * 100 + 0)) +namespace +{ + template struct NppTransposeFunc + { + typedef typename NPPTypeTraits::npp_type npp_type; + + typedef NppStatus (*func_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize); + #if USE_NPP_STREAM_CONTEXT + typedef NppStatus (*func_ctx_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize, NppStreamContext stream); + #endif + }; + + template ::func_t func> struct NppTranspose + { + typedef typename NppTransposeFunc::npp_type npp_type; + + static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream) + { + NppiSize srcsz; + srcsz.height = src.rows; + srcsz.width = src.cols; + + cv::cuda::NppStreamHandler h(stream); + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), srcsz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + #if USE_NPP_STREAM_CONTEXT + template ::func_ctx_t func> struct NppTransposeCtx + { + typedef typename NppTransposeFunc::npp_type npp_type; + + static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream) + { + NppiSize srcsz; + srcsz.height = src.rows; + srcsz.width = src.cols; + + NppStreamContext ctx; + nppSafeCall( nppGetStreamContext(&ctx) ); + ctx.hStream = stream; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), srcsz, ctx) ); + } + }; + #endif +} + void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) { GpuMat src = getInputMat(_src, stream); @@ -84,176 +136,55 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) src.reshape(0, src.cols).copyTo(dst, stream); else { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - #if USE_NPP_STREAM_CONTEXT - constexpr const bool useLegacyStream = false; + constexpr const bool useNppStreamCtx = true; #else - constexpr const bool useLegacyStream = true; + constexpr const bool useNppStreamCtx = false; #endif cudaStream_t _stream = StreamAccessor::getStream(stream); - if (!_stream || useLegacyStream) + if (!_stream || !useNppStreamCtx) { - NppStreamHandler h(_stream); - - //native implementation - if (srcType == CV_8UC1) - nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_8UC3) - nppSafeCall( nppiTranspose_8u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_8UC4) - nppSafeCall( nppiTranspose_8u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16UC1) - nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16UC3) - nppSafeCall( nppiTranspose_16u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16UC4) - nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16SC1) - nppSafeCall( nppiTranspose_16s_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16SC3) - nppSafeCall( nppiTranspose_16s_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_16SC4) - nppSafeCall( nppiTranspose_16s_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32SC1) - nppSafeCall( nppiTranspose_32s_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32SC3) - nppSafeCall( nppiTranspose_32s_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32SC4) - nppSafeCall( nppiTranspose_32s_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32FC1) - nppSafeCall( nppiTranspose_32f_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32FC3) - nppSafeCall( nppiTranspose_32f_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (srcType == CV_32FC4) - nppSafeCall( nppiTranspose_32f_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - //reinterpretation - else if (elemSize == 1) - nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (elemSize == 2) - nppSafeCall( nppiTranspose_16u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (elemSize == 3) - nppSafeCall( nppiTranspose_8u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (elemSize == 4) - nppSafeCall( nppiTranspose_32s_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (elemSize == 6) - nppSafeCall( nppiTranspose_16u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (elemSize == 8) - nppSafeCall( nppiTranspose_16u_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (elemSize == 12) - nppSafeCall( nppiTranspose_32s_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - else if (elemSize == 16) - nppSafeCall( nppiTranspose_32s_C4R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); - - if (_stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - }//end if (!_stream || useLegacyStream) - else//if ((_stream != 0) && !useLegacyStream) + typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream); + //if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize + static const func_t funcs[8][4] = { + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, nullptr, nullptr}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call} + }; + + const func_t func = funcs[src.depth()][src.channels() - 1]; + CV_Assert(func != nullptr); + + func(src, dst, _stream); + }//end if (!_stream || !useNppStreamCtx) + else//if ((_stream != 0) && useNppStreamCtx) { #if USE_NPP_STREAM_CONTEXT - NppStreamContext ctx; - nppSafeCall( nppGetStreamContext(&ctx) ); - ctx.hStream = _stream; - - //native implementation - if (srcType == CV_8UC1) - nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_8UC3) - nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_8UC4) - nppSafeCall( nppiTranspose_8u_C4R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_16UC1) - nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_16UC3) - nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_16UC4) - nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_16SC1) - nppSafeCall( nppiTranspose_16s_C1R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_16SC3) - nppSafeCall( nppiTranspose_16s_C3R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_16SC4) - nppSafeCall( nppiTranspose_16s_C4R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_32SC1) - nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_32SC3) - nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_32SC4) - nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_32FC1) - nppSafeCall( nppiTranspose_32f_C1R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_32FC3) - nppSafeCall( nppiTranspose_32f_C3R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (srcType == CV_32FC4) - nppSafeCall( nppiTranspose_32f_C4R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - //reinterpretation - else if (elemSize == 1) - nppSafeCall( nppiTranspose_8u_C1R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (elemSize == 2) - nppSafeCall( nppiTranspose_16u_C1R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (elemSize == 3) - nppSafeCall( nppiTranspose_8u_C3R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (elemSize == 4) - nppSafeCall( nppiTranspose_32s_C1R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (elemSize == 6) - nppSafeCall( nppiTranspose_16u_C3R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (elemSize == 8) - nppSafeCall( nppiTranspose_16u_C4R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (elemSize == 12) - nppSafeCall( nppiTranspose_32s_C3R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); - else if (elemSize == 16) - nppSafeCall( nppiTranspose_32s_C4R_Ctx(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, ctx) ); + typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream); + //if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize + static const func_t funcs[8][4] = { + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, nullptr, nullptr}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call} + }; + + const func_t func = funcs[src.depth()][src.channels() - 1]; + CV_Assert(func != nullptr); + + func(src, dst, _stream); #endif - }//end if ((_stream != 0) && !useLegacyStream) + }//end if ((_stream != 0) && useNppStreamCtx) }//end if }