diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index bfe50bd34fb..4a42df382e6 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -56,40 +56,136 @@ using namespace cv; using namespace cv::cuda; using namespace cv::cudev; -void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) -{ - GpuMat src = getInputMat(_src, stream); +#define USE_NPP_STREAM_CONTEXT (NPP_VERSION >= (10 * 1000 + 1 * 100 + 0)) - const size_t elemSize = src.elemSize(); +namespace +{ + template struct NppTransposeFunc + { + typedef typename NPPTypeTraits::npp_type npp_type; - CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 ); + typedef NppStatus (*func_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize); + #if USE_NPP_STREAM_CONTEXT + typedef NppStatus (*func_ctx_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize, NppStreamContext stream); + #endif + }; - GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream); + template ::func_t func> struct NppTranspose + { + typedef typename NppTransposeFunc::npp_type npp_type; - if (elemSize == 1) + static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream) { - NppStreamHandler h(StreamAccessor::getStream(stream)); + NppiSize srcsz; + srcsz.height = src.rows; + srcsz.width = src.cols; - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; + cv::cuda::NppStreamHandler h(stream); - nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz) ); + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), srcsz) ); - if (!stream) - CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } - else if (elemSize == 4) - { - gridTranspose(globPtr(src), globPtr(dst), stream); - } - else // if (elemSize == 8) + }; + + #if USE_NPP_STREAM_CONTEXT + template ::func_ctx_t func> struct NppTransposeCtx + { + typedef typename NppTransposeFunc::npp_type npp_type; + + static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream) { - gridTranspose(globPtr(src), globPtr(dst), stream); + NppiSize srcsz; + srcsz.height = src.rows; + srcsz.width = src.cols; + + NppStreamContext ctx; + nppSafeCall( nppGetStreamContext(&ctx) ); + ctx.hStream = stream; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), srcsz, ctx) ); } + }; + #endif +} + +void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) +{ + GpuMat src = getInputMat(_src, stream); - syncOutput(dst, _dst, stream); + const int srcType = src.type(); + const size_t elemSize = src.elemSize(); + + GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream); + + const bool isSupported = + (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4) || + (elemSize == 6) || (elemSize == 8) || (elemSize == 12) || (elemSize == 16); + + if (!isSupported) + CV_Error(Error::StsUnsupportedFormat, ""); + else if (src.empty()) + CV_Error(Error::StsBadArg, "image is empty"); + + if ((src.rows == 1) && (src.cols == 1)) + src.copyTo(dst, stream); + else if (src.rows == 1) + src.reshape(0, src.cols).copyTo(dst, stream); + else if ((src.cols == 1) && src.isContinuous()) + src.reshape(0, src.cols).copyTo(dst, stream); + else + { + #if USE_NPP_STREAM_CONTEXT + constexpr const bool useNppStreamCtx = true; + #else + constexpr const bool useNppStreamCtx = false; + #endif + cudaStream_t _stream = StreamAccessor::getStream(stream); + + if (!_stream || !useNppStreamCtx) + { + typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream); + //if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize + static const func_t funcs[8][4] = { + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call}, + {NppTranspose::call, NppTranspose::call, nullptr, nullptr}, + {NppTranspose::call, NppTranspose::call, NppTranspose::call, NppTranspose::call} + }; + + const func_t func = funcs[src.depth()][src.channels() - 1]; + CV_Assert(func != nullptr); + + func(src, dst, _stream); + }//end if (!_stream || !useNppStreamCtx) + else//if ((_stream != 0) && useNppStreamCtx) + { + #if USE_NPP_STREAM_CONTEXT + typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream); + //if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize + static const func_t funcs[8][4] = { + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call}, + {NppTransposeCtx::call, NppTransposeCtx::call, nullptr, nullptr}, + {NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call, NppTransposeCtx::call} + }; + + const func_t func = funcs[src.depth()][src.channels() - 1]; + CV_Assert(func != nullptr); + + func(src, dst, _stream); + #endif + }//end if ((_stream != 0) && useNppStreamCtx) + }//end if } #endif diff --git a/modules/cudaarithm/test/test_core.cpp b/modules/cudaarithm/test/test_core.cpp index bc8f3737e53..e5c30f703ab 100644 --- a/modules/cudaarithm/test/test_core.cpp +++ b/modules/cudaarithm/test/test_core.cpp @@ -231,12 +231,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, testing::Values(MatType(CV_8UC1), + MatType(CV_8UC2), + MatType(CV_8UC3), MatType(CV_8UC4), + MatType(CV_8SC1), + MatType(CV_8SC2), + MatType(CV_8SC3), + MatType(CV_8SC4), + MatType(CV_16UC1), MatType(CV_16UC2), + MatType(CV_16UC3), + MatType(CV_16UC4), + MatType(CV_16SC1), MatType(CV_16SC2), + MatType(CV_16SC3), + MatType(CV_16SC4), MatType(CV_32SC1), MatType(CV_32SC2), - MatType(CV_64FC1)), + MatType(CV_32SC3), + MatType(CV_32SC4), + MatType(CV_32FC1), + MatType(CV_32FC2), + MatType(CV_32FC3), + MatType(CV_32FC4), + MatType(CV_64FC1), + MatType(CV_64FC2)), WHOLE_SUBMAT)); ////////////////////////////////////////////////////////////////////////////////