Skip to content

More data types supported in cv::cuda::transpose() #3371

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: 4.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 118 additions & 22 deletions modules/cudaarithm/src/cuda/transpose.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,40 +56,136 @@ using namespace cv;
using namespace cv::cuda;
using namespace cv::cudev;

void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
{
GpuMat src = getInputMat(_src, stream);
#define USE_NPP_STREAM_CONTEXT (NPP_VERSION >= (10 * 1000 + 1 * 100 + 0))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be better to define this globally somewhere although not sure where, maybe modules/cudev/include/opencv2/cudev/common.hpp.


const size_t elemSize = src.elemSize();
namespace
{
template <int DEPTH> struct NppTransposeFunc
{
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;

CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
typedef NppStatus (*func_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize);
#if USE_NPP_STREAM_CONTEXT
typedef NppStatus (*func_ctx_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize, NppStreamContext stream);
#endif
};

GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
template <int DEPTH, typename NppTransposeFunc<DEPTH>::func_t func> struct NppTranspose
{
typedef typename NppTransposeFunc<DEPTH>::npp_type npp_type;

if (elemSize == 1)
static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream)
{
NppStreamHandler h(StreamAccessor::getStream(stream));
NppiSize srcsz;
srcsz.height = src.rows;
srcsz.width = src.cols;

NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
cv::cuda::NppStreamHandler h(stream);

nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz) );

if (!stream)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
else if (elemSize == 4)
{
gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
}
else // if (elemSize == 8)
};

#if USE_NPP_STREAM_CONTEXT
template <int DEPTH, typename NppTransposeFunc<DEPTH>::func_ctx_t func> struct NppTransposeCtx
{
typedef typename NppTransposeFunc<DEPTH>::npp_type npp_type;

static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream)
{
gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
NppiSize srcsz;
srcsz.height = src.rows;
srcsz.width = src.cols;

NppStreamContext ctx;
nppSafeCall( nppGetStreamContext(&ctx) );
ctx.hStream = stream;

nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz, ctx) );
}
};
#endif
}

void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
{
GpuMat src = getInputMat(_src, stream);

syncOutput(dst, _dst, stream);
const int srcType = src.type();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this redundant?

const size_t elemSize = src.elemSize();

GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);

const bool isSupported =
(elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4) ||
(elemSize == 6) || (elemSize == 8) || (elemSize == 12) || (elemSize == 16);

if (!isSupported)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would CV_Assert() be better here?

CV_Error(Error::StsUnsupportedFormat, "");
else if (src.empty())
CV_Error(Error::StsBadArg, "image is empty");

if ((src.rows == 1) && (src.cols == 1))
src.copyTo(dst, stream);
else if (src.rows == 1)
src.reshape(0, src.cols).copyTo(dst, stream);
else if ((src.cols == 1) && src.isContinuous())
src.reshape(0, src.cols).copyTo(dst, stream);
else
{
#if USE_NPP_STREAM_CONTEXT
constexpr const bool useNppStreamCtx = true;
#else
constexpr const bool useNppStreamCtx = false;
#endif
cudaStream_t _stream = StreamAccessor::getStream(stream);

if (!_stream || !useNppStreamCtx)
Copy link
Contributor

@cudawarped cudawarped Jan 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be better to always use the old api when CUDA SDK < 10.1 and the newer one otherwise regardless of whether there is a stream or not? That way it should be easier to carve out the older code when it is depreciated in newer CUDA versions.

See https://github.com/cudawarped/opencv_contrib/blob/e40c43d96a22edaa3fbb880e957a3753938dc4f0/modules/cudaarithm/src/cuda/transpose.cu

{
typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream);
//if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize
static const func_t funcs[8][4] = {
{NppTranspose<CV_8U, nppiTranspose_8u_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
{NppTranspose<CV_8U, nppiTranspose_8u_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
{NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call},
{NppTranspose<CV_16S, nppiTranspose_16s_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C3R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call},
{NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C3R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call},
{NppTranspose<CV_32F, nppiTranspose_32f_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C3R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C4R>::call},
{NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call, nullptr, nullptr},
{NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call}
};

const func_t func = funcs[src.depth()][src.channels() - 1];
CV_Assert(func != nullptr);

func(src, dst, _stream);
}//end if (!_stream || !useNppStreamCtx)
else//if ((_stream != 0) && useNppStreamCtx)
{
#if USE_NPP_STREAM_CONTEXT
typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream);
//if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize
static const func_t funcs[8][4] = {
{NppTransposeCtx<CV_8U, nppiTranspose_8u_C1R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_8U, nppiTranspose_8u_C3R_Ctx>::call, NppTransposeCtx<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
{NppTransposeCtx<CV_8U, nppiTranspose_8u_C1R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_8U, nppiTranspose_8u_C3R_Ctx>::call, NppTransposeCtx<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
{NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C4R_Ctx>::call},
{NppTransposeCtx<CV_16S, nppiTranspose_16s_C1R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C3R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call},
{NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C3R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C4R_Ctx>::call},
{NppTransposeCtx<CV_32F, nppiTranspose_32f_C1R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTransposeCtx<CV_32F, nppiTranspose_32f_C3R_Ctx>::call, NppTransposeCtx<CV_32F, nppiTranspose_32f_C4R_Ctx>::call},
{NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C4R_Ctx>::call, nullptr, nullptr},
{NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C4R_Ctx>::call}
};

const func_t func = funcs[src.depth()][src.channels() - 1];
CV_Assert(func != nullptr);

func(src, dst, _stream);
#endif
}//end if ((_stream != 0) && useNppStreamCtx)
}//end if
}

#endif
21 changes: 20 additions & 1 deletion modules/cudaarithm/test/test_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES,
testing::Values(MatType(CV_8UC1),
MatType(CV_8UC2),
MatType(CV_8UC3),
MatType(CV_8UC4),
MatType(CV_8SC1),
MatType(CV_8SC2),
MatType(CV_8SC3),
MatType(CV_8SC4),
MatType(CV_16UC1),
MatType(CV_16UC2),
MatType(CV_16UC3),
MatType(CV_16UC4),
MatType(CV_16SC1),
MatType(CV_16SC2),
MatType(CV_16SC3),
MatType(CV_16SC4),
MatType(CV_32SC1),
MatType(CV_32SC2),
MatType(CV_64FC1)),
MatType(CV_32SC3),
MatType(CV_32SC4),
MatType(CV_32FC1),
MatType(CV_32FC2),
MatType(CV_32FC3),
MatType(CV_32FC4),
MatType(CV_64FC1),
MatType(CV_64FC2)),
WHOLE_SUBMAT));

////////////////////////////////////////////////////////////////////////////////
Expand Down