Skip to content

Commit e40c43d

Browse files
chacha21cudawarped
authored andcommitted
cudaarithm: add more datatypes to npp transpose and update to stream context api for npp >=10.1
1 parent e247b68 commit e40c43d

File tree

3 files changed

+99
-27
lines changed

3 files changed

+99
-27
lines changed

modules/cudaarithm/src/cuda/transpose.cu

Lines changed: 74 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -56,40 +56,89 @@ using namespace cv;
5656
using namespace cv::cuda;
5757
using namespace cv::cudev;
5858

59-
void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
59+
namespace
6060
{
61-
GpuMat src = getInputMat(_src, stream);
62-
63-
const size_t elemSize = src.elemSize();
61+
template <int DEPTH> struct NppTransposeFunc
62+
{
63+
typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
6464

65-
CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
65+
#if CV_USE_NPP_STREAM_CTX
66+
typedef NppStatus(*func_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize, NppStreamContext stream);
67+
#else
68+
typedef NppStatus(*func_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize);
69+
#endif
70+
};
6671

67-
GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
72+
template <int DEPTH, typename NppTransposeFunc<DEPTH>::func_t func> struct NppTranspose
73+
{
74+
typedef typename NppTransposeFunc<DEPTH>::npp_type npp_type;
6875

69-
if (elemSize == 1)
76+
static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream)
7077
{
71-
NppStreamHandler h(StreamAccessor::getStream(stream));
72-
73-
NppiSize sz;
74-
sz.width = src.cols;
75-
sz.height = src.rows;
78+
NppiSize srcsz;
79+
srcsz.height = src.rows;
80+
srcsz.width = src.cols;
81+
82+
#if CV_USE_NPP_STREAM_CTX
83+
NppStreamContext nppStreamContext{};
84+
nppSafeCall(nppGetStreamContext(&nppStreamContext));
85+
nppStreamContext.hStream = stream;
86+
nppSafeCall(func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz, nppStreamContext));
87+
#else
88+
cv::cuda::NppStreamHandler h(stream);
89+
nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz) );
90+
#endif
91+
if (stream == 0)
92+
cudaSafeCall( cudaDeviceSynchronize() );
93+
}
94+
};
95+
}
7696

77-
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
78-
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
97+
void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
98+
{
99+
GpuMat src = getInputMat(_src, stream);
100+
CV_Assert(!src.empty());
101+
const size_t elemSize = src.elemSize();
102+
CV_Assert((elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4) || (elemSize == 6) || (elemSize == 8) || (elemSize == 12) || (elemSize == 16));
103+
GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
79104

80-
if (!stream)
81-
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
82-
}
83-
else if (elemSize == 4)
105+
if ((src.rows == 1) && (src.cols == 1))
106+
src.copyTo(dst, stream);
107+
else if (src.rows == 1)
108+
src.reshape(0, src.cols).copyTo(dst, stream);
109+
else if ((src.cols == 1) && src.isContinuous())
110+
src.reshape(0, src.cols).copyTo(dst, stream);
111+
else
84112
{
85-
gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
86-
}
87-
else // if (elemSize == 8)
88-
{
89-
gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
113+
typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream);
114+
//if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize
115+
#if CV_USE_NPP_STREAM_CTX
116+
static const func_t funcs[8][4] = {
117+
{NppTranspose<CV_8U, nppiTranspose_8u_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R_Ctx>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
118+
{NppTranspose<CV_8U, nppiTranspose_8u_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R_Ctx>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
119+
{NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R_Ctx>::call},
120+
{NppTranspose<CV_16S, nppiTranspose_16s_C1R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C3R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call},
121+
{NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C3R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R_Ctx>::call},
122+
{NppTranspose<CV_32F, nppiTranspose_32f_C1R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTranspose<CV_32F, nppiTranspose_32f_C3R_Ctx>::call, NppTranspose<CV_32F, nppiTranspose_32f_C4R_Ctx>::call},
123+
{NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R_Ctx>::call, nullptr, nullptr},
124+
{NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R_Ctx>::call}
125+
};
126+
#else
127+
static const func_t funcs[8][4] = {
128+
{NppTranspose<CV_8U, nppiTranspose_8u_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
129+
{NppTranspose<CV_8U, nppiTranspose_8u_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
130+
{NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call},
131+
{NppTranspose<CV_16S, nppiTranspose_16s_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C3R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call},
132+
{NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C3R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call},
133+
{NppTranspose<CV_32F, nppiTranspose_32f_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C3R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C4R>::call},
134+
{NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call, nullptr, nullptr},
135+
{NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call}
136+
};
137+
#endif
138+
const func_t func = funcs[src.depth()][src.channels() - 1];
139+
CV_Assert(func != nullptr);
140+
func(src, dst, StreamAccessor::getStream(stream));
90141
}
91-
92-
syncOutput(dst, _dst, stream);
93142
}
94143

95144
#endif

modules/cudaarithm/test/test_core.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ PARAM_TEST_CASE(Transpose, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
187187
cv::Size size;
188188
int type;
189189
bool useRoi;
190+
Stream stream;
190191

191192
virtual void SetUp()
192193
{
@@ -218,7 +219,7 @@ CUDA_TEST_P(Transpose, Accuracy)
218219
else
219220
{
220221
cv::cuda::GpuMat dst = createMat(cv::Size(size.height, size.width), type, useRoi);
221-
cv::cuda::transpose(loadMat(src, useRoi), dst);
222+
cv::cuda::transpose(loadMat(src, useRoi), dst, stream);
222223

223224
cv::Mat dst_gold;
224225
cv::transpose(src, dst_gold);
@@ -231,12 +232,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine(
231232
ALL_DEVICES,
232233
DIFFERENT_SIZES,
233234
testing::Values(MatType(CV_8UC1),
235+
MatType(CV_8UC2),
236+
MatType(CV_8UC3),
234237
MatType(CV_8UC4),
238+
MatType(CV_8SC1),
239+
MatType(CV_8SC2),
240+
MatType(CV_8SC3),
241+
MatType(CV_8SC4),
242+
MatType(CV_16UC1),
235243
MatType(CV_16UC2),
244+
MatType(CV_16UC3),
245+
MatType(CV_16UC4),
246+
MatType(CV_16SC1),
236247
MatType(CV_16SC2),
248+
MatType(CV_16SC3),
249+
MatType(CV_16SC4),
237250
MatType(CV_32SC1),
238251
MatType(CV_32SC2),
239-
MatType(CV_64FC1)),
252+
MatType(CV_32SC3),
253+
MatType(CV_32SC4),
254+
MatType(CV_32FC1),
255+
MatType(CV_32FC2),
256+
MatType(CV_32FC3),
257+
MatType(CV_32FC4),
258+
MatType(CV_64FC1),
259+
MatType(CV_64FC2)),
240260
WHOLE_SUBMAT));
241261

242262
////////////////////////////////////////////////////////////////////////////////

modules/cudev/include/opencv2/cudev/common.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ namespace cv { namespace cudev {
5757

5858
using namespace cv::cuda;
5959

60+
// CV_USE_NPP_STREAM_CTX
61+
#define CV_USE_NPP_STREAM_CTX (NPP_VERSION >= (10 * 1000 + 1 * 100 + 0))
62+
6063
// CV_CUDEV_ARCH
6164

6265
#ifndef __CUDA_ARCH__

0 commit comments

Comments
 (0)