@@ -56,40 +56,89 @@ using namespace cv;
56
56
using namespace cv ::cuda;
57
57
using namespace cv ::cudev;
58
58
59
- void cv::cuda::transpose (InputArray _src, OutputArray _dst, Stream& stream)
59
+ namespace
60
60
{
61
- GpuMat src = getInputMat (_src, stream);
62
-
63
- const size_t elemSize = src. elemSize () ;
61
+ template < int DEPTH> struct NppTransposeFunc
62
+ {
63
+ typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type ;
64
64
65
- CV_Assert ( elemSize == 1 || elemSize == 4 || elemSize == 8 );
65
+ #if CV_USE_NPP_STREAM_CTX
66
+ typedef NppStatus (*func_t )(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize, NppStreamContext stream);
67
+ #else
68
+ typedef NppStatus (*func_t )(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize);
69
+ #endif
70
+ };
66
71
67
- GpuMat dst = getOutputMat (_dst, src.cols , src.rows , src.type (), stream);
72
+ template <int DEPTH, typename NppTransposeFunc<DEPTH>::func_t func> struct NppTranspose
73
+ {
74
+ typedef typename NppTransposeFunc<DEPTH>::npp_type npp_type;
68
75
69
- if (elemSize == 1 )
76
+ static void call ( const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream )
70
77
{
71
- NppStreamHandler h (StreamAccessor::getStream (stream));
72
-
73
- NppiSize sz;
74
- sz.width = src.cols ;
75
- sz.height = src.rows ;
78
+ NppiSize srcsz;
79
+ srcsz.height = src.rows ;
80
+ srcsz.width = src.cols ;
81
+
82
+ #if CV_USE_NPP_STREAM_CTX
83
+ NppStreamContext nppStreamContext{};
84
+ nppSafeCall (nppGetStreamContext (&nppStreamContext));
85
+ nppStreamContext.hStream = stream;
86
+ nppSafeCall (func (src.ptr <npp_type>(), static_cast <int >(src.step ), dst.ptr <npp_type>(), static_cast <int >(dst.step ), srcsz, nppStreamContext));
87
+ #else
88
+ cv::cuda::NppStreamHandler h (stream);
89
+ nppSafeCall ( func (src.ptr <npp_type>(), static_cast <int >(src.step ), dst.ptr <npp_type>(), static_cast <int >(dst.step ), srcsz) );
90
+ #endif
91
+ if (stream == 0 )
92
+ cudaSafeCall ( cudaDeviceSynchronize () );
93
+ }
94
+ };
95
+ }
76
96
77
- nppSafeCall ( nppiTranspose_8u_C1R (src.ptr <Npp8u>(), static_cast <int >(src.step ),
78
- dst.ptr <Npp8u>(), static_cast <int >(dst.step ), sz) );
97
+ void cv::cuda::transpose (InputArray _src, OutputArray _dst, Stream& stream)
98
+ {
99
+ GpuMat src = getInputMat (_src, stream);
100
+ CV_Assert (!src.empty ());
101
+ const size_t elemSize = src.elemSize ();
102
+ CV_Assert ((elemSize == 1 ) || (elemSize == 2 ) || (elemSize == 3 ) || (elemSize == 4 ) || (elemSize == 6 ) || (elemSize == 8 ) || (elemSize == 12 ) || (elemSize == 16 ));
103
+ GpuMat dst = getOutputMat (_dst, src.cols , src.rows , src.type (), stream);
79
104
80
- if (!stream)
81
- CV_CUDEV_SAFE_CALL ( cudaDeviceSynchronize () );
82
- }
83
- else if (elemSize == 4 )
105
+ if ((src.rows == 1 ) && (src.cols == 1 ))
106
+ src.copyTo (dst, stream);
107
+ else if (src.rows == 1 )
108
+ src.reshape (0 , src.cols ).copyTo (dst, stream);
109
+ else if ((src.cols == 1 ) && src.isContinuous ())
110
+ src.reshape (0 , src.cols ).copyTo (dst, stream);
111
+ else
84
112
{
85
- gridTranspose (globPtr<int >(src), globPtr<int >(dst), stream);
86
- }
87
- else // if (elemSize == 8)
88
- {
89
- gridTranspose (globPtr<double >(src), globPtr<double >(dst), stream);
113
+ typedef void (*func_t )(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream);
114
+ // if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize
115
+ #if CV_USE_NPP_STREAM_CTX
116
+ static const func_t funcs[8 ][4 ] = {
117
+ {NppTranspose<CV_8U, nppiTranspose_8u_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R_Ctx>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
118
+ {NppTranspose<CV_8U, nppiTranspose_8u_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R_Ctx>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
119
+ {NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R_Ctx>::call},
120
+ {NppTranspose<CV_16S, nppiTranspose_16s_C1R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C3R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call},
121
+ {NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C3R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R_Ctx>::call},
122
+ {NppTranspose<CV_32F, nppiTranspose_32f_C1R_Ctx>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTranspose<CV_32F, nppiTranspose_32f_C3R_Ctx>::call, NppTranspose<CV_32F, nppiTranspose_32f_C4R_Ctx>::call},
123
+ {NppTranspose<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R_Ctx>::call, nullptr , nullptr },
124
+ {NppTranspose<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R_Ctx>::call}
125
+ };
126
+ #else
127
+ static const func_t funcs[8 ][4 ] = {
128
+ {NppTranspose<CV_8U, nppiTranspose_8u_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
129
+ {NppTranspose<CV_8U, nppiTranspose_8u_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
130
+ {NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call},
131
+ {NppTranspose<CV_16S, nppiTranspose_16s_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C3R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call},
132
+ {NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C3R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call},
133
+ {NppTranspose<CV_32F, nppiTranspose_32f_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C3R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C4R>::call},
134
+ {NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call, nullptr , nullptr },
135
+ {NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call}
136
+ };
137
+ #endif
138
+ const func_t func = funcs[src.depth ()][src.channels () - 1 ];
139
+ CV_Assert (func != nullptr );
140
+ func (src, dst, StreamAccessor::getStream (stream));
90
141
}
91
-
92
- syncOutput (dst, _dst, stream);
93
142
}
94
143
95
144
#endif
0 commit comments