diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index bfe50bd34fb..4a42df382e6 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -56,40 +56,136 @@ using namespace cv;
 using namespace cv::cuda;
 using namespace cv::cudev;
 
-void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
-{
-    GpuMat src = getInputMat(_src, stream);
+#define USE_NPP_STREAM_CONTEXT (NPP_VERSION >= (10 * 1000 + 1 * 100 + 0))
 
-    const size_t elemSize = src.elemSize();
+namespace
+{
+  template <int DEPTH> struct NppTransposeFunc
+  {
+    typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
 
-    CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
+    typedef NppStatus (*func_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize);
+    #if USE_NPP_STREAM_CONTEXT
+    typedef NppStatus (*func_ctx_t)(const npp_type* pSrc, int srcStep, npp_type* pDst, int dstStep, NppiSize srcSize, NppStreamContext stream);
+    #endif
+  };
 
-    GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
+  template <int DEPTH, typename NppTransposeFunc<DEPTH>::func_t func> struct NppTranspose
+  {
+    typedef typename NppTransposeFunc<DEPTH>::npp_type npp_type;
 
-    if (elemSize == 1)
+    static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream)
     {
-        NppStreamHandler h(StreamAccessor::getStream(stream));
+      NppiSize srcsz;
+      srcsz.height = src.rows;
+      srcsz.width = src.cols;
 
-        NppiSize sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
+      cv::cuda::NppStreamHandler h(stream);
 
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+      nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz) );
 
-        if (!stream)
-            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+      if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
     }
-    else if (elemSize == 4)
-    {
-        gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
-    }
-    else // if (elemSize == 8)
+  };
+
+  #if USE_NPP_STREAM_CONTEXT
+  template <int DEPTH, typename NppTransposeFunc<DEPTH>::func_ctx_t func> struct NppTransposeCtx
+  {
+    typedef typename NppTransposeFunc<DEPTH>::npp_type npp_type;
+
+    static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream)
     {
-        gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
+      NppiSize srcsz;
+      srcsz.height = src.rows;
+      srcsz.width = src.cols;
+
+      NppStreamContext ctx;
+      nppSafeCall( nppGetStreamContext(&ctx) );
+      ctx.hStream = stream;
+
+      nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), dst.ptr<npp_type>(), static_cast<int>(dst.step), srcsz, ctx) );
     }
+  };
+  #endif
+}
+
+void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    GpuMat src = getInputMat(_src, stream);
 
-    syncOutput(dst, _dst, stream);
+    const int srcType = src.type();
+    const size_t elemSize = src.elemSize();
+
+    GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
+
+    const bool isSupported =
+      (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4) ||
+      (elemSize == 6) || (elemSize == 8) || (elemSize == 12) || (elemSize == 16);
+
+    if (!isSupported)
+        CV_Error(Error::StsUnsupportedFormat, "");
+    else if (src.empty())
+        CV_Error(Error::StsBadArg, "image is empty");
+
+    if ((src.rows == 1) && (src.cols == 1))
+        src.copyTo(dst, stream);
+    else if (src.rows == 1)
+        src.reshape(0, src.cols).copyTo(dst, stream);
+    else if ((src.cols == 1) && src.isContinuous())
+        src.reshape(0, src.cols).copyTo(dst, stream);
+    else
+    {
+        #if USE_NPP_STREAM_CONTEXT
+        constexpr const bool useNppStreamCtx = true;
+        #else
+        constexpr const bool useNppStreamCtx = false;
+        #endif
+        cudaStream_t _stream = StreamAccessor::getStream(stream);
+
+        if (!_stream || !useNppStreamCtx)
+        {
+          typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream);
+          //if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize
+          static const func_t funcs[8][4] = {
+            {NppTranspose<CV_8U,  nppiTranspose_8u_C1R>::call,  NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call,   NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
+            {NppTranspose<CV_8U,  nppiTranspose_8u_C1R>::call,  NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_8U, nppiTranspose_8u_C3R>::call,   NppTranspose<CV_8U, nppiTranspose_8u_C4R>::call},
+            {NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call},
+            {NppTranspose<CV_16S, nppiTranspose_16s_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C3R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call},
+            {NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C3R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call},
+            {NppTranspose<CV_32F, nppiTranspose_32f_C1R>::call, NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C3R>::call, NppTranspose<CV_32F, nppiTranspose_32f_C4R>::call},
+            {NppTranspose<CV_16S, nppiTranspose_16s_C4R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C4R>::call, nullptr, nullptr},
+            {NppTranspose<CV_16U, nppiTranspose_16u_C1R>::call, NppTranspose<CV_32S, nppiTranspose_32s_C1R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C3R>::call, NppTranspose<CV_16U, nppiTranspose_16u_C4R>::call}
+          };
+          
+          const func_t func = funcs[src.depth()][src.channels() - 1];
+          CV_Assert(func != nullptr);
+
+          func(src, dst, _stream);
+        }//end if (!_stream || !useNppStreamCtx)
+        else//if ((_stream != 0) && useNppStreamCtx)
+        {
+          #if USE_NPP_STREAM_CONTEXT
+          typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, cudaStream_t stream);
+          //if no direct mapping exists between DEPTH+CHANNELS and the nppiTranspose supported type, we use a nppiTranspose of a similar elemSize
+          static const func_t funcs[8][4] = {
+            {NppTransposeCtx<CV_8U,  nppiTranspose_8u_C1R_Ctx>::call,  NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_8U,  nppiTranspose_8u_C3R_Ctx>::call,  NppTransposeCtx<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_8U,  nppiTranspose_8u_C1R_Ctx>::call,  NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_8U,  nppiTranspose_8u_C3R_Ctx>::call,  NppTransposeCtx<CV_8U, nppiTranspose_8u_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_16S, nppiTranspose_16s_C1R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C3R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C3R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_32F, nppiTranspose_32f_C1R_Ctx>::call, NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTransposeCtx<CV_32F, nppiTranspose_32f_C3R_Ctx>::call, NppTransposeCtx<CV_32F, nppiTranspose_32f_C4R_Ctx>::call},
+            {NppTransposeCtx<CV_16S, nppiTranspose_16s_C4R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C4R_Ctx>::call, nullptr, nullptr},
+            {NppTransposeCtx<CV_16U, nppiTranspose_16u_C1R_Ctx>::call, NppTransposeCtx<CV_32S, nppiTranspose_32s_C1R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C3R_Ctx>::call, NppTransposeCtx<CV_16U, nppiTranspose_16u_C4R_Ctx>::call}
+          };
+
+          const func_t func = funcs[src.depth()][src.channels() - 1];
+          CV_Assert(func != nullptr);
+
+          func(src, dst, _stream);
+          #endif
+        }//end if ((_stream != 0) && useNppStreamCtx)
+    }//end if
 }
 
 #endif
diff --git a/modules/cudaarithm/test/test_core.cpp b/modules/cudaarithm/test/test_core.cpp
index bc8f3737e53..e5c30f703ab 100644
--- a/modules/cudaarithm/test/test_core.cpp
+++ b/modules/cudaarithm/test/test_core.cpp
@@ -231,12 +231,31 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Transpose, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC2),
+                    MatType(CV_8UC3),
                     MatType(CV_8UC4),
+                    MatType(CV_8SC1),
+                    MatType(CV_8SC2),
+                    MatType(CV_8SC3),
+                    MatType(CV_8SC4),
+                    MatType(CV_16UC1),
                     MatType(CV_16UC2),
+                    MatType(CV_16UC3),
+                    MatType(CV_16UC4),
+                    MatType(CV_16SC1),
                     MatType(CV_16SC2),
+                    MatType(CV_16SC3),
+                    MatType(CV_16SC4),
                     MatType(CV_32SC1),
                     MatType(CV_32SC2),
-                    MatType(CV_64FC1)),
+                    MatType(CV_32SC3),
+                    MatType(CV_32SC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC2),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4),
+                    MatType(CV_64FC1),
+                    MatType(CV_64FC2)),
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////