From a4abeb3b1bdda5d98aaf9b34855fd132aec4a294 Mon Sep 17 00:00:00 2001
From: MaximSmolskiy <mithridatus@mail.ru>
Date: Thu, 16 Oct 2025 03:42:19 +0300
Subject: [PATCH 1/5] Fix findEllipses

---
 modules/ximgproc/src/find_ellipses.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/modules/ximgproc/src/find_ellipses.cpp b/modules/ximgproc/src/find_ellipses.cpp
index b5ab61e7930..3164aa26f43 100644
--- a/modules/ximgproc/src/find_ellipses.cpp
+++ b/modules/ximgproc/src/find_ellipses.cpp
@@ -1301,12 +1301,12 @@ void EllipseDetectorImpl::preProcessing(Mat1b &src, Mat1b &dp, Mat1b &dn) {
     // 2 - the pixel does belong to an edge
     for (int i = 0; i <= imgSize.height; i++) {
         int *tmpMag = magBuffer[(i > 0) + 1] + 1;
-        const short *tmpDx = dx.ptr<short>(i);
-        const short *tmpDy = dy.ptr<short>(i);
         uchar *tmpMap;
         int prevFlag = 0;
 
         if (i < imgSize.height) {
+            const short *tmpDx = dx.ptr<short>(i);
+            const short *tmpDy = dy.ptr<short>(i);
             tmpMag[-1] = tmpMag[imgSize.width] = 0;
             for (int j = 0; j < imgSize.width; j++)
                 tmpMag[j] = abs(tmpDx[j]) + abs(tmpDy[j]);
@@ -1322,8 +1322,8 @@ void EllipseDetectorImpl::preProcessing(Mat1b &src, Mat1b &dp, Mat1b &dn) {
         tmpMap[-1] = tmpMap[imgSize.width] = 1;
 
         tmpMag = magBuffer[1] + 1; // take the central row
-        tmpDx = (short *) (dx[i - 1]);
-        tmpDy = (short *) (dy[i - 1]);
+        const short *tmpDx = dx.ptr<short>(i - 1);
+        const short *tmpDy = dy.ptr<short>(i - 1);
 
         ptrdiff_t magStep1, magStep2;
         magStep1 = magBuffer[2] - magBuffer[1];

From 48e4b384ae2eab4d675762dc49993efc336c5d41 Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Fri, 17 Oct 2025 16:28:42 +0300
Subject: [PATCH 2/5] [cudadcodec] remove dependance on global constant memory
 for color conversion coefficients

---
 modules/cudacodec/src/cuda/ColorSpace.cu      | 447 ++++++++----------
 modules/cudacodec/src/cuda/ColorSpace.h       |   4 +
 ...idia_surface_format_to_color_converter.cpp | 223 +++++----
 3 files changed, 337 insertions(+), 337 deletions(-)

diff --git a/modules/cudacodec/src/cuda/ColorSpace.cu b/modules/cudacodec/src/cuda/ColorSpace.cu
index 137805af392..3aa0490a95e 100644
--- a/modules/cudacodec/src/cuda/ColorSpace.cu
+++ b/modules/cudacodec/src/cuda/ColorSpace.cu
@@ -8,79 +8,18 @@
 
 namespace cv { namespace cuda { namespace device {
 
-__constant__ float matYuv2Color[3][3];
-
-void inline GetConstants(int iMatrix, float& wr, float& wb, int& black, int& white, int& uvWhite, int& max, bool fullRange = false) {
-    if (fullRange) {
-        black = 0; white = 255; uvWhite = 255;
-    }
-    else {
-        black = 16; white = 235; uvWhite = 240;
-    }
-    max = 255;
-
-    switch (static_cast<cv::cudacodec::ColorSpaceStandard>(iMatrix))
-    {
-    case cv::cudacodec::ColorSpaceStandard::BT709:
-    default:
-        wr = 0.2126f; wb = 0.0722f;
-        break;
-
-    case cv::cudacodec::ColorSpaceStandard::FCC:
-        wr = 0.30f; wb = 0.11f;
-        break;
-
-    case cv::cudacodec::ColorSpaceStandard::BT470:
-    case cv::cudacodec::ColorSpaceStandard::BT601:
-        wr = 0.2990f; wb = 0.1140f;
-        break;
-
-    case cv::cudacodec::ColorSpaceStandard::SMPTE240M:
-        wr = 0.212f; wb = 0.087f;
-        break;
-
-    case cv::cudacodec::ColorSpaceStandard::BT2020:
-    case cv::cudacodec::ColorSpaceStandard::BT2020C:
-        wr = 0.2627f; wb = 0.0593f;
-        // 10-bit only
-        black = 64 << 6; white = 940 << 6;
-        max = (1 << 16) - 1;
-        break;
-    }
-}
-
-void SetMatYuv2Rgb(int iMatrix, bool fullRange = false) {
-    float wr, wb;
-    int black, white, max, uvWhite;
-    GetConstants(iMatrix, wr, wb, black, white, uvWhite, max, fullRange);
-    float mat[3][3] = {
-        1.0f, 0.0f, (1.0f - wr) / 0.5f,
-        1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr),
-        1.0f, (1.0f - wb) / 0.5f, 0.0f,
-    };
-    for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
-            if (j == 0)
-                mat[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]);
-            else
-                mat[i][j] = (float)(1.0 * max / (uvWhite - black) * mat[i][j]);
-        }
-    }
-    cudaMemcpyToSymbol(matYuv2Color, mat, sizeof(mat));
-}
-
 template<class T>
 __device__ static T Clamp(T x, T lower, T upper) {
     return x < lower ? lower : (x > upper ? upper : x);
 }
 
 template<class Gray, class YuvUnit>
-__device__ inline Gray YToGrayForPixel(YuvUnit y, bool videoFullRangeFlag) {
+__device__ inline Gray YToGrayForPixel(YuvUnit y, float lumaCoeff, bool videoFullRangeFlag) {
     const int low = videoFullRangeFlag ? 0 : 1 << (sizeof(YuvUnit) * 8 - 4);
     float fy = (int)y - low;
     const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f;
 
-    YuvUnit g = (YuvUnit)Clamp(matYuv2Color[0][0] * fy, 0.0f, maxf);
+    YuvUnit g = (YuvUnit)Clamp(lumaCoeff * fy, 0.0f, maxf);
     const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(Gray)) * 8;
     Gray gray{};
     if (sizeof(YuvUnit) >= sizeof(Gray))
@@ -91,16 +30,16 @@ __device__ inline Gray YToGrayForPixel(YuvUnit y, bool videoFullRangeFlag) {
 }
 
 template<class Color, class YuvUnit>
-__device__ inline Color YuvToColorForPixel(YuvUnit y, YuvUnit u, YuvUnit v, bool videoFullRangeFlag) {
+__device__ inline Color YuvToColorForPixel(YuvUnit y, YuvUnit u, YuvUnit v, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     const int
         low = videoFullRangeFlag ? 0 : 1 << (sizeof(YuvUnit) * 8 - 4),
         mid = 1 << (sizeof(YuvUnit) * 8 - 1);
     float fy = (int)y - low, fu = (int)u - mid, fv = (int)v - mid;
     const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f;
     YuvUnit
-        r = (YuvUnit)Clamp(matYuv2Color[0][0] * fy + matYuv2Color[0][1] * fu + matYuv2Color[0][2] * fv, 0.0f, maxf),
-        g = (YuvUnit)Clamp(matYuv2Color[1][0] * fy + matYuv2Color[1][1] * fu + matYuv2Color[1][2] * fv, 0.0f, maxf),
-        b = (YuvUnit)Clamp(matYuv2Color[2][0] * fy + matYuv2Color[2][1] * fu + matYuv2Color[2][2] * fv, 0.0f, maxf);
+        r = (YuvUnit)Clamp(matYuv2Color.m[0][0] * fy + matYuv2Color.m[0][1] * fu + matYuv2Color.m[0][2] * fv, 0.0f, maxf),
+        g = (YuvUnit)Clamp(matYuv2Color.m[1][0] * fy + matYuv2Color.m[1][1] * fu + matYuv2Color.m[1][2] * fv, 0.0f, maxf),
+        b = (YuvUnit)Clamp(matYuv2Color.m[2][0] * fy + matYuv2Color.m[2][1] * fu + matYuv2Color.m[2][2] * fv, 0.0f, maxf);
 
     Color color{};
     const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(color.c.r)) * 8;
@@ -118,15 +57,15 @@ __device__ inline Color YuvToColorForPixel(YuvUnit y, YuvUnit u, YuvUnit v, bool
 }
 
 template<class Color, class YuvUnit>
-__device__ inline Color YuvToColoraForPixel(YuvUnit y, YuvUnit u, YuvUnit v, bool videoFullRangeFlag) {
-    Color color = YuvToColorForPixel<Color>(y, u, v, videoFullRangeFlag);
+__device__ inline Color YuvToColoraForPixel(YuvUnit y, YuvUnit u, YuvUnit v, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
+    Color color = YuvToColorForPixel<Color>(y, u, v, matYuv2Color, videoFullRangeFlag);
     const float maxf = (1 << sizeof(color.c.r) * 8) - 1.0f;
     color.c.a = maxf;
     return color;
 }
 
 template<class Yuvx2, class Gray, class Grayx2>
-__global__ static void YToGrayKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+__global__ static void YToGrayKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pGray, int nGrayPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
     int y = (threadIdx.y + blockIdx.y * blockDim.y);
     if (x + 1 >= nWidth || y >= nHeight) {
@@ -138,13 +77,13 @@ __global__ static void YToGrayKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pGra
 
     Yuvx2 l0 = *(Yuvx2*)pSrc;
     *(Grayx2*)pDst = Grayx2{
-        YToGrayForPixel<Gray>(l0.x, videoFullRangeFlag),
-        YToGrayForPixel<Gray>(l0.y, videoFullRangeFlag),
+        YToGrayForPixel<Gray>(l0.x, matYuv2Color.m[0][0], videoFullRangeFlag),
+        YToGrayForPixel<Gray>(l0.y, matYuv2Color.m[0][0], videoFullRangeFlag),
     };
 }
 
 template<class Yuvx2, class Color, class Colorx2>
-__global__ static void YuvToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+__global__ static void YuvToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
     int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
     if (x + 1 >= nWidth || y + 1 >= nHeight) {
@@ -160,20 +99,20 @@ __global__ static void YuvToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* p
 
     union ColorOutx2 {
         Colorx2 d;
-        Color Color[2];
+        Color color[2];
     };
     ColorOutx2 l1Out;
-    l1Out.Color[0] = YuvToColorForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag);
-    l1Out.Color[1] = YuvToColorForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag);
+    l1Out.color[0] = YuvToColorForPixel<Color>(l0.x, ch.x, ch.y, matYuv2Color, videoFullRangeFlag);
+    l1Out.color[1] = YuvToColorForPixel<Color>(l0.y, ch.x, ch.y, matYuv2Color, videoFullRangeFlag);
     *(Colorx2*)pDst = l1Out.d;
     ColorOutx2 l2Out;
-    l2Out.Color[0] = YuvToColorForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag);
-    l2Out.Color[1] = YuvToColorForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag);
+    l2Out.color[0] = YuvToColorForPixel<Color>(l1.x, ch.x, ch.y, matYuv2Color, videoFullRangeFlag);
+    l2Out.color[1] = YuvToColorForPixel<Color>(l1.y, ch.x, ch.y, matYuv2Color, videoFullRangeFlag);
     *(Colorx2*)(pDst + nColorPitch) = l2Out.d;
 }
 
 template<class YuvUnitx2, class Color, class ColorIntx2>
-__global__ static void YuvToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+__global__ static void YuvToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
     int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
     if (x + 1 >= nWidth || y + 1 >= nHeight) {
@@ -188,17 +127,17 @@ __global__ static void YuvToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_t*
     YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
 
     *(ColorIntx2*)pDst = ColorIntx2{
-        YuvToColoraForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag).d,
-        YuvToColoraForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l0.x, ch.x, ch.y, matYuv2Color, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l0.y, ch.x, ch.y, matYuv2Color, videoFullRangeFlag).d,
     };
     *(ColorIntx2*)(pDst + nColorPitch) = ColorIntx2{
-        YuvToColoraForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag).d,
-        YuvToColoraForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l1.x, ch.x, ch.y, matYuv2Color, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l1.y, ch.x, ch.y, matYuv2Color, videoFullRangeFlag).d,
     };
 }
 
 template<class YuvUnitx2, class Color, class Colorx2>
-__global__ static void Yuv444ToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+__global__ static void Yuv444ToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
     int y = (threadIdx.y + blockIdx.y * blockDim.y);
     if (x + 1 >= nWidth || y >= nHeight) {
@@ -214,16 +153,16 @@ __global__ static void Yuv444ToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t
 
     union ColorOutx2 {
         Colorx2 d;
-        Color Color[2];
+        Color color[2];
     };
     ColorOutx2 out;
-    out.Color[0] = YuvToColorForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag);
-    out.Color[1] = YuvToColorForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag);
+    out.color[0] = YuvToColorForPixel<Color>(l0.x, ch1.x, ch2.x, matYuv2Color, videoFullRangeFlag);
+    out.color[1] = YuvToColorForPixel<Color>(l0.y, ch1.y, ch2.y, matYuv2Color, videoFullRangeFlag);
     *(Colorx2*)pDst = out.d;
 }
 
 template<class YuvUnitx2, class Color, class ColorIntx2>
-__global__ static void Yuv444ToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+__global__ static void Yuv444ToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
     int y = (threadIdx.y + blockIdx.y * blockDim.y);
     if (x + 1 >= nWidth || y >= nHeight) {
@@ -238,13 +177,13 @@ __global__ static void Yuv444ToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_
     YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
 
     *(ColorIntx2*)pDst = ColorIntx2{
-        YuvToColoraForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag).d,
-        YuvToColoraForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l0.x, ch1.x, ch2.x, matYuv2Color, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l0.y, ch1.y, ch2.y, matYuv2Color, videoFullRangeFlag).d,
     };
 }
 
 template<class YuvUnitx2, class Color, class ColorUnitx2>
-__global__ static void YuvToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+__global__ static void YuvToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
     int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
     if (x + 1 >= nWidth || y + 1 >= nHeight) {
@@ -257,10 +196,10 @@ __global__ static void YuvToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint
     YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
     YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
 
-    Color color0 = YuvToColorForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag),
-        color1 = YuvToColorForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag),
-        color2 = YuvToColorForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag),
-        color3 = YuvToColorForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag);
+    Color color0 = YuvToColorForPixel<Color>(l0.x, ch.x, ch.y, matYuv2Color, videoFullRangeFlag),
+        color1 = YuvToColorForPixel<Color>(l0.y, ch.x, ch.y, matYuv2Color, videoFullRangeFlag),
+        color2 = YuvToColorForPixel<Color>(l1.x, ch.x, ch.y, matYuv2Color, videoFullRangeFlag),
+        color3 = YuvToColorForPixel<Color>(l1.y, ch.x, ch.y, matYuv2Color, videoFullRangeFlag);
 
     uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
     *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
@@ -274,7 +213,7 @@ __global__ static void YuvToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint
 }
 
 template<class YuvUnitx2, class Color, class ColorUnitx2>
-__global__ static void YuvToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+__global__ static void YuvToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
     int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
     if (x + 1 >= nWidth || y + 1 >= nHeight) {
@@ -287,10 +226,10 @@ __global__ static void YuvToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uin
     YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
     YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
 
-    Color color0 = YuvToColoraForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag),
-        color1 = YuvToColoraForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag),
-        color2 = YuvToColoraForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag),
-        color3 = YuvToColoraForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag);
+    Color color0 = YuvToColoraForPixel<Color>(l0.x, ch.x, ch.y, matYuv2Color, videoFullRangeFlag),
+        color1 = YuvToColoraForPixel<Color>(l0.y, ch.x, ch.y, matYuv2Color, videoFullRangeFlag),
+        color2 = YuvToColoraForPixel<Color>(l1.x, ch.x, ch.y, matYuv2Color, videoFullRangeFlag),
+        color3 = YuvToColoraForPixel<Color>(l1.y, ch.x, ch.y, matYuv2Color, videoFullRangeFlag);
 
     uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
     *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
@@ -307,7 +246,7 @@ __global__ static void YuvToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uin
 }
 
 template<class YuvUnitx2, class Color, class ColorUnitx2>
-__global__ static void Yuv444ToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+__global__ static void Yuv444ToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
     int y = (threadIdx.y + blockIdx.y * blockDim.y);
     if (x + 1 >= nWidth || y >= nHeight) {
@@ -320,9 +259,8 @@ __global__ static void Yuv444ToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, u
     YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
     YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
 
-    Color color0 = YuvToColorForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag),
-        color1 = YuvToColorForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag);
-
+    Color color0 = YuvToColorForPixel<Color>(l0.x, ch1.x, ch2.x, matYuv2Color, videoFullRangeFlag),
+        color1 = YuvToColorForPixel<Color>(l0.y, ch1.y, ch2.y, matYuv2Color, videoFullRangeFlag);
 
     uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
     *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
@@ -335,7 +273,7 @@ __global__ static void Yuv444ToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, u
 }
 
 template<class YuvUnitx2, class Color, class ColorUnitx2>
-__global__ static void Yuv444ToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+__global__ static void Yuv444ToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag) {
     int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
     int y = (threadIdx.y + blockIdx.y * blockDim.y);
     if (x + 1 >= nWidth || y >= nHeight) {
@@ -348,9 +286,8 @@ __global__ static void Yuv444ToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch,
     YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
     YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
 
-    Color color0 = YuvToColoraForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag),
-        color1 = YuvToColoraForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag);
-
+    Color color0 = YuvToColoraForPixel<Color>(l0.x, ch1.x, ch2.x, matYuv2Color, videoFullRangeFlag),
+        color1 = YuvToColoraForPixel<Color>(l0.y, ch1.y, ch2.y, matYuv2Color, videoFullRangeFlag);
 
     uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
     *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
@@ -368,395 +305,395 @@ __global__ static void Yuv444ToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch,
 #define BLOCKSIZE_X 32
 #define BLOCKSIZE_Y 8
 
-void Y8ToGray8(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Y8ToGray8(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YToGrayKernel<uchar2, unsigned char, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpY8, nY8Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpY8, nY8Pitch, dpGray, nGrayPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
-void Y8ToGray16(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Y8ToGray16(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YToGrayKernel<uchar2, unsigned short, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpY8, nY8Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpY8, nY8Pitch, dpGray, nGrayPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
-void Y16ToGray8(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Y16ToGray8(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YToGrayKernel<ushort2, unsigned char, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpY16, nY16Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpY16, nY16Pitch, dpGray, nGrayPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
-void Y16ToGray16(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Y16ToGray16(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YToGrayKernel<ushort2, unsigned short, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpY16, nY16Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpY16, nY16Pitch, dpGray, nGrayPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR24>
-void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColorKernel<uchar2, COLOR24, ushort3>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR32>
-void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColoraKernel<uchar2, COLOR32, uint2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR48>
-void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColorKernel<uchar2, COLOR48, uint3>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR64>
-void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColoraKernel<uchar2, COLOR64, ulonglong2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR24>
-void YUV444ToColor24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444ToColor24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColorKernel<uchar2, COLOR24, ushort3>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR32>
-void YUV444ToColor32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444ToColor32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColoraKernel<uchar2, COLOR32, uint2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR48>
-void YUV444ToColor48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444ToColor48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColorKernel<uchar2, COLOR48, uint3>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR64>
-void YUV444ToColor64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444ToColor64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColoraKernel<uchar2, COLOR64, ulonglong2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR24>
-void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColorKernel<ushort2, COLOR24, ushort3>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR32>
-void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColoraKernel<ushort2, COLOR32, uint2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR48>
-void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColorKernel<ushort2, COLOR48, uint3>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR64>
-void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColoraKernel<ushort2, COLOR64, ulonglong2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR24>
-void YUV444P16ToColor24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444P16ToColor24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColorKernel<ushort2, COLOR24, ushort3>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR32>
-void YUV444P16ToColor32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444P16ToColor32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColoraKernel<ushort2, COLOR32, uint2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR48>
-void YUV444P16ToColor48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444P16ToColor48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColorKernel<ushort2, COLOR48, uint3>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR64>
-void YUV444P16ToColor64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444P16ToColor64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColoraKernel<ushort2, COLOR64, ulonglong2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR24>
-void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColorPlanarKernel<uchar2, COLOR24, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR32>
-void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColoraPlanarKernel<uchar2, COLOR32, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR48>
-void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColorPlanarKernel<uchar2, COLOR48, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR64>
-void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColoraPlanarKernel<uchar2, COLOR64, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR24>
-void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColorPlanarKernel<ushort2, COLOR24, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR32>
-void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColoraPlanarKernel<ushort2, COLOR32, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR48>
-void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColorPlanarKernel<ushort2, COLOR48, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR64>
-void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     YuvToColoraPlanarKernel<ushort2, COLOR64, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR24>
-void YUV444ToColorPlanar24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444ToColorPlanar24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColorPlanarKernel<uchar2, COLOR24, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR32>
-void YUV444ToColorPlanar32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444ToColorPlanar32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColoraPlanarKernel<uchar2, COLOR32, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR48>
-void YUV444ToColorPlanar48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444ToColorPlanar48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColorPlanarKernel<uchar2, COLOR48, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR64>
-void YUV444ToColorPlanar64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444ToColorPlanar64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColoraPlanarKernel<uchar2, COLOR64, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR24>
-void YUV444P16ToColorPlanar24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444P16ToColorPlanar24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColorPlanarKernel<ushort2, COLOR24, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR32>
-void YUV444P16ToColorPlanar32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444P16ToColorPlanar32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColoraPlanarKernel<ushort2, COLOR32, uchar2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR48>
-void YUV444P16ToColorPlanar48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444P16ToColorPlanar48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColorPlanarKernel<ushort2, COLOR48, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
     if (stream == 0)
         cudaSafeCall(cudaStreamSynchronize(stream));
 }
 
 template <class COLOR64>
-void YUV444P16ToColorPlanar64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+void YUV444P16ToColorPlanar64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream) {
     Yuv444ToColoraPlanarKernel<ushort2, COLOR64, ushort2>
         <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
-        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
-    if (stream == 0)
-        cudaSafeCall(cudaStreamSynchronize(stream));
-}
-
-template void Nv12ToColor24<BGR24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColor24<RGB24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColor32<BGRA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColor32<RGBA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColor48<BGR48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColor48<RGB48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColor64<BGRA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColor64<RGBA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template void Nv12ToColorPlanar24<BGR24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColorPlanar24<RGB24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColorPlanar32<BGRA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColorPlanar32<RGBA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColorPlanar48<BGR48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColorPlanar48<RGB48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColorPlanar64<BGRA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void Nv12ToColorPlanar64<RGBA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template void P016ToColor24<BGR24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColor24<RGB24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColor32<BGRA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColor32<RGBA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColor48<BGR48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColor48<RGB48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColor64<BGRA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColor64<RGBA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template void P016ToColorPlanar24<BGR24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColorPlanar24<RGB24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColorPlanar32<BGRA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColorPlanar32<RGBA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColorPlanar48<BGR48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColorPlanar48<RGB48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColorPlanar64<BGRA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void P016ToColorPlanar64<RGBA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template void YUV444ToColor24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColor24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColor32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColor32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColor48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColor48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColor64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColor64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template void YUV444ToColorPlanar24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColorPlanar24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColorPlanar32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColorPlanar32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColorPlanar48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColorPlanar48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColorPlanar64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444ToColorPlanar64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template void YUV444P16ToColor24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColor24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColor32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColor32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColor48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColor48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColor64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColor64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template void YUV444P16ToColorPlanar24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColorPlanar24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColorPlanar32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColorPlanar32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColorPlanar48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColorPlanar48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColorPlanar64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template void YUV444P16ToColorPlanar64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, matYuv2Color, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template void Nv12ToColor24<BGR24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor24<RGB24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor32<BGRA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor32<RGBA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor48<BGR48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor48<RGB48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor64<BGRA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor64<RGBA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void Nv12ToColorPlanar24<BGR24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar24<RGB24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar32<BGRA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar32<RGBA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar48<BGR48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar48<RGB48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar64<BGRA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar64<RGBA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void P016ToColor24<BGR24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor24<RGB24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor32<BGRA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor32<RGBA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor48<BGR48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor48<RGB48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor64<BGRA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor64<RGBA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void P016ToColorPlanar24<BGR24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar24<RGB24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar32<BGRA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar32<RGBA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar48<BGR48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar48<RGB48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar64<BGRA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar64<RGBA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444ToColor24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444ToColorPlanar24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444P16ToColor24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444P16ToColorPlanar24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
 }}}
diff --git a/modules/cudacodec/src/cuda/ColorSpace.h b/modules/cudacodec/src/cuda/ColorSpace.h
index d730aa37fd1..42166f24971 100644
--- a/modules/cudacodec/src/cuda/ColorSpace.h
+++ b/modules/cudacodec/src/cuda/ColorSpace.h
@@ -7,6 +7,10 @@
 #include <cuda_runtime.h>
 
 namespace cv { namespace cuda { namespace device {
+struct ColorMatrix {
+    float m[3][3];
+};
+
 union BGR24 {
     uchar3 v;
     struct {
diff --git a/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
index ff9aa5708c4..ae5805094f1 100644
--- a/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
+++ b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
@@ -13,91 +13,149 @@ Ptr<NVSurfaceToColorConverter> cv::cudacodec::createNVSurfaceToColorConverter(co
 #else
 #include "cuda/ColorSpace.h"
 namespace cv { namespace cuda { namespace device {
-template<class BGR24> void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB24> void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA32> void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA32> void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGR48> void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB48> void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA64> void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA64> void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template<class BGR24> void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB24> void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA32> void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA32> void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGR48> void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB48> void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA64> void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA64> void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template<class BGR24> void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB24> void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA32> void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA32> void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGR48> void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB48> void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA64> void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA64> void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template<class BGR24> void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB24> void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA32> void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA32> void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag);
-template<class BGR48> void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB48> void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA64> void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA64> void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template<class BGR24> void YUV444ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB24> void YUV444ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA32> void YUV444ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA32> void YUV444ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGR48> void YUV444ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB48> void YUV444ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA64> void YUV444ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA64> void YUV444ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template<class BGR24> void YUV444ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB24> void YUV444ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA32> void YUV444ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA32> void YUV444ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGR48> void YUV444ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB48> void YUV444ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA64> void YUV444ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA64> void YUV444ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template<class BGR24> void YUV444P16ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB24> void YUV444P16ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA32> void YUV444P16ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA32> void YUV444P16ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGR48> void YUV444P16ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB48> void YUV444P16ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA64> void YUV444P16ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA64> void YUV444P16ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-template<class BGR24> void YUV444P16ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB24> void YUV444P16ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA32> void YUV444P16ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA32> void YUV444P16ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGR48> void YUV444P16ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGB48> void YUV444P16ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class BGRA64> void YUV444P16ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-template<class RGBA64> void YUV444P16ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-void Y8ToGray8(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-void Y8ToGray16(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-void Y16ToGray8(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-void Y16ToGray16(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
-
-void SetMatYuv2Rgb(int iMatrix, bool);
+template<class BGR24> void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag);
+template<class BGR48> void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444P16ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444P16ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444P16ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444P16ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444P16ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444P16ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444P16ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444P16ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444P16ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444P16ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444P16ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444P16ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444P16ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444P16ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444P16ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444P16ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+
+void Y8ToGray8(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+void Y8ToGray16(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+void Y16ToGray8(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
+void Y16ToGray16(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, ColorMatrix matYuv2Color, bool videoFullRangeFlag, const cudaStream_t stream);
 }}}
 
 using namespace cuda::device;
+namespace {
+void GetConstants(int iMatrix, float& wr, float& wb, int& black, int& white, int& uvWhite, int& max, bool fullRange = false) {
+    if (fullRange) {
+        black = 0; white = 255; uvWhite = 255;
+    }
+    else {
+        black = 16; white = 235; uvWhite = 240;
+    }
+    max = 255;
+
+    switch (static_cast<cv::cudacodec::ColorSpaceStandard>(iMatrix))
+    {
+    case cv::cudacodec::ColorSpaceStandard::BT709:
+    default:
+        wr = 0.2126f; wb = 0.0722f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::FCC:
+        wr = 0.30f; wb = 0.11f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::BT470:
+    case cv::cudacodec::ColorSpaceStandard::BT601:
+        wr = 0.2990f; wb = 0.1140f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::SMPTE240M:
+        wr = 0.212f; wb = 0.087f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::BT2020:
+    case cv::cudacodec::ColorSpaceStandard::BT2020C:
+        wr = 0.2627f; wb = 0.0593f;
+        // 10-bit only
+        black = 64 << 6; white = 940 << 6;
+        max = (1 << 16) - 1;
+        break;
+    }
+}
+
+void SetMatYuv2Rgb(int iMatrix, ColorMatrix& matYuv2Color, bool fullRange = false) {
+    float wr, wb;
+    int black, white, max, uvWhite;
+    GetConstants(iMatrix, wr, wb, black, white, uvWhite, max, fullRange);
+    float mat[3][3] = {
+        1.0f, 0.0f, (1.0f - wr) / 0.5f,
+        1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr),
+        1.0f, (1.0f - wb) / 0.5f, 0.0f,
+    };
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            if (j == 0)
+                matYuv2Color.m[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]);
+            else
+                matYuv2Color.m[i][j] = (float)(1.0 * max / (uvWhite - black) * mat[i][j]);
+        }
+    }
+}
+}
+
 class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
 public:
     NVSurfaceToColorConverterImpl(ColorSpaceStandard colorSpace, bool fullColorRange = false) {
-        SetMatYuv2Rgb(static_cast<int>(colorSpace), fullColorRange);
+        SetMatYuv2Rgb(static_cast<int>(colorSpace), matYuv2Color, fullColorRange);
     }
 
     int OutputColorFormatIdx(const cudacodec::ColorFormat format) {
@@ -142,7 +200,7 @@ class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
         const bool yuv420 = surfaceFormat == SurfaceFormat::SF_NV12 || surfaceFormat == SurfaceFormat::SF_P016;
         CV_Assert(yuv.cols() % 2 == 0);
 
-        using func_t = void (*)(uint8_t* yuv, int yuvPitch, uint8_t* color, int colorPitch, int width, int height, bool videoFullRangeFlag, cudaStream_t stream);
+        using func_t = void (*)(uint8_t* yuv, int yuvPitch, uint8_t* color, int colorPitch, int width, int height, ColorMatrix matYuv2Color, bool videoFullRangeFlag, cudaStream_t stream);
 
         static const func_t funcsNV12[5][2][2] =
         {
@@ -277,11 +335,12 @@ class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
             CV_Error(Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
         CV_Assert(out_.step <= static_cast<size_t>(std::numeric_limits<int>::max()));
-        func((uint8_t*)yuv_.ptr(0), static_cast<int>(yuv_.step), (uint8_t*)out_.ptr(0), static_cast<int>(out_.step), out_.cols, nRows, videoFullRangeFlag, StreamAccessor::getStream(stream));
+        func((uint8_t*)yuv_.ptr(0), static_cast<int>(yuv_.step), (uint8_t*)out_.ptr(0), static_cast<int>(out_.step), out_.cols, nRows, matYuv2Color, videoFullRangeFlag, StreamAccessor::getStream(stream));
         return true;
     }
 
-
+private:
+    ColorMatrix matYuv2Color;
 };
 
 Ptr<NVSurfaceToColorConverter> cv::cudacodec::createNVSurfaceToColorConverter(const ColorSpaceStandard colorSpace, const bool videoFullRangeFlag) {

From 61ccd6e199e0c13c953ae21366472a3bb55a33be Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@opencv.ai>
Date: Thu, 23 Oct 2025 09:05:09 +0300
Subject: [PATCH 3/5] cvv_demo.cpp build fix.

---
 modules/cvv/samples/cvv_demo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/cvv/samples/cvv_demo.cpp b/modules/cvv/samples/cvv_demo.cpp
index c954d1819cc..8d0318034ca 100644
--- a/modules/cvv/samples/cvv_demo.cpp
+++ b/modules/cvv/samples/cvv_demo.cpp
@@ -56,8 +56,8 @@ main(int argc, char** argv)
 
   if (res_w>0 && res_h>0) {
     printf("Setting resolution to %dx%d\n", res_w, res_h);
-    capture.set(CV_CAP_PROP_FRAME_WIDTH, res_w);
-    capture.set(CV_CAP_PROP_FRAME_HEIGHT, res_h);
+    capture.set(cv::CAP_PROP_FRAME_WIDTH, res_w);
+    capture.set(cv::CAP_PROP_FRAME_HEIGHT, res_h);
   }
 
 

From f552db23d9dbf4e05e6626c691d0a35e566a5074 Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Tue, 21 Oct 2025 17:11:09 +0300
Subject: [PATCH 4/5] Expose NVSurfaceToColorConverter::convert to python and
 remove unecessary videoFullRangeFlag argument.

---
 modules/cudacodec/include/opencv2/cudacodec.hpp    |  3 +--
 .../cudacodec/misc/python/test/test_cudacodec.py   | 14 +++++++++++---
 .../nvidia_surface_format_to_color_converter.cpp   |  7 +++++--
 modules/cudacodec/src/video_reader.cpp             |  8 ++++----
 modules/cudacodec/test/test_video.cpp              |  2 +-
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index 35f5b28cc83..1a83c57d6d5 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -392,10 +392,9 @@ class CV_EXPORTS_W NVSurfaceToColorConverter {
      * @param outputFormat The requested output color format.
      * @param bitDepth The requested bit depth of the output frame.
      * @param planar Request seperate planes for each color plane.
-     * @param videoFullRangeFlag Indicates if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.
      * @param stream Stream for the asynchronous version.
      */
-    virtual bool convert(InputArray yuv, OutputArray color, const SurfaceFormat surfaceFormat, const ColorFormat outputFormat, const BitDepth bitDepth = BitDepth::UNCHANGED, const bool planar = false, const bool videoFullRangeFlag = false, cuda::Stream& stream = cuda::Stream::Null()) = 0;
+    CV_WRAP virtual bool convert(InputArray yuv, OutputArray color, const SurfaceFormat surfaceFormat, const ColorFormat outputFormat, const BitDepth bitDepth = BitDepth::UNCHANGED, const bool planar = false, cuda::Stream& stream = cuda::Stream::Null()) = 0;
 };
 
 /** @brief Creates a NVSurfaceToColorConverter.
diff --git a/modules/cudacodec/misc/python/test/test_cudacodec.py b/modules/cudacodec/misc/python/test/test_cudacodec.py
index cf20daeb181..6c22b57251b 100644
--- a/modules/cudacodec/misc/python/test/test_cudacodec.py
+++ b/modules/cudacodec/misc/python/test/test_cudacodec.py
@@ -60,8 +60,8 @@ def test_reader(self):
 
             # Change color format
             ret, colour_code = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_COLOR_FORMAT)
-            self.assertTrue(ret and colour_code == cv.cudacodec.ColorFormat_BGRA)
-            colour_code_gs = cv.cudacodec.ColorFormat_GRAY
+            self.assertTrue(ret and colour_code == cv.cudacodec.BGRA)
+            colour_code_gs = cv.cudacodec.GRAY
             reader.set(colour_code_gs)
             ret, colour_code = reader.getVideoReaderProps(cv.cudacodec.VideoReaderProps_PROP_COLOR_FORMAT)
             self.assertTrue(ret and colour_code == colour_code_gs)
@@ -91,6 +91,14 @@ def test_reader(self):
             else:
                 self.skipTest(e.err)
 
+    def test_NVSurfaceToColorConverter(self):
+        converter = cv.cudacodec.createNVSurfaceToColorConverter(cv.cudacodec.ColorSpaceStandard_BT601,False)
+        bgr_sz = (1920,1080)
+        nv12_sz = (1920, int(1.5*1080))
+        blank_nv12_frame = cv.cuda.GpuMat(nv12_sz,cv.CV_8U)
+        ret, bgr = converter.convert(blank_nv12_frame, cv.cudacodec.SF_NV12, cv.cudacodec.BGR)
+        self.assertTrue(ret == True and bgr.size() == bgr_sz)
+
     def test_map_histogram(self):
         hist = cv.cuda_GpuMat((1,256), cv.CV_8UC1)
         hist.setTo(1)
@@ -107,7 +115,7 @@ def test_writer(self):
             encoder_params_in.gopLength = 10
             stream = cv.cuda.Stream()
             sz = (1920,1080)
-            writer = cv.cudacodec.createVideoWriter(fname, sz, cv.cudacodec.H264, 30, cv.cudacodec.ColorFormat_BGR,
+            writer = cv.cudacodec.createVideoWriter(fname, sz, cv.cudacodec.H264, 30, cv.cudacodec.BGR,
                                                     encoder_params_in, stream=stream)
             blankFrameIn = cv.cuda.GpuMat(sz,cv.CV_8UC3)
             writer.write(blankFrameIn)
diff --git a/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
index ae5805094f1..c04b22b8a7e 100644
--- a/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
+++ b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
@@ -154,7 +154,9 @@ void SetMatYuv2Rgb(int iMatrix, ColorMatrix& matYuv2Color, bool fullRange = fals
 
 class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
 public:
-    NVSurfaceToColorConverterImpl(ColorSpaceStandard colorSpace, bool fullColorRange = false) {
+    NVSurfaceToColorConverterImpl(ColorSpaceStandard colorSpace, bool fullColorRange = false) :
+        videoFullRangeFlag(fullColorRange)
+    {
         SetMatYuv2Rgb(static_cast<int>(colorSpace), matYuv2Color, fullColorRange);
     }
 
@@ -194,7 +196,7 @@ class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
         }
     }
 
-    bool convert(const InputArray yuv, const OutputArray out, const SurfaceFormat surfaceFormat, const ColorFormat outputFormat, const BitDepth bitDepth, const bool planar, const bool videoFullRangeFlag, cuda::Stream& stream) {
+    bool convert(const InputArray yuv, const OutputArray out, const SurfaceFormat surfaceFormat, const ColorFormat outputFormat, const BitDepth bitDepth, const bool planar, cuda::Stream& stream) {
         CV_Assert(outputFormat == ColorFormat::BGR || outputFormat == ColorFormat::BGRA || outputFormat == ColorFormat::RGB || outputFormat == ColorFormat::RGBA || outputFormat == ColorFormat::GRAY);
         CV_Assert(yuv.depth() == CV_8U || yuv.depth() == CV_16U);
         const bool yuv420 = surfaceFormat == SurfaceFormat::SF_NV12 || surfaceFormat == SurfaceFormat::SF_P016;
@@ -341,6 +343,7 @@ class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
 
 private:
     ColorMatrix matYuv2Color;
+    bool videoFullRangeFlag;
 };
 
 Ptr<NVSurfaceToColorConverter> cv::cudacodec::createNVSurfaceToColorConverter(const ColorSpaceStandard colorSpace, const bool videoFullRangeFlag) {
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index 28bbc113163..d4465eb49a7 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -92,7 +92,7 @@ namespace
         void releaseFrameInfo(const std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo);
         bool internalGrab(GpuMat & frame, GpuMat & histogram, Stream & stream);
         void waitForDecoderInit();
-        void cvtFromYuv(const GpuMat& decodedFrame, GpuMat& outFrame, const SurfaceFormat surfaceFormat, const bool videoFullRangeFlag, Stream& stream);
+        void cvtFromYuv(const GpuMat& decodedFrame, GpuMat& outFrame, const SurfaceFormat surfaceFormat, Stream& stream);
 
         Ptr<VideoSource> videoSource_;
 
@@ -260,7 +260,7 @@ namespace
                 cuSafeCall(cuMemcpyDtoDAsync((CUdeviceptr)(histogram.data), cuHistogramPtr, histogramSz, StreamAccessor::getStream(stream)));
             }
 
-            cvtFromYuv(decodedFrame, frame, videoDecoder_->format().surfaceFormat, videoDecoder_->format().videoFullRangeFlag, stream);
+            cvtFromYuv(decodedFrame, frame, videoDecoder_->format().surfaceFormat, stream);
             // unmap video frame
             // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
             videoDecoder_->unmapFrame(decodedFrame);
@@ -414,13 +414,13 @@ namespace
         return true;
     }
 
-    void VideoReaderImpl::cvtFromYuv(const GpuMat& decodedFrame, GpuMat& outFrame, const SurfaceFormat surfaceFormat, const bool videoFullRangeFlag, Stream& stream)
+    void VideoReaderImpl::cvtFromYuv(const GpuMat& decodedFrame, GpuMat& outFrame, const SurfaceFormat surfaceFormat, Stream& stream)
     {
         if (colorFormat == ColorFormat::NV_YUV_SURFACE_FORMAT) {
             decodedFrame.copyTo(outFrame, stream);
             return;
         }
-        yuvConverter->convert(decodedFrame, outFrame, surfaceFormat, colorFormat, bitDepth, planar, videoFullRangeFlag, stream);
+        yuvConverter->convert(decodedFrame, outFrame, surfaceFormat, colorFormat, bitDepth, planar, stream);
     }
 }
 
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index 791be89d7ae..aba3526db2f 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -998,7 +998,7 @@ CUDA_TEST_P(YuvConverter, Reader)
     Mat nv12Interleaved, bgrFromYuv;
     generateTestImages(bgr, nv12Interleaved, bgrFromYuv, surfaceFormat, outputFormat, bitDepth, planar, fullRange);
     GpuMat nv12Device(nv12Interleaved), bgrDevice(bgrFromYuv.size(), bgrFromYuv.type());
-    yuvConverter->convert(nv12Device, bgrDevice, surfaceFormat, outputFormat, bitDepth, planar, fullRange);
+    yuvConverter->convert(nv12Device, bgrDevice, surfaceFormat, outputFormat, bitDepth, planar);
     bgrDevice.download(bgrHost);
     EXPECT_MAT_NEAR(bgrFromYuv, bgrHost, bitDepth == cudacodec::BitDepth::EIGHT ? 2 :512);
 }

From d8222d37287f406ba4cd67e578c88c5293ca635c Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Tue, 28 Oct 2025 17:09:49 +0200
Subject: [PATCH 5/5] [cuda] Add cudart to all modules which use it

---
 modules/cudaarithm/CMakeLists.txt            | 23 +++++++++++---------
 modules/cudabgsegm/CMakeLists.txt            |  6 +++--
 modules/cudacodec/CMakeLists.txt             |  2 +-
 modules/cudafeatures2d/CMakeLists.txt        | 14 ++++++++++--
 modules/cudafilters/CMakeLists.txt           | 19 +++++++++++++---
 modules/cudaimgproc/CMakeLists.txt           |  7 +++---
 modules/cudalegacy/CMakeLists.txt            | 15 ++++++++++---
 modules/cudaobjdetect/CMakeLists.txt         |  4 ++++
 modules/cudaoptflow/CMakeLists.txt           |  4 ++++
 modules/cudastereo/CMakeLists.txt            |  6 +++--
 modules/cudawarping/CMakeLists.txt           | 20 ++++++++++++++---
 modules/superres/CMakeLists.txt              |  4 ++++
 modules/superres/src/precomp.hpp             |  7 ++++--
 modules/videostab/CMakeLists.txt             |  4 ++++
 modules/videostab/src/global_motion.cpp      |  6 +++++
 modules/videostab/src/wobble_suppression.cpp |  9 +++++---
 modules/xfeatures2d/CMakeLists.txt           |  4 ++++
 modules/xfeatures2d/src/precomp.hpp          |  4 ++++
 18 files changed, 124 insertions(+), 34 deletions(-)

diff --git a/modules/cudaarithm/CMakeLists.txt b/modules/cudaarithm/CMakeLists.txt
index fa9c9194163..34863ce2b07 100644
--- a/modules/cudaarithm/CMakeLists.txt
+++ b/modules/cudaarithm/CMakeLists.txt
@@ -7,38 +7,41 @@ set(the_description "CUDA-accelerated Operations on Matrices")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
 
 set(extra_dependencies "")
-set(optional_dependencies "")
 if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
 	if(UNIX AND NOT BUILD_SHARED_LIBS AND CUDA_VERSION_STRING VERSION_GREATER_EQUAL 9.2 AND CUDA_VERSION_STRING VERSION_LESS 13.0 AND CMAKE_VERSION VERSION_GREATER_EQUAL 3.23)
     set(CUDA_FFT_LIB_EXT "_static_nocallback")
   endif()
-  list(APPEND extra_dependencies CUDA::cudart_static CUDA::nppial${CUDA_LIB_EXT} CUDA::nppc${CUDA_LIB_EXT} CUDA::nppitc${CUDA_LIB_EXT} CUDA::nppig${CUDA_LIB_EXT} CUDA::nppist${CUDA_LIB_EXT} CUDA::nppidei${CUDA_LIB_EXT})
+  list(APPEND extra_dependencies CUDA::cudart${CUDA_LIB_EXT} CUDA::nppial${CUDA_LIB_EXT} CUDA::nppc${CUDA_LIB_EXT} CUDA::nppitc${CUDA_LIB_EXT} CUDA::nppig${CUDA_LIB_EXT} CUDA::nppist${CUDA_LIB_EXT} CUDA::nppidei${CUDA_LIB_EXT})
   if(HAVE_CUBLAS)
-    list(APPEND optional_dependencies CUDA::cublas${CUDA_LIB_EXT})
+    list(APPEND extra_dependencies CUDA::cublas${CUDA_LIB_EXT})
     if(NOT CUDA_VERSION VERSION_LESS 10.1)
-      list(APPEND optional_dependencies CUDA::cublasLt${CUDA_LIB_EXT})
+      list(APPEND extra_dependencies CUDA::cublasLt${CUDA_LIB_EXT})
     endif()
   endif()
   if(HAVE_CUFFT)
     # static version requires seperable compilation which is incompatible with opencv's current library structure
     # the cufft_static_nocallback variant does not requires seperable compilation. callbacks are currently not used.
-    list(APPEND optional_dependencies CUDA::cufft${CUDA_FFT_LIB_EXT})
+    list(APPEND extra_dependencies CUDA::cufft${CUDA_FFT_LIB_EXT})
   endif()
 else()
   if(HAVE_CUBLAS)
-    list(APPEND optional_dependencies ${CUDA_cublas_LIBRARY})
+    list(APPEND extra_dependencies ${CUDA_cublas_LIBRARY})
   endif()
   if(HAVE_CUFFT)
-    list(APPEND optional_dependencies ${CUDA_cufft_LIBRARY})
+    list(APPEND extra_dependencies ${CUDA_cufft_LIBRARY})
   endif()
 endif()
 
-ocv_add_module(cudaarithm opencv_core ${extra_dependencies} OPTIONAL opencv_cudev ${optional_dependencies} WRAP python)
+ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev WRAP python)
 
 ocv_module_include_directories()
 ocv_glob_module_sources()
 
-ocv_create_module()
+ocv_create_module(${extra_dependencies})
 
-ocv_add_accuracy_tests(DEPENDS_ON opencv_imgproc)
+set(test_libs "")
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  list(APPEND test_libs CUDA::cudart${CUDA_LIB_EXT})
+endif()
+ocv_add_accuracy_tests(${test_libs} DEPENDS_ON opencv_imgproc)
 ocv_add_perf_tests(DEPENDS_ON opencv_imgproc)
diff --git a/modules/cudabgsegm/CMakeLists.txt b/modules/cudabgsegm/CMakeLists.txt
index 1d2ef64d154..d833241d29f 100644
--- a/modules/cudabgsegm/CMakeLists.txt
+++ b/modules/cudabgsegm/CMakeLists.txt
@@ -5,7 +5,9 @@ endif()
 set(the_description "CUDA-accelerated Background Segmentation")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudabgsegm opencv_video WRAP python)
+
 if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-  ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+  ocv_target_link_libraries(${the_module} PRIVATE CUDA::cudart${CUDA_LIB_EXT})
 endif()
-ocv_define_module(cudabgsegm opencv_video WRAP python)
diff --git a/modules/cudacodec/CMakeLists.txt b/modules/cudacodec/CMakeLists.txt
index a2dd450423f..eeda57149f8 100644
--- a/modules/cudacodec/CMakeLists.txt
+++ b/modules/cudacodec/CMakeLists.txt
@@ -32,7 +32,7 @@ endif()
 
 if(HAVE_NVCUVID OR HAVE_NVCUVENC)
   if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-      list(APPEND extra_libs CUDA::cuda_driver)
+      list(APPEND extra_libs CUDA::cuda_driver CUDA::cudart${CUDA_LIB_EXT})
   else()
     list(APPEND extra_libs ${CUDA_CUDA_LIBRARY})
   endif()
diff --git a/modules/cudafeatures2d/CMakeLists.txt b/modules/cudafeatures2d/CMakeLists.txt
index 2b6023c0b66..68f4e2d61a8 100644
--- a/modules/cudafeatures2d/CMakeLists.txt
+++ b/modules/cudafeatures2d/CMakeLists.txt
@@ -5,7 +5,17 @@ endif()
 set(the_description "CUDA-accelerated Feature Detection and Description")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter -Wshadow)
+
+ocv_add_module(cudafeatures2d opencv_features2d opencv_cudafilters opencv_cudawarping WRAP python)
+
+ocv_module_include_directories()
+ocv_glob_module_sources()
+
+set(extra_libs "")
 if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-  ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+  list(APPEND extra_libs CUDA::cudart${CUDA_LIB_EXT})
 endif()
-ocv_define_module(cudafeatures2d opencv_features2d opencv_cudafilters opencv_cudawarping WRAP python)
+ocv_create_module(${extra_libs})
+
+ocv_add_accuracy_tests(${extra_libs})
+ocv_add_perf_tests(${extra_libs})
diff --git a/modules/cudafilters/CMakeLists.txt b/modules/cudafilters/CMakeLists.txt
index 75ff3b26718..c0eeb920e39 100644
--- a/modules/cudafilters/CMakeLists.txt
+++ b/modules/cudafilters/CMakeLists.txt
@@ -5,8 +5,21 @@ endif()
 set(the_description "CUDA-accelerated Image Filtering")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
-set(extra_libs "")
+
+ocv_add_module(cudafilters opencv_imgproc opencv_cudaarithm WRAP python)
+
+ocv_module_include_directories()
+ocv_glob_module_sources()
+
+set(build_libs "")
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  list(APPEND build_libs CUDA::cudart${CUDA_LIB_EXT} CUDA::nppif${CUDA_LIB_EXT} CUDA::nppim${CUDA_LIB_EXT})
+endif()
+ocv_create_module(${build_libs})
+
+set(test_libs "")
 if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-  set(extra_libs CUDA::nppif${CUDA_LIB_EXT} CUDA::nppim${CUDA_LIB_EXT})
+  list(APPEND test_libs CUDA::cudart${CUDA_LIB_EXT})
 endif()
-ocv_define_module(cudafilters opencv_imgproc opencv_cudaarithm ${extra_libs} WRAP python)
+ocv_add_accuracy_tests(${test_libs})
+ocv_add_perf_tests()
diff --git a/modules/cudaimgproc/CMakeLists.txt b/modules/cudaimgproc/CMakeLists.txt
index de818f6a8b3..e70abca13e4 100644
--- a/modules/cudaimgproc/CMakeLists.txt
+++ b/modules/cudaimgproc/CMakeLists.txt
@@ -5,8 +5,9 @@ endif()
 set(the_description "CUDA-accelerated Image Processing")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
-set(extra_libs "")
+
+ocv_define_module(cudaimgproc opencv_imgproc OPTIONAL opencv_cudev opencv_cudaarithm opencv_cudafilters WRAP python)
+
 if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-  set(extra_libs CUDA::nppial${CUDA_LIB_EXT} CUDA::nppist${CUDA_LIB_EXT} CUDA::nppicc${CUDA_LIB_EXT} CUDA::nppidei${CUDA_LIB_EXT})
+  ocv_target_link_libraries(${the_module} PRIVATE CUDA::cudart${CUDA_LIB_EXT} CUDA::nppial${CUDA_LIB_EXT} CUDA::nppist${CUDA_LIB_EXT} CUDA::nppicc${CUDA_LIB_EXT} CUDA::nppidei${CUDA_LIB_EXT})
 endif()
-ocv_define_module(cudaimgproc opencv_imgproc ${extra_libs} OPTIONAL opencv_cudev opencv_cudaarithm opencv_cudafilters WRAP python)
diff --git a/modules/cudalegacy/CMakeLists.txt b/modules/cudalegacy/CMakeLists.txt
index f3ebfeafde7..ba3f73510dc 100644
--- a/modules/cudalegacy/CMakeLists.txt
+++ b/modules/cudalegacy/CMakeLists.txt
@@ -5,8 +5,17 @@ endif()
 set(the_description "CUDA-accelerated Computer Vision (legacy)")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4130 /wd4324 /wd4512 /wd4310 -Wundef -Wmissing-declarations -Wuninitialized -Wshadow -Wdeprecated-declarations -Wstrict-aliasing -Wtautological-compare)
+
+ocv_add_module(cudalegacy opencv_core opencv_video OPTIONAL opencv_objdetect opencv_imgproc opencv_calib3d opencv_cudaarithm opencv_cudafilters opencv_cudaimgproc)
+
+ocv_module_include_directories()
+ocv_glob_module_sources()
+
+set(extra_libs "")
 if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-  ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+  list(APPEND extra_libs CUDA::cudart${CUDA_LIB_EXT})
 endif()
-ocv_define_module(cudalegacy opencv_core opencv_video
-  OPTIONAL opencv_objdetect opencv_imgproc opencv_calib3d opencv_cudaarithm opencv_cudafilters opencv_cudaimgproc)
+ocv_create_module(${extra_libs})
+
+ocv_add_accuracy_tests(${extra_libs})
+ocv_add_perf_tests(${extra_libs})
diff --git a/modules/cudaobjdetect/CMakeLists.txt b/modules/cudaobjdetect/CMakeLists.txt
index a3f565ff0ab..8329f84a5a5 100644
--- a/modules/cudaobjdetect/CMakeLists.txt
+++ b/modules/cudaobjdetect/CMakeLists.txt
@@ -7,3 +7,7 @@ set(the_description "CUDA-accelerated Object Detection")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow -Wstrict-aliasing)
 
 ocv_define_module(cudaobjdetect opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy WRAP python)
+
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  ocv_target_link_libraries(${the_module} PRIVATE CUDA::cudart${CUDA_LIB_EXT})
+endif()
diff --git a/modules/cudaoptflow/CMakeLists.txt b/modules/cudaoptflow/CMakeLists.txt
index d6ae902637d..b3af2a0bc8b 100644
--- a/modules/cudaoptflow/CMakeLists.txt
+++ b/modules/cudaoptflow/CMakeLists.txt
@@ -8,6 +8,10 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-d
 
 ocv_define_module(cudaoptflow opencv_video opencv_optflow opencv_cudaarithm opencv_cudawarping opencv_cudaimgproc OPTIONAL opencv_cudalegacy WRAP python)
 
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  ocv_target_link_libraries(${the_module} PRIVATE CUDA::cudart${CUDA_LIB_EXT})
+endif()
+
 if(NOT CUDA_VERSION VERSION_LESS "10.0")
   set(NVIDIA_OPTICAL_FLOW_2_0_HEADERS_COMMIT "edb50da3cf849840d680249aa6dbef248ebce2ca")
   set(NVIDIA_OPTICAL_FLOW_2_0_HEADERS_MD5 "a73cd48b18dcc0cc8933b30796074191")
diff --git a/modules/cudastereo/CMakeLists.txt b/modules/cudastereo/CMakeLists.txt
index d129536e903..631e70aad19 100644
--- a/modules/cudastereo/CMakeLists.txt
+++ b/modules/cudastereo/CMakeLists.txt
@@ -5,7 +5,9 @@ endif()
 set(the_description "CUDA-accelerated Stereo Correspondence")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudastereo opencv_calib3d OPTIONAL opencv_cudev WRAP python)
+
 if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-  ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+  ocv_target_link_libraries(${the_module} PRIVATE CUDA::cudart${CUDA_LIB_EXT})
 endif()
-ocv_define_module(cudastereo opencv_calib3d OPTIONAL opencv_cudev WRAP python)
diff --git a/modules/cudawarping/CMakeLists.txt b/modules/cudawarping/CMakeLists.txt
index 67b4a1d6a62..403121f659e 100644
--- a/modules/cudawarping/CMakeLists.txt
+++ b/modules/cudawarping/CMakeLists.txt
@@ -5,8 +5,22 @@ endif()
 set(the_description "CUDA-accelerated Image Warping")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
-set(extra_libs "")
+
+set(build_libs "")
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  list(APPEND build_libs CUDA::cudart${CUDA_LIB_EXT} CUDA::nppial${CUDA_LIB_EXT} CUDA::nppig${CUDA_LIB_EXT})
+endif()
+
+ocv_add_module(cudawarping opencv_core opencv_imgproc OPTIONAL opencv_cudev WRAP python)
+
+ocv_module_include_directories()
+ocv_glob_module_sources()
+
+ocv_create_module(${build_libs})
+
+set(test_libs "")
 if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-  set(extra_libs CUDA::nppial${CUDA_LIB_EXT} CUDA::nppig${CUDA_LIB_EXT})
+  list(APPEND test_libs CUDA::cudart${CUDA_LIB_EXT})
 endif()
-ocv_define_module(cudawarping opencv_core opencv_imgproc ${extra_libs} OPTIONAL opencv_cudev WRAP python)
+ocv_add_accuracy_tests(${test_libs})
+ocv_add_perf_tests()
diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt
index 2979ef0fb87..e5f455a3b83 100644
--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -9,3 +9,7 @@ endif()
 ocv_define_module(superres opencv_imgproc opencv_video opencv_optflow
                   OPTIONAL opencv_videoio opencv_cudaarithm opencv_cudafilters opencv_cudawarping opencv_cudaimgproc opencv_cudaoptflow opencv_cudacodec
                   WRAP python)
+
+if(HAVE_CUDA AND ENABLE_CUDA_FIRST_CLASS_LANGUAGE AND HAVE_opencv_cudaarithm AND HAVE_opencv_cudawarping AND HAVE_opencv_cudafilters)
+  ocv_target_link_libraries(${the_module} PRIVATE CUDA::cudart${CUDA_LIB_EXT})
+endif()
diff --git a/modules/superres/src/precomp.hpp b/modules/superres/src/precomp.hpp
index 9f12c248d6c..92370068799 100644
--- a/modules/superres/src/precomp.hpp
+++ b/modules/superres/src/precomp.hpp
@@ -54,9 +54,12 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/video/tracking.hpp"
 #include "opencv2/core/private.hpp"
-
-#include "opencv2/core/private.cuda.hpp"
 #include "opencv2/core/ocl.hpp"
+#if !defined(HAVE_CUDA) || !defined(HAVE_OPENCV_CUDAARITHM) || !defined(HAVE_OPENCV_CUDAWARPING) || !defined(HAVE_OPENCV_CUDAFILTERS)
+#include "opencv2/core/private/cuda_stubs.hpp"
+#else
+#include "opencv2/core/private.cuda.hpp"
+#endif
 
 #ifdef HAVE_OPENCV_CUDAARITHM
 #  include "opencv2/cudaarithm.hpp"
diff --git a/modules/videostab/CMakeLists.txt b/modules/videostab/CMakeLists.txt
index c31fcfceb5f..def66625d0f 100644
--- a/modules/videostab/CMakeLists.txt
+++ b/modules/videostab/CMakeLists.txt
@@ -10,3 +10,7 @@ endif()
 
 ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d
 OPTIONAL opencv_cudawarping opencv_cudaoptflow opencv_videoio WRAP python)
+
+if(HAVE_CUDA AND ENABLE_CUDA_FIRST_CLASS_LANGUAGE AND ((HAVE_opencv_cudaimgproc AND HAVE_opencv_cudaoptflow) OR HAVE_opencv_cudawarping))
+  ocv_target_link_libraries(${the_module} PRIVATE CUDA::cudart${CUDA_LIB_EXT})
+endif()
diff --git a/modules/videostab/src/global_motion.cpp b/modules/videostab/src/global_motion.cpp
index eb42d6954d6..58363c911b7 100644
--- a/modules/videostab/src/global_motion.cpp
+++ b/modules/videostab/src/global_motion.cpp
@@ -47,7 +47,13 @@
 #include "opencv2/opencv_modules.hpp"
 #include "clp.hpp"
 
+#if defined(HAVE_OPENCV_CUDAIMGPROC) && defined(HAVE_OPENCV_CUDAOPTFLOW)
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+#include "opencv2/core/private/cuda_stubs.hpp"
+#else
 #include "opencv2/core/private.cuda.hpp"
+#endif
+#endif
 
 #if defined(HAVE_OPENCV_CUDAIMGPROC) && defined(HAVE_OPENCV_CUDAOPTFLOW)
     #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
diff --git a/modules/videostab/src/wobble_suppression.cpp b/modules/videostab/src/wobble_suppression.cpp
index 079c511b5d8..961ca5f4a29 100644
--- a/modules/videostab/src/wobble_suppression.cpp
+++ b/modules/videostab/src/wobble_suppression.cpp
@@ -44,10 +44,13 @@
 #include "opencv2/videostab/wobble_suppression.hpp"
 #include "opencv2/videostab/ring_buffer.hpp"
 
-#include "opencv2/core/private.cuda.hpp"
-
 #ifdef HAVE_OPENCV_CUDAWARPING
-#  include "opencv2/cudawarping.hpp"
+#include "opencv2/cudawarping.hpp"
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+#include "opencv2/core/private/cuda_stubs.hpp"
+#else
+#include "opencv2/core/private.cuda.hpp"
+#endif
 #endif
 
 #if defined(HAVE_OPENCV_CUDAWARPING)
diff --git a/modules/xfeatures2d/CMakeLists.txt b/modules/xfeatures2d/CMakeLists.txt
index f30d2ed8c21..bafd4c59bac 100644
--- a/modules/xfeatures2d/CMakeLists.txt
+++ b/modules/xfeatures2d/CMakeLists.txt
@@ -5,6 +5,10 @@ if(HAVE_CUDA)
 endif()
 ocv_define_module(xfeatures2d opencv_core opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_shape opencv_ml opencv_cudaarithm WRAP python java objc)
 
+if(HAVE_CUDA AND ENABLE_CUDA_FIRST_CLASS_LANGUAGE AND HAVE_opencv_cudaarithm)
+  ocv_target_link_libraries(${the_module} PRIVATE CUDA::cudart${CUDA_LIB_EXT})
+endif()
+
 if(NOT OPENCV_SKIP_FEATURES2D_DOWNLOADING)
   include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/download_vgg.cmake)
   include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/download_boostdesc.cmake)
diff --git a/modules/xfeatures2d/src/precomp.hpp b/modules/xfeatures2d/src/precomp.hpp
index 376999600bc..88c295eaa98 100644
--- a/modules/xfeatures2d/src/precomp.hpp
+++ b/modules/xfeatures2d/src/precomp.hpp
@@ -50,7 +50,11 @@
 
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.hpp"
+#if !defined(HAVE_CUDA) || !defined(HAVE_OPENCV_CUDAARITHM)
+#include "opencv2/core/private/cuda_stubs.hpp"
+#else
 #include "opencv2/core/private.cuda.hpp"
+#endif
 #include "opencv2/core/ocl.hpp"
 
 #include "opencv2/opencv_modules.hpp"