From 9a9b173cd178e7c07a98896a009c2a2021a6b247 Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Thu, 7 Aug 2025 21:55:10 +0300
Subject: [PATCH 1/3] cuda: update videostab for cuda 13.0

---
 modules/videostab/src/cuda/global_motion.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/modules/videostab/src/cuda/global_motion.cu b/modules/videostab/src/cuda/global_motion.cu
index 7eca6ff76b7..c20ccfc2ed4 100644
--- a/modules/videostab/src/cuda/global_motion.cu
+++ b/modules/videostab/src/cuda/global_motion.cu
@@ -52,6 +52,11 @@ namespace cv { namespace cuda { namespace device { namespace globmotion {
 __constant__ float cml[9];
 __constant__ float cmr[9];
 
+struct is_zero
+{
+    __host__ __device__ bool operator()(uchar x) const { return x == 0; }
+};
+
 int compactPoints(int N, float *points0, float *points1, const uchar *mask)
 {
     thrust::device_ptr<float2> dpoints0((float2*)points0);
@@ -60,7 +65,7 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
 
     return (int)(thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
                              thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
-                             dmask, thrust::not1(thrust::identity<uchar>()))
+                             dmask, is_zero())
            - thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1)));
 }
 

From 700a1725ddaab843afd8be7033551d46867aff18 Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Sat, 9 Aug 2025 08:03:36 +0300
Subject: [PATCH 2/3] cudafilters: Add comment for use with CUDA streams

---
 .../include/opencv2/cudafilters.hpp           | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/modules/cudafilters/include/opencv2/cudafilters.hpp b/modules/cudafilters/include/opencv2/cudafilters.hpp
index 2aa9c846462..d92bdde2caa 100644
--- a/modules/cudafilters/include/opencv2/cudafilters.hpp
+++ b/modules/cudafilters/include/opencv2/cudafilters.hpp
@@ -97,6 +97,9 @@ center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa boxFilter
  */
 CV_EXPORTS_W Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1, -1),
@@ -115,6 +118,9 @@ center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa filter2D
  */
 CV_EXPORTS_W Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1, -1),
@@ -134,6 +140,9 @@ applied (see getDerivKernels ).
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa Laplacian
  */
 CV_EXPORTS_W Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
@@ -156,6 +165,9 @@ the aperture center.
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa sepFilter2D
  */
 CV_EXPORTS_W Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
@@ -178,6 +190,9 @@ applied. For details, see getDerivKernels .
 @param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
                                          int ksize, bool normalize = false, double scale = 1,
@@ -196,6 +211,9 @@ applied. For details, see getDerivKernels .
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa Sobel
  */
 CV_EXPORTS_W Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
@@ -213,6 +231,9 @@ applied. See getDerivKernels for details.
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa Scharr
  */
 CV_EXPORTS_W Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
@@ -233,6 +254,9 @@ CV_EXPORTS_W Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, in
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa GaussianBlur
  */
 CV_EXPORTS_W Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
@@ -258,6 +282,9 @@ CV_EXPORTS_W Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksi
 is at the center.
 @param iterations Number of times erosion and dilation to be applied.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa morphologyEx
  */
 CV_EXPORTS_W Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
@@ -272,6 +299,9 @@ CV_EXPORTS_W Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray
 @param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
                                           Point anchor = Point(-1, -1),
@@ -284,6 +314,9 @@ CV_EXPORTS_W Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
 @param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
                                           Point anchor = Point(-1, -1),
@@ -300,6 +333,9 @@ CV_EXPORTS_W Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
 @param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
@@ -311,6 +347,9 @@ CV_EXPORTS_W Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize,
 @param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
@@ -329,6 +368,9 @@ Outputs an image that has been filtered using a median-filtering formulation.
 Details on this algorithm can be found in:
 Green, O., 2017. "Efficient scalable median filtering using histogram-based operations",
                   IEEE Transactions on Image Processing, 27(5), pp.2217-2228.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createMedianFilter(int srcType, int windowSize, int partition = 128);
 

From 7d168ffe029140c0537c391f80060f173be8ba90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5vard=20Haug?= <haavardhaug@gmail.com>
Date: Mon, 11 Aug 2025 12:07:46 +0200
Subject: [PATCH 3/3] Merge pull request #3983 from
 mitresthen:warn-warpaffine-overlap

Add assert to ensure using non-overlapping memory regions #3983

This pr addresses this issue: https://github.com/opencv/opencv/issues/27429
where the user did not realize that the warpaffine function requires non-overlapping src and dst memory regions.
The code now compares the input memory regions and asserts that they do not overlap. There is also a test for this functionality.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 .../include/opencv2/cudawarping.hpp           |  2 ++
 modules/cudawarping/src/warp.cpp              |  2 ++
 modules/cudawarping/test/test_warp_affine.cpp | 30 +++++++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/modules/cudawarping/include/opencv2/cudawarping.hpp b/modules/cudawarping/include/opencv2/cudawarping.hpp
index b9ca957358e..6c920fa8c24 100644
--- a/modules/cudawarping/include/opencv2/cudawarping.hpp
+++ b/modules/cudawarping/include/opencv2/cudawarping.hpp
@@ -118,6 +118,7 @@ CV_EXPORTS_W void resize(InputArray src, OutputArray dst, Size dsize, double fx=
 @param src Source image. CV_8U , CV_16U , CV_32S , or CV_32F depth and 1, 3, or 4 channels are
 supported.
 @param dst Destination image with the same type as src . The size is dsize .
+    **In-place operation (src == dst) is not supported and will result in an error.**
 @param M *2x3* Mat or UMat transformation matrix.
 @param dsize Size of the destination image.
 @param flags Combination of interpolation methods (see resize) and the optional flag
@@ -127,6 +128,7 @@ INTER_NEAREST , INTER_LINEAR , and INTER_CUBIC interpolation methods are support
 @param borderValue
 @param stream Stream for the asynchronous version.
 
+@note In-place operation is not supported. If src and dst refer to the same data, the behavior is undefined.
 @sa warpAffine
  */
 CV_EXPORTS void warpAffine(InputArray src, OutputArray dst, InputArray M, Size dsize, int flags = INTER_LINEAR,
diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp
index 8690f54085d..839b786ce45 100644
--- a/modules/cudawarping/src/warp.cpp
+++ b/modules/cudawarping/src/warp.cpp
@@ -208,6 +208,8 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size
     _dst.create(dsize, src.type());
     GpuMat dst = _dst.getGpuMat();
 
+    CV_Assert( src.data != dst.data && "In-place operation not supported for cv::cuda::warpAffine" );
+
     Size wholeSize;
     Point ofs;
     src.locateROI(wholeSize, ofs);
diff --git a/modules/cudawarping/test/test_warp_affine.cpp b/modules/cudawarping/test/test_warp_affine.cpp
index d26a5fdeb7c..ebbba914ced 100644
--- a/modules/cudawarping/test/test_warp_affine.cpp
+++ b/modules/cudawarping/test/test_warp_affine.cpp
@@ -222,6 +222,36 @@ CUDA_TEST_P(WarpAffine, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
 }
 
+CUDA_TEST_P(WarpAffine, OverlapDetection)
+{
+    cv::Mat src = randomMat(size, type);
+    ASSERT_FALSE(src.empty());
+    cv::cuda::GpuMat gpuSrc;
+    gpuSrc.upload(src);
+
+    cv::Mat M = cv::Mat::eye(2, 3, CV_64FC1);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    {
+        cv::cuda::GpuMat gpuDst(gpuSrc, cv::Rect(0, 0, size.width, size.height));
+
+        EXPECT_THROW(
+            cv::cuda::warpAffine(gpuSrc, gpuDst, M, size, flags, borderType, cv::Scalar::all(0)),
+            cv::Exception);
+    }
+
+    {
+        cv::cuda::GpuMat gpuDst(size, gpuSrc.type());
+        ASSERT_NE(gpuSrc.data, gpuDst.data); // Confirm they are distinct
+
+        EXPECT_NO_THROW({
+            cv::cuda::warpAffine(gpuSrc, gpuDst, M, size, flags, borderType, cv::Scalar::all(0));
+        });
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Warping, WarpAffine, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,