@@ -52,38 +52,41 @@ using namespace cv::cuda::device;
5252
5353namespace hist
5454{
55- __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t step, int * hist)
55+ template <bool fourByteAligned>
56+ __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t step, int * hist, const int offsetX = 0 )
5657 {
5758 __shared__ int shist[256 ];
5859
5960 const int y = blockIdx .x * blockDim .y + threadIdx .y ;
6061 const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
61-
62+ const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
6263 shist[tid] = 0 ;
6364 __syncthreads ();
6465
65- if (y < rows)
66- {
67- const unsigned int * rowPtr = (const unsigned int *) (src + y * step);
68-
69- const int cols_4 = cols / 4 ;
70- for (int x = threadIdx .x ; x < cols_4; x += blockDim .x )
71- {
72- unsigned int data = rowPtr[x];
66+ if (y < rows) {
67+ const uchar* rowPtr = &src[y * step];
68+ // load uncoalesced head
69+ if (!fourByteAligned && threadIdx .x == 0 ) {
70+ for (int x = 0 ; x < min (alignedOffset, cols); x++)
71+ Emulation::smem::atomicAdd (&shist[static_cast <int >(rowPtr[x])], 1 );
72+ }
7373
74- Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
75- Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
74+ // coalesced loads
75+ const unsigned int * rowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
76+ const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4 ;
77+ for (int x = threadIdx .x ; x < cols_4; x += blockDim .x ) {
78+ const unsigned int data = rowPtrIntAligned[x];
79+ Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
80+ Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
7681 Emulation::smem::atomicAdd (&shist[(data >> 16 ) & 0xFFU ], 1 );
7782 Emulation::smem::atomicAdd (&shist[(data >> 24 ) & 0xFFU ], 1 );
7883 }
7984
80- if (cols % 4 != 0 && threadIdx .x == 0 )
81- {
82- for (int x = cols_4 * 4 ; x < cols; ++x)
83- {
84- unsigned int data = ((const uchar*)rowPtr)[x];
85- Emulation::smem::atomicAdd (&shist[data], 1 );
86- }
85+ // load uncoalesced tail
86+ if (threadIdx .x == 0 ) {
87+ const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
88+ for (int x = iTailStart; x < cols; x++)
89+ Emulation::smem::atomicAdd (&shist[static_cast <int >(rowPtr[x])], 1 );
8790 }
8891 }
8992
@@ -94,61 +97,70 @@ namespace hist
9497 ::atomicAdd (hist + tid, histVal);
9598 }
9699
97- void histogram256 (PtrStepSzb src, int * hist, cudaStream_t stream)
100+ void histogram256 (PtrStepSzb src, int * hist, const int offsetX, cudaStream_t stream)
98101 {
99102 const dim3 block (32 , 8 );
100103 const dim3 grid (divUp (src.rows , block.y ));
101-
102- histogram256Kernel<<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , hist);
104+ if (offsetX)
105+ histogram256Kernel<false ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , hist, offsetX);
106+ else
107+ histogram256Kernel<true ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , hist, offsetX);
103108 cudaSafeCall ( cudaGetLastError () );
104109
105110 if (stream == 0 )
106111 cudaSafeCall ( cudaDeviceSynchronize () );
107112 }
108113
109- __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int * hist)
114+ template <bool fourByteAligned>
115+ __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int * hist, const int offsetX = 0 )
110116 {
111117 __shared__ int shist[256 ];
112118
113119 const int y = blockIdx .x * blockDim .y + threadIdx .y ;
114120 const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
115-
121+ const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
116122 shist[tid] = 0 ;
117123 __syncthreads ();
118124
119125 if (y < rows)
120126 {
121- const unsigned int * rowPtr = (const unsigned int *) (src + y * srcStep);
122- const unsigned int * maskRowPtr = (const unsigned int *) (mask + y * maskStep);
127+ const uchar* rowPtr = &src[y * srcStep];
128+ const uchar* maskRowPtr = &mask[y * maskStep];
129+ // load uncoalesced head
130+ if (!fourByteAligned && threadIdx .x == 0 ) {
131+ for (int x = 0 ; x < min (alignedOffset, cols); x++) {
132+ if (maskRowPtr[x])
133+ Emulation::smem::atomicAdd (&shist[rowPtr[x]], 1 );
134+ }
135+ }
123136
124- const int cols_4 = cols / 4 ;
125- for (int x = threadIdx .x ; x < cols_4; x += blockDim .x )
126- {
127- unsigned int data = rowPtr[x];
128- unsigned int m = maskRowPtr[x];
137+ // coalesced loads
138+ const unsigned int * rowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
139+ const unsigned int * maskRowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
140+ const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4 ;
141+ for (int x = threadIdx .x ; x < cols_4; x += blockDim .x ) {
142+ const unsigned int data = rowPtrIntAligned[x];
143+ const unsigned int m = maskRowPtrIntAligned[x];
129144
130- if ((m >> 0 ) & 0xFFU )
131- Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
145+ if ((m >> 0 ) & 0xFFU )
146+ Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
132147
133- if ((m >> 8 ) & 0xFFU )
134- Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
148+ if ((m >> 8 ) & 0xFFU )
149+ Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
135150
136- if ((m >> 16 ) & 0xFFU )
151+ if ((m >> 16 ) & 0xFFU )
137152 Emulation::smem::atomicAdd (&shist[(data >> 16 ) & 0xFFU ], 1 );
138153
139- if ((m >> 24 ) & 0xFFU )
154+ if ((m >> 24 ) & 0xFFU )
140155 Emulation::smem::atomicAdd (&shist[(data >> 24 ) & 0xFFU ], 1 );
141156 }
142157
143- if (cols % 4 != 0 && threadIdx .x == 0 )
144- {
145- for (int x = cols_4 * 4 ; x < cols; ++x)
146- {
147- unsigned int data = ((const uchar*)rowPtr)[x];
148- unsigned int m = ((const uchar*)maskRowPtr)[x];
149-
150- if (m)
151- Emulation::smem::atomicAdd (&shist[data], 1 );
158+ // load uncoalesced tail
159+ if (threadIdx .x == 0 ) {
160+ const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
161+ for (int x = iTailStart; x < cols; x++) {
162+ if (maskRowPtr[x])
163+ Emulation::smem::atomicAdd (&shist[static_cast <int >(rowPtr[x])], 1 );
152164 }
153165 }
154166 }
@@ -160,12 +172,15 @@ namespace hist
160172 ::atomicAdd (hist + tid, histVal);
161173 }
162174
163- void histogram256 (PtrStepSzb src, PtrStepSzb mask, int * hist, cudaStream_t stream)
175+ void histogram256 (PtrStepSzb src, PtrStepSzb mask, int * hist, const int offsetX, cudaStream_t stream)
164176 {
165177 const dim3 block (32 , 8 );
166178 const dim3 grid (divUp (src.rows , block.y ));
167179
168- histogram256Kernel<<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , mask.data , mask.step , hist);
180+ if (offsetX)
181+ histogram256Kernel<false ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , mask.data , mask.step , hist, offsetX);
182+ else
183+ histogram256Kernel<true ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , mask.data , mask.step , hist, offsetX);
169184 cudaSafeCall ( cudaGetLastError () );
170185
171186 if (stream == 0 )
@@ -186,42 +201,44 @@ namespace hist
186201 }
187202 }
188203
189- __global__ void histEven8u (const uchar* src, const size_t step, const int rows, const int cols,
190- int * hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
204+ template <bool fourByteAligned>
205+ __global__ void histEven8u (const uchar* src, const size_t step, const int rows, const int cols, int * hist, const int binCount, const int binSize,
206+ const int lowerLevel, const int upperLevel, const int offsetX)
191207 {
192208 extern __shared__ int shist[];
193209
194210 const int y = blockIdx .x * blockDim .y + threadIdx .y ;
195211 const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
196-
212+ const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
197213 if (tid < binCount)
198214 shist[tid] = 0 ;
199-
200215 __syncthreads ();
201216
202217 if (y < rows)
203218 {
204- const uchar* rowPtr = src + y * step;
205- const uint* rowPtr4 = (uint*) rowPtr;
206-
207- const int cols_4 = cols / 4 ;
208- for (int x = threadIdx .x ; x < cols_4; x += blockDim .x )
209- {
210- const uint data = rowPtr4[x];
219+ const uchar* rowPtr = &src[y * step];
220+ // load uncoalesced head
221+ if (!fourByteAligned && threadIdx .x == 0 ) {
222+ for (int x = 0 ; x < min (alignedOffset, cols); x++)
223+ histEvenInc (shist, rowPtr[x], binSize, lowerLevel, upperLevel);
224+ }
211225
212- histEvenInc (shist, (data >> 0 ) & 0xFFU , binSize, lowerLevel, upperLevel);
213- histEvenInc (shist, (data >> 8 ) & 0xFFU , binSize, lowerLevel, upperLevel);
226+ // coalesced loads
227+ const unsigned int * rowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
228+ const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4 ;
229+ for (int x = threadIdx .x ; x < cols_4; x += blockDim .x ) {
230+ const unsigned int data = rowPtrIntAligned[x];
231+ histEvenInc (shist, (data >> 0 ) & 0xFFU , binSize, lowerLevel, upperLevel);
232+ histEvenInc (shist, (data >> 8 ) & 0xFFU , binSize, lowerLevel, upperLevel);
214233 histEvenInc (shist, (data >> 16 ) & 0xFFU , binSize, lowerLevel, upperLevel);
215234 histEvenInc (shist, (data >> 24 ) & 0xFFU , binSize, lowerLevel, upperLevel);
216235 }
217236
218- if (cols % 4 != 0 && threadIdx .x == 0 )
219- {
220- for (int x = cols_4 * 4 ; x < cols; ++x)
221- {
222- const uchar data = rowPtr[x];
223- histEvenInc (shist, data, binSize, lowerLevel, upperLevel);
224- }
237+ // load uncoalesced tail
238+ if (threadIdx .x == 0 ) {
239+ const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
240+ for (int x = iTailStart; x < cols; x++)
241+ histEvenInc (shist, rowPtr[x], binSize, lowerLevel, upperLevel);
225242 }
226243 }
227244
@@ -236,7 +253,7 @@ namespace hist
236253 }
237254 }
238255
239- void histEven8u (PtrStepSzb src, int * hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
256+ void histEven8u (PtrStepSzb src, int * hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
240257 {
241258 const dim3 block (32 , 8 );
242259 const dim3 grid (divUp (src.rows , block.y ));
@@ -245,7 +262,10 @@ namespace hist
245262
246263 const size_t smem_size = binCount * sizeof (int );
247264
248- histEven8u<<<grid, block, smem_size, stream>>> (src.data , src.step , src.rows , src.cols , hist, binCount, binSize, lowerLevel, upperLevel);
265+ if (offsetX)
266+ histEven8u<false ><<<grid, block, smem_size, stream>>> (src.data , src.step , src.rows , src.cols , hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
267+ else
268+ histEven8u<true ><<<grid, block, smem_size, stream>>> (src.data , src.step , src.rows , src.cols , hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
249269 cudaSafeCall ( cudaGetLastError () );
250270
251271 if (stream == 0 )
0 commit comments