@@ -229,25 +229,25 @@ class TrackerInvoker : public cv::ParallelLoopBody
229229 v_int16x8 v01 = v_reinterpret_as_s16 (v_load_expand (Jptr + x + cn));
230230 v_int16x8 v10 = v_reinterpret_as_s16 (v_load_expand (Jptr1 + x));
231231 v_int16x8 v11 = v_reinterpret_as_s16 (v_load_expand (Jptr1 + x + cn));
232- v_int16x8 vmask = v_reinterpret_as_s16 (v_load_expand (maskPtr + x)) * vmax_val_16;
232+ v_int16x8 vmask = v_mul ( v_reinterpret_as_s16 (v_load_expand (maskPtr + x)), vmax_val_16) ;
233233
234234 v_int32x4 t0, t1;
235235 v_int16x8 t00, t01, t10, t11;
236236 v_zip (v00, v01, t00, t01);
237237 v_zip (v10, v11, t10, t11);
238238
239239 // subpixel interpolation
240- t0 = v_dotprod (t00, vqw0, vdelta) + v_dotprod (t10, vqw1);
241- t1 = v_dotprod (t01, vqw0, vdelta) + v_dotprod (t11, vqw1);
242- t0 = t0 >> ( W_BITS - 5 );
243- t1 = t1 >> ( W_BITS - 5 );
240+ t0 = v_add ( v_dotprod (t00, vqw0, vdelta), v_dotprod (t10, vqw1) );
241+ t1 = v_add ( v_dotprod (t01, vqw0, vdelta), v_dotprod (t11, vqw1) );
242+ t0 = v_shr (t0, W_BITS - 5 );
243+ t1 = v_shr (t1, W_BITS - 5 );
244244
245245 // diff = J - I
246- diff0 = v_pack (t0, t1) - vI ;
246+ diff0 = v_sub ( v_pack (t0, t1), vI) ;
247247 // I*gain.x + gain.x
248248 v_mul_expand (vI, vgain_value, t0, t1);
249- diff0 = diff0 + v_pack (t0 >> bitShift, t1 >> bitShift) + vconst_value;
250- diff0 = diff0 & vmask;
249+ diff0 = v_add ( v_add ( diff0, v_pack (v_shr (t0, bitShift), v_shr (t1, bitShift))), vconst_value) ;
250+ diff0 = v_and ( diff0, vmask) ;
251251 v_zip (diff0, diff0, diff2, diff1);
252252
253253 v_int32x4 diff0_0;
@@ -259,16 +259,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
259259 v_zip (vIxy_0, vIxy_1, v10, v11);
260260 v_zip (diff2, diff1, v00, v01);
261261
262- vqb0 += v_cvt_f32 (v_dotprod (v00, v10));
263- vqb1 += v_cvt_f32 (v_dotprod (v01, v11));
262+ vqb0 = v_add (vqb0, v_cvt_f32 (v_dotprod (v00, v10) ));
263+ vqb1 = v_add (vqb1, v_cvt_f32 (v_dotprod (v01, v11) ));
264264
265265 v_int32x4 vI0, vI1;
266266 v_expand (vI, vI0, vI1);
267- vqb2 += v_cvt_f32 (diff0_0 * vI0);
268- vqb2 += v_cvt_f32 (diff0_1 * vI1);
267+ vqb2 = v_add (vqb2, v_cvt_f32 (v_mul ( diff0_0, vI0)) );
268+ vqb2 = v_add (vqb2, v_cvt_f32 (v_mul ( diff0_1, vI1)) );
269269
270- vqb3 += v_cvt_f32 (diff0_0);
271- vqb3 += v_cvt_f32 (diff0_1);
270+ vqb3 = v_add (vqb3, v_cvt_f32 (diff0_0) );
271+ vqb3 = v_add (vqb3, v_cvt_f32 (diff0_1) );
272272
273273 if (j == 0 )
274274 {
@@ -285,17 +285,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
285285 vAxx = v_muladd (fx, fx, vAxx);
286286
287287 // sumIx und sumIy
288- vsumIx += fx ;
289- vsumIy += fy ;
288+ vsumIx = v_add (vsumIx, fx) ;
289+ vsumIy = v_add (vsumIy, fy) ;
290290
291- vsumW1 += vI_ps * fx ;
292- vsumW2 += vI_ps * fy ;
291+ vsumW1 = v_add (vsumW1, v_mul ( vI_ps, fx)) ;
292+ vsumW2 = v_add (vsumW2, v_mul ( vI_ps, fy)) ;
293293
294294 // sumI
295- vsumI += vI_ps;
295+ vsumI = v_add (vsumI, vI_ps) ;
296296
297297 // sumDI
298- vsumDI += vI_ps * vI_ps;
298+ vsumDI = v_add (vsumDI, v_mul ( vI_ps, vI_ps)) ;
299299
300300 v01 = v_reinterpret_as_s16 (v_interleave_pairs (v_reinterpret_as_s32 (v_interleave_pairs (vIxy_1))));
301301 v_expand (v01, t1, t0);
@@ -309,17 +309,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
309309 vAxx = v_muladd (fx, fx, vAxx);
310310
311311 // sumIx und sumIy
312- vsumIx += fx ;
313- vsumIy += fy ;
312+ vsumIx = v_add (vsumIx, fx) ;
313+ vsumIy = v_add (vsumIy, fy) ;
314314
315- vsumW1 += vI_ps * fx ;
316- vsumW2 += vI_ps * fy ;
315+ vsumW1 = v_add (vsumW1, v_mul ( vI_ps, fx)) ;
316+ vsumW2 = v_add (vsumW2, v_mul ( vI_ps, fy)) ;
317317
318318 // sumI
319- vsumI += vI_ps;
319+ vsumI = v_add (vsumI, vI_ps) ;
320320
321321 // sumDI
322- vsumDI += vI_ps * vI_ps;
322+ vsumDI = v_add (vsumDI, v_mul ( vI_ps, vI_ps)) ;
323323 }
324324 }
325325#else
@@ -388,7 +388,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
388388
389389#if CV_SIMD128
390390 float CV_DECL_ALIGNED (16 ) bbuf[4 ];
391- v_store_aligned (bbuf, vqb0 + vqb1);
391+ v_store_aligned (bbuf, v_add ( vqb0, vqb1) );
392392 b1 = bbuf[0 ] + bbuf[2 ];
393393 b2 = bbuf[1 ] + bbuf[3 ];
394394 b3 = v_reduce_sum (vqb2);
@@ -696,19 +696,19 @@ class TrackerInvoker : public cv::ParallelLoopBody
696696 v_int16x8 v01 = v_reinterpret_as_s16 (v_load_expand (Jptr + x + cn));
697697 v_int16x8 v10 = v_reinterpret_as_s16 (v_load_expand (Jptr1 + x));
698698 v_int16x8 v11 = v_reinterpret_as_s16 (v_load_expand (Jptr1 + x + cn));
699- v_int16x8 vmask = v_reinterpret_as_s16 (v_load_expand (maskPtr + x)) * vmax_val_16;
699+ v_int16x8 vmask = v_mul ( v_reinterpret_as_s16 (v_load_expand (maskPtr + x)), vmax_val_16) ;
700700
701701 v_int32x4 t0, t1;
702702 v_int16x8 t00, t01, t10, t11;
703703 v_zip (v00, v01, t00, t01);
704704 v_zip (v10, v11, t10, t11);
705705
706- t0 = v_dotprod (t00, vqw0, vdelta) + v_dotprod (t10, vqw1);
707- t1 = v_dotprod (t01, vqw0, vdelta) + v_dotprod (t11, vqw1);
708- t0 = t0 >> ( W_BITS - 5 );
709- t1 = t1 >> ( W_BITS - 5 );
710- diff0 = v_pack (t0, t1) - diff0;
711- diff0 = diff0 & vmask;
706+ t0 = v_add ( v_dotprod (t00, vqw0, vdelta), v_dotprod (t10, vqw1) );
707+ t1 = v_add ( v_dotprod (t01, vqw0, vdelta), v_dotprod (t11, vqw1) );
708+ t0 = v_shr (t0, W_BITS - 5 );
709+ t1 = v_shr (t1, W_BITS - 5 );
710+ diff0 = v_sub ( v_pack (t0, t1), diff0) ;
711+ diff0 = v_and ( diff0, vmask) ;
712712
713713 v_zip (diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
714714
@@ -717,8 +717,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
717717 v_zip (vIxy_0, vIxy_1, v10, v11);
718718 v_zip (diff2, diff1, v00, v01);
719719
720- vqb0 += v_cvt_f32 (v_dotprod (v00, v10));
721- vqb1 += v_cvt_f32 (v_dotprod (v01, v11));
720+ vqb0 = v_add (vqb0, v_cvt_f32 (v_dotprod (v00, v10) ));
721+ vqb1 = v_add (vqb1, v_cvt_f32 (v_dotprod (v01, v11) ));
722722 }
723723#else
724724 for ( ; x < winSize.width *cn; x++, dIptr += 2 )
@@ -737,7 +737,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
737737
738738#if CV_SIMD128
739739 float CV_DECL_ALIGNED (16 ) bbuf[4 ];
740- v_store_aligned (bbuf, vqb0 + vqb1);
740+ v_store_aligned (bbuf, v_add ( vqb0, vqb1) );
741741 b1 = bbuf[0 ] + bbuf[2 ];
742742 b2 = bbuf[1 ] + bbuf[3 ];
743743#endif
0 commit comments