@@ -111,18 +111,17 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx,
111111
112112 uint8_t * restrict spad_data_th = spad_data + (ith * src0_row_size );
113113
114- const uint32_t nr0 = ne00 / ne10 ;
115-
116114 const uint8_t * restrict src0_ptr = (const uint8_t * ) src0 -> data + (src0_start_row * src0_row_size );
117115 uint8_t * restrict dst_ptr = (uint8_t * ) dst -> data + (src0_start_row * dst_row_size );
118116
119117 const uint8_t * restrict data_src1 = (const uint8_t * ) src1 -> data ;
120118
121- const uint32_t ne0201 = ne02 * ne01 ;
119+ const uint32_t ne02_ne01 = ne02 * ne01 ;
120+
122121 for (uint32_t ir = src0_start_row ; ir < src0_end_row ; ir ++ ) {
123122 const uint32_t i03 = fastdiv (ir , & octx -> src0_div21 );
124- const uint32_t i02 = fastdiv (ir - i03 * ne0201 , & octx -> src0_div1 );
125- const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01 );
123+ const uint32_t i02 = fastdiv (ir - i03 * ne02_ne01 , & octx -> src0_div1 );
124+ const uint32_t i01 = (ir - i03 * ne02_ne01 - i02 * ne01 );
126125
127126 const uint32_t i13 = fastmodulo (i03 , ne13 , & octx -> src1_div3 );
128127 const uint32_t i12 = fastmodulo (i02 , ne12 , & octx -> src1_div2 );
@@ -137,6 +136,7 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx,
137136 }
138137 }
139138
139+ const uint32_t nr0 = ne00 / ne10 ;
140140 if (nr0 > 1 ) {
141141 if ((1 == is_aligned ) && (nr0 == ne00 )) {
142142 hvx_bcast_fp32_a (spad_data_th , * (float * ) src1_ptr , nr0 );
@@ -172,7 +172,6 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
172172 const size_t src1_row_size = nb11 ;
173173 const size_t dst_row_size = nb1 ;
174174
175- const uint32_t ne02_ne01 = ne02 * ne01 ;
176175 const uint32_t src0_nrows = ne01 * ne02 * ne03 ; // src0 rows
177176
178177 const uint32_t src0_start_row = src0_nrows_per_thread * ith ;
@@ -195,10 +194,11 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
195194 const uint8_t * restrict data_src1 = (const uint8_t * ) src1 -> data ;
196195 uint8_t * restrict data_dst = (uint8_t * ) dst -> data ;
197196
197+ const uint32_t ne02_ne01 = ne02 * ne01 ;
198198 for (uint32_t ir = src0_start_row ; ir < src0_end_row ; ir ++ ) {
199199 // src0 indices
200- const uint32_t i03 = ir / ne02_ne01 ;
201- const uint32_t i02 = (ir - i03 * ne02_ne01 ) / ne01 ;
200+ const uint32_t i03 = fastdiv ( ir , & octx -> src0_div21 ) ;
201+ const uint32_t i02 = fastdiv (ir - i03 * ne02_ne01 , & octx -> src0_div1 ) ;
202202 const uint32_t i01 = (ir - i03 * ne02_ne01 - i02 * ne01 );
203203
204204 // src1 indices
0 commit comments