Skip to content

Commit aee43d8

Browse files
hexagon: use fastdiv in ADD_ID
1 parent 21c1cb8 commit aee43d8

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

ggml/src/ggml-hexagon/htp/binary-ops.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -111,18 +111,17 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx,
111111

112112
uint8_t * restrict spad_data_th = spad_data + (ith * src0_row_size);
113113

114-
const uint32_t nr0 = ne00 / ne10;
115-
116114
const uint8_t * restrict src0_ptr = (const uint8_t *) src0->data + (src0_start_row * src0_row_size);
117115
uint8_t * restrict dst_ptr = (uint8_t *) dst->data + (src0_start_row * dst_row_size);
118116

119117
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
120118

121-
const uint32_t ne0201 = ne02 * ne01;
119+
const uint32_t ne02_ne01 = ne02 * ne01;
120+
122121
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
123122
const uint32_t i03 = fastdiv(ir, &octx->src0_div21);
124-
const uint32_t i02 = fastdiv(ir - i03 * ne0201, &octx->src0_div1);
125-
const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01);
123+
const uint32_t i02 = fastdiv(ir - i03 * ne02_ne01, &octx->src0_div1);
124+
const uint32_t i01 = (ir - i03 * ne02_ne01 - i02 * ne01);
126125

127126
const uint32_t i13 = fastmodulo(i03, ne13, &octx->src1_div3);
128127
const uint32_t i12 = fastmodulo(i02, ne12, &octx->src1_div2);
@@ -137,6 +136,7 @@ static void binary_job_f32_per_thread(struct htp_ops_context * octx,
137136
}
138137
}
139138

139+
const uint32_t nr0 = ne00 / ne10;
140140
if (nr0 > 1) {
141141
if ((1 == is_aligned) && (nr0 == ne00)) {
142142
hvx_bcast_fp32_a(spad_data_th, *(float *) src1_ptr, nr0);
@@ -172,7 +172,6 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
172172
const size_t src1_row_size = nb11;
173173
const size_t dst_row_size = nb1;
174174

175-
const uint32_t ne02_ne01 = ne02 * ne01;
176175
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
177176

178177
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
@@ -195,10 +194,11 @@ static void binary_add_id_job_f32_per_thread(struct htp_ops_context * octx,
195194
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
196195
uint8_t * restrict data_dst = (uint8_t *) dst->data;
197196

197+
const uint32_t ne02_ne01 = ne02 * ne01;
198198
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
199199
// src0 indices
200-
const uint32_t i03 = ir / ne02_ne01;
201-
const uint32_t i02 = (ir - i03 * ne02_ne01) / ne01;
200+
const uint32_t i03 = fastdiv(ir, &octx->src0_div21);
201+
const uint32_t i02 = fastdiv(ir - i03 * ne02_ne01, &octx->src0_div1);
202202
const uint32_t i01 = (ir - i03 * ne02_ne01 - i02 * ne01);
203203

204204
// src1 indices

0 commit comments

Comments
 (0)