Skip to content

Commit 7c8f101

Browse files
committed
refactor: replace manual timing with profiling macros in matmul operations
1 parent b567413 commit 7c8f101

File tree

1 file changed

+28
-45
lines changed

1 file changed

+28
-45
lines changed

ggml/src/ggml-hexagon/htp/matmul-ops.c

Lines changed: 28 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,8 +1092,7 @@ static void matmul(struct htp_matmul_type * mt,
10921092
uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
10931093
uint8_t * restrict src1_data = src1_spad->data;
10941094

1095-
volatile uint64_t t1, t2;
1096-
t1 = HAP_perf_get_qtimer_count();
1095+
PROFILER_START(matmul);
10971096

10981097
const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
10991098

@@ -1144,12 +1143,9 @@ static void matmul(struct htp_matmul_type * mt,
11441143
}
11451144
}
11461145

1147-
t2 = HAP_perf_get_qtimer_count();
1148-
1149-
FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
1150-
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
1151-
src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
1152-
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
1146+
PROFILER_END(matmul, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
1147+
nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
1148+
src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
11531149
}
11541150

11551151
// q8x4x2 src1 tensor is already in VTCM spad
@@ -1190,8 +1186,7 @@ static void matvec(struct htp_matmul_type * mt,
11901186
uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
11911187
uint8_t * src1_data = src1_spad->data;
11921188

1193-
uint64_t t1, t2;
1194-
t1 = HAP_perf_get_qtimer_count();
1189+
PROFILER_START(matvec);
11951190

11961191
float * tmp = (float *) spad_dst;
11971192

@@ -1236,12 +1231,9 @@ static void matvec(struct htp_matmul_type * mt,
12361231

12371232
hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
12381233

1239-
t2 = HAP_perf_get_qtimer_count();
1240-
1241-
FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
1242-
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
1243-
src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
1244-
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
1234+
PROFILER_END(matvec, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
1235+
nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
1236+
src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
12451237
}
12461238

12471239
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)]
@@ -1267,8 +1259,7 @@ static void matmul_id(struct htp_matmul_type * mt,
12671259
dma_queue * dma_queue) {
12681260
htp_matmul_preamble;
12691261

1270-
uint64_t t1, t2;
1271-
t1 = HAP_perf_get_qtimer_count();
1262+
PROFILER_START(matmul_id);
12721263

12731264
const uint32_t src0_nrows = ne01; // src0 rows per expert
12741265
const uint32_t src1_nrows = ne11;
@@ -1373,12 +1364,11 @@ static void matmul_id(struct htp_matmul_type * mt,
13731364
}
13741365
}
13751366

1376-
t2 = HAP_perf_get_qtimer_count();
1377-
1378-
FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
1379-
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
1380-
src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1],
1381-
dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
1367+
PROFILER_END(matmul_id,
1368+
"matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
1369+
mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
1370+
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3],
1371+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
13821372
}
13831373

13841374
// q8x4 src1 tensor is already in VTCM spad
@@ -1397,8 +1387,7 @@ static void matvec_id(struct htp_matmul_type * mt,
13971387
dma_queue * dma_queue) {
13981388
htp_matmul_preamble;
13991389

1400-
uint64_t t1, t2;
1401-
t1 = HAP_perf_get_qtimer_count();
1390+
PROFILER_START(matvec_id);
14021391

14031392
const uint32_t src0_nrows = ne01; // src0 rows per expert
14041393

@@ -1473,12 +1462,11 @@ static void matvec_id(struct htp_matmul_type * mt,
14731462
}
14741463
}
14751464

1476-
t2 = HAP_perf_get_qtimer_count();
1477-
1478-
FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
1479-
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
1480-
src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0],
1481-
dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
1465+
PROFILER_END(matvec_id,
1466+
"matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
1467+
mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
1468+
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3],
1469+
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
14821470
}
14831471

14841472
// *** matmul in fp16
@@ -1495,8 +1483,7 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
14951483
dma_queue * dma_queue) {
14961484
htp_matmul_preamble;
14971485

1498-
uint64_t t1, t2;
1499-
t1 = HAP_perf_get_qtimer_count();
1486+
PROFILER_START(matmul_f16_f32);
15001487

15011488
const size_t src0_row_size = sizeof(__fp16) * ne00;
15021489
const size_t src1_row_size = sizeof(float) * ne10;
@@ -1575,12 +1562,10 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
15751562
}
15761563
}
15771564

1578-
t2 = HAP_perf_get_qtimer_count();
1579-
1580-
FARF(HIGH, "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
1581-
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0],
1582-
src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
1583-
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
1565+
PROFILER_END(matmul_f16_f32,
1566+
"matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
1567+
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end,
1568+
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
15841569
}
15851570

15861571
// *** dynamic quant
@@ -1662,7 +1647,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
16621647
uint32_t nth,
16631648
uint32_t ith,
16641649
uint32_t nrows_per_thread) {
1665-
uint64_t t1 = HAP_perf_get_qtimer_count();
1650+
PROFILER_START(quantize_fp32_q8x4);
16661651

16671652
const uint32_t ne0 = src->ne[0];
16681653
const uint32_t ne1 = src->ne[1];
@@ -1694,10 +1679,8 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
16941679
src_data += src_row_size;
16951680
}
16961681

1697-
uint64_t t2 = HAP_perf_get_qtimer_count();
1698-
1699-
FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
1700-
ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
1682+
PROFILER_END(quantize_fp32_q8x4, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith,
1683+
nth, nrows, ir_first, ir_last, src_row_size, dst_row_size);
17011684
}
17021685

17031686
static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) {

0 commit comments

Comments
 (0)