@@ -1092,8 +1092,7 @@ static void matmul(struct htp_matmul_type * mt,
10921092 uint8_t * restrict spad_src0 = src0_spad -> data + src0_spad -> size_per_thread * ith ;
10931093 uint8_t * restrict src1_data = src1_spad -> data ;
10941094
1095- volatile uint64_t t1 , t2 ;
1096- t1 = HAP_perf_get_qtimer_count ();
1095+ PROFILER_START (matmul );
10971096
10981097 const uint8_t * restrict src0_row = (const uint8_t * ) src0 -> data ;
10991098
@@ -1144,12 +1143,9 @@ static void matmul(struct htp_matmul_type * mt,
11441143 }
11451144 }
11461145
1147- t2 = HAP_perf_get_qtimer_count ();
1148-
1149- FARF (HIGH , "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n" , mt -> type , ith , nth ,
1150- src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], src0_start_row , src0_end_row , src1 -> ne [0 ], src1 -> ne [1 ],
1151- src1 -> ne [2 ], src1 -> ne [3 ], dst -> ne [0 ], dst -> ne [1 ], dst -> ne [2 ], dst -> ne [3 ],
1152- (unsigned ) HAP_perf_qtimer_count_to_us (t2 - t1 ));
1146+ PROFILER_END (matmul , "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n" , mt -> type , ith ,
1147+ nth , src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], src0_start_row , src0_end_row , src1 -> ne [0 ],
1148+ src1 -> ne [1 ], src1 -> ne [2 ], src1 -> ne [3 ], dst -> ne [0 ], dst -> ne [1 ], dst -> ne [2 ], dst -> ne [3 ]);
11531149}
11541150
11551151// q8x4x2 src1 tensor is already in VTCM spad
@@ -1190,8 +1186,7 @@ static void matvec(struct htp_matmul_type * mt,
11901186 uint8_t * spad_src0 = src0_spad -> data + src0_spad -> size_per_thread * ith ;
11911187 uint8_t * src1_data = src1_spad -> data ;
11921188
1193- uint64_t t1 , t2 ;
1194- t1 = HAP_perf_get_qtimer_count ();
1189+ PROFILER_START (matvec );
11951190
11961191 float * tmp = (float * ) spad_dst ;
11971192
@@ -1236,12 +1231,9 @@ static void matvec(struct htp_matmul_type * mt,
12361231
12371232 hvx_copy_fp32_ua ((uint8_t * ) & dst_col [src0_start_row ], (uint8_t * ) tmp , src0_end_row - src0_start_row );
12381233
1239- t2 = HAP_perf_get_qtimer_count ();
1240-
1241- FARF (HIGH , "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n" , mt -> type , ith , nth ,
1242- src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], src0_start_row , src0_end_row , src1 -> ne [0 ], src1 -> ne [1 ],
1243- src1 -> ne [2 ], src1 -> ne [3 ], dst -> ne [0 ], dst -> ne [1 ], dst -> ne [2 ], dst -> ne [3 ],
1244- (unsigned ) HAP_perf_qtimer_count_to_us (t2 - t1 ));
1234+ PROFILER_END (matvec , "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n" , mt -> type , ith ,
1235+ nth , src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], src0_start_row , src0_end_row , src1 -> ne [0 ],
1236+ src1 -> ne [1 ], src1 -> ne [2 ], src1 -> ne [3 ], dst -> ne [0 ], dst -> ne [1 ], dst -> ne [2 ], dst -> ne [3 ]);
12451237}
12461238
12471239#define MMID_MATRIX_ROW (row_id , i1 ) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)]
@@ -1267,8 +1259,7 @@ static void matmul_id(struct htp_matmul_type * mt,
12671259 dma_queue * dma_queue ) {
12681260 htp_matmul_preamble ;
12691261
1270- uint64_t t1 , t2 ;
1271- t1 = HAP_perf_get_qtimer_count ();
1262+ PROFILER_START (matmul_id );
12721263
12731264 const uint32_t src0_nrows = ne01 ; // src0 rows per expert
12741265 const uint32_t src1_nrows = ne11 ;
@@ -1373,12 +1364,11 @@ static void matmul_id(struct htp_matmul_type * mt,
13731364 }
13741365 }
13751366
1376- t2 = HAP_perf_get_qtimer_count ();
1377-
1378- FARF (HIGH , "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n" , mt -> type ,
1379- ith , nth , src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], src0_start_row , src0_end_row , src1 -> ne [0 ],
1380- src1 -> ne [1 ], src1 -> ne [2 ], src1 -> ne [3 ], ids -> ne [0 ], ids -> ne [1 ], ids -> ne [2 ], ids -> ne [3 ], dst -> ne [0 ], dst -> ne [1 ],
1381- dst -> ne [2 ], dst -> ne [3 ], (unsigned ) HAP_perf_qtimer_count_to_us (t2 - t1 ));
1367+ PROFILER_END (matmul_id ,
1368+ "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n" ,
1369+ mt -> type , ith , nth , src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], src0_start_row , src0_end_row ,
1370+ src1 -> ne [0 ], src1 -> ne [1 ], src1 -> ne [2 ], src1 -> ne [3 ], ids -> ne [0 ], ids -> ne [1 ], ids -> ne [2 ], ids -> ne [3 ],
1371+ dst -> ne [0 ], dst -> ne [1 ], dst -> ne [2 ], dst -> ne [3 ]);
13821372}
13831373
13841374// q8x4 src1 tensor is already in VTCM spad
@@ -1397,8 +1387,7 @@ static void matvec_id(struct htp_matmul_type * mt,
13971387 dma_queue * dma_queue ) {
13981388 htp_matmul_preamble ;
13991389
1400- uint64_t t1 , t2 ;
1401- t1 = HAP_perf_get_qtimer_count ();
1390+ PROFILER_START (matvec_id );
14021391
14031392 const uint32_t src0_nrows = ne01 ; // src0 rows per expert
14041393
@@ -1473,12 +1462,11 @@ static void matvec_id(struct htp_matmul_type * mt,
14731462 }
14741463 }
14751464
1476- t2 = HAP_perf_get_qtimer_count ();
1477-
1478- FARF (HIGH , "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n" , mt -> type ,
1479- ith , nth , src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], src0_start_row , src0_end_row , src1 -> ne [0 ],
1480- src1 -> ne [1 ], src1 -> ne [2 ], src1 -> ne [3 ], src2 -> ne [0 ], src2 -> ne [1 ], src2 -> ne [2 ], src2 -> ne [3 ], dst -> ne [0 ],
1481- dst -> ne [1 ], dst -> ne [2 ], dst -> ne [3 ], (unsigned ) HAP_perf_qtimer_count_to_us (t2 - t1 ));
1465+ PROFILER_END (matvec_id ,
1466+ "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n" ,
1467+ mt -> type , ith , nth , src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], src0_start_row , src0_end_row ,
1468+ src1 -> ne [0 ], src1 -> ne [1 ], src1 -> ne [2 ], src1 -> ne [3 ], src2 -> ne [0 ], src2 -> ne [1 ], src2 -> ne [2 ], src2 -> ne [3 ],
1469+ dst -> ne [0 ], dst -> ne [1 ], dst -> ne [2 ], dst -> ne [3 ]);
14821470}
14831471
14841472// *** matmul in fp16
@@ -1495,8 +1483,7 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
14951483 dma_queue * dma_queue ) {
14961484 htp_matmul_preamble ;
14971485
1498- uint64_t t1 , t2 ;
1499- t1 = HAP_perf_get_qtimer_count ();
1486+ PROFILER_START (matmul_f16_f32 );
15001487
15011488 const size_t src0_row_size = sizeof (__fp16 ) * ne00 ;
15021489 const size_t src1_row_size = sizeof (float ) * ne10 ;
@@ -1575,12 +1562,10 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
15751562 }
15761563 }
15771564
1578- t2 = HAP_perf_get_qtimer_count ();
1579-
1580- FARF (HIGH , "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n" , ith , nth ,
1581- src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], ir0_start , ir0_end , ir1_start , ir1_end , src1 -> ne [0 ],
1582- src1 -> ne [1 ], src1 -> ne [2 ], src1 -> ne [3 ], dst -> ne [0 ], dst -> ne [1 ], dst -> ne [2 ], dst -> ne [3 ],
1583- (unsigned ) HAP_perf_qtimer_count_to_us (t2 - t1 ));
1565+ PROFILER_END (matmul_f16_f32 ,
1566+ "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n" , ith , nth ,
1567+ src0 -> ne [0 ], src0 -> ne [1 ], src0 -> ne [2 ], src0 -> ne [3 ], ir0_start , ir0_end , ir1_start , ir1_end ,
1568+ src1 -> ne [0 ], src1 -> ne [1 ], src1 -> ne [2 ], src1 -> ne [3 ], dst -> ne [0 ], dst -> ne [1 ], dst -> ne [2 ], dst -> ne [3 ]);
15841569}
15851570
15861571// *** dynamic quant
@@ -1662,7 +1647,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
16621647 uint32_t nth ,
16631648 uint32_t ith ,
16641649 uint32_t nrows_per_thread ) {
1665- uint64_t t1 = HAP_perf_get_qtimer_count ( );
1650+ PROFILER_START ( quantize_fp32_q8x4 );
16661651
16671652 const uint32_t ne0 = src -> ne [0 ];
16681653 const uint32_t ne1 = src -> ne [1 ];
@@ -1694,10 +1679,8 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
16941679 src_data += src_row_size ;
16951680 }
16961681
1697- uint64_t t2 = HAP_perf_get_qtimer_count ();
1698-
1699- FARF (HIGH , "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n" , ith , nth , nrows , ir_first ,
1700- ir_last , src_row_size , dst_row_size , (unsigned ) HAP_perf_qtimer_count_to_us (t2 - t1 ));
1682+ PROFILER_END (quantize_fp32_q8x4 , "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n" , ith ,
1683+ nth , nrows , ir_first , ir_last , src_row_size , dst_row_size );
17011684}
17021685
17031686static void htp_quantize_fp32_q8x4x2 (unsigned int n , unsigned int i , void * data ) {
0 commit comments