From d12247a60d59eb5dcf30b24db1085b2692bece1f Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 24 Nov 2025 12:35:45 +0800 Subject: [PATCH 01/12] refactor: replace ggml_hexagon_mul_mat with template-based binary operation for improved flexibility --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 81 +++----------------------- 1 file changed, 7 insertions(+), 74 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 72a82a89116..8b7641efa01 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2322,76 +2322,6 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer (unsigned int) d->size); } -static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - const struct ggml_tensor * dst = op; - - uint64_t t1, t2; - t1 = ggml_time_us(); - - // Construct HTP message - htp_general_req req; - req.op = HTP_OP_MUL_MAT; - req.flags = flags; - - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.dst, dst); - - // Use opmask to override flags - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } - - dspqueue_buffer bufs[3]; - - // First buffer Weights. - // The content is static, there is no need to do any cache management - dspqueue_buffers_init(bufs, src0, false, false); - - // Second buffer Input Activations. This is a buffer that the CPU - // writes and the DSP reads, so we'll need to flush CPU caches and - // invalidate DSP ones. On platforms with I/O coherency support the - // framework will automatically skip cache operations where possible. - dspqueue_buffers_init(&bufs[1], src1, true, true); - - // Third buffer Output Activations. We'll handle DSP - // cache maintenance in the response message but need to flush - // CPU caches to ensure any previously written dirty lines are - // written out before writes from the DSP start. - dspqueue_buffers_init(&bufs[2], dst, true, false); - - auto * sess = get_session_from_tensor(src0); - - if (opt_verbose) { - hex_print_op_info(op, sess, req.flags); - if (opt_verbose > 1) { - hex_dump_dspbuf(src0, &bufs[0]); - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(dst, &bufs[2]); - } - } - - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, 3, opt_opsync); - } - - t2 = ggml_time_us(); - - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) " - "call-usec %llu\n", - sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); -} - static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; @@ -2471,7 +2401,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); } -static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { +template static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; @@ -2495,6 +2425,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { } switch (node->op) { + case GGML_OP_MUL_MAT: + req.op = HTP_OP_MUL_MAT; + break; case GGML_OP_MUL: req.op = HTP_OP_MUL; break; @@ -2518,7 +2451,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - dspqueue_buffers_init(bufs, src0, true, true); + dspqueue_buffers_init(bufs, src0, !_IsSrc0Constant, !_IsSrc0Constant); // Second buffer = Second Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll @@ -2938,7 +2871,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg switch (node->op) { case GGML_OP_MUL_MAT: - ggml_hexagon_mul_mat(node, flags); + ggml_hexagon_binary(node, flags); prev_quant_op = node; break; case GGML_OP_MUL_MAT_ID: @@ -2948,7 +2881,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg case GGML_OP_MUL: case GGML_OP_ADD: case GGML_OP_SUB: - ggml_hexagon_binary(node, flags); + ggml_hexagon_binary(node, flags); break; case GGML_OP_ADD_ID: ggml_hexagon_add_id(node, flags); From 020f6bf3f2eecf7404554e8329e2a68180774a43 Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 24 Nov 2025 12:57:42 +0800 Subject: [PATCH 02/12] refactor: replace ggml_hexagon_mul_mat_id with template-based binary operation for improved flexibility --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 90 ++------------------------ 1 file changed, 7 insertions(+), 83 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 8b7641efa01..843ff1b8a2b 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2322,85 +2322,6 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer (unsigned int) d->size); } -static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - const struct ggml_tensor * src2 = op->src[2]; - const struct ggml_tensor * dst = op; - - uint64_t t1, t2; - t1 = ggml_time_us(); - - // Construct HTP message - htp_general_req req; - req.op = HTP_OP_MUL_MAT_ID; - req.flags = flags; - - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.src2, src2); - init_htp_tensor(&req.dst, dst); - - // Use opmask to override flags - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } - - dspqueue_buffer bufs[4]; - // First buffer Weights. - // The content is static, there is no need to do any cache management - dspqueue_buffers_init(bufs, src0, false, false); - - // Second buffer Input Activations. This is a buffer that the CPU - // writes and the DSP reads, so we'll need to flush CPU caches and - // invalidate DSP ones. On platforms with I/O coherency support the - // framework will automatically skip cache operations where possible. - dspqueue_buffers_init(&bufs[1], src1, true, true); - - // Third buffer expert IDs. This is a buffer that the CPU - // writes and the DSP reads, so we'll need to flush CPU caches and - // invalidate DSP ones. On platforms with I/O coherency support the - // framework will automatically skip cache operations where possible. - dspqueue_buffers_init(&bufs[2], src2, true, true); - - // Forth buffer Output Activations. We'll handle DSP - // cache maintenance in the response message but need to flush - // CPU caches to ensure any previously written dirty lines are - // written out before writes from the DSP start. - dspqueue_buffers_init(&bufs[3], dst, true, false); - - auto * sess = get_session_from_tensor(src0); - - if (opt_verbose) { - hex_print_op_info(op, sess, req.flags); - if (opt_verbose > 1) { - hex_dump_dspbuf(src0, &bufs[0]); - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(src2, &bufs[2]); - hex_dump_dspbuf(dst, &bufs[3]); - } - } - - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, 4, opt_opsync); - } - - t2 = ggml_time_us(); - - HEX_PROFILE( - "ggml-hex: %s matmul-id %s %u:%u:%u:%u x %s %u:%u:%u:%u (%s %u:%u:%u:%u) -> %s %u:%u:%u:%u : op-usec %u " - "op-cycles %u op-pkts %u (%f) call-usec %llu\n", - sess->name.c_str(), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], - (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], - (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1], (uint32_t) src2->ne[2], - (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); -} - template static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; @@ -2493,7 +2414,7 @@ template static void ggml_hexagon_binary(const struct ggm (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); } -static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { +template static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; @@ -2518,6 +2439,9 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { } switch (node->op) { + case GGML_OP_MUL_MAT_ID: + req.op = HTP_OP_MUL_MAT_ID; + break; case GGML_OP_ADD_ID: req.op = HTP_OP_ADD_ID; break; @@ -2532,7 +2456,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { dspqueue_buffer bufs[4]; // First buffer = input activations - dspqueue_buffers_init(bufs, src0, true, true); + dspqueue_buffers_init(bufs, src0, !_IsSrc0Constant, !_IsSrc0Constant); // Second buffer = experts bias dspqueue_buffers_init(&bufs[1], src1, true, true); // Third buffer = activated experts @@ -2875,7 +2799,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg prev_quant_op = node; break; case GGML_OP_MUL_MAT_ID: - ggml_hexagon_mul_mat_id(node, flags); + ggml_hexagon_binary_id(node, flags); prev_quant_op = node; break; case GGML_OP_MUL: @@ -2884,7 +2808,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg ggml_hexagon_binary(node, flags); break; case GGML_OP_ADD_ID: - ggml_hexagon_add_id(node, flags); + ggml_hexagon_binary_id(node, flags); break; case GGML_OP_RMS_NORM: ggml_hexagon_unary(node, flags); From 8424d62d7fe141b279cffe217c7a92544934787c Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 24 Nov 2025 17:05:08 +0800 Subject: [PATCH 03/12] refactor: initialize buffer types and streamline dspqueue_buffers_init calls for clarity --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 73 ++++++++++++++++---------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 843ff1b8a2b..54f99d634de 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -221,8 +221,8 @@ struct ggml_hexagon_session { void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false); void flush(); - ggml_backend_buffer_type buffer_type; - ggml_backend_buffer_type repack_buffer_type; + ggml_backend_buffer_type buffer_type = {}; + ggml_backend_buffer_type repack_buffer_type = {}; std::string name; remote_handle64 handle; @@ -1838,11 +1838,8 @@ void ggml_hexagon_session::release() noexcept(true) { } ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) { - buffer_type.context = nullptr; - repack_buffer_type.context = nullptr; - - buffer_type.device = dev; - repack_buffer_type.device = dev; + buffer_type.device = dev; + repack_buffer_type.device = dev; try { allocate(dev_id); @@ -2293,19 +2290,38 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) { h->nb[3] = t->nb[3]; } -static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) { +enum dsp_buffer_type { + DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ = 0, + DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ, + DSP_BUFFER_TYPE_CONSTANT, +}; + +static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, dsp_buffer_type buff_type) { if (!t) { return 0; } memset(buf, 0, sizeof(*buf)); auto tensor_buf = static_cast(t->buffer->context); - buf->fd = tensor_buf->fd; - buf->ptr = t->data; - buf->offset = (uint8_t *) t->data - tensor_buf->base; - buf->size = ggml_nbytes(t); - buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU - buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP + buf->fd = tensor_buf->fd; + buf->ptr = t->data; + buf->offset = (uint8_t *) t->data - tensor_buf->base; + buf->size = ggml_nbytes(t); + + switch (buff_type) { + case DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ: + // Flush CPU + buf->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER; + break; + case DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ: + // Flush CPU, Invalidate DSP + buf->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; + break; + default: + // Constant buffer, no cache maintenance + buf->flags = 0; + break; + } return 1; } @@ -2372,21 +2388,20 @@ template static void ggml_hexagon_binary(const struct ggm // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - dspqueue_buffers_init(bufs, src0, !_IsSrc0Constant, !_IsSrc0Constant); + dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Second buffer = Second Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - dspqueue_buffers_init(&bufs[1], src1, true, true); + dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Third buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - dspqueue_buffers_init(&bufs[2], dst, true, false); - + dspqueue_buffers_init(&bufs[2], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); auto * sess = get_session_from_tensor(src0); if (opt_verbose) { @@ -2456,13 +2471,13 @@ template static void ggml_hexagon_binary_id(const struct dspqueue_buffer bufs[4]; // First buffer = input activations - dspqueue_buffers_init(bufs, src0, !_IsSrc0Constant, !_IsSrc0Constant); + dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Second buffer = experts bias - dspqueue_buffers_init(&bufs[1], src1, true, true); + dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Third buffer = activated experts - dspqueue_buffers_init(&bufs[2], src2, true, true); + dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Forth buffer = output activations - dspqueue_buffers_init(&bufs[3], dst, true, true); + dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); auto * sess = get_session_from_tensor(src0); @@ -2567,21 +2582,21 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true); + size_t n_bufs = dspqueue_buffers_init(bufs, src0, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Second buffer(nullable) = Second Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true); + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Second or third buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false); + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); // Primary DSP session from the src0 tensor auto * sess = get_session_from_tensor(src0); @@ -2666,28 +2681,28 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true); + size_t n_bufs = dspqueue_buffers_init(bufs, src0, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Second buffer // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true); + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Third buffer(nullable) // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true); + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); // Final buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false); + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); // Primary DSP session from the src0 tensor auto * sess = get_session_from_tensor(src0); From f1fa387c1ad9676a9bca8a696271b8c1635b9e25 Mon Sep 17 00:00:00 2001 From: chraac Date: Tue, 25 Nov 2025 00:41:07 +0800 Subject: [PATCH 04/12] add comment --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 44 +++++++++++++++----------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 54f99d634de..5c7837a478e 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2383,24 +2383,21 @@ template static void ggml_hexagon_binary(const struct ggm init_htp_tensor(&req.dst, dst); dspqueue_buffer bufs[3]; - // First buffer = First Operand of Binary op - // This is a buffer that the CPU writes and the DSP reads, so we'll - // need to flush CPU caches and invalidate DSP ones. On platforms - // with I/O coherency support the framework will automatically skip - // cache operations where possible. + + // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op). + // If constant (e.g. weights), no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. + // Note: On platforms with I/O coherency, the framework skips cache ops automatically. dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Second buffer = Second Operand of Binary op - // This is a buffer that the CPU writes and the DSP reads, so we'll - // need to flush CPU caches and invalidate DSP ones. On platforms - // with I/O coherency support the framework will automatically skip - // cache operations where possible. + // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Third buffer = Output Activations. We'll handle DSP - // cache maintenance in the response message but need to flush - // CPU caches to ensure any previously written dirty lines are - // written out before writes from the DSP start. + // Buffer 2 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. dspqueue_buffers_init(&bufs[2], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); auto * sess = get_session_from_tensor(src0); @@ -2470,13 +2467,24 @@ template static void ggml_hexagon_binary_id(const struct init_htp_tensor(&req.dst, dst); dspqueue_buffer bufs[4]; - // First buffer = input activations + + // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op). + // If constant, no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Second buffer = experts bias + + // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Third buffer = activated experts + + // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Forth buffer = output activations + + // Buffer 3 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); auto * sess = get_session_from_tensor(src0); From 4c33de3e5061baa38ce2f5cfed3597787d3ce5ec Mon Sep 17 00:00:00 2001 From: chraac Date: Tue, 25 Nov 2025 19:24:59 +0800 Subject: [PATCH 05/12] refactor: remove redundant buffer checks in hexagon supported operations --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 53 +++++--------------------- 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 5c7837a478e..f7dba6ef311 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #ifdef _WIN32 # include @@ -1982,11 +1982,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s return false; } - // src0 & src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0, src1, dst)) { - return false; - } - return true; } @@ -2029,12 +2024,6 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session return false; } - // src0 (weights) must be repacked and mapped to the same session - // src1 & sr2 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0, src1, src2, dst)) { - return false; - } - return true; } @@ -2064,11 +2053,6 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se return false; } - // src0, src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0, src1, dst)) { - return false; - } - return true; } @@ -2096,11 +2080,6 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se return false; } - // src0, src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0, src1, src2, dst)) { - return false; - } - return true; } @@ -2123,11 +2102,6 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses return false; } - // src0 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0, dst)) { - return false; - } - return true; } @@ -2160,11 +2134,6 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session } } - // src0, src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0, src1, dst)) { - return false; - } - return true; } @@ -2213,11 +2182,6 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s } } - // src0, src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0, src1, dst)) { - return false; - } - return true; } @@ -2268,11 +2232,6 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess } } - // src0, src1, src2 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0, src1, src2, dst)) { - return false; - } - return true; } @@ -3136,8 +3095,16 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_repack_buffer_ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { auto sess = static_cast(dev->context); - bool supp = false; + // src0, src1, src2 & dst must be mapped to the same session + if (!hex_supported_buffer(sess, op->src[0], op->src[1], op->src[2], op)) { + if (opt_verbose) { + HEX_VERBOSE("ggml-hex: %s device-unsupports-op %s : unsupported buffer types\n", sess->name.c_str(), + ggml_op_name(op->op)); + } + return false; + }; + bool supp = false; switch (op->op) { case GGML_OP_NONE: case GGML_OP_RESHAPE: From 48552b192a6704e0a96fccc95dbc130bdd56b1eb Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 26 Nov 2025 00:41:03 +0800 Subject: [PATCH 06/12] wip --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index f7dba6ef311..f298c6694e6 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2059,7 +2059,6 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; - const struct ggml_tensor * src2 = op->src[2]; const struct ggml_tensor * dst = op; if (!hex_supported_src0_type(src0->type)) { From 46ce567afad91e88b1073dca431680607e975122 Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 26 Nov 2025 00:56:23 +0800 Subject: [PATCH 07/12] add missing include to fix weak symbol warning --- ggml/src/ggml-hexagon/htp-utils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-hexagon/htp-utils.h b/ggml/src/ggml-hexagon/htp-utils.h index 1a48f5dcbdf..7bbae3a0b73 100644 --- a/ggml/src/ggml-hexagon/htp-utils.h +++ b/ggml/src/ggml-hexagon/htp-utils.h @@ -8,6 +8,7 @@ extern "C" { #include #include #include +#include #include /* Offset to differentiate HLOS and Hexagon error codes. From 5f9dfe64cff7eb721a8c158e8ab58b2afe063656 Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 26 Nov 2025 13:59:30 +0800 Subject: [PATCH 08/12] add ggml_hexagon_op_generic --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 69 ++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index f298c6694e6..8cf57f84bfe 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -1598,7 +1598,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( try { ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); return nullptr; } @@ -1610,7 +1610,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe try { ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/); return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what()); return nullptr; } @@ -1849,7 +1849,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface; repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { release(); throw; } @@ -2296,6 +2296,65 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer (unsigned int) d->size); } +typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, + dspqueue_buffer (&bufs)[4], + const struct ggml_tensor * op); + +template +static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) { + const struct ggml_tensor * node = op; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * src2 = node->src[2]; + const struct ggml_tensor * dst = node; + + uint64_t t1 = 0; + uint64_t t2 = 0; + + t1 = ggml_time_us(); + + // Construct HTP message + htp_general_req req; + req.flags = flags; + + // Use opmask to override flags + if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { + req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; + } + if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { + req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; + } + + dspqueue_buffer bufs[4]; + init_req(&req, buf, op); + + auto * sess = get_session_from_tensor(src0); + if (opt_verbose) { + hex_print_op_info(op, sess, req.flags); + if (opt_verbose > 1) { + hex_dump_dspbuf(src0, &bufs[0]); + hex_dump_dspbuf(src1, &bufs[1]); + hex_dump_dspbuf(src2, &bufs[2]); + hex_dump_dspbuf(dst, &bufs[3]); + } + } + + if ((opt_opmask & HTP_OPMASK_QUEUE)) { + sess->enqueue(req, bufs, 4, opt_opsync); + } + + t2 = ggml_time_us(); + + HEX_PROFILE( + "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) " + "call-usec %llu\n", + sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, + (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); +} + template static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; @@ -3247,7 +3306,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) { } } - if(opt_arch < 75) { + if (opt_arch < 75) { opt_ndev = 1; GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n"); } @@ -3260,7 +3319,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) { devices[i].reg = reg; try { devices[i].context = new ggml_hexagon_session(i, &devices[i]); - } catch (std::exception const &exc) { + } catch (const std::exception & exc) { GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i); devices[i].context = nullptr; } From 5e27b7f402d448fd4de2a1721b806027c7ebf1ed Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 26 Nov 2025 15:31:07 +0800 Subject: [PATCH 09/12] refactor: simplify tensor operation initialization and buffer management in hexagon implementation --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 282 ++++++++++--------------- 1 file changed, 111 insertions(+), 171 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 8cf57f84bfe..d481598f28a 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -2296,9 +2297,7 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer (unsigned int) d->size); } -typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, - dspqueue_buffer (&bufs)[4], - const struct ggml_tensor * op); +typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op); template static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) { @@ -2326,210 +2325,151 @@ static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flag } dspqueue_buffer bufs[4]; - init_req(&req, buf, op); + const size_t n_bufs = init_req(&req, bufs, op); auto * sess = get_session_from_tensor(src0); if (opt_verbose) { hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(src2, &bufs[2]); + if (src1) { + hex_dump_dspbuf(src1, &bufs[1]); + } + if (src2) { + hex_dump_dspbuf(src2, &bufs[2]); + } hex_dump_dspbuf(dst, &bufs[3]); } } if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, 4, opt_opsync); + sess->enqueue(req, bufs, n_bufs, opt_opsync); } t2 = ggml_time_us(); - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) " - "call-usec %llu\n", - sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + if (src1) { + HEX_PROFILE( + "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u " + "(%f) call-usec %llu\n", + sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, + (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + } else { + HEX_PROFILE( + "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec " + "%llu\n", + sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, + (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + } } template static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { - const struct ggml_tensor * node = op; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const struct ggml_tensor * dst = node; + constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4], + const ggml_tensor * op) -> size_t { + const struct ggml_tensor * node = op; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * dst = node; - uint64_t t1 = 0; - uint64_t t2 = 0; - - t1 = ggml_time_us(); - - // Construct HTP message - htp_general_req req; - req.flags = flags; - - // Use opmask to override flags - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } - - switch (node->op) { - case GGML_OP_MUL_MAT: - req.op = HTP_OP_MUL_MAT; - break; - case GGML_OP_MUL: - req.op = HTP_OP_MUL; - break; - case GGML_OP_ADD: - req.op = HTP_OP_ADD; - break; - case GGML_OP_SUB: - req.op = HTP_OP_SUB; - break; - default: - GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); - } - - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.dst, dst); + switch (node->op) { + case GGML_OP_MUL_MAT: + req->op = HTP_OP_MUL_MAT; + break; + case GGML_OP_MUL: + req->op = HTP_OP_MUL; + break; + case GGML_OP_ADD: + req->op = HTP_OP_ADD; + break; + case GGML_OP_SUB: + req->op = HTP_OP_SUB; + break; + default: + GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); + break; + } - dspqueue_buffer bufs[3]; + init_htp_tensor(&req->src0, src0); + init_htp_tensor(&req->src1, src1); + init_htp_tensor(&req->dst, dst); - // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op). - // If constant (e.g. weights), no cache management is needed. - // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. - // Note: On platforms with I/O coherency, the framework skips cache ops automatically. - dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - - // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - - // Buffer 2 (dst): Output Activations. - // DSP writes, CPU reads. - // We flush CPU caches to ensure consistency before DSP writes. - // DSP cache maintenance is handled in the response message. - dspqueue_buffers_init(&bufs[2], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - auto * sess = get_session_from_tensor(src0); + // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op). + // If constant (e.g. weights), no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. + // Note: On platforms with I/O coherency, the framework skips cache ops automatically. + size_t n_bufs = dspqueue_buffers_init( + bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - if (opt_verbose) { - hex_print_op_info(op, sess, req.flags); - if (opt_verbose > 1) { - hex_dump_dspbuf(src0, &bufs[0]); - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(dst, &bufs[2]); - } - } + // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, 3, opt_opsync); - } + // Buffer 2 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - t2 = ggml_time_us(); + return n_bufs; + }; - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) " - "call-usec %llu\n", - sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags); } template static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) { - const struct ggml_tensor * node = op; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const struct ggml_tensor * src2 = node->src[2]; - const struct ggml_tensor * dst = node; - - uint64_t t1 = 0; - uint64_t t2 = 0; - - t1 = ggml_time_us(); - - // Construct HTP message - htp_general_req req; - req.flags = flags; - - // Use opmask to override flags - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } + constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4], + const ggml_tensor * op) -> size_t { + const struct ggml_tensor * node = op; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * src2 = node->src[2]; + const struct ggml_tensor * dst = node; - switch (node->op) { - case GGML_OP_MUL_MAT_ID: - req.op = HTP_OP_MUL_MAT_ID; - break; - case GGML_OP_ADD_ID: - req.op = HTP_OP_ADD_ID; - break; - default: - GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); - } - - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.src2, src2); - init_htp_tensor(&req.dst, dst); - - dspqueue_buffer bufs[4]; - - // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op). - // If constant, no cache management is needed. - // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. - dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - - // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + switch (node->op) { + case GGML_OP_MUL_MAT_ID: + req->op = HTP_OP_MUL_MAT_ID; + break; + case GGML_OP_ADD_ID: + req->op = HTP_OP_ADD_ID; + break; + default: + GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); + } - // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + init_htp_tensor(&req->src0, src0); + init_htp_tensor(&req->src1, src1); + init_htp_tensor(&req->src2, src2); + init_htp_tensor(&req->dst, dst); - // Buffer 3 (dst): Output Activations. - // DSP writes, CPU reads. - // We flush CPU caches to ensure consistency before DSP writes. - // DSP cache maintenance is handled in the response message. - dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); + // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op). + // If constant, no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. + size_t n_bufs = dspqueue_buffers_init( + bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - auto * sess = get_session_from_tensor(src0); + // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - if (opt_verbose) { - hex_print_op_info(op, sess, req.flags); - if (opt_verbose > 1) { - hex_dump_dspbuf(src0, &bufs[0]); - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(src2, &bufs[2]); - hex_dump_dspbuf(dst, &bufs[3]); - } - } + // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, 4, opt_opsync); - } + // Buffer 3 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. + n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - t2 = ggml_time_us(); + return n_bufs; + }; - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) " - "call-usec %llu\n", - sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); + ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags); } static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { From 186053c9a8e1c683af17f87b137cb9654d07c9c9 Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 26 Nov 2025 15:48:20 +0800 Subject: [PATCH 10/12] refactor: streamline hexagon operation initialization and buffer management --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 182 ++++++++++++------------- 1 file changed, 89 insertions(+), 93 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index d481598f28a..1277e698ad8 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2299,8 +2299,8 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op); -template -static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) { +template +static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; @@ -2368,108 +2368,104 @@ static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flag } } -template static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { - constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4], - const ggml_tensor * op) -> size_t { - const struct ggml_tensor * node = op; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const struct ggml_tensor * dst = node; - - switch (node->op) { - case GGML_OP_MUL_MAT: - req->op = HTP_OP_MUL_MAT; - break; - case GGML_OP_MUL: - req->op = HTP_OP_MUL; - break; - case GGML_OP_ADD: - req->op = HTP_OP_ADD; - break; - case GGML_OP_SUB: - req->op = HTP_OP_SUB; - break; - default: - GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); - break; - } +template +static inline size_t init_binary_req_and_bufs(htp_general_req * req, + dspqueue_buffer (&bufs)[4], + const ggml_tensor * op) { + const struct ggml_tensor * node = op; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * dst = node; - init_htp_tensor(&req->src0, src0); - init_htp_tensor(&req->src1, src1); - init_htp_tensor(&req->dst, dst); + switch (node->op) { + case GGML_OP_MUL_MAT: + req->op = HTP_OP_MUL_MAT; + break; + case GGML_OP_MUL: + req->op = HTP_OP_MUL; + break; + case GGML_OP_ADD: + req->op = HTP_OP_ADD; + break; + case GGML_OP_SUB: + req->op = HTP_OP_SUB; + break; + default: + GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); + break; + } - // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op). - // If constant (e.g. weights), no cache management is needed. - // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. - // Note: On platforms with I/O coherency, the framework skips cache ops automatically. - size_t n_bufs = dspqueue_buffers_init( - bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + init_htp_tensor(&req->src0, src0); + init_htp_tensor(&req->src1, src1); + init_htp_tensor(&req->dst, dst); - // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op). + // If constant (e.g. weights), no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. + // Note: On platforms with I/O coherency, the framework skips cache ops automatically. + size_t n_bufs = dspqueue_buffers_init( + bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Buffer 2 (dst): Output Activations. - // DSP writes, CPU reads. - // We flush CPU caches to ensure consistency before DSP writes. - // DSP cache maintenance is handled in the response message. - n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); + // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - return n_bufs; - }; + // Buffer 2 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags); + return n_bufs; } -template static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) { - constexpr const auto init_func = [](htp_general_req * req, dspqueue_buffer(&bufs)[4], - const ggml_tensor * op) -> size_t { - const struct ggml_tensor * node = op; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const struct ggml_tensor * src2 = node->src[2]; - const struct ggml_tensor * dst = node; - - switch (node->op) { - case GGML_OP_MUL_MAT_ID: - req->op = HTP_OP_MUL_MAT_ID; - break; - case GGML_OP_ADD_ID: - req->op = HTP_OP_ADD_ID; - break; - default: - GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); - } +template +static inline size_t init_binary_id_req_and_bufs(htp_general_req * req, + dspqueue_buffer (&bufs)[4], + const ggml_tensor * op) { + const struct ggml_tensor * node = op; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const struct ggml_tensor * src2 = node->src[2]; + const struct ggml_tensor * dst = node; - init_htp_tensor(&req->src0, src0); - init_htp_tensor(&req->src1, src1); - init_htp_tensor(&req->src2, src2); - init_htp_tensor(&req->dst, dst); + switch (node->op) { + case GGML_OP_MUL_MAT_ID: + req->op = HTP_OP_MUL_MAT_ID; + break; + case GGML_OP_ADD_ID: + req->op = HTP_OP_ADD_ID; + break; + default: + GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); + } - // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op). - // If constant, no cache management is needed. - // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. - size_t n_bufs = dspqueue_buffers_init( - bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + init_htp_tensor(&req->src0, src0); + init_htp_tensor(&req->src1, src1); + init_htp_tensor(&req->src2, src2); + init_htp_tensor(&req->dst, dst); - // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op). + // If constant, no cache management is needed. + // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches. + size_t n_bufs = dspqueue_buffers_init( + bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op). - // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. - n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); + // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - // Buffer 3 (dst): Output Activations. - // DSP writes, CPU reads. - // We flush CPU caches to ensure consistency before DSP writes. - // DSP cache maintenance is handled in the response message. - n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); + // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op). + // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches. + n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ); - return n_bufs; - }; + // Buffer 3 (dst): Output Activations. + // DSP writes, CPU reads. + // We flush CPU caches to ensure consistency before DSP writes. + // DSP cache maintenance is handled in the response message. + n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags); + return n_bufs; } static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { @@ -2775,20 +2771,20 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg switch (node->op) { case GGML_OP_MUL_MAT: - ggml_hexagon_binary(node, flags); + ggml_hexagon_op_generic>(node, flags); prev_quant_op = node; break; case GGML_OP_MUL_MAT_ID: - ggml_hexagon_binary_id(node, flags); + ggml_hexagon_op_generic>(node, flags); prev_quant_op = node; break; case GGML_OP_MUL: case GGML_OP_ADD: case GGML_OP_SUB: - ggml_hexagon_binary(node, flags); + ggml_hexagon_op_generic>(node, flags); break; case GGML_OP_ADD_ID: - ggml_hexagon_binary_id(node, flags); + ggml_hexagon_op_generic>(node, flags); break; case GGML_OP_RMS_NORM: ggml_hexagon_unary(node, flags); From 01c06dc7eb15eaff91bb7ffd603c5fc557fd21ab Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 26 Nov 2025 18:13:18 +0800 Subject: [PATCH 11/12] refactor: update function signatures and streamline request handling in hexagon operations --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 179 +++++-------------------- 1 file changed, 32 insertions(+), 147 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 1277e698ad8..48a1d6e6bef 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2297,9 +2297,11 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer (unsigned int) d->size); } -typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op); +typedef size_t (*init_dsp_req_and_buffer_func_t)(htp_general_req * req, + dspqueue_buffer (&bufs)[4], + const ggml_tensor * op); -template +template static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; @@ -2314,6 +2316,7 @@ static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32 // Construct HTP message htp_general_req req; + memset(&req, 0, sizeof(req)); req.flags = flags; // Use opmask to override flags @@ -2325,7 +2328,7 @@ static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32 } dspqueue_buffer bufs[4]; - const size_t n_bufs = init_req(&req, bufs, op); + const size_t n_bufs = _init_req_func(&req, bufs, op); auto * sess = get_session_from_tensor(src0); if (opt_verbose) { @@ -2468,50 +2471,42 @@ static inline size_t init_binary_id_req_and_bufs(htp_general_req * req, return n_bufs; } -static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { +static inline size_t init_unary_req_and_bufs(htp_general_req * req, + dspqueue_buffer (&bufs)[4], + const ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * dst = op; - uint64_t t1 = 0; - uint64_t t2 = 0; - - t1 = ggml_time_us(); - - // Construct HTP message - htp_general_req req; - - memset(&req, 0, sizeof(htp_general_req)); - memcpy(&req.op_params, &op->op_params, sizeof(op->op_params)); - req.flags = flags; + memcpy(&req->op_params, &op->op_params, sizeof(op->op_params)); bool supported = false; switch (op->op) { case GGML_OP_RMS_NORM: - req.op = HTP_OP_RMS_NORM; + req->op = HTP_OP_RMS_NORM; supported = true; break; case GGML_OP_UNARY: if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) { - req.op = HTP_OP_UNARY_SILU; + req->op = HTP_OP_UNARY_SILU; supported = true; } break; case GGML_OP_GLU: if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU) { - req.op = HTP_OP_GLU_SWIGLU; + req->op = HTP_OP_GLU_SWIGLU; supported = true; } else if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) { - req.op = HTP_OP_GLU_SWIGLU_OAI; + req->op = HTP_OP_GLU_SWIGLU_OAI; supported = true; } break; case GGML_OP_SOFT_MAX: - req.op = HTP_OP_SOFTMAX; + req->op = HTP_OP_SOFTMAX; supported = true; default: @@ -2522,22 +2517,12 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op); } - init_htp_tensor(&req.dst, dst); - init_htp_tensor(&req.src0, src0); + init_htp_tensor(&req->dst, dst); + init_htp_tensor(&req->src0, src0); if (src1) { - init_htp_tensor(&req.src1, src1); + init_htp_tensor(&req->src1, src1); } - // Use opmask to override flags - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } - - dspqueue_buffer bufs[3]; - // First buffer = Only Operand of Unary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms @@ -2559,84 +2544,25 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { // written out before writes from the DSP start. n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - // Primary DSP session from the src0 tensor - auto * sess = get_session_from_tensor(src0); - - if (opt_verbose) { - hex_print_op_info(op, sess, req.flags); - if (opt_verbose > 1) { - hex_dump_dspbuf(src0, &bufs[0]); - if (src1) { - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(dst, &bufs[2]); - } else { - hex_dump_dspbuf(dst, &bufs[1]); - } - } - } - - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, n_bufs, opt_opsync); - } - - t2 = ggml_time_us(); - - if (src1) { - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u " - "(%f) call-usec %llu\n", - sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); - } else { - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec " - "%llu\n", - sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); - } + return n_bufs; } -static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { +static inline size_t init_rope_req_and_bufs(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * src2 = op->src[2]; const struct ggml_tensor * dst = op; - uint64_t t1 = 0; - uint64_t t2 = 0; - - t1 = ggml_time_us(); + memcpy(&req->op_params, &op->op_params, sizeof(op->op_params)); + req->op = HTP_OP_ROPE; - // Construct HTP message - htp_general_req req; - - memset(&req, 0, sizeof(htp_general_req)); - memcpy(&req.op_params, &op->op_params, sizeof(op->op_params)); - req.flags = flags; - req.op = HTP_OP_ROPE; - - init_htp_tensor(&req.dst, dst); - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); + init_htp_tensor(&req->dst, dst); + init_htp_tensor(&req->src0, src0); + init_htp_tensor(&req->src1, src1); if (src2) { - init_htp_tensor(&req.src2, src2); + init_htp_tensor(&req->src2, src2); } - // Use opmask to override flags - if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { - req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE; - } - if (!(opt_opmask & HTP_OPMASK_COMPUTE)) { - req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; - } - - dspqueue_buffer bufs[4]; - // First buffer // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms @@ -2665,48 +2591,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { // written out before writes from the DSP start. n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ); - // Primary DSP session from the src0 tensor - auto * sess = get_session_from_tensor(src0); - - if (opt_verbose) { - hex_print_op_info(op, sess, req.flags); - if (opt_verbose > 1) { - hex_dump_dspbuf(src0, &bufs[0]); - if (src1) { - hex_dump_dspbuf(src1, &bufs[1]); - hex_dump_dspbuf(dst, &bufs[2]); - } else { - hex_dump_dspbuf(dst, &bufs[1]); - } - } - } - - if ((opt_opmask & HTP_OPMASK_QUEUE)) { - sess->enqueue(req, bufs, n_bufs, opt_opsync); - } - - t2 = ggml_time_us(); - - if (src2) { - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles " - "%u op-pkts %u (%f) call-usec %llu\n", - sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1], - (uint32_t) src2->ne[2], (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); - } else { - HEX_PROFILE( - "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u " - "(%f) call-usec %llu\n", - sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, - (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1); - } + return n_bufs; } static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { @@ -2787,25 +2672,25 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg ggml_hexagon_op_generic>(node, flags); break; case GGML_OP_RMS_NORM: - ggml_hexagon_unary(node, flags); + ggml_hexagon_op_generic(node, flags); break; case GGML_OP_UNARY: if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) { - ggml_hexagon_unary(node, flags); + ggml_hexagon_op_generic(node, flags); } break; case GGML_OP_GLU: if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) || (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) { - ggml_hexagon_unary(node, flags); + ggml_hexagon_op_generic(node, flags); } break; case GGML_OP_SOFT_MAX: - ggml_hexagon_unary(node, flags); + ggml_hexagon_op_generic(node, flags); break; case GGML_OP_ROPE: - ggml_hexagon_rope(node, flags); + ggml_hexagon_op_generic(node, flags); break; default: From 97dd2c776ac64723f3addc26bdc7eba9e6adbd6a Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 26 Nov 2025 18:17:38 +0800 Subject: [PATCH 12/12] wip --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 48a1d6e6bef..dd3d559df71 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2302,7 +2302,7 @@ typedef size_t (*init_dsp_req_and_buffer_func_t)(htp_general_req * req, const ggml_tensor * op); template -static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) { +static inline void ggml_hexagon_dispatch_op(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * node = op; const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; @@ -2656,41 +2656,41 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg switch (node->op) { case GGML_OP_MUL_MAT: - ggml_hexagon_op_generic>(node, flags); + ggml_hexagon_dispatch_op>(node, flags); prev_quant_op = node; break; case GGML_OP_MUL_MAT_ID: - ggml_hexagon_op_generic>(node, flags); + ggml_hexagon_dispatch_op>(node, flags); prev_quant_op = node; break; case GGML_OP_MUL: case GGML_OP_ADD: case GGML_OP_SUB: - ggml_hexagon_op_generic>(node, flags); + ggml_hexagon_dispatch_op>(node, flags); break; case GGML_OP_ADD_ID: - ggml_hexagon_op_generic>(node, flags); + ggml_hexagon_dispatch_op>(node, flags); break; case GGML_OP_RMS_NORM: - ggml_hexagon_op_generic(node, flags); + ggml_hexagon_dispatch_op(node, flags); break; case GGML_OP_UNARY: if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) { - ggml_hexagon_op_generic(node, flags); + ggml_hexagon_dispatch_op(node, flags); } break; case GGML_OP_GLU: if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) || (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) { - ggml_hexagon_op_generic(node, flags); + ggml_hexagon_dispatch_op(node, flags); } break; case GGML_OP_SOFT_MAX: - ggml_hexagon_op_generic(node, flags); + ggml_hexagon_dispatch_op(node, flags); break; case GGML_OP_ROPE: - ggml_hexagon_op_generic(node, flags); + ggml_hexagon_dispatch_op(node, flags); break; default: