From d12247a60d59eb5dcf30b24db1085b2692bece1f Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 24 Nov 2025 12:35:45 +0800
Subject: [PATCH 01/12] refactor: replace ggml_hexagon_mul_mat with
 template-based binary operation for improved flexibility

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 81 +++-----------------------
 1 file changed, 7 insertions(+), 74 deletions(-)
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 72a82a89116..8b7641efa01 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2322,76 +2322,6 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
                 (unsigned int) d->size);
 }
 
-static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-    const struct ggml_tensor * dst  = op;
-
-    uint64_t t1, t2;
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-    req.op    = HTP_OP_MUL_MAT;
-    req.flags = flags;
-
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.dst, dst);
-
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    dspqueue_buffer bufs[3];
-
-    // First buffer Weights.
-    // The content is static, there is no need to do any cache management
-    dspqueue_buffers_init(bufs, src0, false, false);
-
-    // Second buffer Input Activations. This is a buffer that the CPU
-    // writes and the DSP reads, so we'll need to flush CPU caches and
-    // invalidate DSP ones. On platforms with I/O coherency support the
-    // framework will automatically skip cache operations where possible.
-    dspqueue_buffers_init(&bufs[1], src1, true, true);
-
-    // Third buffer Output Activations. We'll handle DSP
-    // cache maintenance in the response message but need to flush
-    // CPU caches to ensure any previously written dirty lines are
-    // written out before writes from the DSP start.
-    dspqueue_buffers_init(&bufs[2], dst, true, false);
-
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            hex_dump_dspbuf(src1, &bufs[1]);
-            hex_dump_dspbuf(dst, &bufs[2]);
-        }
-    }
-
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 3, opt_opsync);
-    }
-
-    t2 = ggml_time_us();
-
-    HEX_PROFILE(
-        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
-        "call-usec %llu\n",
-        sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-}
-
 static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * src1 = op->src[1];
@@ -2471,7 +2401,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
         (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
 }
 
-static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
+template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * node = op;
     const struct ggml_tensor * src0 = node->src[0];
     const struct ggml_tensor * src1 = node->src[1];
@@ -2495,6 +2425,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     }
 
     switch (node->op) {
+        case GGML_OP_MUL_MAT:
+            req.op = HTP_OP_MUL_MAT;
+            break;
         case GGML_OP_MUL:
             req.op = HTP_OP_MUL;
             break;
@@ -2518,7 +2451,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    dspqueue_buffers_init(bufs, src0, true, true);
+    dspqueue_buffers_init(bufs, src0, !_IsSrc0Constant, !_IsSrc0Constant);
 
     // Second buffer = Second Operand of Binary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
@@ -2938,7 +2871,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 
         switch (node->op) {
             case GGML_OP_MUL_MAT:
-                ggml_hexagon_mul_mat(node, flags);
+                ggml_hexagon_binary<true>(node, flags);
                 prev_quant_op = node;
                 break;
             case GGML_OP_MUL_MAT_ID:
@@ -2948,7 +2881,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
             case GGML_OP_MUL:
             case GGML_OP_ADD:
             case GGML_OP_SUB:
-                ggml_hexagon_binary(node, flags);
+                ggml_hexagon_binary<false>(node, flags);
                 break;
             case GGML_OP_ADD_ID:
                 ggml_hexagon_add_id(node, flags);

From 020f6bf3f2eecf7404554e8329e2a68180774a43 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 24 Nov 2025 12:57:42 +0800
Subject: [PATCH 02/12] refactor: replace ggml_hexagon_mul_mat_id with
 template-based binary operation for improved flexibility

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 90 ++------------------------
 1 file changed, 7 insertions(+), 83 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 8b7641efa01..843ff1b8a2b 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2322,85 +2322,6 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
                 (unsigned int) d->size);
 }
 
-static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * src0 = op->src[0];
-    const struct ggml_tensor * src1 = op->src[1];
-    const struct ggml_tensor * src2 = op->src[2];
-    const struct ggml_tensor * dst  = op;
-
-    uint64_t t1, t2;
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-    req.op    = HTP_OP_MUL_MAT_ID;
-    req.flags = flags;
-
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.src2, src2);
-    init_htp_tensor(&req.dst, dst);
-
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    dspqueue_buffer bufs[4];
-    // First buffer Weights.
-    // The content is static, there is no need to do any cache management
-    dspqueue_buffers_init(bufs, src0, false, false);
-
-    // Second buffer Input Activations. This is a buffer that the CPU
-    // writes and the DSP reads, so we'll need to flush CPU caches and
-    // invalidate DSP ones. On platforms with I/O coherency support the
-    // framework will automatically skip cache operations where possible.
-    dspqueue_buffers_init(&bufs[1], src1, true, true);
-
-    // Third buffer expert IDs. This is a buffer that the CPU
-    // writes and the DSP reads, so we'll need to flush CPU caches and
-    // invalidate DSP ones. On platforms with I/O coherency support the
-    // framework will automatically skip cache operations where possible.
-    dspqueue_buffers_init(&bufs[2], src2, true, true);
-
-    // Forth buffer Output Activations. We'll handle DSP
-    // cache maintenance in the response message but need to flush
-    // CPU caches to ensure any previously written dirty lines are
-    // written out before writes from the DSP start.
-    dspqueue_buffers_init(&bufs[3], dst, true, false);
-
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            hex_dump_dspbuf(src1, &bufs[1]);
-            hex_dump_dspbuf(src2, &bufs[2]);
-            hex_dump_dspbuf(dst, &bufs[3]);
-        }
-    }
-
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 4, opt_opsync);
-    }
-
-    t2 = ggml_time_us();
-
-    HEX_PROFILE(
-        "ggml-hex: %s matmul-id %s %u:%u:%u:%u x %s %u:%u:%u:%u (%s %u:%u:%u:%u) -> %s %u:%u:%u:%u : op-usec %u "
-        "op-cycles %u op-pkts %u (%f) call-usec %llu\n",
-        sess->name.c_str(), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
-        (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
-        (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1], (uint32_t) src2->ne[2],
-        (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
-        (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-}
-
 template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * node = op;
     const struct ggml_tensor * src0 = node->src[0];
@@ -2493,7 +2414,7 @@ template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggm
         (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
 }
 
-static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
+template <bool _IsSrc0Constant> static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * node = op;
     const struct ggml_tensor * src0 = node->src[0];
     const struct ggml_tensor * src1 = node->src[1];
@@ -2518,6 +2439,9 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
     }
 
     switch (node->op) {
+        case GGML_OP_MUL_MAT_ID:
+            req.op = HTP_OP_MUL_MAT_ID;
+            break;
         case GGML_OP_ADD_ID:
             req.op = HTP_OP_ADD_ID;
             break;
@@ -2532,7 +2456,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
 
     dspqueue_buffer bufs[4];
     // First buffer = input activations
-    dspqueue_buffers_init(bufs, src0, true, true);
+    dspqueue_buffers_init(bufs, src0, !_IsSrc0Constant, !_IsSrc0Constant);
     // Second buffer = experts bias
     dspqueue_buffers_init(&bufs[1], src1, true, true);
     // Third buffer = activated experts
@@ -2875,7 +2799,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
                 prev_quant_op = node;
                 break;
             case GGML_OP_MUL_MAT_ID:
-                ggml_hexagon_mul_mat_id(node, flags);
+                ggml_hexagon_binary_id<true>(node, flags);
                 prev_quant_op = node;
                 break;
             case GGML_OP_MUL:
@@ -2884,7 +2808,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
                 ggml_hexagon_binary<false>(node, flags);
                 break;
             case GGML_OP_ADD_ID:
-                ggml_hexagon_add_id(node, flags);
+                ggml_hexagon_binary_id<false>(node, flags);
                 break;
             case GGML_OP_RMS_NORM:
                 ggml_hexagon_unary(node, flags);

From 8424d62d7fe141b279cffe217c7a92544934787c Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 24 Nov 2025 17:05:08 +0800
Subject: [PATCH 03/12] refactor: initialize buffer types and streamline
 dspqueue_buffers_init calls for clarity

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 73 ++++++++++++++++----------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 843ff1b8a2b..54f99d634de 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -221,8 +221,8 @@ struct ggml_hexagon_session {
     void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
     void flush();
 
-    ggml_backend_buffer_type buffer_type;
-    ggml_backend_buffer_type repack_buffer_type;
+    ggml_backend_buffer_type buffer_type        = {};
+    ggml_backend_buffer_type repack_buffer_type = {};
 
     std::string      name;
     remote_handle64  handle;
@@ -1838,11 +1838,8 @@ void ggml_hexagon_session::release() noexcept(true) {
 }
 
 ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
-    buffer_type.context        = nullptr;
-    repack_buffer_type.context = nullptr;
-
-    buffer_type.device         = dev;
-    repack_buffer_type.device  = dev;
+    buffer_type.device        = dev;
+    repack_buffer_type.device = dev;
 
     try {
         allocate(dev_id);
@@ -2293,19 +2290,38 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
     h->nb[3] = t->nb[3];
 }
 
-static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
+enum dsp_buffer_type {
+    DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ = 0,
+    DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ,
+    DSP_BUFFER_TYPE_CONSTANT,
+};
+
+static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, dsp_buffer_type buff_type) {
     if (!t) {
         return 0;
     }
 
     memset(buf, 0, sizeof(*buf));
     auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
-    buf->fd      = tensor_buf->fd;
-    buf->ptr     = t->data;
-    buf->offset  = (uint8_t *) t->data - tensor_buf->base;
-    buf->size    = ggml_nbytes(t);
-    buf->flags   = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);        // Flush CPU
-    buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0);  // Invalidate DSP
+    buf->fd         = tensor_buf->fd;
+    buf->ptr        = t->data;
+    buf->offset     = (uint8_t *) t->data - tensor_buf->base;
+    buf->size       = ggml_nbytes(t);
+
+    switch (buff_type) {
+        case DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ:
+            // Flush CPU
+            buf->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER;
+            break;
+        case DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ:
+            // Flush CPU, Invalidate DSP
+            buf->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+            break;
+        default:
+            // Constant buffer, no cache maintenance
+            buf->flags = 0;
+            break;
+    }
     return 1;
 }
 
@@ -2372,21 +2388,20 @@ template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggm
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    dspqueue_buffers_init(bufs, src0, !_IsSrc0Constant, !_IsSrc0Constant);
+    dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
     // Second buffer = Second Operand of Binary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    dspqueue_buffers_init(&bufs[1], src1, true, true);
+    dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
     // Third buffer = Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    dspqueue_buffers_init(&bufs[2], dst, true, false);
-
+    dspqueue_buffers_init(&bufs[2], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
     auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
@@ -2456,13 +2471,13 @@ template <bool _IsSrc0Constant> static void ggml_hexagon_binary_id(const struct
 
     dspqueue_buffer bufs[4];
     // First buffer = input activations
-    dspqueue_buffers_init(bufs, src0, !_IsSrc0Constant, !_IsSrc0Constant);
+    dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
     // Second buffer = experts bias
-    dspqueue_buffers_init(&bufs[1], src1, true, true);
+    dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
     // Third buffer = activated experts
-    dspqueue_buffers_init(&bufs[2], src2, true, true);
+    dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
     // Forth buffer = output activations
-    dspqueue_buffers_init(&bufs[3], dst, true, true);
+    dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
     auto * sess = get_session_from_tensor(src0);
 
@@ -2567,21 +2582,21 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
+    size_t n_bufs = dspqueue_buffers_init(bufs, src0, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
     // Second buffer(nullable) = Second Operand of Binary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
     // Second or third buffer = Output Activations. We'll handle DSP
     // Second buffer = Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
     // Primary DSP session from the src0 tensor
     auto * sess = get_session_from_tensor(src0);
@@ -2666,28 +2681,28 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
+    size_t n_bufs = dspqueue_buffers_init(bufs, src0, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
     // Second buffer
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
     // Third buffer(nullable)
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
     // Final buffer = Output Activations. We'll handle DSP
     // Second buffer = Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
     // Primary DSP session from the src0 tensor
     auto * sess = get_session_from_tensor(src0);

From f1fa387c1ad9676a9bca8a696271b8c1635b9e25 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Tue, 25 Nov 2025 00:41:07 +0800
Subject: [PATCH 04/12] add comment

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 44 +++++++++++++++-----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 54f99d634de..5c7837a478e 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2383,24 +2383,21 @@ template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggm
     init_htp_tensor(&req.dst, dst);
 
     dspqueue_buffer bufs[3];
-    // First buffer = First Operand of Binary op
-    // This is a buffer that the CPU writes and the DSP reads, so we'll
-    // need to flush CPU caches and invalidate DSP ones. On platforms
-    // with I/O coherency support the framework will automatically skip
-    // cache operations where possible.
+
+    // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op).
+    // If constant (e.g. weights), no cache management is needed.
+    // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
+    // Note: On platforms with I/O coherency, the framework skips cache ops automatically.
     dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-    // Second buffer = Second Operand of Binary op
-    // This is a buffer that the CPU writes and the DSP reads, so we'll
-    // need to flush CPU caches and invalidate DSP ones. On platforms
-    // with I/O coherency support the framework will automatically skip
-    // cache operations where possible.
+    // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op).
+    // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
     dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-    // Third buffer = Output Activations. We'll handle DSP
-    // cache maintenance in the response message but need to flush
-    // CPU caches to ensure any previously written dirty lines are
-    // written out before writes from the DSP start.
+    // Buffer 2 (dst): Output Activations.
+    // DSP writes, CPU reads.
+    // We flush CPU caches to ensure consistency before DSP writes.
+    // DSP cache maintenance is handled in the response message.
     dspqueue_buffers_init(&bufs[2], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
     auto * sess = get_session_from_tensor(src0);
 
@@ -2470,13 +2467,24 @@ template <bool _IsSrc0Constant> static void ggml_hexagon_binary_id(const struct
     init_htp_tensor(&req.dst, dst);
 
     dspqueue_buffer bufs[4];
-    // First buffer = input activations
+
+    // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op).
+    // If constant, no cache management is needed.
+    // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
     dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
-    // Second buffer = experts bias
+
+    // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op).
+    // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
     dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
-    // Third buffer = activated experts
+
+    // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op).
+    // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
     dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
-    // Forth buffer = output activations
+
+    // Buffer 3 (dst): Output Activations.
+    // DSP writes, CPU reads.
+    // We flush CPU caches to ensure consistency before DSP writes.
+    // DSP cache maintenance is handled in the response message.
     dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
     auto * sess = get_session_from_tensor(src0);

From 4c33de3e5061baa38ce2f5cfed3597787d3ce5ec Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Tue, 25 Nov 2025 19:24:59 +0800
Subject: [PATCH 05/12] refactor: remove redundant buffer checks in hexagon
 supported operations

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 53 +++++---------------------
 1 file changed, 10 insertions(+), 43 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 5c7837a478e..f7dba6ef311 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -8,8 +8,8 @@
 #include <atomic>
 #include <chrono>
 #include <mutex>
-#include <string>
 #include <stdexcept>
+#include <string>
 
 #ifdef _WIN32
 #    include <sal.h>
@@ -1982,11 +1982,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
             return false;
     }
 
-    // src0 & src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2029,12 +2024,6 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
         return false;
     }
 
-    // src0 (weights) must be repacked and mapped to the same session
-    // src1 & sr2 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2064,11 +2053,6 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
         return false;
     }
 
-    // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2096,11 +2080,6 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
         return false;
     }
 
-    // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2123,11 +2102,6 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
         return false;
     }
 
-    // src0 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2160,11 +2134,6 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
         }
     }
 
-    // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2213,11 +2182,6 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
         }
     }
 
-    // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -2268,11 +2232,6 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
         }
     }
 
-    // src0, src1, src2 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
-        return false;
-    }
-
     return true;
 }
 
@@ -3136,8 +3095,16 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_repack_buffer_
 static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     auto sess = static_cast<ggml_hexagon_session *>(dev->context);
 
-    bool supp = false;
+    // src0, src1, src2 & dst must be mapped to the same session
+    if (!hex_supported_buffer(sess, op->src[0], op->src[1], op->src[2], op)) {
+        if (opt_verbose) {
+            HEX_VERBOSE("ggml-hex: %s device-unsupports-op %s : unsupported buffer types\n", sess->name.c_str(),
+                        ggml_op_name(op->op));
+        }
+        return false;
+    };
 
+    bool supp = false;
     switch (op->op) {
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:

From 48552b192a6704e0a96fccc95dbc130bdd56b1eb Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 26 Nov 2025 00:41:03 +0800
Subject: [PATCH 06/12] wip

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index f7dba6ef311..f298c6694e6 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2059,7 +2059,6 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
 static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * src1 = op->src[1];
-    const struct ggml_tensor * src2 = op->src[2];
     const struct ggml_tensor * dst  = op;
 
     if (!hex_supported_src0_type(src0->type)) {

From 46ce567afad91e88b1073dca431680607e975122 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 26 Nov 2025 00:56:23 +0800
Subject: [PATCH 07/12] add missing include to fix weak symbol warning

---
 ggml/src/ggml-hexagon/htp-utils.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-hexagon/htp-utils.h b/ggml/src/ggml-hexagon/htp-utils.h
index 1a48f5dcbdf..7bbae3a0b73 100644
--- a/ggml/src/ggml-hexagon/htp-utils.h
+++ b/ggml/src/ggml-hexagon/htp-utils.h
@@ -8,6 +8,7 @@ extern "C" {
 #include <AEEStdErr.h>
 #include <inttypes.h>
 #include <remote.h>
+#include <rpcmem.h>
 #include <stdbool.h>
 
 /* Offset to differentiate HLOS and Hexagon error codes.

From 5f9dfe64cff7eb721a8c158e8ab58b2afe063656 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 26 Nov 2025 13:59:30 +0800
Subject: [PATCH 08/12] add ggml_hexagon_op_generic

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 69 ++++++++++++++++++++++++--
 1 file changed, 64 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index f298c6694e6..8cf57f84bfe 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1598,7 +1598,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     try {
         ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (std::exception const &exc) {
+    } catch (const std::exception & exc) {
         GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
         return nullptr;
     }
@@ -1610,7 +1610,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe
     try {
         ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
         return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
-    } catch (std::exception const &exc) {
+    } catch (const std::exception & exc) {
         GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
         return nullptr;
     }
@@ -1849,7 +1849,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
 
         repack_buffer_type.iface   = ggml_backend_hexagon_repack_buffer_type_interface;
         repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
-    } catch (std::exception const &exc) {
+    } catch (const std::exception & exc) {
         release();
         throw;
     }
@@ -2296,6 +2296,65 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
                 (unsigned int) d->size);
 }
 
+typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req,
+                                            dspqueue_buffer (&bufs)[4],
+                                            const struct ggml_tensor * op);
+
+template <bool _IsSrc0Constant, init_dsp_req_and_buffer_t init_req>
+static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) {
+    const struct ggml_tensor * node = op;
+    const struct ggml_tensor * src0 = node->src[0];
+    const struct ggml_tensor * src1 = node->src[1];
+    const struct ggml_tensor * src2 = node->src[2];
+    const struct ggml_tensor * dst  = node;
+
+    uint64_t t1 = 0;
+    uint64_t t2 = 0;
+
+    t1 = ggml_time_us();
+
+    // Construct HTP message
+    htp_general_req req;
+    req.flags = flags;
+
+    // Use opmask to override flags
+    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
+    }
+    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
+    }
+
+    dspqueue_buffer bufs[4];
+    init_req(&req, buf, op);
+
+    auto * sess = get_session_from_tensor(src0);
+    if (opt_verbose) {
+        hex_print_op_info(op, sess, req.flags);
+        if (opt_verbose > 1) {
+            hex_dump_dspbuf(src0, &bufs[0]);
+            hex_dump_dspbuf(src1, &bufs[1]);
+            hex_dump_dspbuf(src2, &bufs[2]);
+            hex_dump_dspbuf(dst, &bufs[3]);
+        }
+    }
+
+    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
+        sess->enqueue(req, bufs, 4, opt_opsync);
+    }
+
+    t2 = ggml_time_us();
+
+    HEX_PROFILE(
+        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
+        "call-usec %llu\n",
+        sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+}
+
 template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * node = op;
     const struct ggml_tensor * src0 = node->src[0];
@@ -3247,7 +3306,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
         }
     }
 
-    if(opt_arch < 75) {
+    if (opt_arch < 75) {
         opt_ndev = 1;
         GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
     }
@@ -3260,7 +3319,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
         devices[i].reg     = reg;
         try {
             devices[i].context = new ggml_hexagon_session(i, &devices[i]);
-        } catch (std::exception const &exc) {
+        } catch (const std::exception & exc) {
             GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
             devices[i].context = nullptr;
         }

From 5e27b7f402d448fd4de2a1721b806027c7ebf1ed Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 26 Nov 2025 15:31:07 +0800
Subject: [PATCH 09/12] refactor: simplify tensor operation initialization and
 buffer management in hexagon implementation

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 282 ++++++++++---------------
 1 file changed, 111 insertions(+), 171 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 8cf57f84bfe..d481598f28a 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -7,6 +7,7 @@
 
 #include <atomic>
 #include <chrono>
+#include <cstddef>
 #include <mutex>
 #include <stdexcept>
 #include <string>
@@ -2296,9 +2297,7 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
                 (unsigned int) d->size);
 }
 
-typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req,
-                                            dspqueue_buffer (&bufs)[4],
-                                            const struct ggml_tensor * op);
+typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op);
 
 template <bool _IsSrc0Constant, init_dsp_req_and_buffer_t init_req>
 static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) {
@@ -2326,210 +2325,151 @@ static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flag
     }
 
     dspqueue_buffer bufs[4];
-    init_req(&req, buf, op);
+    const size_t    n_bufs = init_req(&req, bufs, op);
 
     auto * sess = get_session_from_tensor(src0);
     if (opt_verbose) {
         hex_print_op_info(op, sess, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
-            hex_dump_dspbuf(src1, &bufs[1]);
-            hex_dump_dspbuf(src2, &bufs[2]);
+            if (src1) {
+                hex_dump_dspbuf(src1, &bufs[1]);
+            }
+            if (src2) {
+                hex_dump_dspbuf(src2, &bufs[2]);
+            }
             hex_dump_dspbuf(dst, &bufs[3]);
         }
     }
 
     if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 4, opt_opsync);
+        sess->enqueue(req, bufs, n_bufs, opt_opsync);
     }
 
     t2 = ggml_time_us();
 
-    HEX_PROFILE(
-        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
-        "call-usec %llu\n",
-        sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    if (src1) {
+        HEX_PROFILE(
+            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
+            "(%f) call-usec %llu\n",
+            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    } else {
+        HEX_PROFILE(
+            "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec "
+            "%llu\n",
+            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    }
 }
 
 template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * node = op;
-    const struct ggml_tensor * src0 = node->src[0];
-    const struct ggml_tensor * src1 = node->src[1];
-    const struct ggml_tensor * dst  = node;
+    constexpr const auto init_func = [](htp_general_req *   req, dspqueue_buffer(&bufs)[4],
+                                        const ggml_tensor * op) -> size_t {
+        const struct ggml_tensor * node = op;
+        const struct ggml_tensor * src0 = node->src[0];
+        const struct ggml_tensor * src1 = node->src[1];
+        const struct ggml_tensor * dst  = node;
 
-    uint64_t t1 = 0;
-    uint64_t t2 = 0;
-
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-    req.flags = flags;
-
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    switch (node->op) {
-        case GGML_OP_MUL_MAT:
-            req.op = HTP_OP_MUL_MAT;
-            break;
-        case GGML_OP_MUL:
-            req.op = HTP_OP_MUL;
-            break;
-        case GGML_OP_ADD:
-            req.op = HTP_OP_ADD;
-            break;
-        case GGML_OP_SUB:
-            req.op = HTP_OP_SUB;
-            break;
-        default:
-            GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
-    }
-
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.dst, dst);
+        switch (node->op) {
+            case GGML_OP_MUL_MAT:
+                req->op = HTP_OP_MUL_MAT;
+                break;
+            case GGML_OP_MUL:
+                req->op = HTP_OP_MUL;
+                break;
+            case GGML_OP_ADD:
+                req->op = HTP_OP_ADD;
+                break;
+            case GGML_OP_SUB:
+                req->op = HTP_OP_SUB;
+                break;
+            default:
+                GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
+                break;
+        }
 
-    dspqueue_buffer bufs[3];
+        init_htp_tensor(&req->src0, src0);
+        init_htp_tensor(&req->src1, src1);
+        init_htp_tensor(&req->dst, dst);
 
-    // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op).
-    // If constant (e.g. weights), no cache management is needed.
-    // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
-    // Note: On platforms with I/O coherency, the framework skips cache ops automatically.
-    dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
-
-    // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op).
-    // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
-    dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
-
-    // Buffer 2 (dst): Output Activations.
-    // DSP writes, CPU reads.
-    // We flush CPU caches to ensure consistency before DSP writes.
-    // DSP cache maintenance is handled in the response message.
-    dspqueue_buffers_init(&bufs[2], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
-    auto * sess = get_session_from_tensor(src0);
+        // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op).
+        // If constant (e.g. weights), no cache management is needed.
+        // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
+        // Note: On platforms with I/O coherency, the framework skips cache ops automatically.
+        size_t n_bufs = dspqueue_buffers_init(
+            bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            hex_dump_dspbuf(src1, &bufs[1]);
-            hex_dump_dspbuf(dst, &bufs[2]);
-        }
-    }
+        // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op).
+        // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
+        n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 3, opt_opsync);
-    }
+        // Buffer 2 (dst): Output Activations.
+        // DSP writes, CPU reads.
+        // We flush CPU caches to ensure consistency before DSP writes.
+        // DSP cache maintenance is handled in the response message.
+        n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
-    t2 = ggml_time_us();
+        return n_bufs;
+    };
 
-    HEX_PROFILE(
-        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
-        "call-usec %llu\n",
-        sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags);
 }
 
 template <bool _IsSrc0Constant> static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) {
-    const struct ggml_tensor * node = op;
-    const struct ggml_tensor * src0 = node->src[0];
-    const struct ggml_tensor * src1 = node->src[1];
-    const struct ggml_tensor * src2 = node->src[2];
-    const struct ggml_tensor * dst  = node;
-
-    uint64_t t1 = 0;
-    uint64_t t2 = 0;
-
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-    req.flags = flags;
-
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
+    constexpr const auto init_func = [](htp_general_req *   req, dspqueue_buffer(&bufs)[4],
+                                        const ggml_tensor * op) -> size_t {
+        const struct ggml_tensor * node = op;
+        const struct ggml_tensor * src0 = node->src[0];
+        const struct ggml_tensor * src1 = node->src[1];
+        const struct ggml_tensor * src2 = node->src[2];
+        const struct ggml_tensor * dst  = node;
 
-    switch (node->op) {
-        case GGML_OP_MUL_MAT_ID:
-            req.op = HTP_OP_MUL_MAT_ID;
-            break;
-        case GGML_OP_ADD_ID:
-            req.op = HTP_OP_ADD_ID;
-            break;
-        default:
-            GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
-    }
-
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.src2, src2);
-    init_htp_tensor(&req.dst, dst);
-
-    dspqueue_buffer bufs[4];
-
-    // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op).
-    // If constant, no cache management is needed.
-    // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
-    dspqueue_buffers_init(bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
-
-    // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op).
-    // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
-    dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
+        switch (node->op) {
+            case GGML_OP_MUL_MAT_ID:
+                req->op = HTP_OP_MUL_MAT_ID;
+                break;
+            case GGML_OP_ADD_ID:
+                req->op = HTP_OP_ADD_ID;
+                break;
+            default:
+                GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
+        }
 
-    // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op).
-    // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
-    dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
+        init_htp_tensor(&req->src0, src0);
+        init_htp_tensor(&req->src1, src1);
+        init_htp_tensor(&req->src2, src2);
+        init_htp_tensor(&req->dst, dst);
 
-    // Buffer 3 (dst): Output Activations.
-    // DSP writes, CPU reads.
-    // We flush CPU caches to ensure consistency before DSP writes.
-    // DSP cache maintenance is handled in the response message.
-    dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
+        // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op).
+        // If constant, no cache management is needed.
+        // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
+        size_t n_bufs = dspqueue_buffers_init(
+            bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-    auto * sess = get_session_from_tensor(src0);
+        // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op).
+        // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
+        n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            hex_dump_dspbuf(src1, &bufs[1]);
-            hex_dump_dspbuf(src2, &bufs[2]);
-            hex_dump_dspbuf(dst, &bufs[3]);
-        }
-    }
+        // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op).
+        // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
+        n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 4, opt_opsync);
-    }
+        // Buffer 3 (dst): Output Activations.
+        // DSP writes, CPU reads.
+        // We flush CPU caches to ensure consistency before DSP writes.
+        // DSP cache maintenance is handled in the response message.
+        n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
-    t2 = ggml_time_us();
+        return n_bufs;
+    };
 
-    HEX_PROFILE(
-        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
-        "call-usec %llu\n",
-        sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags);
 }
 
 static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {

From 186053c9a8e1c683af17f87b137cb9654d07c9c9 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 26 Nov 2025 15:48:20 +0800
Subject: [PATCH 10/12] refactor: streamline hexagon operation initialization
 and buffer management

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 182 ++++++++++++-------------
 1 file changed, 89 insertions(+), 93 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index d481598f28a..1277e698ad8 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2299,8 +2299,8 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
 
 typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op);
 
-template <bool _IsSrc0Constant, init_dsp_req_and_buffer_t init_req>
-static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) {
+template <init_dsp_req_and_buffer_t init_req>
+static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * node = op;
     const struct ggml_tensor * src0 = node->src[0];
     const struct ggml_tensor * src1 = node->src[1];
@@ -2368,108 +2368,104 @@ static void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flag
     }
 }
 
-template <bool _IsSrc0Constant> static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
-    constexpr const auto init_func = [](htp_general_req *   req, dspqueue_buffer(&bufs)[4],
-                                        const ggml_tensor * op) -> size_t {
-        const struct ggml_tensor * node = op;
-        const struct ggml_tensor * src0 = node->src[0];
-        const struct ggml_tensor * src1 = node->src[1];
-        const struct ggml_tensor * dst  = node;
-
-        switch (node->op) {
-            case GGML_OP_MUL_MAT:
-                req->op = HTP_OP_MUL_MAT;
-                break;
-            case GGML_OP_MUL:
-                req->op = HTP_OP_MUL;
-                break;
-            case GGML_OP_ADD:
-                req->op = HTP_OP_ADD;
-                break;
-            case GGML_OP_SUB:
-                req->op = HTP_OP_SUB;
-                break;
-            default:
-                GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
-                break;
-        }
+template <bool _IsSrc0Constant>
+static inline size_t init_binary_req_and_bufs(htp_general_req * req,
+                                              dspqueue_buffer (&bufs)[4],
+                                              const ggml_tensor * op) {
+    const struct ggml_tensor * node = op;
+    const struct ggml_tensor * src0 = node->src[0];
+    const struct ggml_tensor * src1 = node->src[1];
+    const struct ggml_tensor * dst  = node;
 
-        init_htp_tensor(&req->src0, src0);
-        init_htp_tensor(&req->src1, src1);
-        init_htp_tensor(&req->dst, dst);
+    switch (node->op) {
+        case GGML_OP_MUL_MAT:
+            req->op = HTP_OP_MUL_MAT;
+            break;
+        case GGML_OP_MUL:
+            req->op = HTP_OP_MUL;
+            break;
+        case GGML_OP_ADD:
+            req->op = HTP_OP_ADD;
+            break;
+        case GGML_OP_SUB:
+            req->op = HTP_OP_SUB;
+            break;
+        default:
+            GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
+            break;
+    }
 
-        // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op).
-        // If constant (e.g. weights), no cache management is needed.
-        // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
-        // Note: On platforms with I/O coherency, the framework skips cache ops automatically.
-        size_t n_bufs = dspqueue_buffers_init(
-            bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
+    init_htp_tensor(&req->src0, src0);
+    init_htp_tensor(&req->src1, src1);
+    init_htp_tensor(&req->dst, dst);
 
-        // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op).
-        // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
-        n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
+    // Buffer 0 (src0): Weights (mulmat) or First Operand (binary op).
+    // If constant (e.g. weights), no cache management is needed.
+    // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
+    // Note: On platforms with I/O coherency, the framework skips cache ops automatically.
+    size_t n_bufs = dspqueue_buffers_init(
+        bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-        // Buffer 2 (dst): Output Activations.
-        // DSP writes, CPU reads.
-        // We flush CPU caches to ensure consistency before DSP writes.
-        // DSP cache maintenance is handled in the response message.
-        n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
+    // Buffer 1 (src1): Input Activations (mulmat) or Second Operand (binary op).
+    // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-        return n_bufs;
-    };
+    // Buffer 2 (dst): Output Activations.
+    // DSP writes, CPU reads.
+    // We flush CPU caches to ensure consistency before DSP writes.
+    // DSP cache maintenance is handled in the response message.
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
-    ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags);
+    return n_bufs;
 }
 
-template <bool _IsSrc0Constant> static void ggml_hexagon_binary_id(const struct ggml_tensor * op, uint32_t flags) {
-    constexpr const auto init_func = [](htp_general_req *   req, dspqueue_buffer(&bufs)[4],
-                                        const ggml_tensor * op) -> size_t {
-        const struct ggml_tensor * node = op;
-        const struct ggml_tensor * src0 = node->src[0];
-        const struct ggml_tensor * src1 = node->src[1];
-        const struct ggml_tensor * src2 = node->src[2];
-        const struct ggml_tensor * dst  = node;
-
-        switch (node->op) {
-            case GGML_OP_MUL_MAT_ID:
-                req->op = HTP_OP_MUL_MAT_ID;
-                break;
-            case GGML_OP_ADD_ID:
-                req->op = HTP_OP_ADD_ID;
-                break;
-            default:
-                GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
-        }
+template <bool _IsSrc0Constant>
+static inline size_t init_binary_id_req_and_bufs(htp_general_req * req,
+                                                 dspqueue_buffer (&bufs)[4],
+                                                 const ggml_tensor * op) {
+    const struct ggml_tensor * node = op;
+    const struct ggml_tensor * src0 = node->src[0];
+    const struct ggml_tensor * src1 = node->src[1];
+    const struct ggml_tensor * src2 = node->src[2];
+    const struct ggml_tensor * dst  = node;
 
-        init_htp_tensor(&req->src0, src0);
-        init_htp_tensor(&req->src1, src1);
-        init_htp_tensor(&req->src2, src2);
-        init_htp_tensor(&req->dst, dst);
+    switch (node->op) {
+        case GGML_OP_MUL_MAT_ID:
+            req->op = HTP_OP_MUL_MAT_ID;
+            break;
+        case GGML_OP_ADD_ID:
+            req->op = HTP_OP_ADD_ID;
+            break;
+        default:
+            GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
+    }
 
-        // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op).
-        // If constant, no cache management is needed.
-        // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
-        size_t n_bufs = dspqueue_buffers_init(
-            bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
+    init_htp_tensor(&req->src0, src0);
+    init_htp_tensor(&req->src1, src1);
+    init_htp_tensor(&req->src2, src2);
+    init_htp_tensor(&req->dst, dst);
 
-        // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op).
-        // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
-        n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
+    // Buffer 0 (src0): Weights (mulmat) or Input Activations (other op).
+    // If constant, no cache management is needed.
+    // Otherwise (CPU writes, DSP reads), we flush CPU caches and invalidate DSP caches.
+    size_t n_bufs = dspqueue_buffers_init(
+        bufs, src0, _IsSrc0Constant ? DSP_BUFFER_TYPE_CONSTANT : DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-        // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op).
-        // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
-        n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
+    // Buffer 1 (src1): Input Activations (mulmat) or Experts Bias (other op).
+    // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
+    n_bufs += dspqueue_buffers_init(&bufs[1], src1, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-        // Buffer 3 (dst): Output Activations.
-        // DSP writes, CPU reads.
-        // We flush CPU caches to ensure consistency before DSP writes.
-        // DSP cache maintenance is handled in the response message.
-        n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
+    // Buffer 2 (src2): Expert IDs (mulmat) or Activated Experts (other op).
+    // CPU writes, DSP reads: flush CPU caches and invalidate DSP caches.
+    n_bufs += dspqueue_buffers_init(&bufs[2], src2, DSP_BUFFER_TYPE_CPU_WRITE_DSP_READ);
 
-        return n_bufs;
-    };
+    // Buffer 3 (dst): Output Activations.
+    // DSP writes, CPU reads.
+    // We flush CPU caches to ensure consistency before DSP writes.
+    // DSP cache maintenance is handled in the response message.
+    n_bufs += dspqueue_buffers_init(&bufs[3], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
-    ggml_hexagon_op_generic<_IsSrc0Constant, init_func>(op, flags);
+    return n_bufs;
 }
 
 static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
@@ -2775,20 +2771,20 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 
         switch (node->op) {
             case GGML_OP_MUL_MAT:
-                ggml_hexagon_binary<true>(node, flags);
+                ggml_hexagon_op_generic<init_binary_req_and_bufs<true>>(node, flags);
                 prev_quant_op = node;
                 break;
             case GGML_OP_MUL_MAT_ID:
-                ggml_hexagon_binary_id<true>(node, flags);
+                ggml_hexagon_op_generic<init_binary_id_req_and_bufs<true>>(node, flags);
                 prev_quant_op = node;
                 break;
             case GGML_OP_MUL:
             case GGML_OP_ADD:
             case GGML_OP_SUB:
-                ggml_hexagon_binary<false>(node, flags);
+                ggml_hexagon_op_generic<init_binary_req_and_bufs<false>>(node, flags);
                 break;
             case GGML_OP_ADD_ID:
-                ggml_hexagon_binary_id<false>(node, flags);
+                ggml_hexagon_op_generic<init_binary_id_req_and_bufs<false>>(node, flags);
                 break;
             case GGML_OP_RMS_NORM:
                 ggml_hexagon_unary(node, flags);

From 01c06dc7eb15eaff91bb7ffd603c5fc557fd21ab Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 26 Nov 2025 18:13:18 +0800
Subject: [PATCH 11/12] refactor: update function signatures and streamline
 request handling in hexagon operations

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 179 +++++--------------------
 1 file changed, 32 insertions(+), 147 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 1277e698ad8..48a1d6e6bef 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2297,9 +2297,11 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
                 (unsigned int) d->size);
 }
 
-typedef size_t (*init_dsp_req_and_buffer_t)(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op);
+typedef size_t (*init_dsp_req_and_buffer_func_t)(htp_general_req * req,
+                                                 dspqueue_buffer (&bufs)[4],
+                                                 const ggml_tensor * op);
 
-template <init_dsp_req_and_buffer_t init_req>
+template <init_dsp_req_and_buffer_func_t _init_req_func>
 static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * node = op;
     const struct ggml_tensor * src0 = node->src[0];
@@ -2314,6 +2316,7 @@ static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32
 
     // Construct HTP message
     htp_general_req req;
+    memset(&req, 0, sizeof(req));
     req.flags = flags;
 
     // Use opmask to override flags
@@ -2325,7 +2328,7 @@ static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32
     }
 
     dspqueue_buffer bufs[4];
-    const size_t    n_bufs = init_req(&req, bufs, op);
+    const size_t    n_bufs = _init_req_func(&req, bufs, op);
 
     auto * sess = get_session_from_tensor(src0);
     if (opt_verbose) {
@@ -2468,50 +2471,42 @@ static inline size_t init_binary_id_req_and_bufs(htp_general_req * req,
     return n_bufs;
 }
 
-static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
+static inline size_t init_unary_req_and_bufs(htp_general_req * req,
+                                             dspqueue_buffer (&bufs)[4],
+                                             const ggml_tensor * op) {
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * src1 = op->src[1];
     const struct ggml_tensor * dst  = op;
 
-    uint64_t t1 = 0;
-    uint64_t t2 = 0;
-
-    t1 = ggml_time_us();
-
-    // Construct HTP message
-    htp_general_req req;
-
-    memset(&req, 0, sizeof(htp_general_req));
-    memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
-    req.flags = flags;
+    memcpy(&req->op_params, &op->op_params, sizeof(op->op_params));
 
     bool supported = false;
 
     switch (op->op) {
         case GGML_OP_RMS_NORM:
-            req.op    = HTP_OP_RMS_NORM;
+            req->op   = HTP_OP_RMS_NORM;
             supported = true;
             break;
 
         case GGML_OP_UNARY:
             if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
-                req.op    = HTP_OP_UNARY_SILU;
+                req->op   = HTP_OP_UNARY_SILU;
                 supported = true;
             }
             break;
 
         case GGML_OP_GLU:
             if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU) {
-                req.op    = HTP_OP_GLU_SWIGLU;
+                req->op   = HTP_OP_GLU_SWIGLU;
                 supported = true;
             } else if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) {
-                req.op    = HTP_OP_GLU_SWIGLU_OAI;
+                req->op   = HTP_OP_GLU_SWIGLU_OAI;
                 supported = true;
             }
             break;
 
         case GGML_OP_SOFT_MAX:
-            req.op    = HTP_OP_SOFTMAX;
+            req->op   = HTP_OP_SOFTMAX;
             supported = true;
 
         default:
@@ -2522,22 +2517,12 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
         GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op);
     }
 
-    init_htp_tensor(&req.dst, dst);
-    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req->dst, dst);
+    init_htp_tensor(&req->src0, src0);
     if (src1) {
-        init_htp_tensor(&req.src1, src1);
+        init_htp_tensor(&req->src1, src1);
     }
 
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    dspqueue_buffer bufs[3];
-
     // First buffer = Only Operand of Unary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
@@ -2559,84 +2544,25 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
     // written out before writes from the DSP start.
     n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
-    // Primary DSP session from the src0 tensor
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            if (src1) {
-                hex_dump_dspbuf(src1, &bufs[1]);
-                hex_dump_dspbuf(dst, &bufs[2]);
-            } else {
-                hex_dump_dspbuf(dst, &bufs[1]);
-            }
-        }
-    }
-
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, n_bufs, opt_opsync);
-    }
-
-    t2 = ggml_time_us();
-
-    if (src1) {
-        HEX_PROFILE(
-            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
-            "(%f) call-usec %llu\n",
-            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-    } else {
-        HEX_PROFILE(
-            "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec "
-            "%llu\n",
-            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-    }
+    return n_bufs;
 }
 
-static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
+static inline size_t init_rope_req_and_bufs(htp_general_req * req, dspqueue_buffer (&bufs)[4], const ggml_tensor * op) {
     const struct ggml_tensor * src0 = op->src[0];
     const struct ggml_tensor * src1 = op->src[1];
     const struct ggml_tensor * src2 = op->src[2];
     const struct ggml_tensor * dst  = op;
 
-    uint64_t t1 = 0;
-    uint64_t t2 = 0;
-
-    t1 = ggml_time_us();
+    memcpy(&req->op_params, &op->op_params, sizeof(op->op_params));
+    req->op    = HTP_OP_ROPE;
 
-    // Construct HTP message
-    htp_general_req req;
-
-    memset(&req, 0, sizeof(htp_general_req));
-    memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
-    req.flags = flags;
-    req.op    = HTP_OP_ROPE;
-
-    init_htp_tensor(&req.dst, dst);
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req->dst, dst);
+    init_htp_tensor(&req->src0, src0);
+    init_htp_tensor(&req->src1, src1);
     if (src2) {
-        init_htp_tensor(&req.src2, src2);
+        init_htp_tensor(&req->src2, src2);
     }
 
-    // Use opmask to override flags
-    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
-    }
-    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
-        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
-    }
-
-    dspqueue_buffer bufs[4];
-
     // First buffer
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
@@ -2665,48 +2591,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
     // written out before writes from the DSP start.
     n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, DSP_BUFFER_TYPE_DSP_WRITE_CPU_READ);
 
-    // Primary DSP session from the src0 tensor
-    auto * sess = get_session_from_tensor(src0);
-
-    if (opt_verbose) {
-        hex_print_op_info(op, sess, req.flags);
-        if (opt_verbose > 1) {
-            hex_dump_dspbuf(src0, &bufs[0]);
-            if (src1) {
-                hex_dump_dspbuf(src1, &bufs[1]);
-                hex_dump_dspbuf(dst, &bufs[2]);
-            } else {
-                hex_dump_dspbuf(dst, &bufs[1]);
-            }
-        }
-    }
-
-    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, n_bufs, opt_opsync);
-    }
-
-    t2 = ggml_time_us();
-
-    if (src2) {
-        HEX_PROFILE(
-            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles "
-            "%u op-pkts %u (%f) call-usec %llu\n",
-            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1],
-            (uint32_t) src2->ne[2], (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-    } else {
-        HEX_PROFILE(
-            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
-            "(%f) call-usec %llu\n",
-            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
-            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
-    }
+    return n_bufs;
 }
 
 static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
@@ -2787,25 +2672,25 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
                 ggml_hexagon_op_generic<init_binary_id_req_and_bufs<false>>(node, flags);
                 break;
             case GGML_OP_RMS_NORM:
-                ggml_hexagon_unary(node, flags);
+                ggml_hexagon_op_generic<init_unary_req_and_bufs>(node, flags);
                 break;
             case GGML_OP_UNARY:
                 if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
-                    ggml_hexagon_unary(node, flags);
+                    ggml_hexagon_op_generic<init_unary_req_and_bufs>(node, flags);
                 }
                 break;
             case GGML_OP_GLU:
                 if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
                     (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
-                    ggml_hexagon_unary(node, flags);
+                    ggml_hexagon_op_generic<init_unary_req_and_bufs>(node, flags);
                 }
                 break;
             case GGML_OP_SOFT_MAX:
-                ggml_hexagon_unary(node, flags);
+                ggml_hexagon_op_generic<init_unary_req_and_bufs>(node, flags);
                 break;
 
             case GGML_OP_ROPE:
-                ggml_hexagon_rope(node, flags);
+                ggml_hexagon_op_generic<init_rope_req_and_bufs>(node, flags);
                 break;
 
             default:

From 97dd2c776ac64723f3addc26bdc7eba9e6adbd6a Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 26 Nov 2025 18:17:38 +0800
Subject: [PATCH 12/12] wip

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 48a1d6e6bef..dd3d559df71 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2302,7 +2302,7 @@ typedef size_t (*init_dsp_req_and_buffer_func_t)(htp_general_req * req,
                                                  const ggml_tensor * op);
 
 template <init_dsp_req_and_buffer_func_t _init_req_func>
-static inline void ggml_hexagon_op_generic(const struct ggml_tensor * op, uint32_t flags) {
+static inline void ggml_hexagon_dispatch_op(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * node = op;
     const struct ggml_tensor * src0 = node->src[0];
     const struct ggml_tensor * src1 = node->src[1];
@@ -2656,41 +2656,41 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 
         switch (node->op) {
             case GGML_OP_MUL_MAT:
-                ggml_hexagon_op_generic<init_binary_req_and_bufs<true>>(node, flags);
+                ggml_hexagon_dispatch_op<init_binary_req_and_bufs<true>>(node, flags);
                 prev_quant_op = node;
                 break;
             case GGML_OP_MUL_MAT_ID:
-                ggml_hexagon_op_generic<init_binary_id_req_and_bufs<true>>(node, flags);
+                ggml_hexagon_dispatch_op<init_binary_id_req_and_bufs<true>>(node, flags);
                 prev_quant_op = node;
                 break;
             case GGML_OP_MUL:
             case GGML_OP_ADD:
             case GGML_OP_SUB:
-                ggml_hexagon_op_generic<init_binary_req_and_bufs<false>>(node, flags);
+                ggml_hexagon_dispatch_op<init_binary_req_and_bufs<false>>(node, flags);
                 break;
             case GGML_OP_ADD_ID:
-                ggml_hexagon_op_generic<init_binary_id_req_and_bufs<false>>(node, flags);
+                ggml_hexagon_dispatch_op<init_binary_id_req_and_bufs<false>>(node, flags);
                 break;
             case GGML_OP_RMS_NORM:
-                ggml_hexagon_op_generic<init_unary_req_and_bufs>(node, flags);
+                ggml_hexagon_dispatch_op<init_unary_req_and_bufs>(node, flags);
                 break;
             case GGML_OP_UNARY:
                 if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
-                    ggml_hexagon_op_generic<init_unary_req_and_bufs>(node, flags);
+                    ggml_hexagon_dispatch_op<init_unary_req_and_bufs>(node, flags);
                 }
                 break;
             case GGML_OP_GLU:
                 if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
                     (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
-                    ggml_hexagon_op_generic<init_unary_req_and_bufs>(node, flags);
+                    ggml_hexagon_dispatch_op<init_unary_req_and_bufs>(node, flags);
                 }
                 break;
             case GGML_OP_SOFT_MAX:
-                ggml_hexagon_op_generic<init_unary_req_and_bufs>(node, flags);
+                ggml_hexagon_dispatch_op<init_unary_req_and_bufs>(node, flags);
                 break;
 
             case GGML_OP_ROPE:
-                ggml_hexagon_op_generic<init_rope_req_and_bufs>(node, flags);
+                ggml_hexagon_dispatch_op<init_rope_req_and_bufs>(node, flags);
                 break;
 
             default: