From 06253824b9ab9938c80a00dd0dd69f27f3b78b12 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 15 Jul 2025 22:42:01 -0700
Subject: [PATCH 01/37] model: add support for extra bufs for all devices

---
 src/llama-model.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index e4609963300..7420a3176d9 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -404,6 +404,19 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
     // add the device default buffer type
     buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
 
+    // add the device extra buffer type (if any)
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+    auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+        ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
+
+    if (ggml_backend_dev_get_extra_bufts_fn) {
+        ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
+        while (extra_bufts && *extra_bufts) {
+            buft_list.emplace_back(dev, *extra_bufts);
+            ++extra_bufts;
+        }
+    }
+
     return buft_list;
 }
 

From 80dc8e80edcc862aeade7519e8e6de7ea43986dc Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Mon, 23 Jun 2025 14:11:12 -0700
Subject: [PATCH 02/37] hexagon: add experimental ggml-hexagon backend for the
 Hexagon NPU

This commit introduces a new experimental backend `ggml-hexagon` with support for the Hexagon NPU.

Highlights:
- Supports Hexagon versions: v73, v75, v79, and v81
- Targets Android devices based on Snapdragon SoCs: Gen3, 8-Elite, and 8-Elite Gen5
- Supports Q4_0, Q8_0, MXFP4, and FP32 data types
- Implements core LLM ops: MUL_MAT/MUL_MAT_ID, ADD/SUB/MUL/ADD_ID, RMS_NORM, ROPE, GLU/SWIGLU, SOFTMAX

**Note:** This backend is experimental and may exhibit instability or limited performance across supported devices.
It is intended for early testing and feedback from llama.cpp/ggml developer and user community.

Co-Authored-By: Rajdeep Ganguly <rganguly@qti.qualcomm.com>
Co-Authored-By: Todor Boinovski <todorb@qti.qualcomm.com>
---
 CODEOWNERS                                    |    1 +
 docs/backend/hexagon/CMakeUserPresets.json    |   47 +
 docs/backend/hexagon/README.md                |  222 +
 docs/backend/hexagon/developer.md             |  105 +
 docs/backend/hexagon/llama-cli.farf           |    1 +
 docs/backend/hexagon/run-bench.sh             |   36 +
 docs/backend/hexagon/run-cli.sh               |   52 +
 docs/backend/hexagon/run-tool.sh              |   51 +
 ggml/CMakeLists.txt                           |    2 +
 ggml/include/ggml-hexagon.h                   |   19 +
 ggml/src/CMakeLists.txt                       |    2 +
 ggml/src/ggml-backend-reg.cpp                 |    8 +
 ggml/src/ggml-hexagon/CMakeLists.txt          |   67 +
 ggml/src/ggml-hexagon/ggml-hexagon.cpp        | 3581 +++++++++++++++++
 ggml/src/ggml-hexagon/htp-utils.c             |  442 ++
 ggml/src/ggml-hexagon/htp-utils.h             |  219 +
 ggml/src/ggml-hexagon/htp/CMakeLists.txt      |   40 +
 ggml/src/ggml-hexagon/htp/act-ops.c           |  448 +++
 ggml/src/ggml-hexagon/htp/binary-ops.c        |  344 ++
 .../ggml-hexagon/htp/cmake-toolchain.cmake    |  157 +
 ggml/src/ggml-hexagon/htp/htp-ctx.h           |   40 +
 ggml/src/ggml-hexagon/htp/htp-dma.c           |   69 +
 ggml/src/ggml-hexagon/htp/htp-dma.h           |  119 +
 ggml/src/ggml-hexagon/htp/htp-msg.h           |  156 +
 ggml/src/ggml-hexagon/htp/htp-ops.h           |   53 +
 ggml/src/ggml-hexagon/htp/htp_iface.idl       |   16 +
 ggml/src/ggml-hexagon/htp/hvx-exp.c           |   80 +
 ggml/src/ggml-hexagon/htp/hvx-inverse.c       |   60 +
 ggml/src/ggml-hexagon/htp/hvx-sigmoid.c       |   49 +
 ggml/src/ggml-hexagon/htp/hvx-utils.c         |  944 +++++
 ggml/src/ggml-hexagon/htp/hvx-utils.h         |  981 +++++
 ggml/src/ggml-hexagon/htp/main.c              |  945 +++++
 ggml/src/ggml-hexagon/htp/matmul-ops.c        | 2215 ++++++++++
 ggml/src/ggml-hexagon/htp/ops-utils.h         |  116 +
 ggml/src/ggml-hexagon/htp/rope-ops.c          |  418 ++
 ggml/src/ggml-hexagon/htp/softmax-ops.c       |  402 ++
 ggml/src/ggml-hexagon/htp/unary-ops.c         |  255 ++
 ggml/src/ggml-hexagon/htp/worker-pool.c       |  297 ++
 ggml/src/ggml-hexagon/htp/worker-pool.h       |   57 +
 39 files changed, 13116 insertions(+)
 create mode 100644 docs/backend/hexagon/CMakeUserPresets.json
 create mode 100644 docs/backend/hexagon/README.md
 create mode 100644 docs/backend/hexagon/developer.md
 create mode 100644 docs/backend/hexagon/llama-cli.farf
 create mode 100755 docs/backend/hexagon/run-bench.sh
 create mode 100755 docs/backend/hexagon/run-cli.sh
 create mode 100755 docs/backend/hexagon/run-tool.sh
 create mode 100644 ggml/include/ggml-hexagon.h
 create mode 100644 ggml/src/ggml-hexagon/CMakeLists.txt
 create mode 100644 ggml/src/ggml-hexagon/ggml-hexagon.cpp
 create mode 100644 ggml/src/ggml-hexagon/htp-utils.c
 create mode 100644 ggml/src/ggml-hexagon/htp-utils.h
 create mode 100644 ggml/src/ggml-hexagon/htp/CMakeLists.txt
 create mode 100644 ggml/src/ggml-hexagon/htp/act-ops.c
 create mode 100644 ggml/src/ggml-hexagon/htp/binary-ops.c
 create mode 100644 ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake
 create mode 100644 ggml/src/ggml-hexagon/htp/htp-ctx.h
 create mode 100644 ggml/src/ggml-hexagon/htp/htp-dma.c
 create mode 100644 ggml/src/ggml-hexagon/htp/htp-dma.h
 create mode 100644 ggml/src/ggml-hexagon/htp/htp-msg.h
 create mode 100644 ggml/src/ggml-hexagon/htp/htp-ops.h
 create mode 100644 ggml/src/ggml-hexagon/htp/htp_iface.idl
 create mode 100644 ggml/src/ggml-hexagon/htp/hvx-exp.c
 create mode 100644 ggml/src/ggml-hexagon/htp/hvx-inverse.c
 create mode 100644 ggml/src/ggml-hexagon/htp/hvx-sigmoid.c
 create mode 100644 ggml/src/ggml-hexagon/htp/hvx-utils.c
 create mode 100644 ggml/src/ggml-hexagon/htp/hvx-utils.h
 create mode 100644 ggml/src/ggml-hexagon/htp/main.c
 create mode 100644 ggml/src/ggml-hexagon/htp/matmul-ops.c
 create mode 100644 ggml/src/ggml-hexagon/htp/ops-utils.h
 create mode 100644 ggml/src/ggml-hexagon/htp/rope-ops.c
 create mode 100644 ggml/src/ggml-hexagon/htp/softmax-ops.c
 create mode 100644 ggml/src/ggml-hexagon/htp/unary-ops.c
 create mode 100644 ggml/src/ggml-hexagon/htp/worker-pool.c
 create mode 100644 ggml/src/ggml-hexagon/htp/worker-pool.h

diff --git a/CODEOWNERS b/CODEOWNERS
index f833fb7cf48..53d2e1e7ed4 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -65,6 +65,7 @@
 /ggml/src/ggml-impl.h                   @ggerganov @slaren
 /ggml/src/ggml-metal/                   @ggerganov
 /ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
+/ggml/src/ggml-hexagon/                 @max-krasnyansky
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @rgerganov
diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json
new file mode 100644
index 00000000000..d413ec46fca
--- /dev/null
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -0,0 +1,47 @@
+﻿{
+  "version": 4,
+  "configurePresets": [
+    {
+        "name": "arm64-android-snapdragon",
+        "hidden": true,
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "cacheVariables": {
+            "ANDROID_ABI":      "arm64-v8a",
+            "ANDROID_PLATFORM": "android-31",
+            "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK}/build/cmake/android.toolchain.cmake",
+            "CMAKE_C_FLAGS":   "-march=armv8.7a -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.7a -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
+            "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
+            "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
+            "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
+            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+            "PREBUILT_LIB_DIR": "android_aarch64",
+            "GGML_OPENMP":      "OFF",
+            "GGML_OPENCL":      "ON",
+            "GGML_HEXAGON":     "ON",
+            "LLAMA_CURL":       "OFF"
+        }
+    },
+
+    { 
+        "name": "arm64-windows-snapdragon", 
+        "inherits": [ "base", "arm64-windows-llvm" ],
+        "cacheVariables": {
+            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+            "PREBUILT_LIB_DIR": "windows_aarch64",
+            "GGML_OPENMP":      "OFF",
+            "GGML_OPENCL":      "ON",
+            "GGML_HEXAGON":     "ON",
+            "LLAMA_CURL":       "OFF"
+        }
+    },
+ 
+    { "name": "arm64-android-snapdragon-debug"  , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
+    { "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
+
+    { "name": "arm64-windows-snapdragon-debug"  , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
+    { "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
+  ]
+}
diff --git a/docs/backend/hexagon/README.md b/docs/backend/hexagon/README.md
new file mode 100644
index 00000000000..db22d47bf0f
--- /dev/null
+++ b/docs/backend/hexagon/README.md
@@ -0,0 +1,222 @@
+# Snapdragon-based Android devices
+
+## How to Build
+
+The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
+This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
+
+This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
+
+    ~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.1
+    [d]/> cd /workspace
+
+The rest of the Android build process assumes that you're running inside the toolchain container.
+Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
+
+    [d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
+     
+    [d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
+    Preset CMake variables:
+      ANDROID_ABI="arm64-v8a"
+      ...
+      CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
+      GGML_HEXAGON="ON"
+      GGML_OPENCL="ON"
+      GGML_OPENMP="OFF"
+      HEXAGON_SDK_ROOT="/opt/hexagon/6.3.0.0"
+    ...
+    -- Including OpenCL backend
+    -- Including Hexagon backend
+    ...
+    -- Build files have been written to: /workspace/build-snapdragon
+
+    [d]/workspace> cmake --build build-snapdragon 
+    ...
+    [144/356] Performing build step for 'htp-v73'
+    [1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
+    [2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
+    [3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
+    [4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
+    ...
+    -- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
+    -- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
+    ...
+ 
+To generate an installable "package" simply use cmake --install:
+
+    [d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon
+    -- Install configuration: "Release"
+    -- Installing: /workspace/pkg-snapdragon/lib/libggml-cpu.so
+    -- Installing: /workspace/pkg-snapdragon/lib/libggml-opencl.so
+    -- Installing: /workspace/pkg-snapdragon/lib/libggml-hexagon.so
+    -- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v73.so
+    -- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v75.so
+    -- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v79.so
+    -- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v81.so
+    -- Installing: /workspace/pkg-snapdragon/lib/libggml.so
+    ...
+    -- Installing: /workspace/pkg-snapdragon/bin/llama-bench
+    -- Installing: /workspace/pkg-snapdragon/bin/llama-cli
+    ...
+
+## How to Install
+
+For this step, your device needs to be configured for on-device development.
+Please see https://developer.android.com/studio/debug/dev-options for details.
+
+Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
+**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
+
+    ~/src/llama.cpp$ adb push pkg-snapdragon/* /data/local/tmp/llama.cpp
+    pkg-snapdragon/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
+    pkg-snapdragon/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
+    pkg-snapdragon/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
+    102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
+
+At this point, you should also install some models:
+
+    ~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
+    ...
+    2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920]
+
+    ~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
+    Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
+
+## How to Run
+
+The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
+
+llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4).
+You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option.
+
+Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options.
+
+Here are some examples of running various llama.cpp tools via ADB.
+
+Simple question for Llama-3.2-1B
+
+    ~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./docs/backend/snapdragon/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
+    ...
+    ggml-hex: HTP arch version v79
+    ggml-hex: allocating new session: HTP0
+    ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
+    ...
+    load_tensors: offloading output layer to GPU
+    load_tensors: offloaded 17/17 layers to GPU
+    load_tensors:          CPU model buffer size =   225.49 MiB
+    load_tensors:         HTP0 model buffer size =     0.26 MiB
+    load_tensors:  HTP0-REPACK model buffer size =   504.00 MiB
+    ...
+    I hope this helps you understand the world's most popular cookies! [end of text]
+    ... 
+    llama_perf_sampler_print:    sampling time =      30.08 ms /   487 runs   (    0.06 ms per token, 16191.77 tokens per second)
+    llama_perf_context_print:        load time =     617.94 ms
+    llama_perf_context_print: prompt eval time =      80.76 ms /    11 tokens (    7.34 ms per token,   136.21 tokens per second)
+    llama_perf_context_print:        eval time =    9210.59 ms /   475 runs   (   19.39 ms per token,    51.57 tokens per second)
+    llama_perf_context_print:       total time =    9454.92 ms /   486 tokens
+    llama_perf_context_print:    graphs reused =        473
+    llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+    llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+    llama_memory_breakdown_print: |   - Host               |                  439 =   225 +     136 +      77                |
+    llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =   504 +       0 +       0                |
+
+Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
+
+    M=../gguf/OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 docs/backend/hexagon/run-cli.sh -f surfing.txt -no-cnv
+    ...
+    ggml-hex: allocating new registry : ndev 2
+    ggml-hex: HTP arch version v81
+    ggml-hex: allocating new session: HTP0
+    ggml-hex: allocating new session: HTP1
+    ...
+    load_tensors: offloading output layer to GPU
+    load_tensors: offloaded 17/17 layers to GPU
+    load_tensors:          CPU model buffer size =   143.86 MiB
+    load_tensors:         HTP1 model buffer size =     0.23 MiB
+    load_tensors:  HTP1-REPACK model buffer size =  1575.00 MiB
+    load_tensors:         HTP0 model buffer size =     0.28 MiB
+    load_tensors:  HTP0-REPACK model buffer size =  2025.00 MiB
+    ...
+    llama_context:        CPU  output buffer size =     0.19 MiB
+    llama_kv_cache:       HTP1 KV buffer size =   238.00 MiB
+    llama_kv_cache:       HTP0 KV buffer size =   306.00 MiB
+    llama_kv_cache: size =  544.00 MiB (  8192 cells,  16 layers,  1/1 seqs), K (q8_0):  272.00 MiB, V (q8_0):  272.00 MiB
+    llama_context:       HTP0 compute buffer size =    15.00 MiB
+    llama_context:       HTP1 compute buffer size =    15.00 MiB
+    llama_context:        CPU compute buffer size =    24.56 MiB
+    ...
+    llama_perf_context_print: prompt eval time =    1730.57 ms /   212 tokens (    8.16 ms per token,   122.50 tokens per second)
+    llama_perf_context_print:        eval time =    5624.75 ms /   257 runs   (   21.89 ms per token,    45.69 tokens per second)
+    llama_perf_context_print:       total time =    7377.33 ms /   469 tokens
+    llama_perf_context_print:    graphs reused =        255
+    llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+    llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+    llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+    llama_memory_breakdown_print: |   - Host               |                  742 =   144 +     544 +      54                |
+    llama_memory_breakdown_print: |   - HTP1-REPACK        |                 1575 =  1575 +       0 +       0                |
+    llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2025 =  2025 +       0 +       0                | 
+
+Op test for MUL_MAT
+
+    ~/src/llama.cpp$ HB=0 docs/backend/snapdragon/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
+    ...
+    Backend 2/3: HTP0
+    Device description: Hexagon
+    Device memory: 2048 MB (2048 MB free)
+    MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+    MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+    MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+
+    ~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf docs/backend/hexagon/run-bench.sh -p 128 -n 64
+    ...
+    ggml-hex: allocating new registry : ndev 1
+    ggml-hex: HTP arch version v79
+    ggml-hex: allocating new session: HTP0
+    ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
+    | model          |       size | params | backend    | ngl | threads | n_batch | mmap |  test |           t/s |
+    | ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
+    | llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 | pp128 | 169.42 ± 1.75 |
+    | llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 |  tg64 |  51.54 ± 1.13 |
+
+    build: 6a8cf8914 (6733)
+
+## Environment variables
+
+- `GGML_HEXAGON_NDEV=1`
+  Controls the number of devices/sessions to allocate. The default is 1.  
+  Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four.
+
+- `GGML_HEXAGON_NHVX=0`
+  Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version).
+
+- `GGML_HEXAGON_HOSTBUF=1`
+  Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.  
+  This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
+
+- `GGML_HEXAGON_VERBOSE=1`
+  Enables verbose logging of Ops from the backend. Example output:
+
+  ```
+  ggml-hex: HTP0 graph-compute n_nodes 2
+  ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1
+  ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3
+  ggml-hex: HTP0 graph-compute n_nodes 1
+  ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0
+  ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024
+  ```
+
+- `GGML_HEXAGON_PROFILE=1`  
+  Generates a host-side profile for the ggml-hexagon Ops.
+
+- `GGML_HEXAGON_OPMASK=0x0`  
+  Allows enabling specific stages of the processing pipeline:
+
+  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)  
+  - `0x2` Enable Dynamic Quantizer (if needed for the Op)  
+  - `0x4` Enable Op Compute (MUL_MAT, etc.)
+
+  Examples:
+
+      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out  
+      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest  
+      `GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
diff --git a/docs/backend/hexagon/developer.md b/docs/backend/hexagon/developer.md
new file mode 100644
index 00000000000..55164f92acb
--- /dev/null
+++ b/docs/backend/hexagon/developer.md
@@ -0,0 +1,105 @@
+# Hexagon backend developer details
+
+## Backend libraries
+
+The Hexagon backend consist of two parts:
+
+  - `libggml-hexagon` 
+    This is the regular CPU-side GGML backend library, either shared or statically linked
+
+  - `libggm-htp-vNN`
+    This is the NPU-side (HTP stands for Hexagon Tensor Process) shared library that contains the Op dispatcher and kernels.
+    The correct library is selected automatically at runtime based on the HW version.
+
+Here is an example of the build artifacts
+
+    ~/src/llama.cpp$ ls -l pkg-snapdragon/lib/libggml*
+    pkg-snapdragon/lib/libggml-base.so
+    pkg-snapdragon/lib/libggml-cpu.so
+    pkg-snapdragon/lib/libggml-hexagon.so      <<< CPU library 
+    pkg-snapdragon/lib/libggml-htp-v73.so      <<< HTP op/kernels for Hexagon v73
+    pkg-snapdragon/lib/libggml-htp-v75.so     
+    pkg-snapdragon/lib/libggml-htp-v79.so
+    pkg-snapdragon/lib/libggml-htp-v81.so
+
+## Memory buffers
+
+Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU.
+The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically 
+quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA).
+
+Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those 
+to the NPU at this point.
+
+The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4.
+From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force 
+the repacking.
+
+## Large model handling
+
+Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB.
+In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc).
+
+In order to map models larger than 3.5GB we need to allocate multiple devices and split the model.
+For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support.
+Each Hexagon device behaves like a GPU from the offload and model splitting perspective.
+
+Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
+
+    M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt docs/backend/hexagon/run-cli.sh -no-cnv -f surfing.txt -n 32
+    ...
+    LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+    ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+    GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf 
+          -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt 
+    ...
+    llama_model_loader: - type  f32:  289 tensors
+    llama_model_loader: - type q4_0:   96 tensors
+    llama_model_loader: - type q8_0:    2 tensors
+    llama_model_loader: - type mxfp4:  72 tensors
+    ...
+    load_tensors: offloaded 25/25 layers to GPU
+    load_tensors:          CPU model buffer size =  1182.09 MiB
+    load_tensors:         HTP1 model buffer size =     6.64 MiB
+    load_tensors:  HTP1-REPACK model buffer size =  2505.94 MiB
+    load_tensors:         HTP3 model buffer size =     5.55 MiB
+    load_tensors:  HTP3-REPACK model buffer size =  2088.28 MiB
+    load_tensors:         HTP0 model buffer size =     7.75 MiB
+    load_tensors:  HTP0-REPACK model buffer size =  2923.59 MiB
+    load_tensors:         HTP2 model buffer size =     6.64 MiB
+    load_tensors:  HTP2-REPACK model buffer size =  2505.94 MiB
+    ...
+    llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
+    llama_context:        CPU  output buffer size =     0.77 MiB
+    llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
+    llama_kv_cache:       HTP1 KV buffer size =    25.50 MiB
+    llama_kv_cache:       HTP3 KV buffer size =    25.50 MiB
+    llama_kv_cache:       HTP0 KV buffer size =    25.50 MiB
+    llama_kv_cache:       HTP2 KV buffer size =    25.50 MiB
+    llama_kv_cache: size =  102.00 MiB (  8192 cells,  12 layers,  1/1 seqs), K (q8_0):   51.00 MiB, V (q8_0):   51.00 MiB
+    llama_kv_cache_iswa: creating     SWA KV cache, size = 256 cells
+    llama_kv_cache:       HTP1 KV buffer size =     0.80 MiB
+    llama_kv_cache:       HTP3 KV buffer size =     0.53 MiB
+    llama_kv_cache:       HTP0 KV buffer size =     1.06 MiB
+    llama_kv_cache:       HTP2 KV buffer size =     0.80 MiB
+    llama_kv_cache: size =    3.19 MiB (   256 cells,  12 layers,  1/1 seqs), K (q8_0):    1.59 MiB, V (q8_0):    1.59 MiB
+    llama_context:       HTP0 compute buffer size =    16.06 MiB
+    llama_context:       HTP1 compute buffer size =    16.06 MiB
+    llama_context:       HTP2 compute buffer size =    16.06 MiB
+    llama_context:       HTP3 compute buffer size =    16.06 MiB
+    llama_context:        CPU compute buffer size =    98.19 MiB
+    ...
+    llama_perf_context_print: prompt eval time =    3843.67 ms /   197 tokens ( 19.51 ms per token, 51.25 tokens per second)
+    llama_perf_context_print:        eval time =    1686.13 ms /    31 runs   ( 54.39 ms per token, 18.39 tokens per second)
+    llama_perf_context_print:       total time =    6266.30 ms /   228 tokens
+    llama_perf_context_print:    graphs reused =         30
+    llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+    llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+    llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+    llama_memory_breakdown_print: |   - HTP2 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+    llama_memory_breakdown_print: |   - HTP3 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+    llama_memory_breakdown_print: |   - Host               |                 1476 =  1208 +     105 +     162                |
+    llama_memory_breakdown_print: |   - HTP1-REPACK        |                 2505 =  2505 +       0 +       0                |
+    llama_memory_breakdown_print: |   - HTP3-REPACK        |                 2088 =  2088 +       0 +       0                |
+    llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2923 =  2923 +       0 +       0                |
+    llama_memory_breakdown_print: |   - HTP2-REPACK        |                 2505 =  2505 +       0 +       0                |
diff --git a/docs/backend/hexagon/llama-cli.farf b/docs/backend/hexagon/llama-cli.farf
new file mode 100644
index 00000000000..de84fe89adb
--- /dev/null
+++ b/docs/backend/hexagon/llama-cli.farf
@@ -0,0 +1 @@
+0xffff
diff --git a/docs/backend/hexagon/run-bench.sh b/docs/backend/hexagon/run-bench.sh
new file mode 100755
index 00000000000..1e8a33abc0c
--- /dev/null
+++ b/docs/backend/hexagon/run-bench.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+#
+
+# Basedir on device
+basedir=/data/local/tmp/llama.cpp
+
+branch=.
+[ "$B" != "" ] && branch=$B
+
+adbserial=
+[ "$S" != "" ] && adbserial="-s $S"
+
+model="Llama-3.2-3B-Instruct-Q4_0.gguf"
+[ "$M" != "" ] && model="$M"
+
+device="HTP0"
+[ "$D" != "" ] && device="$D"
+
+verbose=""
+[ "$V" != "" ] && verbose="$V"
+
+nhvx=
+[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+
+ndev=
+[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
+
+set -x
+
+adb $adbserial shell " \
+  cd $basedir;         \
+  LD_LIBRARY_PATH=$basedir/$branch/lib   \
+  ADSP_LIBRARY_PATH=$basedir/$branch/lib \
+    $ndev $nhvx ./$branch/bin/llama-bench --mmap 0 -m $basedir/../gguf/$model \
+        -t 4 --batch-size 128 -ngl 99 $@ \
+"
diff --git a/docs/backend/hexagon/run-cli.sh b/docs/backend/hexagon/run-cli.sh
new file mode 100755
index 00000000000..a236bcd43b1
--- /dev/null
+++ b/docs/backend/hexagon/run-cli.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+#
+
+# Basedir on device
+basedir=/data/local/tmp/llama.cpp
+
+cli_opts=
+
+branch=.
+[ "$B" != "" ] && branch=$B
+
+adbserial=
+[ "$S" != "" ] && adbserial="-s $S"
+
+model="Llama-3.2-3B-Instruct-Q4_0.gguf"
+[ "$M" != "" ] && model="$M"
+
+device="HTP0"
+[ "$D" != "" ] && device="$D"
+
+verbose=
+[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
+
+experimental=
+[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
+
+sched=
+[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" 
+
+profile=
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
+
+opmask=
+[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+
+nhvx=
+[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+
+ndev=
+[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
+
+set -x
+
+adb $adbserial shell " \
+  cd $basedir; ulimit -c unlimited;        \
+    LD_LIBRARY_PATH=$basedir/$branch/lib   \
+    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
+    $verbose $experimental $sched $opmask $profile $nhvx $ndev           \
+      ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model       \
+        -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
+  	-ngl 99 --device $device $cli_opts $@ \
+"
diff --git a/docs/backend/hexagon/run-tool.sh b/docs/backend/hexagon/run-tool.sh
new file mode 100755
index 00000000000..71ac1dc98c3
--- /dev/null
+++ b/docs/backend/hexagon/run-tool.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+#
+
+# Basedir on device
+basedir=/data/local/tmp/llama.cpp
+
+cli_opts=
+
+branch=.
+[ "$B" != "" ] && branch=$B
+
+adbserial=
+[ "$S" != "" ] && adbserial="-s $S"
+
+device="HTP0"
+[ "$D" != "" ] && device="$D"
+
+verbose=
+[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
+
+experimental=
+[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$V"
+
+sched=
+[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" 
+
+profile=
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
+
+opmask=
+[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+
+nhvx=
+[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
+
+ndev=
+[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
+
+hb=
+[ "$HB" != "" ] && hb="GGML_HEXAGON_HOSTBUF=$HB"
+
+set -x
+
+tool=$1; shift
+
+adb $adbserial shell " \
+  cd $basedir; ulimit -c unlimited;        \
+    LD_LIBRARY_PATH=$basedir/$branch/lib   \
+    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
+    $verbose $experimental $sched $opmask $profile $nhvx $ndev $hb ./$branch/bin/$tool $@ \
+"
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 73032be68e1..181f179ed17 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adr
 set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                             "gmml: OpenCL API version to target")
 
+option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
+
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
 
diff --git a/ggml/include/ggml-hexagon.h b/ggml/include/ggml-hexagon.h
new file mode 100644
index 00000000000..bdb432be949
--- /dev/null
+++ b/ggml/include/ggml-hexagon.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_API ggml_backend_t ggml_backend_hexagon_init(void);
+
+GGML_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
+
+GGML_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 3356ef550de..8f60fa62f06 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 include(CheckCXXCompilerFlag)
 include("../cmake/common.cmake")
+include(ExternalProject)
 
 add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
 
@@ -402,6 +403,7 @@ ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
+ggml_add_backend(Hexagon)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 136afec748d..e96b5c403dd 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -57,6 +57,10 @@
 #include "ggml-opencl.h"
 #endif
 
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
 #ifdef GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@@ -199,6 +203,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
 #endif
+#ifdef GGML_USE_HEXAGON
+        register_backend(ggml_backend_hexagon_reg());
+#endif
 #ifdef GGML_USE_CANN
         register_backend(ggml_backend_cann_reg());
 #endif
@@ -598,6 +605,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
+    ggml_backend_load_best("hexagon", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
     ggml_backend_load_best("cpu", silent, dir_path);
     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
new file mode 100644
index 00000000000..c8a48dc9034
--- /dev/null
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -0,0 +1,67 @@
+include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
+
+option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
+
+add_library(htp_iface OBJECT
+    ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/htp-utils.c)
+
+set_target_properties(htp_iface PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(htp_iface PUBLIC
+    ${HEXAGON_SDK_ROOT}/incs
+    ${HEXAGON_SDK_ROOT}/incs/stddef
+    ${HEXAGON_SDK_ROOT}/utils/examples
+    ${CMAKE_CURRENT_SOURCE_DIR}/htp
+    ${CMAKE_CURRENT_BINARY_DIR})
+
+build_idl(htp/htp_iface.idl htp_iface)
+
+if (CMAKE_SYSTEM_NAME MATCHES Android)
+    target_link_options(htp_iface PUBLIC -llog -ldl)
+elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
+    target_precompile_headers(htp_iface PUBLIC <sal.h>)
+else()
+    target_link_options(htp_iface PUBLIC -ldl)
+endif()
+
+link_custom_library(htp_iface cdsprpc)
+link_custom_library(htp_iface rpcmem)
+
+set(TARGET_NAME ggml-hexagon)
+ggml_add_backend_library(${TARGET_NAME} ggml-hexagon.cpp ../../include/ggml-hexagon.h)
+
+target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
+target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})
+
+# Build HTP bits 
+set(HTP_CMAKE_ARGS
+    -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
+    -DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
+    -DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
+    -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG})
+
+ExternalProject_Add(htp-v73
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
+    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
+
+ExternalProject_Add(htp-v75
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
+    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75")
+
+ExternalProject_Add(htp-v79
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
+    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79")
+
+ExternalProject_Add(htp-v81
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
+    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v81 -DPREBUILT_LIB_DIR="toolv19_v81")
+
+# Install Hexagon skels required at runtime
+install(FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so
+    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so
+    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so
+    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v81.so
+    TYPE LIB)
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
new file mode 100644
index 00000000000..e2f05cb2ad5
--- /dev/null
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -0,0 +1,3581 @@
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include <atomic>
+#include <chrono>
+#include <mutex>
+#include <string>
+
+#ifdef _WIN32
+#    include <sal.h>
+#    ifndef _WINDOWS
+#        define _WINDOWS
+#    endif
+#else
+#    include <semaphore.h>
+#    include <unistd.h>
+#endif
+
+#pragma clang diagnostic ignored "-Wnested-anon-types"
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+
+#include "htp-utils.h"
+
+#include <AEEStdErr.h>
+#include <dspqueue.h>
+#include <rpcmem.h>
+
+#define GGML_COMMON_IMPL_CPP
+#include "ggml-backend-impl.h"
+#include "ggml-common.h"
+#include "ggml-hexagon.h"
+#include "ggml-impl.h"
+#include "ggml-quants.h"
+#include "htp-msg.h"
+#include "htp_iface.h"
+
+static size_t opt_ndev         = 1;
+static size_t opt_nhvx         = 0;  // use all
+static int    opt_arch         = 0;  // autodetect
+static int    opt_etm          = 0;
+static int    opt_verbose      = 0;
+static int    opt_profile      = 0;
+static int    opt_hostbuf      = 1;
+static int    opt_experimental = 0;
+
+// Enable all stages by default
+static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
+static int opt_opsync = 0;  // synchronous ops
+
+#define HEX_VERBOSE(...)              \
+    if (opt_verbose) {                \
+        fprintf(stderr, __VA_ARGS__); \
+        fflush(stderr);               \
+    }
+#define HEX_PROFILE(...) \
+    if (opt_profile)     \
+    fprintf(stderr, __VA_ARGS__)
+
+static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
+    return ((size_t) addr & (align - 1)) == 0;
+}
+
+static inline size_t hex_round_up(size_t n, size_t m) {
+    return m * ((n + m - 1) / m);
+}
+
+static const char * status_to_str(uint32_t status) {
+    switch (status) {
+        case HTP_STATUS_OK:
+            return "OK";
+        case HTP_STATUS_NO_SUPPORT:
+            return "NO-SUPPORT";
+        case HTP_STATUS_INVAL_PARAMS:
+            return "INVAL-PARAMS";
+        case HTP_STATUS_VTCM_TOO_SMALL:
+            return "VTCM-TOO-SMALL";
+        case HTP_STATUS_INTERNAL_ERR:
+            return "INTERNAL-ERROR";
+        default:
+            return "UNKNOWN";
+    }
+}
+
+static uint64_t time_usec(void) {
+    auto now = std::chrono::high_resolution_clock::now().time_since_epoch();
+    return std::chrono::duration_cast<std::chrono::microseconds>(now).count();
+}
+
+// ** debug helpers
+
+static inline int hex_format_tensor_dims(char * str, const struct ggml_tensor * t) {
+    if (t->ne[2] == 1 && t->ne[3] == 1) {
+        return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
+    } else {
+        return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+    }
+}
+
+static inline void hex_format_op_dims(char * str, const struct ggml_tensor * t) {
+    char * p = str;
+
+    // append src0 and src1 (if any)
+    if (t->src[0]) {
+        p += hex_format_tensor_dims(p, t->src[0]);
+
+        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+            p += sprintf(p, " x ");
+            p += hex_format_tensor_dims(p, t->src[i]);
+        }
+
+        p += sprintf(p, " -> ");
+    }
+
+    // format self dims separately for better visual alignment
+    char self[64];
+    hex_format_tensor_dims(self, t);
+
+    p += sprintf(p, "%s", self);
+}
+
+static inline int hex_format_tensor_strides(char * str, const struct ggml_tensor * t) {
+    const char * c = ggml_is_contiguous(t) ? "" : "!";
+
+    if (t->ne[2] == 1 && t->ne[3] == 1) {
+        return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
+    } else {
+        return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2],
+                       (size_t) t->nb[3], c);
+    }
+}
+
+static inline void hex_format_op_strides(char * str, const struct ggml_tensor * t) {
+    char * p = str;
+
+    // append src0 and src1 (if any)
+    if (t->src[0]) {
+        p += hex_format_tensor_strides(p, t->src[0]);
+
+        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+            p += sprintf(p, " x ");
+            p += hex_format_tensor_strides(p, t->src[i]);
+        }
+
+        p += sprintf(p, " -> ");
+    }
+
+    // format self dims separately for better visual alignment
+    char self[64];
+    hex_format_tensor_strides(self, t);
+
+    p += sprintf(p, "%s", self);
+}
+
+static inline void hex_format_op_types(char * str, const struct ggml_tensor * t) {
+    char * p = str;
+
+    // append src0 and src1 (if any)
+    if (t->src[0]) {
+        p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
+
+        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+            p += sprintf(p, " x ");
+            p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
+        }
+
+        p += sprintf(p, " -> ");
+    }
+
+    p += sprintf(p, "%s", ggml_type_name(t->type));
+}
+
+static inline const char * hex_tensor_buff_name(const struct ggml_tensor * t) {
+    if (t->buffer) {
+        return ggml_backend_buffer_name(t->buffer);
+    }
+    return "NONE";
+}
+
+static inline void hex_format_op_buffs(char * str, const struct ggml_tensor * t) {
+    char * p = str;
+
+    // append src0 and src1 (if any)
+    if (t->src[0]) {
+        p += sprintf(p, "%s", hex_tensor_buff_name(t->src[0]));
+
+        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+            p += sprintf(p, " x ");
+            p += sprintf(p, "%s", hex_tensor_buff_name(t->src[i]));
+        }
+
+        p += sprintf(p, " -> ");
+    }
+
+    p += sprintf(p, "%s", hex_tensor_buff_name(t));
+}
+
+static inline void hex_format_op_names(char * str, const struct ggml_tensor * t) {
+    char * p = str;
+
+    // append src0 and src1 (if any)
+    if (t->src[0]) {
+        p += sprintf(p, "%s", t->src[0]->name);
+
+        for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+            p += sprintf(p, " x ");
+            p += sprintf(p, "%s", t->src[i]->name);
+        }
+
+        p += sprintf(p, " -> ");
+    }
+
+    p += sprintf(p, "%s", t->name);
+}
+
+// ** backend sessions
+
+struct ggml_hexagon_session {
+    ggml_hexagon_session(int dev_id);
+    ~ggml_hexagon_session();
+
+    ggml_backend_buffer_type buffer_type;
+    ggml_backend_buffer_type repack_buffer_type;
+
+    std::string     name;
+    remote_handle64 handle;
+    dspqueue_t      queue;
+    uint32_t        session_id;
+    uint32_t        domain_id;
+    uint64_t        queue_id;
+    int             dev_id;
+
+    std::atomic<int> op_pending;
+    uint32_t         prof_usecs;
+    uint32_t         prof_cycles;
+    uint32_t         prof_pkts;
+};
+
+// Packet callback
+static void htp_packet_callback(dspqueue_t queue, AEEResult error, void * context) {
+    auto sess = static_cast<ggml_hexagon_session *>(context);
+
+    // Repeatedly read packets from the queue until it's empty. We don't
+    // necessarily get a separate callback for each packet, and new packets
+    // may arrive while we're processing the previous one.
+
+    while (1) {
+        struct htp_general_rsp rsp;
+        uint32_t               rsp_size;
+        uint32_t               flags;
+
+        struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
+        uint32_t               n_bufs;
+
+        // Read packet from queue
+        int err = dspqueue_read_noblock(queue, &flags,
+                                        HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
+                                        &n_bufs,                 // Number of buffer references
+                                        bufs,                    // Buffer references
+                                        sizeof(rsp),             // Max message length
+                                        &rsp_size,               // Message length
+                                        (uint8_t *) &rsp);
+
+        if (err == AEE_EWOULDBLOCK) {
+            // Consumed all packets available for now
+            return;
+        }
+
+        if (err != 0) {
+            fprintf(stderr, "ggml-hex: dspqueue_read_noblock failed: 0x%08x\n", (unsigned) err);
+            exit(1);
+        }
+
+        // Basic sanity checks
+        if (rsp_size != sizeof(rsp)) {
+            fprintf(stderr, "ggml-hex: dspcall : bad response (size)\n");
+            exit(1);
+        }
+
+        if (rsp.status != HTP_STATUS_OK) {
+            fprintf(stderr, "ggml-hex: dspcall : dsp-rsp: %s\n", status_to_str(rsp.status));
+            // TODO: handle errors
+        }
+
+        // FIXME: update profiling implementation
+        sess->prof_usecs  = rsp.prof_usecs;
+        sess->prof_cycles = rsp.prof_cycles;
+        sess->prof_pkts   = rsp.prof_pkts;
+
+        sess->op_pending--;  // atomic dec
+    }
+}
+
+// Error callback - simply terminates with an error. Used where we don't
+// expect errors.
+[[noreturn]] static void htp_error_callback(dspqueue_t queue, AEEResult error, void * context) {
+    fprintf(stderr, "ggml-hex: dspcall general error 0x%x: for queue %p\n", error, (void *) queue);
+    exit(1);
+}
+
+// ** backend buffers
+
+struct ggml_backend_hexagon_buffer_type_context {
+    ggml_backend_hexagon_buffer_type_context(const std::string & name, ggml_hexagon_session * sess) {
+        this->sess = sess;
+        this->name = name;
+    }
+
+    ggml_hexagon_session * sess;
+    std::string            name;
+};
+
+struct ggml_backend_hexagon_buffer_context {
+    bool mmap_to(ggml_hexagon_session * s) {
+        HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
+                    s->name.c_str(), (void *) this->base, s->domain_id, s->session_id, this->size, this->fd,
+                    (int) this->repack);
+
+        int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
+        if (err != 0) {
+            fprintf(stderr, "ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
+                    s->domain_id, this->size, this->fd, (unsigned) err);
+            return false;
+        }
+
+        return true;
+    }
+
+    bool mmap() {
+        if (this->mapped) {
+            return true;
+        }
+        if (!mmap_to(this->sess)) {
+            return false;
+        }
+        this->mapped = true;
+        return true;
+    }
+
+    void munmap() {
+        if (!this->mapped) {
+            return;
+        }
+
+        fastrpc_munmap(this->sess->domain_id, this->fd, this->base, this->size);
+        this->mapped = false;
+    }
+
+    ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
+        size += 4 * 1024;  // extra page for padding
+
+        this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+        if (!this->base) {
+            fprintf(stderr, "ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
+            return;
+        }
+
+        this->fd = rpcmem_to_fd(this->base);
+        if (this->fd < 0) {
+            fprintf(stderr, "ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base);
+            rpcmem_free(this->base);
+            this->base = NULL;
+            return;
+        }
+
+        HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
+                    (void *) this->base, size, this->fd, (int) repack);
+
+        this->sess   = sess;
+        this->size   = size;
+        this->mapped = false;
+        this->repack = repack;
+    }
+
+    ~ggml_backend_hexagon_buffer_context() {
+        munmap();
+        if (this->base) {
+            rpcmem_free(this->base);
+            this->base = NULL;
+        }
+    }
+
+    ggml_hexagon_session * sess;  // primary session
+    uint8_t *              base;
+    size_t                 size;
+    int                    fd;
+    bool                   mapped;  // mmap is done
+    bool                   repack;  // repacked buffer
+};
+
+static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
+    return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer->buft->context)->sess;
+}
+
+static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
+    delete ctx;
+}
+
+static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
+    auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
+    return ctx->base;
+}
+
+static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    auto ctx  = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
+    auto sess = ctx->sess;
+
+    HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
+                tensor->name, (void *) ctx->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage,
+                (int) ctx->repack);
+
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        // Just a view, reuse the same extra info
+        tensor->flags = tensor->view_src->flags;
+        tensor->extra = tensor->view_src->extra;
+    } else {
+        if (!ctx->mapped) {
+            ctx->mmap();
+        }
+    }
+    return GGML_STATUS_SUCCESS;
+}
+
+// ======== Q4x4x2 ====================
+struct x2_q4 {
+    int v[2];
+};
+
+static x2_q4 unpack_q4(uint8_t v) {
+    x2_q4 x = { (int) (v & 0x0f) - 8, (int) (v >> 4) - 8 };
+    return x;
+}
+
+static void dump_block_q4_0(const block_q4_0 * b, int i) {
+    HEX_VERBOSE("ggml-hex: repack q4_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_q4(b->qs[0]).v[0],
+                unpack_q4(b->qs[1]).v[0], unpack_q4(b->qs[2]).v[0], unpack_q4(b->qs[3]).v[0], unpack_q4(b->qs[12]).v[1],
+                unpack_q4(b->qs[13]).v[1], unpack_q4(b->qs[14]).v[1], unpack_q4(b->qs[15]).v[1],
+                GGML_FP16_TO_FP32(b->d));
+}
+
+static void dump_packed_block_q4x4x2(const uint8_t * v, unsigned int i, size_t k) {
+    static const int qk        = QK_Q4_0x4x2;
+    const int        dblk_size = 8 * 2;   // 8x __fp16
+    const int        qblk_size = qk / 2;  // int4
+    const int        qrow_size = k / 2;   // int4 (not padded)
+
+    const uint8_t * v_q = v + 0;          // quants first
+    const uint8_t * v_d = v + qrow_size;  // then scales
+
+    const uint8_t *   q = v_q + i * qblk_size;
+    const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
+
+    HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
+                unpack_q4(q[0]).v[0], unpack_q4(q[1]).v[0], unpack_q4(q[2]).v[0], unpack_q4(q[3]).v[0],
+                unpack_q4(q[60]).v[0], unpack_q4(q[61]).v[0], unpack_q4(q[62]).v[0], unpack_q4(q[63]).v[0],
+                unpack_q4(q[124]).v[0], unpack_q4(q[125]).v[0], unpack_q4(q[126]).v[0], unpack_q4(q[127]).v[0],
+                GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
+
+    HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
+                i + 1, unpack_q4(q[0]).v[1], unpack_q4(q[1]).v[1], unpack_q4(q[2]).v[1], unpack_q4(q[3]).v[1],
+                unpack_q4(q[60]).v[1], unpack_q4(q[61]).v[1], unpack_q4(q[62]).v[1], unpack_q4(q[63]).v[1],
+                unpack_q4(q[124]).v[1], unpack_q4(q[125]).v[1], unpack_q4(q[126]).v[1], unpack_q4(q[127]).v[1],
+                GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
+}
+
+static void unpack_q4_0_quants(uint8_t * qs, const block_q4_0 * x, unsigned int bi) {
+    static const int qk = QK4_0;
+
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const int x0             = (x->qs[i] & 0x0F);
+        const int x1             = (x->qs[i] >> 4);
+        qs[bi * qk + i + 0]      = x0;
+        qs[bi * qk + i + qk / 2] = x1;
+    }
+}
+
+static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi) {
+    static const int qk = QK4_0;
+
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const uint8_t x0 = qs[bi * qk + i + 0];
+        const uint8_t x1 = qs[bi * qk + i + qk / 2];
+        x->qs[i]         = x0 | (x1 << 4);
+    }
+}
+
+static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    const int dblk_size = 8 * 2;              // 8x __fp16
+    const int qblk_size = qk / 2;             // int4
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+
+    uint8_t * y_q = y + 0;                    // quants first
+    uint8_t * y_d = y + qrow_size;            // then scales
+
+    if (opt_verbose > 2) {
+        for (int i = 0; i < nb; i++) {
+            dump_block_q4_0(&x[i * 8 + 0], 0);
+            dump_block_q4_0(&x[i * 8 + 1], 1);
+            dump_block_q4_0(&x[i * 8 + 2], 2);
+            dump_block_q4_0(&x[i * 8 + 3], 3);
+            dump_block_q4_0(&x[i * 8 + 4], 4);
+            dump_block_q4_0(&x[i * 8 + 5], 5);
+            dump_block_q4_0(&x[i * 8 + 6], 6);
+            dump_block_q4_0(&x[i * 8 + 7], 7);
+        }
+    }
+
+    // Repack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+        unpack_q4_0_quants(qs, &x[i * 8 + 0], 0);
+        unpack_q4_0_quants(qs, &x[i * 8 + 1], 1);
+        unpack_q4_0_quants(qs, &x[i * 8 + 2], 2);
+        unpack_q4_0_quants(qs, &x[i * 8 + 3], 3);
+        unpack_q4_0_quants(qs, &x[i * 8 + 4], 4);
+        unpack_q4_0_quants(qs, &x[i * 8 + 5], 5);
+        unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
+        unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
+
+        uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            q[j] = (qs[j + 128] << 4) | qs[j];
+        }
+    }
+
+    // Repack the scales
+    // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
+    // the last block is truncated and overriden by the scales.
+    for (int i = 0; i < nb; i++) {
+        // Repack the scales
+        ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
+        d[0]          = x[i * 8 + 0].d;
+        d[1]          = x[i * 8 + 1].d;
+        d[2]          = x[i * 8 + 2].d;
+        d[3]          = x[i * 8 + 3].d;
+        d[4]          = x[i * 8 + 4].d;
+        d[5]          = x[i * 8 + 5].d;
+        d[6]          = x[i * 8 + 6].d;
+        d[7]          = x[i * 8 + 7].d;
+    }
+
+    if (opt_verbose > 1) {
+        for (int i = 0; i < nb; i++) {
+            dump_packed_block_q4x4x2(y, i, k);
+        }
+    }
+}
+
+static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    const int dblk_size = 8 * 2;              // 8x __fp16
+    const int qblk_size = qk / 2;             // int4
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+
+    const uint8_t * y_q = y + 0;              // quants first
+    const uint8_t * y_d = y + qrow_size;      // then scales
+
+    if (opt_verbose > 1) {
+        for (int i = 0; i < nb; i++) {
+            dump_packed_block_q4x4x2(y, i, k);
+        }
+    }
+
+    // Unpack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+
+        const uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            qs[j]       = q[j] & 0xf;
+            qs[j + 128] = q[j] >> 4;
+        }
+
+        pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
+        pack_q4_0_quants(&x[i * 8 + 1], qs, 1);
+        pack_q4_0_quants(&x[i * 8 + 2], qs, 2);
+        pack_q4_0_quants(&x[i * 8 + 3], qs, 3);
+        pack_q4_0_quants(&x[i * 8 + 4], qs, 4);
+        pack_q4_0_quants(&x[i * 8 + 5], qs, 5);
+        pack_q4_0_quants(&x[i * 8 + 6], qs, 6);
+        pack_q4_0_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    // Repack the scales
+    // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
+    // the last block is truncated and overriden by the scales.
+    for (int i = 0; i < nb; i++) {
+        // Unpack the scales
+        const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
+        x[i * 8 + 0].d      = d[0];
+        x[i * 8 + 1].d      = d[1];
+        x[i * 8 + 2].d      = d[2];
+        x[i * 8 + 3].d      = d[3];
+        x[i * 8 + 4].d      = d[4];
+        x[i * 8 + 5].d      = d[5];
+        x[i * 8 + 6].d      = d[6];
+        x[i * 8 + 7].d      = d[7];
+    }
+
+    if (opt_verbose > 2) {
+        for (int i = 0; i < nb; i++) {
+            dump_block_q4_0(&x[i * 8 + 0], 0);
+            dump_block_q4_0(&x[i * 8 + 1], 1);
+            dump_block_q4_0(&x[i * 8 + 2], 2);
+            dump_block_q4_0(&x[i * 8 + 3], 3);
+            dump_block_q4_0(&x[i * 8 + 4], 4);
+            dump_block_q4_0(&x[i * 8 + 5], 5);
+            dump_block_q4_0(&x[i * 8 + 6], 6);
+            dump_block_q4_0(&x[i * 8 + 7], 7);
+        }
+    }
+}
+
+static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
+    static const int qk = QK_Q4_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    // Init the quants such that they unpack into zeros
+    uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+    memset(qs, 8, sizeof(qs));
+
+    for (int i = 0; i < nb; i++) {
+        pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
+        pack_q4_0_quants(&x[i * 8 + 1], qs, 1);
+        pack_q4_0_quants(&x[i * 8 + 2], qs, 2);
+        pack_q4_0_quants(&x[i * 8 + 3], qs, 3);
+        pack_q4_0_quants(&x[i * 8 + 4], qs, 4);
+        pack_q4_0_quants(&x[i * 8 + 5], qs, 5);
+        pack_q4_0_quants(&x[i * 8 + 6], qs, 6);
+        pack_q4_0_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    // Init the scales
+    // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
+    // the last block is truncated and overriden by the scales.
+    for (int i = 0; i < nb; i++) {
+        // Unpack the scales
+        x[i * 8 + 0].d = 0;
+        x[i * 8 + 1].d = 0;
+        x[i * 8 + 2].d = 0;
+        x[i * 8 + 3].d = 0;
+        x[i * 8 + 4].d = 0;
+        x[i * 8 + 5].d = 0;
+        x[i * 8 + 6].d = 0;
+        x[i * 8 + 7].d = 0;
+    }
+}
+
+// repack q4_0 data into q4x4x2 tensor
+static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
+    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-q4_0-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+
+    init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros
+
+    for (int64_t i = 0; i < nrows; i++) {
+        const uint8_t * src = (const uint8_t *) data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
+
+        memcpy(buf_pd, src, row_size);
+        repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, row_size);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
+// repack q4x4x2 tensor into q4_0 data
+static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
+    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+
+    memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros
+
+    for (int64_t i = 0; i < nrows; i++) {
+        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) data + (i * row_size);
+
+        memcpy(buf_pd, src, row_size);
+        unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, row_size);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
+// ======== Q8x4x2 ====================
+static void dump_block_q8_0(const block_q8_0 * b, int i) {
+    HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
+                b->qs[3], b->qs[28], b->qs[29], b->qs[30], b->qs[31], GGML_FP16_TO_FP32(b->d));
+}
+
+static void dump_packed_block_q8x4x2(const uint8_t * v, unsigned int i, size_t k) {
+    static const int qk        = QK_Q8_0x4x2;
+    const int        dblk_size = 8 * 2;   // 8x __fp16
+    const int        qblk_size = qk;      // int8
+    const int        qrow_size = k;       // int8 (not padded)
+
+    const uint8_t * v_q = v + 0;          // quants first
+    const uint8_t * v_d = v + qrow_size;  // then scales
+
+    const uint8_t *   q = v_q + i * qblk_size;
+    const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
+
+    HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
+                q[0], q[1], q[2], q[3], q[60], q[61], q[62], q[63], q[124], q[125], q[126], q[127],
+                GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
+
+    HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
+                i + 1, q[128], q[129], q[130], q[131], q[192], q[193], q[194], q[195], q[252], q[253], q[254], q[255],
+                GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
+}
+
+static void unpack_q8_0_quants(uint8_t * qs, const block_q8_0 * x, unsigned int bi) {
+    static const int qk = QK8_0;
+
+    for (unsigned int i = 0; i < qk; ++i) {
+        qs[bi * qk + i] = x->qs[i];
+    }
+}
+
+static void pack_q8_0_quants(block_q8_0 * x, const uint8_t * qs, unsigned int bi) {
+    static const int qk = QK8_0;
+
+    for (unsigned int i = 0; i < qk; ++i) {
+        x->qs[i] = qs[bi * qk + i];
+    }
+}
+
+static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
+    static const int qk = QK_Q8_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    const int dblk_size = 8 * 2;              // 8x __fp16
+    const int qblk_size = qk;                 // int8
+    const int qrow_size = k;                  // int8 (not padded to blocks)
+
+    uint8_t * y_q = y + 0;                    // quants first
+    uint8_t * y_d = y + qrow_size;            // then scales
+
+    if (opt_verbose > 2) {
+        for (int i = 0; i < nb; i++) {
+            dump_block_q8_0(&x[i * 8 + 0], 0);
+            dump_block_q8_0(&x[i * 8 + 1], 1);
+            dump_block_q8_0(&x[i * 8 + 2], 2);
+            dump_block_q8_0(&x[i * 8 + 3], 3);
+            dump_block_q8_0(&x[i * 8 + 4], 4);
+            dump_block_q8_0(&x[i * 8 + 5], 5);
+            dump_block_q8_0(&x[i * 8 + 6], 6);
+            dump_block_q8_0(&x[i * 8 + 7], 7);
+        }
+    }
+
+    // Repack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q8_0x4x2];  // unpacked quants
+
+        unpack_q8_0_quants(qs, &x[i * 8 + 0], 0);
+        unpack_q8_0_quants(qs, &x[i * 8 + 1], 1);
+        unpack_q8_0_quants(qs, &x[i * 8 + 2], 2);
+        unpack_q8_0_quants(qs, &x[i * 8 + 3], 3);
+        unpack_q8_0_quants(qs, &x[i * 8 + 4], 4);
+        unpack_q8_0_quants(qs, &x[i * 8 + 5], 5);
+        unpack_q8_0_quants(qs, &x[i * 8 + 6], 6);
+        unpack_q8_0_quants(qs, &x[i * 8 + 7], 7);
+
+        uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk; j++) {
+            q[j] = qs[j];
+        }
+    }
+
+    // Repack the scales
+    // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
+    // the last block is truncated and overriden by the scales.
+    for (int i = 0; i < nb; i++) {
+        // Repack the scales
+        ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
+        d[0]          = x[i * 8 + 0].d;
+        d[1]          = x[i * 8 + 1].d;
+        d[2]          = x[i * 8 + 2].d;
+        d[3]          = x[i * 8 + 3].d;
+        d[4]          = x[i * 8 + 4].d;
+        d[5]          = x[i * 8 + 5].d;
+        d[6]          = x[i * 8 + 6].d;
+        d[7]          = x[i * 8 + 7].d;
+    }
+
+    if (opt_verbose > 1) {
+        for (int i = 0; i < nb; i++) {
+            dump_packed_block_q8x4x2(y, i, k);
+        }
+    }
+}
+
+static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
+    static const int qk = QK_Q8_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    const int dblk_size = 8 * 2;              // 8x __fp16
+    const int qblk_size = qk;                 // int8
+    const int qrow_size = k;                  // int8 (not padded to blocks)
+
+    const uint8_t * y_q = y + 0;              // quants first
+    const uint8_t * y_d = y + qrow_size;      // then scales
+
+    if (opt_verbose > 1) {
+        for (int i = 0; i < nb; i++) {
+            dump_packed_block_q8x4x2(y, i, k);
+        }
+    }
+
+    // Unpack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_Q4_0x4x2];  // unpacked quants
+
+        const uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk; j++) {
+            qs[j] = q[j];
+        }
+
+        pack_q8_0_quants(&x[i * 8 + 0], qs, 0);
+        pack_q8_0_quants(&x[i * 8 + 1], qs, 1);
+        pack_q8_0_quants(&x[i * 8 + 2], qs, 2);
+        pack_q8_0_quants(&x[i * 8 + 3], qs, 3);
+        pack_q8_0_quants(&x[i * 8 + 4], qs, 4);
+        pack_q8_0_quants(&x[i * 8 + 5], qs, 5);
+        pack_q8_0_quants(&x[i * 8 + 6], qs, 6);
+        pack_q8_0_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    // Repack the scales
+    // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
+    // the last block is truncated and overriden by the scales.
+    for (int i = 0; i < nb; i++) {
+        // Unpack the scales
+        const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
+        x[i * 8 + 0].d      = d[0];
+        x[i * 8 + 1].d      = d[1];
+        x[i * 8 + 2].d      = d[2];
+        x[i * 8 + 3].d      = d[3];
+        x[i * 8 + 4].d      = d[4];
+        x[i * 8 + 5].d      = d[5];
+        x[i * 8 + 6].d      = d[6];
+        x[i * 8 + 7].d      = d[7];
+    }
+
+    if (opt_verbose > 2) {
+        for (int i = 0; i < nb; i++) {
+            dump_block_q8_0(&x[i * 8 + 0], 0);
+            dump_block_q8_0(&x[i * 8 + 1], 1);
+            dump_block_q8_0(&x[i * 8 + 2], 2);
+            dump_block_q8_0(&x[i * 8 + 3], 3);
+            dump_block_q8_0(&x[i * 8 + 4], 4);
+            dump_block_q8_0(&x[i * 8 + 5], 5);
+            dump_block_q8_0(&x[i * 8 + 6], 6);
+            dump_block_q8_0(&x[i * 8 + 7], 7);
+        }
+    }
+}
+
+static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
+    static const int qk = QK_Q8_0x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    // Init the quants such that they unpack into zeros
+    uint8_t qs[QK_Q8_0x4x2];  // unpacked quants
+    memset(qs, 0, sizeof(qs));
+
+    for (int i = 0; i < nb; i++) {
+        pack_q8_0_quants(&x[i * 8 + 0], qs, 0);
+        pack_q8_0_quants(&x[i * 8 + 1], qs, 1);
+        pack_q8_0_quants(&x[i * 8 + 2], qs, 2);
+        pack_q8_0_quants(&x[i * 8 + 3], qs, 3);
+        pack_q8_0_quants(&x[i * 8 + 4], qs, 4);
+        pack_q8_0_quants(&x[i * 8 + 5], qs, 5);
+        pack_q8_0_quants(&x[i * 8 + 6], qs, 6);
+        pack_q8_0_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    // Init the scales
+    // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
+    // the last block is truncated and overriden by the scales.
+    for (int i = 0; i < nb; i++) {
+        // Unpack the scales
+        x[i * 8 + 0].d = 0;
+        x[i * 8 + 1].d = 0;
+        x[i * 8 + 2].d = 0;
+        x[i * 8 + 3].d = 0;
+        x[i * 8 + 4].d = 0;
+        x[i * 8 + 5].d = 0;
+        x[i * 8 + 6].d = 0;
+        x[i * 8 + 7].d = 0;
+    }
+}
+
+// repack q8_0 data into q8x4x2 tensor
+static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
+    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-q8_0-q8x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+
+    init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros
+
+    for (int64_t i = 0; i < nrows; i++) {
+        const uint8_t * src = (const uint8_t *) data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
+
+        memcpy(buf_pd, src, row_size);
+        repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, row_size);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
+// repack q8x4x2 tensor into q8_0 data
+static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
+    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-q8x4x2-q8_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+                t->ne[0], nrows, row_size);
+
+    memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros
+
+    for (int64_t i = 0; i < nrows; i++) {
+        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) data + (i * row_size);
+
+        memcpy(buf_pd, src, row_size);
+        unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, row_size);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
+// ======== MXFP4x4x2 ====================
+struct x2_mxfp4 {
+    int v[2];
+};
+
+static x2_mxfp4 unpack_mxfp4(uint8_t v) {
+    x2_mxfp4 x;
+    x.v[0] = kvalues_mxfp4[(v & 0x0f)];
+    x.v[1] = kvalues_mxfp4[(v >> 4)];
+    return x;
+}
+
+static void dump_block_mxfp4(const block_mxfp4 * b, int i) {
+    HEX_VERBOSE("ggml-hex: repack mxfp4 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_mxfp4(b->qs[0]).v[0],
+                unpack_mxfp4(b->qs[1]).v[0], unpack_mxfp4(b->qs[2]).v[0], unpack_mxfp4(b->qs[3]).v[0],
+                unpack_mxfp4(b->qs[12]).v[1], unpack_mxfp4(b->qs[13]).v[1], unpack_mxfp4(b->qs[14]).v[1],
+                unpack_mxfp4(b->qs[15]).v[1], GGML_E8M0_TO_FP32_HALF(b->e));
+}
+
+static void dump_packed_block_mxfp4x4x2(const uint8_t * v, unsigned int i, size_t k) {
+    static const int qk        = QK_MXFP4x4x2;
+    const int        eblk_size = 8 * 1;   // 8x E8M0
+    const int        qblk_size = qk / 2;  // int4
+    const int        qrow_size = k / 2;   // int4 (not padded)
+
+    const uint8_t * v_q = v + 0;          // quants first
+    const uint8_t * v_e = v + qrow_size;  // then scales
+
+    const uint8_t * q = v_q + i * qblk_size;
+    const uint8_t * e = (const uint8_t *) (v_e + i * eblk_size);
+
+    HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
+                unpack_mxfp4(q[0]).v[0], unpack_mxfp4(q[1]).v[0], unpack_mxfp4(q[2]).v[0], unpack_mxfp4(q[3]).v[0],
+                unpack_mxfp4(q[60]).v[0], unpack_mxfp4(q[61]).v[0], unpack_mxfp4(q[62]).v[0], unpack_mxfp4(q[63]).v[0],
+                unpack_mxfp4(q[124]).v[0], unpack_mxfp4(q[125]).v[0], unpack_mxfp4(q[126]).v[0],
+                unpack_mxfp4(q[127]).v[0], GGML_E8M0_TO_FP32_HALF(e[0]), GGML_E8M0_TO_FP32_HALF(e[1]),
+                GGML_E8M0_TO_FP32_HALF(e[2]), GGML_E8M0_TO_FP32_HALF(e[3]));
+
+    HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
+                i + 1, unpack_mxfp4(q[0]).v[1], unpack_mxfp4(q[1]).v[1], unpack_mxfp4(q[2]).v[1],
+                unpack_mxfp4(q[3]).v[1], unpack_mxfp4(q[60]).v[1], unpack_mxfp4(q[61]).v[1], unpack_mxfp4(q[62]).v[1],
+                unpack_mxfp4(q[63]).v[1], unpack_mxfp4(q[124]).v[1], unpack_mxfp4(q[125]).v[1],
+                unpack_mxfp4(q[126]).v[1], unpack_mxfp4(q[127]).v[1], GGML_E8M0_TO_FP32_HALF(e[4]),
+                GGML_E8M0_TO_FP32_HALF(e[5]), GGML_E8M0_TO_FP32_HALF(e[6]), GGML_E8M0_TO_FP32_HALF(e[7]));
+}
+
+static void unpack_mxfp4_quants(uint8_t * qs, const block_mxfp4 * x, unsigned int bi) {
+    static const int qk = QK_MXFP4;
+
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const uint8_t x0         = (x->qs[i] & 0x0F);
+        const uint8_t x1         = (x->qs[i] >> 4);
+        qs[bi * qk + i + 0]      = x0;
+        qs[bi * qk + i + qk / 2] = x1;
+    }
+}
+
+static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int bi) {
+    static const int qk = QK4_0;
+
+    for (unsigned int i = 0; i < qk / 2; ++i) {
+        const uint8_t x0 = qs[bi * qk + i + 0];
+        const uint8_t x1 = qs[bi * qk + i + qk / 2];
+        x->qs[i]         = x0 | (x1 << 4);
+    }
+}
+
+static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
+    static const int qk = QK_MXFP4x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    const int eblk_size = 8 * 1;              // 8x E8M0
+    const int qblk_size = qk / 2;             // int4
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+
+    uint8_t * y_q = y + 0;                    // quants first
+    uint8_t * y_e = y + qrow_size;            // then scales
+
+    if (opt_verbose > 2) {
+        for (int i = 0; i < nb; i++) {
+            dump_block_mxfp4(&x[i * 8 + 0], 0);
+            dump_block_mxfp4(&x[i * 8 + 1], 1);
+            dump_block_mxfp4(&x[i * 8 + 2], 2);
+            dump_block_mxfp4(&x[i * 8 + 3], 3);
+            dump_block_mxfp4(&x[i * 8 + 4], 4);
+            dump_block_mxfp4(&x[i * 8 + 5], 5);
+            dump_block_mxfp4(&x[i * 8 + 6], 6);
+            dump_block_mxfp4(&x[i * 8 + 7], 7);
+        }
+    }
+
+    // Repack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_MXFP4x4x2];  // unpacked quants
+
+        unpack_mxfp4_quants(qs, &x[i * 8 + 0], 0);
+        unpack_mxfp4_quants(qs, &x[i * 8 + 1], 1);
+        unpack_mxfp4_quants(qs, &x[i * 8 + 2], 2);
+        unpack_mxfp4_quants(qs, &x[i * 8 + 3], 3);
+        unpack_mxfp4_quants(qs, &x[i * 8 + 4], 4);
+        unpack_mxfp4_quants(qs, &x[i * 8 + 5], 5);
+        unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
+        unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
+
+        uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            q[j] = (qs[j + 128] << 4) | qs[j];
+        }
+    }
+
+    // Repack the scales
+    // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
+    // the last block is truncated and overriden by the scales.
+    for (int i = 0; i < nb; i++) {
+        // Repack the scales
+        uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
+        e[0]        = x[i * 8 + 0].e;
+        e[1]        = x[i * 8 + 1].e;
+        e[2]        = x[i * 8 + 2].e;
+        e[3]        = x[i * 8 + 3].e;
+        e[4]        = x[i * 8 + 4].e;
+        e[5]        = x[i * 8 + 5].e;
+        e[6]        = x[i * 8 + 6].e;
+        e[7]        = x[i * 8 + 7].e;
+    }
+
+    if (opt_verbose > 1) {
+        for (int i = 0; i < nb; i++) {
+            dump_packed_block_mxfp4x4x2(y, i, k);
+        }
+    }
+}
+
+static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
+    static const int qk = QK_MXFP4x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    const int eblk_size = 8 * 1;              // 8x E8M0
+    const int qblk_size = qk / 2;             // int4
+    const int qrow_size = k / 2;              // int4 (not padded to blocks)
+
+    const uint8_t * y_q = y + 0;              // quants first
+    const uint8_t * y_e = y + qrow_size;      // then scales
+
+    if (opt_verbose > 1) {
+        for (int i = 0; i < nb; i++) {
+            dump_packed_block_mxfp4x4x2(y, i, k);
+        }
+    }
+
+    // Unpack the quants
+    for (int i = 0; i < nb; i++) {
+        uint8_t qs[QK_MXFP4x4x2];  // unpacked quants
+
+        const uint8_t * q = y_q + (i * qblk_size);
+        for (int j = 0; j < qk / 2; j++) {
+            qs[j]       = q[j] & 0xf;
+            qs[j + 128] = q[j] >> 4;
+        }
+
+        pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
+        pack_mxfp4_quants(&x[i * 8 + 1], qs, 1);
+        pack_mxfp4_quants(&x[i * 8 + 2], qs, 2);
+        pack_mxfp4_quants(&x[i * 8 + 3], qs, 3);
+        pack_mxfp4_quants(&x[i * 8 + 4], qs, 4);
+        pack_mxfp4_quants(&x[i * 8 + 5], qs, 5);
+        pack_mxfp4_quants(&x[i * 8 + 6], qs, 6);
+        pack_mxfp4_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    // Repack the scales
+    // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
+    // the last block is truncated and overriden by the scales.
+    for (int i = 0; i < nb; i++) {
+        // Unpack the scales
+        const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
+        x[i * 8 + 0].e    = e[0];
+        x[i * 8 + 1].e    = e[1];
+        x[i * 8 + 2].e    = e[2];
+        x[i * 8 + 3].e    = e[3];
+        x[i * 8 + 4].e    = e[4];
+        x[i * 8 + 5].e    = e[5];
+        x[i * 8 + 6].e    = e[6];
+        x[i * 8 + 7].e    = e[7];
+    }
+
+    if (opt_verbose > 2) {
+        for (int i = 0; i < nb; i++) {
+            dump_block_mxfp4(&x[i * 8 + 0], 0);
+            dump_block_mxfp4(&x[i * 8 + 1], 1);
+            dump_block_mxfp4(&x[i * 8 + 2], 2);
+            dump_block_mxfp4(&x[i * 8 + 3], 3);
+            dump_block_mxfp4(&x[i * 8 + 4], 4);
+            dump_block_mxfp4(&x[i * 8 + 5], 5);
+            dump_block_mxfp4(&x[i * 8 + 6], 6);
+            dump_block_mxfp4(&x[i * 8 + 7], 7);
+        }
+    }
+}
+
+static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
+    static const int qk = QK_MXFP4x4x2;
+    const int        nb = (k + qk - 1) / qk;  // number of blocks (padded)
+
+    // Init the quants such that they unpack into zeros
+    uint8_t qs[QK_MXFP4x4x2];  // unpacked quants
+    memset(qs, 0, sizeof(qs));
+
+    for (int i = 0; i < nb; i++) {
+        pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
+        pack_mxfp4_quants(&x[i * 8 + 1], qs, 1);
+        pack_mxfp4_quants(&x[i * 8 + 2], qs, 2);
+        pack_mxfp4_quants(&x[i * 8 + 3], qs, 3);
+        pack_mxfp4_quants(&x[i * 8 + 4], qs, 4);
+        pack_mxfp4_quants(&x[i * 8 + 5], qs, 5);
+        pack_mxfp4_quants(&x[i * 8 + 6], qs, 6);
+        pack_mxfp4_quants(&x[i * 8 + 7], qs, 7);
+    }
+
+    // Init the scales
+    // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
+    // the last block is truncated and overriden by the scales.
+    for (int i = 0; i < nb; i++) {
+        // Unpack the scales
+        x[i * 8 + 0].e = 0;
+        x[i * 8 + 1].e = 0;
+        x[i * 8 + 2].e = 0;
+        x[i * 8 + 3].e = 0;
+        x[i * 8 + 4].e = 0;
+        x[i * 8 + 5].e = 0;
+        x[i * 8 + 6].e = 0;
+        x[i * 8 + 7].e = 0;
+    }
+}
+
+// repack mxfp4 data into mxfp4x4x2 tensor
+static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
+    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-mxfp4-mxfp4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
+                size, t->ne[0], nrows, row_size);
+
+    init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros
+
+    for (int64_t i = 0; i < nrows; i++) {
+        const uint8_t * src = (const uint8_t *) data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
+
+        memcpy(buf_pd, src, row_size);
+        repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, row_size);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
+// repack mxfp4x4x2 tensor into mxfp4 data
+static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t size) {
+    int64_t nrows = ggml_nrows(t);
+
+    size_t row_size    = ggml_row_size(t->type, t->ne[0]);
+    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
+    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)
+
+    void * buf_pd = ggml_aligned_malloc(row_size_pd);
+    GGML_ASSERT(buf_pd != NULL);
+
+    void * buf_rp = ggml_aligned_malloc(row_size_rp);
+    GGML_ASSERT(buf_rp != NULL);
+
+    HEX_VERBOSE("ggml-hex: repack-mxfp4x4x2-mxfp4 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
+                size, t->ne[0], nrows, row_size);
+
+    memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros
+
+    for (int64_t i = 0; i < nrows; i++) {
+        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
+        uint8_t *       dst = (uint8_t *) data + (i * row_size);
+
+        memcpy(buf_pd, src, row_size);
+        unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
+        memcpy(dst, buf_rp, row_size);
+    }
+
+    ggml_aligned_free(buf_pd, row_size_pd);
+    ggml_aligned_free(buf_rp, row_size_rp);
+}
+
+static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                   ggml_tensor *         tensor,
+                                                   const void *          data,
+                                                   size_t                offset,
+                                                   size_t                size) {
+    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
+    auto sess = ctx->sess;
+
+    HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
+                offset, size);
+
+    switch (tensor->type) {
+        case GGML_TYPE_Q4_0:
+            GGML_ASSERT(offset == 0);
+            repack_q4_0_q4x4x2(tensor, data, size);
+            break;
+
+        case GGML_TYPE_Q8_0:
+            GGML_ASSERT(offset == 0);
+            repack_q8_0_q8x4x2(tensor, data, size);
+            break;
+
+        case GGML_TYPE_MXFP4:
+            GGML_ASSERT(offset == 0);
+            repack_mxfp4_mxfp4x4x2(tensor, data, size);
+            break;
+
+        default:
+            memcpy((char *) tensor->data + offset, data, size);
+            break;
+    }
+}
+
+static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                   const ggml_tensor *   tensor,
+                                                   void *                data,
+                                                   size_t                offset,
+                                                   size_t                size) {
+    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
+    auto sess = ctx->sess;
+
+    HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
+                offset, size);
+
+    switch (tensor->type) {
+        case GGML_TYPE_Q4_0:
+            GGML_ASSERT(offset == 0);
+            repack_q4x4x2_q4_0(data, tensor, size);
+            break;
+
+        case GGML_TYPE_Q8_0:
+            GGML_ASSERT(offset == 0);
+            repack_q8x4x2_q8_0(data, tensor, size);
+            break;
+
+        case GGML_TYPE_MXFP4:
+            GGML_ASSERT(offset == 0);
+            repack_mxfp4x4x2_mxfp4(data, tensor, size);
+            break;
+
+        default:
+            memcpy(data, (const char *) tensor->data + offset, size);
+            break;
+    }
+}
+
+static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t      buffer,
+                                                   const struct ggml_tensor * src,
+                                                   struct ggml_tensor *       dst) {
+    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
+    auto sess = ctx->sess;
+
+    HEX_VERBOSE("ggml-hex: %s copy-tensor %s -> %s size %zu\n", sess->name.c_str(), src->name, dst->name,
+                ggml_nbytes(src));
+
+    memcpy(dst->data, src->data, ggml_nbytes(src));
+    return true;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
+    auto sess = ctx->sess;
+    HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
+    memset(ctx->base, value, ctx->size);
+}
+
+static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_hexagon_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_hexagon_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_hexagon_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_hexagon_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_hexagon_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_hexagon_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_hexagon_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// ** backend buffer type
+
+static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
+    return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
+}
+
+static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type,
+                                                                           size_t                     size) {
+    auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
+    ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
+    return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
+}
+
+static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
+    ggml_backend_buffer_type_t buffer_type,
+    size_t                     size) {
+    auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
+    ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
+    return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
+    return 128;  // HVX alignment
+    GGML_UNUSED(buffer_type);
+}
+
+static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+                                                              const struct ggml_tensor * t) {
+    return ggml_nbytes(t);
+}
+
+static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
+    return 1 * 1024 * 1024 * 1024;  // 1GB per buffer
+    GGML_UNUSED(buffer_type);
+}
+
+static bool ggml_backend_hexagon_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return opt_hostbuf;
+    GGML_UNUSED(buft);
+}
+
+static bool ggml_backend_hexagon_repack_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_hexagon_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_hexagon_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_hexagon_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_hexagon_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_hexagon_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_hexagon_buffer_type_get_alloc_size,
+    /* .is_host          = */ ggml_backend_hexagon_buffer_type_is_host,
+};
+
+static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_hexagon_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_hexagon_repack_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_hexagon_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_hexagon_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_hexagon_buffer_type_get_alloc_size,
+    /* .is_host          = */ ggml_backend_hexagon_repack_buffer_type_is_host,
+};
+
+ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
+    this->domain_id  = 3;  // Default for CDSP, updated after the session is created
+    this->session_id = 0;  // Default for CDSP, updated after the session is created
+    this->dev_id     = dev_id;
+    this->name       = std::string("HTP") + std::to_string(dev_id);
+
+    this->op_pending  = 0;
+    this->prof_usecs  = 0;
+    this->prof_cycles = 0;
+    this->prof_pkts   = 0;
+
+    fprintf(stderr, "ggml-hex: allocating new session: %s\n", this->name.c_str());
+
+    domain * my_domain = get_domain(this->domain_id);
+    if (my_domain == NULL) {
+        fprintf(stderr, "ggml-hex: unable to get domain struct for CDSP\n");
+        exit(1);
+    }
+
+    // Create new session
+    if (dev_id != 0) {
+        struct remote_rpc_reserve_new_session n;
+        n.domain_name_len  = strlen(CDSP_DOMAIN_NAME);
+        n.domain_name      = const_cast<char *>(CDSP_DOMAIN_NAME);
+        n.session_name     = const_cast<char *>(this->name.c_str());
+        n.session_name_len = this->name.size();
+
+        int err = remote_session_control(FASTRPC_RESERVE_NEW_SESSION, (void *) &n, sizeof(n));
+        if (err != AEE_SUCCESS) {
+            fprintf(stderr, "ggml-hex: remote_session_control failed to reserve new session %d : error 0x%x\n", dev_id,
+                    err);
+            exit(1);
+        }
+
+        // Save the IDs
+        this->session_id = n.session_id;
+        this->domain_id  = n.effective_domain_id;
+    }
+
+    // Get session URI
+    char htp_uri[256];
+    sprintf(htp_uri, "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
+
+    char session_uri[256];
+    {
+        struct remote_rpc_get_uri u;
+        u.session_id      = this->session_id;
+        u.domain_name     = const_cast<char *>(CDSP_DOMAIN_NAME);
+        u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
+        u.module_uri      = const_cast<char *>(htp_uri);
+        u.module_uri_len  = strlen(htp_uri);
+        u.uri             = session_uri;
+        u.uri_len         = sizeof(session_uri);
+
+        int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
+        if (err != AEE_SUCCESS) {
+            fprintf(stderr, "ggml-hex: remote_session_control failed to get URI for session %d : error 0x%x\n", dev_id,
+                    err);
+            exit(1);
+        }
+    }
+
+    // Enable Unsigned PD
+    {
+        struct remote_rpc_control_unsigned_module u;
+        u.domain = this->domain_id;
+        u.enable = 1;
+        int err  = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *) &u, sizeof(u));
+        if (err != AEE_SUCCESS) {
+            fprintf(stderr,
+                    "ggml-hex: remote_session_control failed to enable unsigned PD for session %d : error 0x%x\n",
+                    dev_id, err);
+            exit(1);
+        }
+    }
+
+    // Open session
+    int err = htp_iface_open(session_uri, &this->handle);
+    if (err != AEE_SUCCESS) {
+        fprintf(stderr, "ggml-hex: failed to open session %d : error 0x%x\n", dev_id, err);
+        exit(1);
+    }
+
+    fprintf(stderr, "ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
+            this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
+
+    // Enable FastRPC QoS mode
+    {
+        struct remote_rpc_control_latency l;
+        l.enable = 1;
+
+        int err = remote_handle64_control(this->handle, DSPRPC_CONTROL_LATENCY, (void *) &l, sizeof(l));
+        if (err != 0) {
+            fprintf(stderr, "ggml-hex: failed to enable fastrpc QOS mode: 0x%08x\n", (unsigned) err);
+            exit(1);
+        }
+    }
+
+    // Now let's setup the DSP queue
+    err = dspqueue_create(this->domain_id,
+                          0,              // Flags
+                          128 * 1024,     // Request  queue size (in bytes)
+                          64 * 1024,      // Response queue size (in bytes)
+                          htp_packet_callback, htp_error_callback,
+                          (void *) this,  // Callback context
+                          &queue);
+
+    if (err != 0) {
+        fprintf(stderr, "ggml-hex: dspqueue_create failed: 0x%08x\n", (unsigned) err);
+        exit(1);
+    }
+
+    // Export queue for use on the DSP
+    err = dspqueue_export(queue, &this->queue_id);
+    if (err != 0) {
+        fprintf(stderr, "ggml-hex: dspqueue_export failed: 0x%08x\n", (unsigned) err);
+        exit(1);
+    }
+
+    if (opt_etm) {
+        err = htp_iface_enable_etm(this->handle);
+        if (err != 0) {
+            fprintf(stderr, "ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
+            exit(1);
+        }
+    }
+
+    // Start the DSP-side service. We need to pass the queue ID to the
+    // DSP in a FastRPC call; the DSP side will import the queue and start
+    // listening for packets in a callback.
+    err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx);
+    if (err != 0) {
+        fprintf(stderr, "ggml-hex: htp_iface_start failed: 0x%08x\n", (unsigned) err);
+        exit(1);
+    }
+
+    buffer_type.iface   = ggml_backend_hexagon_buffer_type_interface;
+    buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name, this);
+
+    repack_buffer_type.iface   = ggml_backend_hexagon_repack_buffer_type_interface;
+    repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
+}
+
+ggml_hexagon_session::~ggml_hexagon_session() {
+    fprintf(stderr, "ggml-hex: releasing session: %s\n", this->name.c_str());
+
+    // Stop the DSP-side service and close the queue
+    int err = htp_iface_stop(this->handle);
+    if (err != 0) {
+        fprintf(stderr, "ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
+        exit(1);
+    }
+
+    if (opt_etm) {
+        err = htp_iface_disable_etm(this->handle);
+        if (err != 0) {
+            fprintf(stderr, "ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
+        }
+    }
+
+    err = dspqueue_close(queue);
+    if (err != 0) {
+        fprintf(stderr, "ggml-hex: dspqueue_close failed: 0x%08x\n", (unsigned) err);
+        exit(1);
+    }
+
+    htp_iface_close(this->handle);
+}
+
+// ** backend interface
+
+static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b) {
+    return b->buft->iface.get_alignment == ggml_backend_hexagon_buffer_type_get_alignment;
+}
+
+static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
+    return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
+}
+
+static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_tensor * y) {
+    if (x->ne[0] != y->ne[0]) {
+        return false;
+    }
+    if (x->ne[1] != y->ne[1]) {
+        return false;
+    }
+    if (x->ne[2] != y->ne[2]) {
+        return false;
+    }
+    if (x->ne[3] != y->ne[3]) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool hex_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    if ((t1->ne[0] % t0->ne[0]) != 0) {
+        return false;
+    }
+    if ((t1->ne[1] % t0->ne[1]) != 0) {
+        return false;
+    }
+    if ((t1->ne[2] % t0->ne[2]) != 0) {
+        return false;
+    }
+    if ((t1->ne[3] % t0->ne[3]) != 0) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool hex_supported_src0_type(ggml_type t) {
+    return t == GGML_TYPE_F32;
+}
+
+static bool hex_supported_src1_type(ggml_type t) {
+    return t == GGML_TYPE_F32;
+}
+
+static bool hex_supported_src2_type(ggml_type t) {
+    return t == GGML_TYPE_F32;
+}
+
+static bool hex_supported_src1_type2(ggml_type t) {
+    return t == GGML_TYPE_F16;
+}
+
+static bool hex_supported_src1_type3(ggml_type t) {
+    return t == GGML_TYPE_I32;
+}
+
+static bool hex_supported_dst_type(ggml_type t) {
+    return t == GGML_TYPE_F32;
+}
+
+static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_tensor * y) {
+    // TODO: support broadcast for ne[2 and 3]
+    if (x->ne[0] != y->ne[0]) {
+        return false;
+    }
+    if (x->ne[2] != y->ne[2]) {
+        return false;
+    }
+    if (x->ne[3] != y->ne[3]) {
+        return false;
+    }
+    return true;
+}
+
+static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+
+    // TODO: add support for non-cont tensors
+    if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
+        return false;
+    }
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_MXFP4:
+            if (src0->ne[0] % 32) {
+                return false;
+            }
+
+            if (src0->ne[1] > 16 * 1024) {
+                return false;  // typically the lm-head which would be too large for VTCM
+            }
+
+            // if ((src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3])) return false;
+            if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
+                return false;
+            }
+
+            // src0 (weights) must be repacked
+            if (src0->buffer && !ggml_backend_buffer_is_hexagon_repack(src0->buffer)) {
+                return false;
+            }
+            break;
+
+        case GGML_TYPE_F16:
+            if (!opt_experimental) {
+                return false;
+            }
+            break;
+
+        default:
+            return false;
+    }
+
+    // src0 & src1 & dst must be mapped to the same session
+    if (src0->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
+        return false;
+    }
+    if (src1->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
+        return false;
+    }
+    if (dst->buffer &&
+        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * src2 = op->src[2];
+    const struct ggml_tensor * dst  = op;
+
+    if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32 || src2->type != GGML_TYPE_I32) {
+        return false;
+    }
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_MXFP4:
+            if ((src0->ne[0] % 32)) {
+                return false;
+            }
+
+            // src0 (weights) must be repacked
+            if (src0->buffer && !ggml_backend_buffer_is_hexagon_repack(src0->buffer)) {
+                return false;
+            }
+            break;
+
+        case GGML_TYPE_F16:
+            if (!opt_experimental) {
+                return false;
+            }
+            break;
+
+        default:
+            return false;
+    }
+
+    // TODO: add support for non-cont tensors
+    if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
+        return false;
+    }
+
+    // src0 (weights) must be repacked and mapped to the same session
+    // src1 & sr2 & dst must be mapped to the same session
+    if (src0->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
+        return false;
+    }
+    if (src1->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
+        return false;
+    }
+    if (src2->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
+        return false;
+    }
+    if (dst->buffer &&
+        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * dst  = op;
+
+    if (!hex_supported_src0_type(src0->type)) {
+        return false;
+    }
+    if (!hex_supported_src1_type(src1->type)) {
+        return false;
+    }
+    if (!hex_supported_dst_type(dst->type)) {
+        return false;
+    }
+    if (!hex_supported_dims2(src0, dst)) {
+        return false;
+    }
+    if (!hex_can_repeat(src1, src0)) {
+        return false;
+    }
+
+    // TODO: add support for non-contigiuos tensors
+    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
+        return false;
+    }
+
+    // src0, src1 & dst must be mapped to the same session
+    if (src0->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
+        return false;
+    }
+    if (src1->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
+        return false;
+    }
+    if (dst->buffer &&
+        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * src2 = op->src[2];
+    const struct ggml_tensor * dst  = op;
+
+    if (!hex_supported_src0_type(src0->type)) {
+        return false;
+    }
+    if (!hex_supported_src1_type(src1->type)) {
+        return false;
+    }
+    if (!hex_supported_dst_type(dst->type)) {
+        return false;
+    }
+    if (!hex_supported_dims2(src0, dst)) {
+        return false;
+    }
+
+    // REVISIT: add support for non-contigiuos tensors
+    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
+        return false;
+    }
+
+    // src0, src1 & dst must be mapped to the same session
+    if (src0->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
+        return false;
+    }
+    if (src1->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
+        return false;
+    }
+    if (src2->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
+        return false;
+    }
+    if (dst->buffer &&
+        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * dst  = op;
+
+    if (!hex_supported_src0_type(src0->type)) {
+        return false;
+    }
+    if (!hex_supported_dst_type(dst->type)) {
+        return false;
+    }
+    if (!hex_supported_dims2(src0, dst)) {
+        return false;
+    }
+
+    // TODO: add support for non-contigiuos tensors
+    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
+        return false;
+    }
+
+    // src0 & dst must be mapped to the same session
+    if (src0->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
+        return false;
+    }
+    if (dst->buffer &&
+        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session * sess,
+                                               const struct ggml_tensor *          op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * dst  = op;
+
+    if (!hex_supported_src0_type(src0->type)) {
+        return false;
+    }
+    if (!hex_supported_dst_type(dst->type)) {
+        return false;
+    }
+
+    if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
+        return false;
+    }
+
+    if (src1) {
+        if (!hex_supported_src1_type(src1->type)) {
+            return false;
+        }
+        if (!hex_supported_dims2(src0, src1)) {
+            return false;
+        }
+        if (!ggml_is_contiguous(src1)) {
+            return false;
+        }
+    }
+
+    // src0, src1 & dst must be mapped to the same session
+    if (src0->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
+        return false;
+    }
+    if (src1 && src1->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
+        return false;
+    }
+    if (dst->buffer &&
+        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * src2 = op->src[2];
+    const struct ggml_tensor * dst  = op;
+
+    if (src2) {
+        return false;  // FIXME: add support for sinks
+    }
+
+    if (!hex_supported_src0_type(src0->type)) {
+        return false;
+    }
+    if (!hex_supported_dst_type(dst->type)) {
+        return false;
+    }
+
+    if (src1) {
+        if (!hex_supported_src1_type(src1->type) && !hex_supported_src1_type2(src1->type)) {
+            return false;
+        }
+        if (src0->ne[0] != src1->ne[0]) {
+            return false;
+        }
+        if (src1->ne[1] < src0->ne[1]) {
+            return false;
+        }
+        if (src0->ne[2] % src1->ne[2] != 0) {
+            return false;
+        }
+        if (src0->ne[3] % src1->ne[3] != 0) {
+            return false;
+        }
+    }
+
+    if (src1) {
+        if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
+            return false;
+        }
+    } else {
+        if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
+            return false;
+        }
+    }
+
+    // src0, src1 & dst must be mapped to the same session
+    if (src0->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
+        return false;
+    }
+    if (src1 && src1->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
+        return false;
+    }
+    if (dst->buffer &&
+        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const int32_t * op_params = &op->op_params[0];
+
+    int mode = op_params[2];
+
+    if ((mode & GGML_ROPE_TYPE_NEOX) || (mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
+        return false;
+    }
+    if (mode & 1) {
+        return false;
+    }
+
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * src2 = op->src[2];
+    const struct ggml_tensor * dst  = op;
+
+    if (!hex_supported_src0_type(src0->type)) {
+        return false;  // FIXME: add support for GGML_TYPE_F16 for src0
+    }
+    if (!hex_supported_dst_type(dst->type)) {
+        return false;
+    }
+    if (!hex_supported_src1_type3(src1->type)) {
+        return false;
+    }
+    if (src2) {
+        if (!hex_supported_src2_type(src2->type)) {
+            return false;
+        }
+        int n_dims = op_params[1];
+        if (src2->ne[0] < (n_dims / 2)) {
+            return false;
+        }
+    }
+
+    if (src2) {
+        if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(src2) ||
+            !ggml_is_contiguous(dst)) {
+            return false;
+        }
+    } else {
+        if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
+            return false;
+        }
+    }
+
+    // src0, src1, src2 & dst must be mapped to the same session
+    if (src0->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
+        return false;
+    }
+    if (src1->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
+        return false;
+    }
+    if (src2 && src2->buffer &&
+        (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
+        return false;
+    }
+    if (dst->buffer &&
+        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+        return false;
+    }
+
+    return true;
+}
+
+// Init hexagon tensor from GGML tensor and Hexagon buffer
+static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
+    h->data  = 0;  // updated by the receiver
+    h->type  = t->type;
+    h->ne[0] = t->ne[0];
+    h->ne[1] = t->ne[1];
+    h->ne[2] = t->ne[2];
+    h->ne[3] = t->ne[3];
+    h->nb[0] = t->nb[0];
+    h->nb[1] = t->nb[1];
+    h->nb[2] = t->nb[2];
+    h->nb[3] = t->nb[3];
+}
+
+static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
+    auto buf  = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
+    auto sess = buf->sess;
+
+    HEX_VERBOSE("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
+                t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
+                (unsigned int) d->size);
+}
+
+static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * dst  = op;
+
+    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
+    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
+    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
+
+    uint64_t t1, t2;
+    t1 = ggml_time_us();
+
+    // Construct HTP message
+    htp_general_req req;
+    req.op    = HTP_OP_MUL_MAT;
+    req.flags = flags;
+
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req.dst, dst);
+
+    // Use opmask to override flags
+    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
+    }
+    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
+    }
+
+    dspqueue_buffer bufs[3];
+    memset(bufs, 0, sizeof(bufs));
+
+    // First buffer Weights.
+    // The content is static, there is no need to do any cache management
+    bufs[0].fd     = src0_buf->fd;
+    bufs[0].ptr    = src0->data;
+    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
+    bufs[0].size   = ggml_nbytes(src0);
+    bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_REF;
+
+    // Second buffer Input Activations. This is a buffer that the CPU
+    // writes and the DSP reads, so we'll need to flush CPU caches and
+    // invalidate DSP ones. On platforms with I/O coherency support the
+    // framework will automatically skip cache operations where possible.
+    bufs[1].fd     = src1_buf->fd;
+    bufs[1].ptr    = src1->data;
+    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
+    bufs[1].size   = ggml_nbytes(src1);
+    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+
+    // Third buffer Output Activations. We'll handle DSP
+    // cache maintenance in the response message but need to flush
+    // CPU caches to ensure any previously written dirty lines are
+    // written out before writes from the DSP start.
+    bufs[2].fd     = dst_buf->fd;
+    bufs[2].ptr    = dst->data;
+    bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
+    bufs[2].size   = ggml_nbytes(dst);
+    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+
+    // Primary DSP session from the src0 (normally weight) tensor
+    auto sess = src0_buf->sess;
+
+    if (opt_verbose) {
+        char dims[64 * GGML_MAX_SRC];
+        char strides[64 * GGML_MAX_SRC];
+        char types[16 * GGML_MAX_SRC];
+        char buffs[64 * GGML_MAX_SRC];
+        char names[64 * GGML_MAX_SRC];
+
+        hex_format_op_dims(dims, op);
+        hex_format_op_strides(strides, op);
+        hex_format_op_types(types, op);
+        hex_format_op_buffs(buffs, op);
+        hex_format_op_names(names, op);
+
+        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+                    names, dims, types, strides, buffs, req.flags);
+        if (opt_verbose > 1) {
+            hex_dump_dspbuf(src0, &bufs[0]);
+            hex_dump_dspbuf(src1, &bufs[1]);
+            hex_dump_dspbuf(dst, &bufs[2]);
+        }
+    }
+
+    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 3,                       // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000                  // Timeout
+        );
+
+        if (err != 0) {
+            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+            exit(1);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
+    }
+
+    t2 = ggml_time_us();
+
+    HEX_PROFILE(
+        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
+        "call-usec %llu\n",
+        sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+}
+
+static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * src2 = op->src[2];
+    const struct ggml_tensor * dst  = op;
+
+    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
+    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
+    auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
+    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
+
+    uint64_t t1, t2;
+    t1 = ggml_time_us();
+
+    // Construct HTP message
+    htp_general_req req;
+    req.op    = HTP_OP_MUL_MAT_ID;
+    req.flags = flags;
+
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req.src2, src2);
+    init_htp_tensor(&req.dst, dst);
+
+    // Use opmask to override flags
+    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
+    }
+    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
+    }
+
+    dspqueue_buffer bufs[4];
+    memset(bufs, 0, sizeof(bufs));
+
+    // First buffer Weights.
+    // The content is static, there is no need to do any cache management
+    bufs[0].fd     = src0_buf->fd;
+    bufs[0].ptr    = src0->data;
+    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
+    bufs[0].size   = ggml_nbytes(src0);
+    bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_REF;
+
+    // Second buffer Input Activations. This is a buffer that the CPU
+    // writes and the DSP reads, so we'll need to flush CPU caches and
+    // invalidate DSP ones. On platforms with I/O coherency support the
+    // framework will automatically skip cache operations where possible.
+    bufs[1].fd     = src1_buf->fd;
+    bufs[1].ptr    = src1->data;
+    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
+    bufs[1].size   = ggml_nbytes(src1);
+    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+
+    // Third buffer expert IDs. This is a buffer that the CPU
+    // writes and the DSP reads, so we'll need to flush CPU caches and
+    // invalidate DSP ones. On platforms with I/O coherency support the
+    // framework will automatically skip cache operations where possible.
+    bufs[2].fd     = src2_buf->fd;
+    bufs[2].ptr    = src2->data;
+    bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
+    bufs[2].size   = ggml_nbytes(src2);
+    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+
+    // Forth buffer Output Activations. We'll handle DSP
+    // cache maintenance in the response message but need to flush
+    // CPU caches to ensure any previously written dirty lines are
+    // written out before writes from the DSP start.
+    bufs[3].fd     = dst_buf->fd;
+    bufs[3].ptr    = dst->data;
+    bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
+    bufs[3].size   = ggml_nbytes(dst);
+    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+
+    // Primary DSP session from the src0 (normally weight) tensor
+    auto sess = src0_buf->sess;
+
+    if (opt_verbose) {
+        char dims[64 * GGML_MAX_SRC];
+        char strides[64 * GGML_MAX_SRC];
+        char types[16 * GGML_MAX_SRC];
+        char buffs[64 * GGML_MAX_SRC];
+        char names[64 * GGML_MAX_SRC];
+
+        hex_format_op_dims(dims, op);
+        hex_format_op_types(types, op);
+        hex_format_op_buffs(buffs, op);
+        hex_format_op_names(names, op);
+
+        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+                    names, dims, types, strides, buffs, req.flags);
+
+        if (opt_verbose > 1) {
+            hex_dump_dspbuf(src0, &bufs[0]);
+            hex_dump_dspbuf(src1, &bufs[1]);
+            hex_dump_dspbuf(src2, &bufs[2]);
+            hex_dump_dspbuf(dst, &bufs[3]);
+        }
+    }
+
+    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 4,                       // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000                  // Timeout
+        );
+
+        if (err != 0) {
+            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+            exit(1);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
+    }
+
+    t2 = ggml_time_us();
+
+    HEX_VERBOSE(
+        "ggml-hex: %s matmul-id %s %u:%u:%u:%u x %s %u:%u:%u:%u (%s %u:%u:%u:%u) -> %s %u:%u:%u:%u : op-usec %u "
+        "op-cycles %u op-pkts %u (%f) call-usec %llu\n",
+        sess->name.c_str(), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
+        (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
+        (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1], (uint32_t) src2->ne[2],
+        (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
+        (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+}
+
+static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
+    const struct ggml_tensor * node = op;
+    const struct ggml_tensor * src0 = node->src[0];
+    const struct ggml_tensor * src1 = node->src[1];
+    const struct ggml_tensor * dst  = node;
+
+    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
+    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
+    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
+
+    uint64_t t1 = 0;
+    uint64_t t2 = 0;
+
+    t1 = ggml_time_us();
+
+    // Construct HTP message
+    htp_general_req req;
+    req.flags = flags;
+
+    // Use opmask to override flags
+    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
+    }
+    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
+    }
+
+    switch (node->op) {
+        case GGML_OP_MUL:
+            req.op = HTP_OP_MUL;
+            break;
+        case GGML_OP_ADD:
+            req.op = HTP_OP_ADD;
+            break;
+        case GGML_OP_SUB:
+            req.op = HTP_OP_SUB;
+            break;
+        default:
+            fprintf(stderr, "ggml-hex: unsupported op:%d\n", node->op);
+            exit(1);
+    }
+
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req.dst, dst);
+
+    dspqueue_buffer bufs[3];
+    memset(bufs, 0, sizeof(bufs));
+
+    // First buffer = First Operand of Binary op
+    // This is a buffer that the CPU writes and the DSP reads, so we'll
+    // need to flush CPU caches and invalidate DSP ones. On platforms
+    // with I/O coherency support the framework will automatically skip
+    // cache operations where possible.
+    bufs[0].fd     = src0_buf->fd;
+    bufs[0].ptr    = src0->data;
+    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
+    bufs[0].size   = ggml_nbytes(src0);
+    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
+
+    // Second buffer = Second Operand of Binary op
+    // This is a buffer that the CPU writes and the DSP reads, so we'll
+    // need to flush CPU caches and invalidate DSP ones. On platforms
+    // with I/O coherency support the framework will automatically skip
+    // cache operations where possible.
+    bufs[1].fd     = src1_buf->fd;
+    bufs[1].ptr    = src1->data;
+    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
+    bufs[1].size   = ggml_nbytes(src1);
+    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+
+    // Third buffer = Output Activations. We'll handle DSP
+    // cache maintenance in the response message but need to flush
+    // CPU caches to ensure any previously written dirty lines are
+    // written out before writes from the DSP start.
+    bufs[2].fd     = dst_buf->fd;
+    bufs[2].ptr    = dst->data;
+    bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
+    bufs[2].size   = ggml_nbytes(dst);
+    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+
+    // Primary DSP session from the src0 tensor
+    ggml_hexagon_session * sess = src0_buf->sess;
+
+    if (opt_verbose) {
+        char dims[64 * GGML_MAX_SRC];
+        char strides[16 * GGML_MAX_SRC];
+        char types[16 * GGML_MAX_SRC];
+        char buffs[64 * GGML_MAX_SRC];
+        char names[64 * GGML_MAX_SRC];
+
+        hex_format_op_dims(dims, op);
+        hex_format_op_strides(strides, op);
+        hex_format_op_types(types, op);
+        hex_format_op_buffs(buffs, op);
+        hex_format_op_names(names, op);
+
+        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
+                    ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
+        if (opt_verbose > 1) {
+            hex_dump_dspbuf(src0, &bufs[0]);
+            hex_dump_dspbuf(src1, &bufs[1]);
+            hex_dump_dspbuf(dst, &bufs[2]);
+        }
+    }
+
+    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 3,                       // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000);                // Timeout
+
+        if (0 != err) {
+            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+            exit(1);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
+    }
+
+    t2 = ggml_time_us();
+
+    HEX_PROFILE(
+        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
+        "call-usec %llu\n",
+        sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+}
+
+static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
+    const struct ggml_tensor * node = op;
+    const struct ggml_tensor * src0 = node->src[0];
+    const struct ggml_tensor * src1 = node->src[1];
+    const struct ggml_tensor * src2 = node->src[2];
+    const struct ggml_tensor * dst  = node;
+
+    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
+    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
+    auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
+    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
+
+    uint64_t t1 = 0;
+    uint64_t t2 = 0;
+
+    t1 = ggml_time_us();
+
+    // Construct HTP message
+    htp_general_req req;
+    req.flags = flags;
+
+    // Use opmask to override flags
+    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
+    }
+    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
+    }
+
+    switch (node->op) {
+        case GGML_OP_ADD_ID:
+            req.op = HTP_OP_ADD_ID;
+            break;
+        default:
+            fprintf(stderr, "ggml-hex: unsupported op:%d\n", node->op);
+            exit(1);
+    }
+
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req.src2, src2);
+    init_htp_tensor(&req.dst, dst);
+
+    dspqueue_buffer bufs[4];
+    memset(bufs, 0, sizeof(bufs));
+
+    // First buffer = input activations
+    bufs[0].fd     = src0_buf->fd;
+    bufs[0].ptr    = src0->data;
+    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
+    bufs[0].size   = ggml_nbytes(src0);
+    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
+
+    // Second buffer = experts bias
+    bufs[1].fd     = src1_buf->fd;
+    bufs[1].ptr    = src1->data;
+    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
+    bufs[1].size   = ggml_nbytes(src1);
+    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+
+    // Third buffer = activated experts
+    bufs[2].fd     = src2_buf->fd;
+    bufs[2].ptr    = src2->data;
+    bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
+    bufs[2].size   = ggml_nbytes(src2);
+    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+
+    // Forth buffer = output activations
+    bufs[3].fd     = dst_buf->fd;
+    bufs[3].ptr    = dst->data;
+    bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
+    bufs[3].size   = ggml_nbytes(dst);
+    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+
+    // Primary DSP session from the src0 tensor
+    ggml_hexagon_session * sess = src0_buf->sess;
+
+    if (opt_verbose) {
+        char dims[64 * GGML_MAX_SRC];
+        char strides[16 * GGML_MAX_SRC];
+        char types[16 * GGML_MAX_SRC];
+        char buffs[64 * GGML_MAX_SRC];
+        char names[64 * GGML_MAX_SRC];
+
+        hex_format_op_dims(dims, op);
+        hex_format_op_strides(strides, op);
+        hex_format_op_types(types, op);
+        hex_format_op_buffs(buffs, op);
+        hex_format_op_names(names, op);
+
+        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
+                    ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
+
+        if (opt_verbose > 1) {
+            hex_dump_dspbuf(src0, &bufs[0]);
+            hex_dump_dspbuf(src1, &bufs[1]);
+            hex_dump_dspbuf(src2, &bufs[2]);
+            hex_dump_dspbuf(dst, &bufs[3]);
+        }
+    }
+
+    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 4,                       // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000);                // Timeout
+
+        if (0 != err) {
+            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+            exit(1);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
+    }
+
+    t2 = ggml_time_us();
+
+    HEX_PROFILE(
+        "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
+        "call-usec %llu\n",
+        sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+        (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+        (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+        (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+        (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+}
+
+static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * dst  = op;
+
+    uint64_t t1 = 0;
+    uint64_t t2 = 0;
+
+    t1 = ggml_time_us();
+
+    // Construct HTP message
+    htp_general_req req;
+
+    memset(&req, 0, sizeof(htp_general_req));
+    memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
+    req.flags = flags;
+
+    bool supported = false;
+
+    switch (op->op) {
+        case GGML_OP_RMS_NORM:
+            req.op    = HTP_OP_RMS_NORM;
+            supported = true;
+            break;
+
+        case GGML_OP_UNARY:
+            if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
+                req.op    = HTP_OP_UNARY_SILU;
+                supported = true;
+            }
+            break;
+
+        case GGML_OP_GLU:
+            if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU) {
+                req.op    = HTP_OP_GLU_SWIGLU;
+                supported = true;
+            } else if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) {
+                req.op    = HTP_OP_GLU_SWIGLU_OAI;
+                supported = true;
+            }
+            break;
+
+        case GGML_OP_SOFT_MAX:
+            req.op    = HTP_OP_SOFTMAX;
+            supported = true;
+
+        default:
+            break;
+    }
+
+    if (!supported) {
+        fprintf(stderr, "ggml-hex: unsupported op:%d\n", op->op);
+        exit(1);
+    }
+
+    init_htp_tensor(&req.dst, dst);
+    init_htp_tensor(&req.src0, src0);
+    if (src1) {
+        init_htp_tensor(&req.src1, src1);
+    }
+
+    // Use opmask to override flags
+    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
+    }
+    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
+    }
+
+    dspqueue_buffer bufs[3];
+    int             n_bufs = 0;
+
+    memset(bufs, 0, sizeof(bufs));
+
+    // First buffer = Only Operand of Unary op
+    // This is a buffer that the CPU writes and the DSP reads, so we'll
+    // need to flush CPU caches and invalidate DSP ones. On platforms
+    // with I/O coherency support the framework will automatically skip
+    // cache operations where possible.
+    auto src0_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
+    bufs[n_bufs].fd     = src0_buf->fd;
+    bufs[n_bufs].ptr    = src0->data;
+    bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
+    bufs[n_bufs].size   = ggml_nbytes(src0);
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                          DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
+    ++n_bufs;
+
+    if (src1) {
+        // Second buffer = Second Operand of Binary op
+        // This is a buffer that the CPU writes and the DSP reads, so we'll
+        // need to flush CPU caches and invalidate DSP ones. On platforms
+        // with I/O coherency support the framework will automatically skip
+        // cache operations where possible.
+        auto src1_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
+        bufs[n_bufs].fd     = src1_buf->fd;
+        bufs[n_bufs].ptr    = src1->data;
+        bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
+        bufs[n_bufs].size   = ggml_nbytes(src1);
+        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                              DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+        ++n_bufs;
+    }
+
+    // Second or third buffer = Output Activations. We'll handle DSP
+    // Second buffer = Output Activations. We'll handle DSP
+    // cache maintenance in the response message but need to flush
+    // CPU caches to ensure any previously written dirty lines are
+    // written out before writes from the DSP start.
+    auto dst_buf        = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
+    bufs[n_bufs].fd     = dst_buf->fd;
+    bufs[n_bufs].ptr    = dst->data;
+    bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
+    bufs[n_bufs].size   = ggml_nbytes(dst);
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    ++n_bufs;
+
+    // Primary DSP session from the src0 tensor
+    ggml_hexagon_session * sess = src0_buf->sess;
+
+    if (opt_verbose) {
+        char dims[64 * GGML_MAX_SRC];
+        char strides[64 * GGML_MAX_SRC];
+        char types[16 * GGML_MAX_SRC];
+        char buffs[64 * GGML_MAX_SRC];
+        char names[64 * GGML_MAX_SRC];
+
+        hex_format_op_dims(dims, op);
+        hex_format_op_strides(strides, op);
+        hex_format_op_types(types, op);
+        hex_format_op_buffs(buffs, op);
+        hex_format_op_names(names, op);
+
+        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+                    names, dims, types, strides, buffs, req.flags);
+        if (opt_verbose > 1) {
+            hex_dump_dspbuf(src0, &bufs[0]);
+            if (src1) {
+                hex_dump_dspbuf(src1, &bufs[1]);
+                hex_dump_dspbuf(dst, &bufs[2]);
+            } else {
+                hex_dump_dspbuf(dst, &bufs[1]);
+            }
+        }
+    }
+
+    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 n_bufs,                  // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000);                // Timeout
+
+        if (0 != err) {
+            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+            exit(1);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
+    }
+
+    t2 = ggml_time_us();
+
+    if (src1) {
+        HEX_PROFILE(
+            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
+            "(%f) call-usec %llu\n",
+            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    } else {
+        HEX_PROFILE(
+            "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec "
+            "%llu\n",
+            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    }
+}
+
+static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+    const struct ggml_tensor * src2 = op->src[2];
+    const struct ggml_tensor * dst  = op;
+
+    uint64_t t1 = 0;
+    uint64_t t2 = 0;
+
+    t1 = ggml_time_us();
+
+    // Construct HTP message
+    htp_general_req req;
+
+    memset(&req, 0, sizeof(htp_general_req));
+    memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
+    req.flags = flags;
+    req.op    = HTP_OP_ROPE;
+
+    init_htp_tensor(&req.dst, dst);
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
+    if (src2) {
+        init_htp_tensor(&req.src2, src2);
+    }
+
+    // Use opmask to override flags
+    if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
+    }
+    if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
+    }
+
+    dspqueue_buffer bufs[4];
+    int             n_bufs = 0;
+
+    memset(bufs, 0, sizeof(bufs));
+
+    // First buffer
+    // This is a buffer that the CPU writes and the DSP reads, so we'll
+    // need to flush CPU caches and invalidate DSP ones. On platforms
+    // with I/O coherency support the framework will automatically skip
+    // cache operations where possible.
+    auto src0_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
+    bufs[n_bufs].fd     = src0_buf->fd;
+    bufs[n_bufs].ptr    = src0->data;
+    bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
+    bufs[n_bufs].size   = ggml_nbytes(src0);
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                          DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
+    ++n_bufs;
+
+    // Second buffer
+    // This is a buffer that the CPU writes and the DSP reads, so we'll
+    // need to flush CPU caches and invalidate DSP ones. On platforms
+    // with I/O coherency support the framework will automatically skip
+    // cache operations where possible.
+    auto src1_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
+    bufs[n_bufs].fd     = src1_buf->fd;
+    bufs[n_bufs].ptr    = src1->data;
+    bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
+    bufs[n_bufs].size   = ggml_nbytes(src1);
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                          DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+    ++n_bufs;
+
+    if (src2) {
+        // Third buffer
+        // This is a buffer that the CPU writes and the DSP reads, so we'll
+        // need to flush CPU caches and invalidate DSP ones. On platforms
+        // with I/O coherency support the framework will automatically skip
+        // cache operations where possible.
+        auto src2_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
+        bufs[n_bufs].fd     = src2_buf->fd;
+        bufs[n_bufs].ptr    = src2->data;
+        bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
+        bufs[n_bufs].size   = ggml_nbytes(src2);
+        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                              DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+        ++n_bufs;
+    }
+
+    // Final buffer = Output Activations. We'll handle DSP
+    // Second buffer = Output Activations. We'll handle DSP
+    // cache maintenance in the response message but need to flush
+    // CPU caches to ensure any previously written dirty lines are
+    // written out before writes from the DSP start.
+    auto dst_buf        = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
+    bufs[n_bufs].fd     = dst_buf->fd;
+    bufs[n_bufs].ptr    = dst->data;
+    bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
+    bufs[n_bufs].size   = ggml_nbytes(dst);
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    ++n_bufs;
+
+    // Primary DSP session from the src0 tensor
+    ggml_hexagon_session * sess = src0_buf->sess;
+
+    if (opt_verbose) {
+        char dims[64 * GGML_MAX_SRC];
+        char strides[64 * GGML_MAX_SRC];
+        char types[16 * GGML_MAX_SRC];
+        char buffs[64 * GGML_MAX_SRC];
+        char names[64 * GGML_MAX_SRC];
+
+        hex_format_op_dims(dims, op);
+        hex_format_op_strides(strides, op);
+        hex_format_op_types(types, op);
+        hex_format_op_buffs(buffs, op);
+        hex_format_op_names(names, op);
+
+        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+                    names, dims, types, strides, buffs, req.flags);
+        if (opt_verbose > 1) {
+            hex_dump_dspbuf(src0, &bufs[0]);
+            if (src1) {
+                hex_dump_dspbuf(src1, &bufs[1]);
+                hex_dump_dspbuf(dst, &bufs[2]);
+            } else {
+                hex_dump_dspbuf(dst, &bufs[1]);
+            }
+        }
+    }
+
+    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 n_bufs,                  // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000);                // Timeout
+
+        if (0 != err) {
+            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+            exit(1);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
+    }
+
+    t2 = ggml_time_us();
+
+    if (src2) {
+        HEX_PROFILE(
+            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles "
+            "%u op-pkts %u (%f) call-usec %llu\n",
+            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1],
+            (uint32_t) src2->ne[2], (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    } else {
+        HEX_PROFILE(
+            "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
+            "(%f) call-usec %llu\n",
+            sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+            (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+            (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+            (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
+            (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
+    }
+}
+
+static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
+    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
+    return sess->name.c_str();
+}
+
+static void ggml_backend_hexagon_free(ggml_backend_t backend) {
+    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
+    GGML_UNUSED(sess);
+}
+
+static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
+    return (op0 && op0->src[1] == op1->src[1]);
+}
+
+// scan the graph and figure out last compute op index
+static inline int last_compute_op(ggml_cgraph * graph) {
+    int last;
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        ggml_tensor * node = graph->nodes[i];
+
+        switch (node->op) {
+            case GGML_OP_MUL_MAT:
+            case GGML_OP_MUL_MAT_ID:
+            case GGML_OP_MUL:
+            case GGML_OP_ADD:
+            case GGML_OP_SUB:
+            case GGML_OP_RMS_NORM:
+            case GGML_OP_GLU:
+            case GGML_OP_ADD_ID:
+                last = i;
+                break;
+
+            default:
+                break;
+        }
+    }
+
+    return last;
+}
+
+static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
+    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
+
+    HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
+
+    const int last = last_compute_op(graph);
+
+    const struct ggml_tensor * prev_quant_op = nullptr;  // prev executed op with quantizer
+
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        ggml_tensor * node = graph->nodes[i];
+
+        uint32_t flags = 0;
+
+        // skip quantizer if src1 is reused
+        if (op_reuse_src1(node, prev_quant_op)) {
+            flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
+        }
+
+        // ask for early notification for the last Op
+        if (i == last) {
+            flags |= HTP_OPFLAGS_EARLY_WAKEUP;
+        }
+
+        switch (node->op) {
+            case GGML_OP_MUL_MAT:
+                ggml_hexagon_mul_mat(node, flags);
+                prev_quant_op = node;
+                break;
+            case GGML_OP_MUL_MAT_ID:
+                ggml_hexagon_mul_mat_id(node, flags);
+                prev_quant_op = node;
+                break;
+            case GGML_OP_MUL:
+            case GGML_OP_ADD:
+            case GGML_OP_SUB:
+                ggml_hexagon_binary(node, flags);
+                break;
+            case GGML_OP_ADD_ID:
+                ggml_hexagon_add_id(node, flags);
+                break;
+            case GGML_OP_RMS_NORM:
+                ggml_hexagon_unary(node, flags);
+                break;
+            case GGML_OP_UNARY:
+                if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
+                    ggml_hexagon_unary(node, flags);
+                }
+                break;
+            case GGML_OP_GLU:
+                if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
+                    (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
+                    ggml_hexagon_unary(node, flags);
+                }
+                break;
+            case GGML_OP_SOFT_MAX:
+                ggml_hexagon_unary(node, flags);
+                break;
+
+            case GGML_OP_ROPE:
+                ggml_hexagon_rope(node, flags);
+                break;
+
+            // non-compute ops
+            case GGML_OP_NONE:
+            case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
+            case GGML_OP_PERMUTE:
+            case GGML_OP_TRANSPOSE:
+                break;
+
+            default:
+                fprintf(stderr, "\nggml-hex: %s not supported\n", ggml_op_desc(node));
+                fflush(stderr);
+                GGML_ASSERT(false);
+        }
+    }
+
+    // Wait until all pending ops complete
+    while (sess->op_pending) {
+        ;
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
+    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
+
+    HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());
+
+    // Wait until all pending ops complete
+    while (sess->op_pending) {
+        ;
+    }
+}
+
+static struct ggml_backend_i hexagon_backend_i = {
+    /* .get_name                = */ ggml_backend_hexagon_name,
+    /* .free                    = */ ggml_backend_hexagon_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ ggml_backend_hexagon_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_hexagon_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .graph_optimize          = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_hexagon_guid() {
+    static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49,
+                              0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 };
+    return &guid;
+}
+
+bool ggml_backend_is_hexagon(ggml_backend_t backend) {
+    return backend && backend->iface.get_name == ggml_backend_hexagon_name;
+}
+
+// device interface
+
+static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, const char * params) {
+    auto sess = static_cast<ggml_hexagon_session *>(dev->context);
+
+    return new ggml_backend{
+        /* .guid      = */ ggml_backend_hexagon_guid(),
+        /* .interface = */ hexagon_backend_i,
+        /* .device    = */ dev,
+        /* .context   = */ sess,
+    };
+
+    GGML_UNUSED(params);
+}
+
+static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
+    auto sess = static_cast<ggml_hexagon_session *>(dev->context);
+    return sess->name.c_str();
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev_t dev) {
+    return "Hexagon";
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    // ~2GB per session for now
+    *free  = 2ULL * 1024 * 1024 * 1024;
+    *total = *free;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_hexagon_device_get_name(dev);
+    props->description = ggml_backend_hexagon_device_get_description(dev);
+    props->type        = ggml_backend_hexagon_device_get_type(dev);
+    ggml_backend_hexagon_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ true,
+        /* .host_buffer           = */ (bool) opt_hostbuf,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) {
+    auto sess = static_cast<ggml_hexagon_session *>(dev->context);
+    return &sess->buffer_type;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_repack_buffer_type(ggml_backend_dev_t dev) {
+    auto sess = static_cast<ggml_hexagon_session *>(dev->context);
+    return &sess->repack_buffer_type;
+}
+
+static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    auto sess = static_cast<ggml_hexagon_session *>(dev->context);
+
+    bool supp = false;
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            supp = true;
+            break;
+
+        case GGML_OP_MUL_MAT:
+            supp = ggml_hexagon_supported_mul_mat(sess, op);
+            break;
+
+        case GGML_OP_MUL_MAT_ID:
+            supp = ggml_hexagon_supported_mul_mat_id(sess, op);
+            break;
+
+        case GGML_OP_MUL:
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+            supp = ggml_hexagon_supported_binary(sess, op);
+            break;
+
+        case GGML_OP_ADD_ID:
+            supp = ggml_hexagon_supported_add_id(sess, op);
+            break;
+
+        case GGML_OP_RMS_NORM:
+            supp = ggml_hexagon_supported_unary(sess, op);
+            break;
+
+        case GGML_OP_SOFT_MAX:
+            supp = ggml_hexagon_supported_softmax(sess, op);
+            break;
+
+        case GGML_OP_UNARY:
+            if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
+                supp = ggml_hexagon_supported_activations(sess, op);
+            }
+            break;
+
+        case GGML_OP_GLU:
+            if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) /* || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) */) {
+                supp = ggml_hexagon_supported_activations(sess, op);
+            }
+            break;
+
+        case GGML_OP_ROPE:
+            supp = ggml_hexagon_supported_rope(sess, op);
+            break;
+
+        default:
+            break;
+    }
+
+    if (opt_verbose) {
+        char dims[64 * GGML_MAX_SRC];
+        char strides[64 * GGML_MAX_SRC];
+        char types[16 * GGML_MAX_SRC];
+        char buffs[64 * GGML_MAX_SRC];
+        char names[64 * GGML_MAX_SRC];
+
+        hex_format_op_dims(dims, op);
+        hex_format_op_strides(strides, op);
+        hex_format_op_types(types, op);
+        hex_format_op_buffs(buffs, op);
+        hex_format_op_names(names, op);
+
+        HEX_VERBOSE("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
+                    ggml_op_name(op->op), names, dims, types, strides, buffs, (int) supp);
+    }
+
+    return supp;
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if (buft->iface.get_alignment != ggml_backend_hexagon_buffer_type_get_alignment) {
+        return false;
+    }
+
+    auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
+    auto s1 = static_cast<ggml_backend_hexagon_buffer_type_context *>(buft->context)->sess;
+
+    // Need session/domain-id for buffers to be compatible
+    bool supp = (s0->session_id == s1->session_id);
+
+    HEX_VERBOSE("ggml-hex: %s device-supports-buft %s (%d)\n", s0->name.c_str(), s1->name.c_str(), (int) supp);
+
+    return supp;
+}
+
+static ggml_backend_buffer_type_t * ggml_backend_hexagon_device_get_extra_buffers_type(ggml_backend_dev_t dev) {
+    auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
+    HEX_VERBOSE("ggml-hex: device-get-extra-buft : %s \n", s0->name.c_str());
+
+    static ggml_backend_buffer_type_t bufts[2];
+    bufts[0] = ggml_backend_hexagon_device_get_repack_buffer_type(dev);
+    bufts[1] = NULL;
+    return bufts;
+}
+
+static const struct ggml_backend_device_i ggml_backend_hexagon_device_i = {
+    /* .get_name             = */ ggml_backend_hexagon_device_get_name,
+    /* .get_description      = */ ggml_backend_hexagon_device_get_description,
+    /* .get_memory           = */ ggml_backend_hexagon_device_get_memory,
+    /* .get_type             = */ ggml_backend_hexagon_device_get_type,
+    /* .get_props            = */ ggml_backend_hexagon_device_get_props,
+    /* .init_backend         = */ ggml_backend_hexagon_device_init,
+    /* .get_buffer_type      = */ ggml_backend_hexagon_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,  // ggml_backend_hexagon_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr = */ NULL,  // ggml_backend_hexagon_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_hexagon_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_hexagon_device_supports_buft,
+    /* .offload_op           = */ NULL,  // ggml_backend_hexagon_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+//** backend registry
+
+#define GGML_HEXAGON_MAX_SESSIONS 16
+
+struct ggml_hexagon_registry {
+    ggml_hexagon_registry(ggml_backend_reg_t reg);
+    ~ggml_hexagon_registry();
+
+    ggml_backend_device devices[GGML_HEXAGON_MAX_SESSIONS];
+};
+
+ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
+    fprintf(stderr, "ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
+
+    if (!opt_arch) {
+        int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
+        if (err != 0) {
+            fprintf(stderr, "ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
+            opt_arch = 73;
+        }
+    }
+
+    fprintf(stderr, "ggml-hex: Hexagon Arch version v%d\n", opt_arch);
+
+    // Create devices / sessions
+    for (size_t i = 0; i < opt_ndev; i++) {
+        devices[i].iface   = ggml_backend_hexagon_device_i;
+        devices[i].reg     = reg;
+        devices[i].context = new ggml_hexagon_session(i);
+    }
+}
+
+ggml_hexagon_registry::~ggml_hexagon_registry() {
+    fprintf(stderr, "ggml-hex: releasing registry\n");
+
+    // Release devices / sessions
+    for (size_t i = 0; i < opt_ndev; i++) {
+        auto sess = static_cast<ggml_hexagon_session *>(devices[i].context);
+        delete sess;
+    }
+}
+
+static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
+    return "HTP";
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
+    return opt_ndev;
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    auto hreg = static_cast<ggml_hexagon_registry *>(reg->context);
+
+    if (index >= opt_ndev) {
+        return NULL;
+    }
+
+    return &hreg->devices[index];
+}
+
+static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
+        ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
+        return (void *) fct;
+    }
+
+    return NULL;
+}
+
+static void ggml_hexagon_init(ggml_backend_reg * reg) {
+    // Basic sanity checks to make sure definitions match
+    static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
+                  "please update hexagon_type to match ggml_type");
+    static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
+                  "please update hexagon_type to match ggml_type");
+    static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
+                  "please update hexagon_type to match ggml_type");
+
+    const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
+    const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
+
+    opt_verbose      = str_verbose ? atoi(str_verbose) : 0;
+    opt_profile      = getenv("GGML_HEXAGON_PROFILE") != nullptr;
+    opt_etm          = getenv("GGML_HEXAGON_ETM") != nullptr;
+    opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr;
+
+    const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
+    if (str_opmask != nullptr) {
+        opt_opmask = strtoul(str_opmask, NULL, 0);
+    }
+    opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr;
+
+    const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
+    if (str_ndev) {
+        opt_ndev = strtoul(str_ndev, NULL, 0);
+        if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
+            opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
+        }
+    }
+
+    const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
+    if (str_nhvx) {
+        opt_nhvx = strtoul(str_nhvx, NULL, 0);
+    }
+
+    const char * str_arch = getenv("GGML_HEXAGON_ARCH");
+    if (str_arch) {
+        if (str_arch[0] == 'v') {
+            str_arch++;
+        }
+        opt_arch = strtoul(str_arch, NULL, 0);
+    }
+
+    opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
+
+    reg->context = new ggml_hexagon_registry(reg);
+
+    HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
+                sizeof(struct htp_general_rsp));
+}
+
+static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = {
+    /* .get_name         = */ ggml_backend_hexagon_reg_get_name,
+    /* .get_device_count = */ ggml_backend_hexagon_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_hexagon_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_hexagon_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
+    static bool initialized = false;
+
+    static ggml_backend_reg reg = { /* .api_version = */ GGML_BACKEND_API_VERSION,
+                                    /* .iface       = */ ggml_backend_hexagon_reg_i,
+                                    /* .context     = */ NULL };
+
+    {
+        static std::mutex           mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            ggml_hexagon_init(&reg);
+        }
+
+        initialized = true;
+    }
+
+    return &reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_hexagon_reg)
diff --git a/ggml/src/ggml-hexagon/htp-utils.c b/ggml/src/ggml-hexagon/htp-utils.c
new file mode 100644
index 00000000000..0b31438bbb5
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp-utils.c
@@ -0,0 +1,442 @@
+
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wsign-compare"
+
+#include "htp-utils.h"
+
+#include <domain.h>
+#include <remote.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+domain * get_domain(int domain_id) {
+    int i    = 0;
+    int size = sizeof(supported_domains) / sizeof(domain);
+
+    for (i = 0; i < size; i++) {
+        if (supported_domains[i].id == domain_id) {
+            return &supported_domains[i];
+        }
+    }
+
+    return NULL;
+}
+
+bool is_valid_domain_id(int domain_id, int compute_only) {
+    int i    = 0;
+    int size = sizeof(supported_domains) / sizeof(domain);
+
+    if (compute_only) {
+        return is_CDSP(domain_id);
+    }
+
+    for (i = 0; i < size; i++) {
+        if (supported_domains[i].id == domain_id) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
+    int nErr    = AEE_SUCCESS;
+    int ss_info = 0;
+    if (domain_type != NULL) {
+        if (strcmp(domain_type, "LPASS") == 0) {
+            ss_info = FASTRPC_LPASS;
+        } else if (strcmp(domain_type, "HPASS") == 0) {
+            ss_info = FASTRPC_HPASS;
+        } else {
+            ss_info = FASTRPC_NSP;
+        }
+    }
+    system_req_payload req  = { 0 };
+    req.id                  = FASTRPC_GET_DOMAINS;
+    req.sys.domains         = NULL;
+    fastrpc_domain * domain = NULL;
+    if (ss_info != 0) {
+        req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
+    } else {
+        req.sys.flags = 0;
+    }
+#ifdef _WIN32
+    nErr = AEE_EUNSUPPORTED;
+    goto bail;
+#endif
+    if (remote_system_request) {
+        nErr = remote_system_request(&req);
+        if (nErr != AEE_SUCCESS) {
+            printf("Failure in remote_system_request call: %d.\n", nErr);
+            goto bail;
+        }
+        // Allocate memory for domain-info array
+        req.sys.max_domains = req.sys.num_domains;
+        if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) {
+            nErr = AEE_ENOMEMORY;
+            printf("Unable to allocate memory for req.sys.domains");
+            goto bail;
+        }
+
+        nErr = remote_system_request(&req);
+        if (nErr != AEE_SUCCESS) {
+            printf("Failure in remote_system_request call: %d.\n", nErr);
+            goto bail;
+        }
+
+        for (int i = 0; i < req.sys.num_domains; i++) {
+            // Verify that only requested type domains were returned
+            domain = &req.sys.domains[i];
+            if (domain->type != ss_info && domain_type != NULL) {
+                nErr = -1;
+                printf("Incorrect data received from remote_system_request.\n");
+                goto bail;
+            }
+        }
+        *domains_info = req.sys.domains;
+        *num_domains  = req.sys.num_domains;
+    } else {
+        nErr = AEE_EUNSUPPORTED;
+        goto bail;
+    }
+bail:
+    if (nErr && !req.sys.domains) {
+        free(req.sys.domains);
+    }
+    return nErr;
+}
+
+int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id) {
+    int                              err  = 0;
+    remote_rpc_effective_domain_id_t sess = { 0 };
+
+    sess.domain_name     = domain_name;
+    sess.domain_name_len = strlen(domain_name);
+    sess.session_id      = session_id;
+
+    err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess));
+    if (err) {
+        printf("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name,
+               session_id);
+        return err;
+    }
+
+    *effec_domain_id = sess.effective_domain_id;
+    return err;
+}
+
+int get_dsp_support(int * domain) {
+    int nErr = AEE_SUCCESS;
+    *domain  = CDSP_DOMAIN_ID;  // DSP domain default value is CDSP_DOMAIN_ID
+
+    if (remote_handle_control) {
+        struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 };
+        nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+        if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            printf("\nFastRPC Capability API is not supported on this device\n");
+            goto bail;
+        }
+
+        if (dsp_capability_domain.capability == 0) {
+            dsp_capability_domain.domain       = ADSP_DOMAIN_ID;  // Check for ADSP support.
+            dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
+            dsp_capability_domain.capability   = 0;
+            nErr                               = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain,
+                                                                       sizeof(struct remote_dsp_capability));
+            if (dsp_capability_domain.capability) {
+                *domain = ADSP_DOMAIN_ID;  // For targets like Agatti (not having cDSP), domain is ADSP_DOMAIN_ID
+            }
+        }
+
+        if (nErr != AEE_SUCCESS) {
+            printf("\nget_dsp_support failed with Error 0x%x\n", nErr);
+            goto bail;
+        }
+    } else {
+        nErr = AEE_EUNSUPPORTEDAPI;
+        printf("remote_dsp_capability interface is not supported on this device\n");
+    }
+
+bail:
+    return nErr;
+}
+
+int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
+    int nErr    = AEE_SUCCESS;
+    *capability = 0;
+
+    if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
+    } else {
+        nErr = AEE_EBADPARM;
+        printf("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n");
+        goto bail;
+    }
+    if (remote_handle_control) {
+        if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) {
+            /*
+            * Query the DSP for VTCM information
+            * Since the ADSP does not have a dedicated VTCM, we expect the output to be 0
+            */
+            struct remote_dsp_capability dsp_capability_vtcm_dsp;
+            dsp_capability_vtcm_dsp.domain       = (uint32_t) domain;
+            dsp_capability_vtcm_dsp.attribute_ID = attr;
+            dsp_capability_vtcm_dsp.capability   = (uint32_t) 0;
+            nErr                                 = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp,
+                                                                         sizeof(struct remote_dsp_capability));
+            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                printf("\nFastRPC Capability API is not supported on this device\n");
+                printf("Running the usecase without checking the capability\n");
+                nErr = AEE_SUCCESS;
+                goto bail;
+            } else if (nErr == AEE_SUCCESS) {
+                *capability = dsp_capability_vtcm_dsp.capability;
+            } else {
+                printf("\nget_vtcm_info failed with Error 0x%x\n", nErr);
+                goto bail;
+            }
+        } else {
+            nErr = AEE_EUNSUPPORTED;
+            printf("Unsupported domain %d\n", domain);
+            goto bail;
+        }
+    } else {
+        nErr = AEE_EUNSUPPORTEDAPI;
+        printf("remote_dsp_capability interface is not supported on this device\n");
+    }
+
+bail:
+    return nErr;
+}
+
+bool is_unsignedpd_supported(int domain_id) {
+    int nErr = AEE_SUCCESS;
+    if (remote_handle_control) {
+        struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 };
+        nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+        if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            printf("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n");
+            return false;
+        }
+        if (nErr) {
+            printf("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr);
+            return false;
+        }
+        if (dsp_capability_domain.capability == 1) {
+            return true;
+        }
+    } else {
+        nErr = AEE_EUNSUPPORTEDAPI;
+        printf("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n");
+        return false;
+    }
+    return false;
+}
+
+bool get_unsignedpd_support(void) {
+    return is_unsignedpd_supported(CDSP_DOMAIN_ID);
+}
+
+bool is_async_fastrpc_supported(int domain) {
+    int nErr = AEE_SUCCESS;
+    if (remote_handle_control) {
+        if (domain == CDSP_DOMAIN_ID) {
+            /*
+            * Query the DSP for ASYNC_FASTRPC_SUPPORT information
+            * Async fastrpc is supported only on CDSP
+            */
+            struct remote_dsp_capability dsp_capability_async_support;
+            dsp_capability_async_support.domain       = (uint32_t) domain;
+            dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
+            dsp_capability_async_support.capability   = (uint32_t) 0;
+            nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support,
+                                         sizeof(struct remote_dsp_capability));
+            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                printf("\nFastRPC Capability API is not supported on this device\n");
+                printf("Running the usecase without checking the capability\n");
+                nErr = AEE_SUCCESS;
+                goto bail;
+            } else if (dsp_capability_async_support.capability == 1) {
+                return true;
+            }
+            if (nErr != AEE_SUCCESS) {
+                printf("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr);
+                goto bail;
+            }
+        } else {
+            nErr = AEE_EUNSUPPORTED;
+            printf("Async fastrpc is not supported on domain %d\n", domain);
+            goto bail;
+        }
+    } else {
+        nErr = AEE_EUNSUPPORTEDAPI;
+        printf("remote_dsp_capability interface is not supported on this device\n");
+    }
+
+bail:
+    return false;
+}
+
+bool is_status_notification_supported(int domain) {
+    int nErr = AEE_SUCCESS;
+
+    if (remote_handle_control) {
+        /*
+        * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
+        * DSP User PD status notification Support
+        */
+        struct remote_dsp_capability dsp_capability_status_notification_support;
+        dsp_capability_status_notification_support.domain       = (uint32_t) domain;
+        dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
+        dsp_capability_status_notification_support.capability   = (uint32_t) 0;
+        nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support,
+                                     sizeof(struct remote_dsp_capability));
+        if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            printf("\nFastRPC Capability API is not supported on this device\n");
+            printf("Running the usecase without checking the capability\n");
+            nErr = AEE_SUCCESS;
+            goto bail;
+        } else if (dsp_capability_status_notification_support.capability == 1) {
+            return true;
+        }
+        if (nErr != AEE_SUCCESS) {
+            printf("\nis_status_notification_supported failed with Error 0x%x\n", nErr);
+            goto bail;
+        }
+    } else {
+        nErr = AEE_EUNSUPPORTEDAPI;
+        printf("remote_dsp_capability interface is not supported on this device\n");
+    }
+
+bail:
+    return false;
+}
+
+int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
+    int nErr    = AEE_SUCCESS;
+    *capability = 0;
+
+    if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
+        nErr = AEE_EBADPARM;
+        printf("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n");
+        goto bail;
+    }
+    if (remote_handle_control) {
+        if (domain == CDSP_DOMAIN_ID) {
+            /*
+            * Query the DSP for HMX SUPPORT information
+            * HMX is supported on CDSP only
+            */
+            struct remote_dsp_capability dsp_capability_hmx_dsp;
+            dsp_capability_hmx_dsp.domain       = (uint32_t) domain;
+            dsp_capability_hmx_dsp.attribute_ID = attr;
+            dsp_capability_hmx_dsp.capability   = (uint32_t) 0;
+            nErr                                = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp,
+                                                                        sizeof(struct remote_dsp_capability));
+            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                printf("\nFastRPC Capability API is not supported on this device\n");
+                printf("Running the usecase without checking the capability\n");
+                nErr = AEE_SUCCESS;
+                goto bail;
+            } else if (nErr == AEE_SUCCESS) {
+                *capability = dsp_capability_hmx_dsp.capability;
+            } else {
+                printf("\nget_hmx_support_info failed with Error 0x%x\n", nErr);
+                goto bail;
+            }
+        } else {
+            nErr = AEE_EUNSUPPORTED;
+            printf("HMX support is not there for domain %d\n", domain);
+            goto bail;
+        }
+    } else {
+        nErr = AEE_EUNSUPPORTEDAPI;
+        printf("remote_dsp_capability interface is not supported on this device\n");
+    }
+
+bail:
+    return nErr;
+}
+
+int get_hex_arch_ver(int domain, int * arch) {
+    if (!remote_handle_control) {
+        fprintf(stderr, "ggml-hex: remote_handle_control is not supported on this device\n");
+        return AEE_EUNSUPPORTEDAPI;
+    }
+
+    struct remote_dsp_capability arch_ver;
+    arch_ver.domain       = (uint32_t) domain;
+    arch_ver.attribute_ID = ARCH_VER;
+    arch_ver.capability   = (uint32_t) 0;
+
+    int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
+    if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
+        fprintf(stderr, "ggml-hex: FastRPC capability API is not supported on this device\n");
+        return AEE_EUNSUPPORTEDAPI;
+    }
+
+    if (err != AEE_SUCCESS) {
+        fprintf(stderr, "ggml-hex: FastRPC capability query failed (err %d)\n", err);
+        return err;
+    }
+
+    switch (arch_ver.capability & 0xff) {
+        case 0x73:
+            *arch = 73;
+            return 0;
+        case 0x75:
+            *arch = 75;
+            return 0;
+        case 0x79:
+            *arch = 79;
+            return 0;
+        case 0x81:
+            *arch = 81;
+            return 0;
+    }
+    return -1;
+}
+
+int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) {
+    int nErr    = AEE_SUCCESS;
+    *capability = 0;
+
+    if (remote_handle_control) {
+        if (domain == CDSP_DOMAIN_ID) {
+            /*
+            * Query the DSP for HVX SUPPORT information
+            * HVX is supported on CDSP only
+            */
+            struct remote_dsp_capability dsp_capability_hvx_dsp;
+            dsp_capability_hvx_dsp.domain       = (uint32_t) domain;
+            dsp_capability_hvx_dsp.attribute_ID = attr;
+            dsp_capability_hvx_dsp.capability   = (uint32_t) 0;
+            nErr                                = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp,
+                                                                        sizeof(struct remote_dsp_capability));
+            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                printf("\nFastRPC Capability API is not supported on this device\n");
+                printf("Running the usecase without checking the capability\n");
+                nErr = AEE_SUCCESS;
+                goto bail;
+            } else if (nErr == AEE_SUCCESS) {
+                *capability = dsp_capability_hvx_dsp.capability;
+            } else {
+                printf("\nget_hvx_support_info failed with Error 0x%x\n", nErr);
+                goto bail;
+            }
+        } else {
+            nErr = AEE_EUNSUPPORTED;
+            printf("HVX support is not available on domain %d\n", domain);
+            goto bail;
+        }
+    } else {
+        nErr = AEE_EUNSUPPORTEDAPI;
+        printf("remote_dsp_capability interface is not supported on this device\n");
+    }
+
+bail:
+    return nErr;
+}
diff --git a/ggml/src/ggml-hexagon/htp-utils.h b/ggml/src/ggml-hexagon/htp-utils.h
new file mode 100644
index 00000000000..caba3e174a2
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp-utils.h
@@ -0,0 +1,219 @@
+#ifndef HTP_UTILS_H
+#define HTP_UTILS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <AEEStdErr.h>
+#include <inttypes.h>
+#include <remote.h>
+#include <stdbool.h>
+
+/* Offset to differentiate HLOS and Hexagon error codes.
+   Stores the value of AEE_EOFFSET for Hexagon. */
+#ifndef DSP_OFFSET
+#    define DSP_OFFSET 0x80000400
+#endif
+
+/* Errno for connection reset by peer. */
+#ifndef ECONNRESET
+#    ifdef __hexagon__
+#        define ECONNRESET 104
+#    endif
+#endif
+
+/* Abstraction of different OS specific sleep APIs.
+   SLEEP accepts input in seconds. */
+#ifndef SLEEP
+#    ifdef __hexagon__
+#        define SLEEP(x)                      \
+            { /* Do nothing for simulator. */ \
+            }
+#    else
+#        ifdef _WINDOWS
+#            define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
+#        else
+#            define SLEEP(x) sleep(x)        /* sleep accepts input in seconds. */
+#        endif
+#    endif
+#endif
+
+/* Include windows specific header files. */
+#ifdef _WINDOWS
+#    include <sysinfoapi.h>
+#    include <windows.h>
+#    define _CRT_SECURE_NO_WARNINGS         1
+#    define _WINSOCK_DEPRECATED_NO_WARNINGS 1
+/* Including this file for custom implementation of getopt function. */
+#    include "getopt_custom.h"
+#endif
+
+/* Includes and defines for all HLOS except windows */
+#if !defined(__hexagon__) && !defined(_WINDOWS)
+#    include "unistd.h"
+
+#    include <sys/time.h>
+#endif
+
+/* Includes and defines for Hexagon and all HLOS except Windows. */
+#if !defined(_WINDOWS)
+/* Weak reference to remote symbol for compilation. */
+#    pragma weak remote_session_control
+#    pragma weak remote_handle_control
+#    pragma weak remote_handle64_control
+#    pragma weak fastrpc_mmap
+#    pragma weak fastrpc_munmap
+#endif
+
+#if !defined(_WINDOWS)
+#    pragma weak remote_system_request
+#endif
+/**
+ * Wrapper for FastRPC Capability API: query DSP support.
+ *
+ * @param[out]  domain pointer to supported domain.
+ * @return      0          if query is successful.
+ *              non-zero   if error, return value points to the error.
+ */
+int get_dsp_support(int * domain);
+
+/**
+ * Wrapper for FastRPC Capability API: query VTCM information.
+ *
+ * @param[in]   domain value of domain in the queried.
+ * @param[out]  capability capability value of the attribute queried.
+ * @param[in]   attr value of the attribute to the queried.
+ * @return      0          if query is successful.
+ *              non-zero   if error, return value points to the error.
+ */
+int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr);
+
+/**
+ * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
+ *
+ * @return      true          if unsigned pd is supported.
+ *              false         if unsigned pd is not supported, capability query failed.
+ */
+
+bool get_unsignedpd_support(void);
+
+/**
+ * Wrapper for FastRPC Capability API: query unsigned pd support.
+ *
+ * @param[in]   domain value of domain in the queried.
+ * @return      true          if unsigned pd is supported.
+ *              false         if unsigned pd is not supported, capability query failed.
+ */
+
+bool is_unsignedpd_supported(int domain_id);
+
+/**
+ * is_valid_domain_id API: query a domain id is valid.
+ *
+ * @param[in]   domain value of domain in the queried.
+ * @param[in]   compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
+ * @return      true          if value of domain is valid.
+ *              false         if value of domain is not valid.
+ */
+
+bool is_valid_domain_id(int domain_id, int compute_only);
+
+/**
+ * get_domain API: get domain struct from domain value.
+ *
+ * @param[in]  domain value of a domain
+ * @return     Returns domain struct of the domain if it is supported or else
+ *             returns NULL.
+ *
+ */
+
+domain * get_domain(int domain_id);
+
+/**
+ * get_domains_info API: get information for all the domains available on the device
+ *
+ * @param[in]  domain_type pointer to domain type
+ * @param[in]  num_domains pointer to number of domains
+ * @param[in]  domains_info pointer to save discovered domains information.
+ * @return     0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
+ *
+ */
+
+int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info);
+
+/**
+ * get_effective_domain_id API: get effective domain id for given session id
+ *
+ * @param[in]  domain_name pointer to domain name
+ * @param[in]  session_id
+ * @param[in]  effec_domain_id pointer to save obtained effective domain id.
+ * @return     0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ */
+
+int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id);
+
+/**
+ * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
+ *
+ * @param[in]  domain_id value of a domain
+ * @return     Returns true or false stating support of Async FastRPC
+ *
+ */
+
+bool is_async_fastrpc_supported(int domain_id);
+
+/**
+ * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
+ *
+ * @param[in]  domain_id value of a domain
+ * @return     Returns true or false stating status notification support information
+ *
+ */
+bool is_status_notification_supported(int domain_id);
+
+/**
+ * get_hmx_support_info API: query the DSP for HMX SUPPORT information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  capability capability value of the attribute queried.
+ * @param[in]   attr value of the attribute to the queried.
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ */
+int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr);
+
+/**
+ * get_hex_arch_ver API: query the Hexagon processor architecture version information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  Arch version (73, 75, ...)
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ */
+int get_hex_arch_ver(int domain, int * arch);
+
+/**
+ * get_hvx_support_info API: query the DSP for HVX SUPPORT information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  capability capability value of the attribute queried.
+ * @param[in]   attr value of the attribute to the queried.
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ */
+int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  //DSP_CAPABILITIES_UTILS_H
diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
new file mode 100644
index 00000000000..9be5df55083
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 3.22.2)
+project(ggml-htp C CXX ASM)
+
+include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
+
+include_directories(
+    ${HEXAGON_SDK_ROOT}/incs
+    ${HEXAGON_SDK_ROOT}/incs/stddef
+    ${CMAKE_CURRENT_SOURCE_DIR}/../..
+    ${CMAKE_CURRENT_SOURCE_DIR}/..
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR})
+
+set(HTP_LIB ggml-htp-${DSP_VERSION})
+
+add_library(${HTP_LIB} SHARED
+    main.c
+    htp_iface_skel.c
+    worker-pool.c
+    htp-dma.c
+    hvx-sigmoid.c
+    hvx-inverse.c
+    hvx-exp.c
+    hvx-utils.c
+    matmul-ops.c
+    binary-ops.c
+    unary-ops.c
+    softmax-ops.c
+    act-ops.c
+    rope-ops.c
+)
+
+target_compile_definitions(${HTP_LIB} PRIVATE
+	$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>)
+
+build_idl(htp_iface.idl ${HTP_LIB})
+
+set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)
+
+install(TARGETS ${HTP_LIB})
diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
new file mode 100644
index 00000000000..16044975d92
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -0,0 +1,448 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+#include <HAP_farf.h>
+#include <HAP_mem.h>
+#include <HAP_perf.h>
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <qurt_thread.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+#define htp_act_preamble3              \
+    const uint32_t ne00 = src0->ne[0]; \
+    const uint32_t ne01 = src0->ne[1]; \
+    const uint32_t ne02 = src0->ne[2]; \
+    const uint32_t ne03 = src0->ne[3]; \
+                                       \
+    const uint32_t ne10 = src1->ne[0]; \
+    const uint32_t ne11 = src1->ne[1]; \
+    const uint32_t ne12 = src1->ne[2]; \
+    const uint32_t ne13 = src1->ne[3]; \
+                                       \
+    const uint32_t ne0 = dst->ne[0];   \
+    const uint32_t ne1 = dst->ne[1];   \
+    const uint32_t ne2 = dst->ne[2];   \
+    const uint32_t ne3 = dst->ne[3];   \
+                                       \
+    const uint32_t nb00 = src0->nb[0]; \
+    const uint32_t nb01 = src0->nb[1]; \
+    const uint32_t nb02 = src0->nb[2]; \
+    const uint32_t nb03 = src0->nb[3]; \
+                                       \
+    const uint32_t nb10 = src1->nb[0]; \
+    const uint32_t nb11 = src1->nb[1]; \
+    const uint32_t nb12 = src1->nb[2]; \
+    const uint32_t nb13 = src1->nb[3]; \
+                                       \
+    const uint32_t nb0 = dst->nb[0];   \
+    const uint32_t nb1 = dst->nb[1];   \
+    const uint32_t nb2 = dst->nb[2];   \
+    const uint32_t nb3 = dst->nb[3];
+
+#define htp_act_preamble2              \
+    const uint32_t ne00 = src0->ne[0]; \
+    const uint32_t ne01 = src0->ne[1]; \
+    const uint32_t ne02 = src0->ne[2]; \
+    const uint32_t ne03 = src0->ne[3]; \
+                                       \
+    const uint32_t ne0 = dst->ne[0];   \
+    const uint32_t ne1 = dst->ne[1];   \
+    const uint32_t ne2 = dst->ne[2];   \
+    const uint32_t ne3 = dst->ne[3];   \
+                                       \
+    const uint32_t nb00 = src0->nb[0]; \
+    const uint32_t nb01 = src0->nb[1]; \
+    const uint32_t nb02 = src0->nb[2]; \
+    const uint32_t nb03 = src0->nb[3]; \
+                                       \
+    const uint32_t nb0 = dst->nb[0];   \
+    const uint32_t nb1 = dst->nb[1];   \
+    const uint32_t nb2 = dst->nb[2];   \
+    const uint32_t nb3 = dst->nb[3];
+
+static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
+                                       const struct htp_tensor * src1,
+                                       struct htp_tensor *       dst,
+                                       const int32_t *           op_params,
+                                       struct htp_spad *         src0_spad,
+                                       struct htp_spad *         src1_spad,
+                                       struct htp_spad *         dst_spad,
+                                       uint32_t                  nth,
+                                       uint32_t                  ith,
+                                       uint32_t                  src0_nrows_per_thread) {
+    htp_act_preamble3;
+
+    size_t src0_row_size = nb01;
+    size_t src1_row_size = nb11;
+    size_t dst_row_size  = nb1;
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
+        is_aligned = 0;
+        FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
+
+    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
+    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
+    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+
+    bool src1_valid = src1->ne[0];
+    if (!src1_valid) {
+        data_src1     = data_src0;
+        src1_row_size = src0_row_size;
+    }
+
+    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
+    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
+    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
+
+    const int32_t swapped = op_params[1];
+
+    const int nc = (src1_valid) ? ne0 : ne0 / 2;
+
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
+        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
+        const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
+        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
+
+        if (ir + 1 < src0_end_row) {
+            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
+        }
+
+        if (!src1_valid) {
+            src0 += swapped ? nc : 0;
+            src1 += swapped ? 0 : nc;
+        }
+
+        if (1 == opt_path) {
+            hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
+            hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
+                                (uint8_t *) dst, nc);
+        } else {
+            hvx_exp_f32((const uint8_t *) src0, src0_spad_data, nc, true);
+            hvx_add_scalar_f32(src0_spad_data, 1.0, src1_spad_data, nc);
+            hvx_inverse_f32(src1_spad_data, src0_spad_data, nc);
+
+            hvx_mul_f32((const uint8_t *) src0, src0_spad_data, dst_spad_data, nc);
+            hvx_mul_f32(dst_spad_data, (const uint8_t *) src1, (uint8_t *) dst, nc);
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "swiglu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path,
+         ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
+                                           const struct htp_tensor * src1,
+                                           struct htp_tensor *       dst,
+                                           const int32_t *           op_params,
+                                           struct htp_spad *         src0_spad,
+                                           struct htp_spad *         src1_spad,
+                                           struct htp_spad *         dst_spad,
+                                           uint32_t                  nth,
+                                           uint32_t                  ith,
+                                           uint32_t                  src0_nrows_per_thread) {
+    htp_act_preamble3;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const size_t src0_row_size = nb01;
+    const size_t src1_row_size = nb11;
+    const size_t dst_row_size  = nb1;
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
+        FARF(HIGH, "act-f32: unaligned addresses in activations op, possibly slower execution\n");
+    }
+
+    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
+    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
+    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+
+    bool src1_valid = src1->ne[0];
+    if (!src1_valid) {
+        data_src1 = data_src0;
+    }
+
+    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
+    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
+    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
+
+    const int32_t swapped = op_params[1];
+    const float   alpha   = ((const float *) (op_params))[2];
+    const float   limit   = ((const float *) (op_params))[3];
+
+    const int nc = (src1_valid) ? ne0 : ne0 / 2;
+
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
+        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
+        const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
+        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
+
+        if (ir + 1 < src0_end_row) {
+            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
+        }
+
+        if (!src1) {
+            src0 += swapped ? nc : 0;
+            src1 += swapped ? 0 : nc;
+        }
+
+        // x (src0_spad_data) = std::min(src0_p[k], limit);
+        hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc);
+        // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
+        hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc);
+        // y (src1_spad_data)  = y1 + 1.f
+        hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc);
+        // x1 (dst_spad_data) = alpha * (x)
+        hvx_mul_scalar_f32(src0_spad_data, alpha, dst_spad_data, nc);
+        // x2 (dst_spad_data) = expf(-x1)
+        hvx_exp_f32(dst_spad_data, dst_spad_data, nc, true);
+        // x3 (dst_spad_data) = x2 + 1.f
+        hvx_add_scalar_f32(dst_spad_data, 1.0, dst_spad_data, nc);
+        // x4 (dst_spad_data) = 1 / x3
+        hvx_inverse_f32(dst_spad_data, dst_spad_data, nc);
+        // out_glu(dst_spad_data) = x * x4
+        hvx_mul_f32(src0_spad_data, dst_spad_data, dst_spad_data, nc);
+        // out = out_glu * (y + 1.f);
+        hvx_mul_f32(dst_spad_data, src1_spad_data, (uint8_t *) dst, nc);
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "swiglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, src0->ne[0],
+         src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], src1->ne[2],
+         src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
+                                       struct htp_tensor *       dst,
+                                       const int32_t *           op_params,
+                                       struct htp_spad *         src0_spad,
+                                       struct htp_spad *         dst_spad,
+                                       uint32_t                  nth,
+                                       uint32_t                  ith,
+                                       uint32_t                  src0_nrows_per_thread) {
+    htp_act_preamble2;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const size_t src0_row_size = nb01;
+    const size_t dst_row_size  = nb1;
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
+        is_aligned = 0;
+        FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
+
+    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
+    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+
+    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
+    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
+
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
+        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
+        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
+
+        if (ir + 1 < src0_end_row) {
+            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
+        }
+
+        if (1 == opt_path) {
+            hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, ne0);
+            hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+        } else {
+            hvx_exp_f32((const uint8_t *) src0, src0_spad_data, ne0, true);
+            hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0);
+            hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0);
+
+            hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "silu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
+         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void unary_silu_fp32(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = (struct htp_ops_context *) data;
+    unary_silu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
+                               octx->src0_nrows_per_thread);
+}
+
+static void glu_swiglu_fp32(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = (struct htp_ops_context *) data;
+    glu_swiglu_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
+                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
+}
+
+static void glu_swiglu_oai_fp32(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = (struct htp_ops_context *) data;
+    glu_swiglu_oai_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
+                                   &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
+}
+
+static int execute_op_activations_fp32(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    if (((src0->ne[0] * SIZEOF_FP32) != src0->nb[1]) || ((dst->ne[0] * SIZEOF_FP32) != dst->nb[1])) {
+        FARF(ERROR, "Non-contiguous tensors are not supported at this time \n");
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    worker_callback_t act_op_func;
+    const char *      op_type = NULL;
+
+    switch (octx->op) {
+        case HTP_OP_UNARY_SILU:
+            act_op_func = unary_silu_fp32;
+            op_type     = "silu-f32";
+            break;
+
+        case HTP_OP_GLU_SWIGLU:
+            act_op_func = glu_swiglu_fp32;
+            op_type     = "swiglu-f32";
+            break;
+
+        case HTP_OP_GLU_SWIGLU_OAI:
+            act_op_func = glu_swiglu_oai_fp32;
+            op_type     = "swiglu-oai-f32";
+            break;
+
+        default:
+            FARF(ERROR, "Unsupported activations Op %u\n", octx->op);
+            return HTP_STATUS_NO_SUPPORT;
+    }
+
+    const uint32_t n_threads  = octx->n_threads;
+    const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+
+    const size_t src0_row_size = src0->nb[1];
+    const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
+    const size_t dst_row_size  = dst->nb[1];
+
+    // VTCM scratchpads for all tensors
+    // N rows per thread, padded to HVX vector size
+    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * octx->n_threads;
+    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
+    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
+
+    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+
+    if (src1->ne[0]) {
+        FARF(HIGH,
+             "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
+             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
+             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
+             octx->dst_spad.size);
+    } else {
+        FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
+             src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+             octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
+    }
+
+    // Make sure the reserved vtcm size is sufficient
+    if (octx->ctx->vtcm_size < spad_size) {
+        FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
+             spad_size);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+
+    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        uint32_t n_jobs = MIN(n_threads, src0_nrows);
+
+        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
+    }
+
+    return err;
+}
+
+int op_activations(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    switch (octx->src0.type) {
+        case HTP_TYPE_F32:
+            err = execute_op_activations_fp32(octx);
+            break;
+
+        default:
+            err = HTP_STATUS_NO_SUPPORT;
+            break;
+    }
+
+    return err;
+}
diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c
new file mode 100644
index 00000000000..92c0109d287
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/binary-ops.c
@@ -0,0 +1,344 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+
+#include <HAP_farf.h>
+#include <HAP_mem.h>
+#include <HAP_perf.h>
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <qurt_thread.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+typedef void (*hvx_elemwise_f32_func)(const uint8_t * src0,
+                                      const uint8_t * src1,
+                                      uint8_t *       data_dst,
+                                      const int       num_elems);
+
+static hvx_elemwise_f32_func func_table_HVX[]     = { hvx_mul_f32, hvx_add_f32, hvx_sub_f32 };
+static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_opt, hvx_add_f32_opt, hvx_sub_f32_opt };
+
+#define htp_binary_preamble            \
+    const uint32_t ne00 = src0->ne[0]; \
+    const uint32_t ne01 = src0->ne[1]; \
+    const uint32_t ne02 = src0->ne[2]; \
+    const uint32_t ne03 = src0->ne[3]; \
+                                       \
+    const uint32_t ne10 = src1->ne[0]; \
+    const uint32_t ne11 = src1->ne[1]; \
+    const uint32_t ne12 = src1->ne[2]; \
+    const uint32_t ne13 = src1->ne[3]; \
+                                       \
+    const uint32_t ne0 = dst->ne[0];   \
+    const uint32_t ne1 = dst->ne[1];   \
+    const uint32_t ne2 = dst->ne[2];   \
+    const uint32_t ne3 = dst->ne[3];   \
+                                       \
+    const uint32_t nb00 = src0->nb[0]; \
+    const uint32_t nb01 = src0->nb[1]; \
+    const uint32_t nb02 = src0->nb[2]; \
+    const uint32_t nb03 = src0->nb[3]; \
+                                       \
+    const uint32_t nb10 = src1->nb[0]; \
+    const uint32_t nb11 = src1->nb[1]; \
+    const uint32_t nb12 = src1->nb[2]; \
+    const uint32_t nb13 = src1->nb[3]; \
+                                       \
+    const uint32_t nb0 = dst->nb[0];   \
+    const uint32_t nb1 = dst->nb[1];   \
+    const uint32_t nb2 = dst->nb[2];   \
+    const uint32_t nb3 = dst->nb[3];
+
+static void binary_job_f32_per_thread(const struct htp_tensor * src0,
+                                      const struct htp_tensor * src1,
+                                      struct htp_tensor *       dst,
+                                      uint8_t *                 spad_data,
+                                      uint32_t                  nth,
+                                      uint32_t                  ith,
+                                      uint32_t                  src0_nrows_per_thread,
+                                      enum htp_op               op) {
+    htp_binary_preamble;
+
+    const size_t src0_row_size = nb01;
+    const size_t src1_row_size = nb11;
+    const size_t dst_row_size  = nb1;
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src1_nrows = ne11 * ne12 * ne13;  // src1 rows
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
+        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
+        FARF(HIGH, "binary-f32: unaligned addresses in elementwise op, possibly slower execution\n");
+        is_aligned = 0;
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
+
+    hvx_elemwise_f32_func func_HVX = (1 == opt_path) ? func_table_HVX_opt[op] : func_table_HVX[op];
+
+    uint8_t * restrict spad_data_th = spad_data + (ith * src0_row_size);
+
+    const uint32_t nr0 = ne00 / ne10;
+
+    const uint8_t * restrict src0_ptr = (const uint8_t *) src0->data + (src0_start_row * src0_row_size);
+    uint8_t * restrict dst_ptr        = (uint8_t *) dst->data + (src0_start_row * dst_row_size);
+
+    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
+    const uint8_t * restrict src1_ptr  = NULL;
+
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
+        src1_ptr = data_src1 + (ir % src1_nrows) * src1_row_size;
+
+        if (ir + 1 < src0_end_row) {
+            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
+            if (src1_row_size == src0_row_size) {
+                htp_l2fetch(src1_ptr, 1, src1_row_size, src1_row_size);
+            }
+        }
+
+        if (nr0 > 1) {
+            if ((1 == is_aligned) && (nr0 == ne00)) {
+                hvx_bcast_fp32_a(spad_data_th, *(float *) src1_ptr, nr0);
+            } else {
+                for (uint32_t r = 0; r < nr0; r++) {
+                    memcpy(spad_data_th + r * nb11, (const uint8_t *) src1_ptr, nb11);
+                }
+            }
+            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, (uint8_t *) dst_ptr, ne00);
+        } else {
+            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
+        }
+
+        src0_ptr += src0_row_size;
+        dst_ptr += dst_row_size;
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "binary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path,
+         ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void binary_add_id_job_f32_per_thread(const struct htp_tensor * src0,
+                                             const struct htp_tensor * src1,
+                                             const struct htp_tensor * src2,
+                                             struct htp_tensor *       dst,
+                                             uint8_t *                 spad_data,
+                                             uint32_t                  nth,
+                                             uint32_t                  ith,
+                                             uint32_t                  src0_nrows_per_thread,
+                                             hvx_elemwise_f32_func     func_HVX) {
+    htp_binary_preamble;
+
+    const size_t src0_row_size = nb01;
+    const size_t src1_row_size = nb11;
+    const size_t dst_row_size  = nb1;
+
+    const uint32_t ne02_ne01  = ne02 * ne01;
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
+        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
+        FARF(HIGH, "add-id-f32: unaligned addresses, possibly slower execution\n");
+    }
+
+    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
+    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
+    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
+        // src0 indices
+        const uint32_t i03 = ir / ne02_ne01;
+        const uint32_t i02 = (ir - i03 * ne02_ne01) / ne01;
+        const uint32_t i01 = (ir - i03 * ne02_ne01 - i02 * ne01);
+
+        // src1 indices
+        const int i11 = *(int32_t *) ((char *) src2->data + i01 * src2->nb[0] + i02 * src2->nb[1]);
+        assert(i11 >= 0 && i11 < ne11);
+
+        float * restrict dst_ptr        = (float *) (data_dst + i03 * nb3 + i02 * nb2 + i01 * nb1);
+        const float * restrict src0_ptr = (const float *) (data_src0 + i03 * nb03 + i02 * nb02 + i01 * nb01);
+        const float * restrict src1_ptr = (const float *) (data_src1 + 0 + 0 + i11 * nb11);
+
+        if (ir + 1 < src0_end_row) {
+            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
+            if (src1_row_size == src0_row_size) {
+                htp_l2fetch(src1_ptr + ne10, 1, src1_row_size, src1_row_size);
+            }
+        }
+
+        const uint32_t nr0 = ne00 / ne10;
+        if (nr0 > 1) {
+            for (uint32_t r = 0; r < nr0; r++) {
+                memcpy(spad_data + r * nb10, (const uint8_t *) src1_ptr, nb10);
+            }
+            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data, (uint8_t *) dst_ptr, ne00);
+        } else {
+            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "add-id-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", ith, nth,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
+         src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1],
+         dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void binary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = (struct htp_ops_context *) data;
+
+    switch (octx->op) {
+        case HTP_OP_MUL:
+        case HTP_OP_ADD:
+        case HTP_OP_SUB:
+            binary_job_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->src1_spad.data, n, i,
+                                      octx->src0_nrows_per_thread, octx->op);
+            break;
+
+        case HTP_OP_ADD_ID:
+            binary_add_id_job_f32_per_thread(&octx->src0, &octx->src1, &octx->src2, &octx->dst, octx->src0_spad.data, n,
+                                             i, octx->src0_nrows_per_thread, hvx_add_f32);
+            break;
+
+        default:
+            FARF(ERROR, "Unknown Binary Op %u", octx->op);
+            break;
+    }
+}
+
+static int execute_op_binary_f32(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    worker_callback_t binary_op_func;
+    const char *      op_type = NULL;
+
+    switch (octx->op) {
+        case HTP_OP_MUL:
+            binary_op_func = binary_job_dispatcher_f32;
+            op_type        = "mul-f32";
+            break;
+
+        case HTP_OP_ADD:
+            binary_op_func = binary_job_dispatcher_f32;
+            op_type        = "add-f32";
+            break;
+
+        case HTP_OP_SUB:
+            binary_op_func = binary_job_dispatcher_f32;
+            op_type        = "sub-f32";
+            break;
+
+        case HTP_OP_ADD_ID:
+            binary_op_func = binary_job_dispatcher_f32;
+            op_type        = "add-id-f32";
+            break;
+
+        default:
+            FARF(ERROR, "Unsupported binary-Op %u\n", octx->op);
+            return HTP_STATUS_NO_SUPPORT;
+    }
+
+    const int      n_threads  = octx->n_threads;
+    const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+
+    const size_t src0_row_size = src0->nb[1];
+    const size_t src1_row_size = src1->nb[1];
+    const size_t dst_row_size  = dst->nb[1];
+
+    // VTCM scratchpads for all tensors
+    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
+    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
+
+    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+
+    FARF(HIGH,
+         "%s: (%ux%ux%ux%u) * (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
+         op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
+         src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
+         octx->dst_spad.size);
+
+    // Make sure the reserved vtcm size is sufficient
+    if (octx->ctx->vtcm_size < spad_size) {
+        FARF(ERROR, "binary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type,
+             octx->ctx->vtcm_size, spad_size);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+
+    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        uint32_t n_jobs = MIN(n_threads, src0_nrows);
+
+        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+
+        worker_pool_run_func(octx->ctx->worker_pool, binary_op_func, octx, n_jobs);
+    }
+
+    return err;
+}
+
+int op_binary(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    switch (octx->src0.type) {
+        case HTP_TYPE_F32:
+            err = execute_op_binary_f32(octx);
+            break;
+
+        default:
+            err = HTP_STATUS_NO_SUPPORT;
+            break;
+    }
+
+    return err;
+}
diff --git a/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake b/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake
new file mode 100644
index 00000000000..7fa236e328f
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake
@@ -0,0 +1,157 @@
+if (HEXAGON_TOOLCHAIN_INCLUDED)
+  return()
+endif()
+set(HEXAGON_TOOLCHAIN_INCLUDED true)
+
+#Cross Compiling for Hexagon
+set(HEXAGON TRUE)
+set(CMAKE_SYSTEM_NAME QURT)
+set(CMAKE_SYSTEM_PROCESSOR Hexagon)
+set(CMAKE_SYSTEM_VERSION "1") #${HEXAGON_PLATFORM_LEVEL})
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+set(CUSTOM_RUNELF_PATH "")
+
+#To fix backward compatibility with EAI addon.
+if (NOT HEXAGON_SDK_ROOT)
+    set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
+endif()
+
+if (NOT HEXAGON_TOOLS_ROOT)
+    if (DEFINED ENV{HEXAGON_TOOLS_ROOT})
+        set(HEXAGON_TOOLS_ROOT $ENV{HEXAGON_TOOLS_ROOT})
+    endif()
+    if(NOT HEXAGON_TOOLS_ROOT)
+        set(HEXAGON_TOOLS_ROOT $ENV{DEFAULT_HEXAGON_TOOLS_ROOT})
+    endif()
+endif()
+
+file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
+file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}"   HEXAGON_SDK_ROOT)
+
+#Get the Binary extension of the Hexagon Toolchain
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
+    set(HEXAGON_TOOLCHAIN_SUFFIX .exe)
+endif()
+message(DEBUG "CMAKE_HOST_SYSTEM_NAME:${CMAKE_HOST_SYSTEM_NAME}")
+
+include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_arch.cmake)
+
+set(HEXAGON_TOOLCHAIN ${HEXAGON_TOOLS_ROOT})
+set(HEXAGON_LIB_DIR "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib")
+set(HEXAGON_ISS_DIR ${HEXAGON_TOOLCHAIN}/Tools/lib/iss)
+
+set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
+    HEXAGON_SDK_ROOT
+    HEXAGON_TOOLS_ROOT
+)
+
+#QURT Related includes and linker flags
+set(V_ARCH ${HEXAGON_ARCH})
+set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}")
+set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}")
+
+if( ${TREE} MATCHES PAKMAN )
+    set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}")
+endif()
+message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}")
+set(RTOS_DIR ${_QURT_INSTALL_DIR})
+set(QCC_DIR "${HEXAGON_QCC_DIR}/${V_ARCH}/G0")
+set(TARGET_DIR "${HEXAGON_LIB_DIR}/${V_ARCH}/G0")
+
+include_directories(
+    ${_QURT_INSTALL_DIR}/include
+    ${_QURT_INSTALL_DIR}/include/qurt
+    ${_QURT_INSTALL_DIR}/include/posix
+    )
+
+set(QURT_START_LINK_LIBS)
+set(QURT_START_LINK_LIBS
+    "${TARGET_DIR}/init.o"
+    "${RTOS_DIR}/lib/crt1.o"
+    "${RTOS_DIR}/lib/debugmon.o"
+    "${RTOS_DIR}/lib/libqurt.a"
+    "${TARGET_DIR}/libc.a"
+    "${TARGET_DIR}/libqcc.a"
+    "${TARGET_DIR}/libhexagon.a"
+    "${RTOS_DIR}/lib/libqurtcfs.a"
+    "${RTOS_DIR}/lib/libtimer_island.a"
+    "${RTOS_DIR}/lib/libtimer_main.a"
+    "${RTOS_DIR}/lib/libposix.a"
+    )
+STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}")
+
+set(QURT_END_LINK_LIBS
+    ${TARGET_DIR}/fini.o
+    )
+
+#Non QURT related includes and linker flags
+
+set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}")
+
+if (NOT NO_WRAP_MEM_API)
+    set(WRAP_MALLOC   -Wl,--wrap=malloc)
+    set(WRAP_CALLOC   -Wl,--wrap=calloc)
+    set(WRAP_FREE     -Wl,--wrap=free)
+    set(WRAP_REALLOC  -Wl,--wrap=realloc)
+    set(WRAP_MEMALIGN -Wl,--wrap=memalign)
+endif()
+
+set(PIC_SHARED_LD_FLAGS
+    -mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH}
+    -G0
+    -fpic
+    -Wl,-Bsymbolic
+    -Wl,-L${TARGET_DIR_NOOS}/G0/pic
+    -Wl,-L${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/
+    -Wl,--no-threads ${WRAP_MALLOC} ${WRAP_CALLOC} ${WRAP_FREE} ${WRAP_REALLOC} ${WRAP_MEMALIGN}
+    -shared
+    "-o <TARGET> <SONAME_FLAG><TARGET_SONAME>"
+    "<LINK_FLAGS>"
+    -Wl,--start-group
+    "<OBJECTS>"
+    "<LINK_LIBRARIES>"
+    -Wl,--end-group
+    -lc
+    )
+STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}")
+
+set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}")
+
+#System include paths
+include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
+include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
+include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs)
+
+#LLVM toolchain setup
+#Compiler paths, options and architecture
+set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX})
+set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
+set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX})
+set(CMAKE_ASM_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
+set(HEXAGON_LINKER ${CMAKE_C_COMPILER})
+set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon)
+
+set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG   "-Wl,-soname,")
+set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")
+
+#Compiler Options
+set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
+
+set(CMAKE_CXX_FLAGS_DEBUG          "${COMMON_FLAGS} -O0 -D_DEBUG -g")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g")
+set(CMAKE_CXX_FLAGS_RELEASE        "${COMMON_FLAGS} -O3")
+
+set(CMAKE_C_FLAGS_DEBUG            "${COMMON_FLAGS} -O0 -D_DEBUG -g")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO   "${COMMON_FLAGS} -O3 -g")
+set(CMAKE_C_FLAGS_RELEASE          "${COMMON_FLAGS} -O3")
+
+set(CMAKE_ASM_FLAGS_DEBUG          "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
+set(CMAKE_ASM_FLAGS_RELEASE        "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
+set(CMAKE_ASM_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" )
+
+#Linker Options
+set(CMAKE_C_CREATE_SHARED_LIBRARY   "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}")
+set(CMAKE_CXX_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}")
diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h
new file mode 100644
index 00000000000..5c3d217f1cc
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -0,0 +1,40 @@
+#ifndef HTP_CTX_H
+#define HTP_CTX_H
+
+#include "htp-dma.h"
+#include "worker-pool.h"
+
+#include <assert.h>
+#include <dspqueue.h>
+#include <stdatomic.h>
+#include <stdint.h>
+
+#define HTP_MAX_NTHREADS 10
+
+// FIXME: move these into matmul-ops
+#define HTP_SPAD_SRC0_NROWS 16
+#define HTP_SPAD_SRC1_NROWS 16
+#define HTP_SPAD_DST_NROWS  2
+
+// Main context for htp DSP backend
+struct htp_context {
+    dspqueue_t            queue;
+    dma_queue *           dma[HTP_MAX_NTHREADS];
+    worker_pool_context_t worker_pool;
+    uint32_t              n_threads;
+
+    int thread_id;
+    int thread_prio;
+
+    uint8_t * vtcm_base;
+    size_t    vtcm_size;
+    uint32_t  vtcm_rctx;
+
+    atomic_bool vtcm_valid;
+    atomic_bool vtcm_inuse;
+    atomic_bool vtcm_needs_release;
+
+    uint32_t opmask;
+};
+
+#endif /* HTP_CTX_H */
diff --git a/ggml/src/ggml-hexagon/htp/htp-dma.c b/ggml/src/ggml-hexagon/htp/htp-dma.c
new file mode 100644
index 00000000000..10c54b45ee2
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.c
@@ -0,0 +1,69 @@
+#include "htp-dma.h"
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#pragma clang diagnostic ignored "-Wunused-function"
+
+static inline uint32_t pow2_ceil(uint32_t x) {
+    if (x <= 1) {
+        return 1;
+    }
+    int p = 2;
+    x--;
+    while (x >>= 1) {
+        p <<= 1;
+    }
+    return p;
+}
+
+dma_queue * dma_queue_create(size_t capacity) {
+    dma_queue * q = (dma_queue *) memalign(32, sizeof(dma_queue));
+    if (q == NULL) {
+        FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
+        return NULL;
+    }
+
+    capacity = pow2_ceil(capacity);
+
+    memset(q, 0, sizeof(dma_queue));
+    q->capacity = capacity;
+    q->idx_mask = capacity - 1;
+
+    q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
+    memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
+
+    q->dst = (void **) memalign(4, capacity * sizeof(void *));
+    memset(q->dst, 0, capacity * sizeof(void *));
+
+    q->tail = &q->desc[capacity - 1];
+
+    if (!q->desc && !q->dst) {
+        FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
+        return NULL;
+    }
+
+    FARF(HIGH, "dma-queue: capacity %u\n", capacity);
+
+    return q;
+}
+
+void dma_queue_delete(dma_queue * q) {
+    if (!q) {
+        return;
+    }
+    free(q->desc);
+    free(q->dst);
+    free(q);
+}
+
+void dma_queue_flush(dma_queue * q) {
+    while (1) {
+        uint32_t s = dmwait() & 0x3;
+        if (s == HEXAGON_UDMA_DM0_STATUS_IDLE) {
+            break;
+        }
+    }
+    q->tail = NULL;
+}
diff --git a/ggml/src/ggml-hexagon/htp/htp-dma.h b/ggml/src/ggml-hexagon/htp/htp-dma.h
new file mode 100644
index 00000000000..4d0d54ce859
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.h
@@ -0,0 +1,119 @@
+#ifndef HTP_DMA_H
+#define HTP_DMA_H
+
+#include <HAP_farf.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    hexagon_udma_descriptor_type1_t * desc;  // descriptor pointers
+    hexagon_udma_descriptor_type1_t * tail;  // tail pointer
+    void **                           dst;   // dst pointers
+    uint32_t                          push_idx;
+    uint32_t                          pop_idx;
+    uint32_t                          capacity;
+    uint32_t                          idx_mask;
+} dma_queue;
+
+dma_queue * dma_queue_create(size_t capacity);
+void        dma_queue_delete(dma_queue * q);
+void        dma_queue_flush(dma_queue * q);
+
+// TODO: technically we don't need these and could use Q6_dmstart/wait/etc instead
+// but those do not seem to always compiler properly.
+static inline void dmstart(void * next) {
+    asm volatile(" release(%0):at" : : "r"(next));
+    asm volatile(" dmstart(%0)" : : "r"(next));
+}
+
+static inline void dmlink(void * cur, void * next) {
+    asm volatile(" release(%0):at" : : "r"(next));
+    asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next));
+}
+
+static inline unsigned int dmpoll(void) {
+    unsigned int ret = 0;
+    asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory");
+    return ret;
+}
+
+static inline unsigned int dmwait(void) {
+    unsigned int ret = 0;
+    asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory");
+    return ret;
+}
+
+static inline bool dma_queue_push(dma_queue *  q,
+                                  void *       dst,
+                                  const void * src,
+                                  size_t       dst_row_size,
+                                  size_t       src_row_size,
+                                  size_t       nrows) {
+    if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
+        return false;
+    }
+
+    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];
+
+    desc->next           = NULL;
+    desc->length         = 0;
+    desc->desctype       = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
+    desc->dstbypass      = 1;
+    desc->srcbypass      = 1;
+    desc->order          = 0;
+    desc->dstate         = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
+    desc->src            = (void *) src;
+    desc->dst            = (void *) dst;
+    desc->allocation     = 0;
+    desc->padding        = 0;
+    desc->roiwidth       = src_row_size;
+    desc->roiheight      = nrows;
+    desc->srcstride      = src_row_size;
+    desc->dststride      = dst_row_size;
+    desc->srcwidthoffset = 0;
+    desc->dstwidthoffset = 0;
+
+    q->dst[q->push_idx] = dst;
+
+    dmlink(q->tail, desc);
+    q->tail = desc;
+
+    // FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
+    q->push_idx = (q->push_idx + 1) & q->idx_mask;
+    return true;
+}
+
+static inline uint8_t * dma_queue_pop(dma_queue * q) {
+    if (q->push_idx == q->pop_idx) {
+        return NULL;
+    }
+
+    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
+
+    // Wait for desc to complete
+    while (1) {
+        dmpoll();
+        if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
+            break;
+        }
+        // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
+    }
+
+    uint8_t * dst = (uint8_t *) q->dst[q->pop_idx];
+
+    // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
+    q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
+    return dst;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* HTP_DMA_H */
diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h
new file mode 100644
index 00000000000..f23d5788068
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@@ -0,0 +1,156 @@
+#ifndef HTP_MSG_H
+#define HTP_MSG_H
+
+#include <assert.h>
+
+// ggml-common.h must be included prio to this header
+
+// Mask to enable various stages of the Ops.
+// Used for debugging and profiling.
+enum {
+    HTP_OPMASK_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into the DSP)
+    HTP_OPMASK_QUANTIZE = (1 << 1),  // Enable Quantize
+    HTP_OPMASK_COMPUTE  = (1 << 2),  // Enable Compute
+};
+
+// Op flags
+enum {
+    HTP_OPFLAGS_SKIP_QUANTIZE = (1 << 0),  // Skip dynamic quantization (reuse quantized tensors)
+    HTP_OPFLAGS_SKIP_COMPUTE  = (1 << 1),  // Skip actual computation (used for profiling)
+    HTP_OPFLAGS_EARLY_WAKEUP  = (1 << 2)   // Send early wakeup notification
+};
+
+enum htp_status {
+    HTP_STATUS_OK             = 1,
+    HTP_STATUS_INTERNAL_ERR   = 2,
+    HTP_STATUS_NO_SUPPORT     = 3,
+    HTP_STATUS_INVAL_PARAMS   = 4,
+    HTP_STATUS_VTCM_TOO_SMALL = 5,
+};
+
+// The values must match the ggml_type.
+// Duplicated here because we can't include full ggml.h in the htp build.
+// We have some static_asserts in the cpp code to ensure things are in sync.
+enum htp_data_type {
+    HTP_TYPE_F32   = 0,
+    HTP_TYPE_F16   = 1,
+    HTP_TYPE_Q4_0  = 2,
+    HTP_TYPE_Q8_0  = 8,
+    HTP_TYPE_MXFP4 = 39,
+    HTP_TYPE_COUNT
+};
+
+// These values are manually translated over to HTP
+// !!!! DO NOT ALTER THE ORDER OF THE FIRST FOUR ENUMS !!!!
+enum htp_op {
+    HTP_OP_MUL            = 0,
+    HTP_OP_ADD            = 1,
+    HTP_OP_SUB            = 2,
+    HTP_OP_DIV            = 3,
+    HTP_OP_MUL_MAT        = 4,
+    HTP_OP_MUL_MAT_ID     = 5,
+    HTP_OP_RMS_NORM       = 6,
+    HTP_OP_UNARY_SILU     = 7,
+    HTP_OP_GLU_SWIGLU     = 8,
+    HTP_OP_GLU_SWIGLU_OAI = 9,
+    HTP_OP_SOFTMAX        = 10,
+    HTP_OP_ADD_ID         = 11,
+    HTP_OP_ROPE           = 12,
+    INVALID
+};
+
+static inline size_t htp_type_block_size(uint32_t t) {
+    switch (t) {
+        case HTP_TYPE_F32:
+            return 1;
+        case HTP_TYPE_F16:
+            return 1;
+        case HTP_TYPE_Q4_0:
+            return QK4_0;
+        case HTP_TYPE_Q8_0:
+            return QK8_0;
+        case HTP_TYPE_MXFP4:
+            return QK_MXFP4;
+        default:
+            assert(0 && "unsupported HTP data type");
+    }
+    return 0;
+}
+
+static inline size_t htp_type_nbytes(uint32_t t) {
+    switch (t) {
+        case HTP_TYPE_F32:
+            return 4;
+        case HTP_TYPE_F16:
+            return 2;
+        case HTP_TYPE_Q4_0:
+            return sizeof(block_q4_0);
+        case HTP_TYPE_Q8_0:
+            return sizeof(block_q8_0);
+        case HTP_TYPE_MXFP4:
+            return sizeof(block_mxfp4);
+        default:
+            assert(0 && "unsupported HTP data type");
+    }
+    return 0;
+}
+
+static const char * htp_type_name(uint32_t t) {
+    switch (t) {
+        case HTP_TYPE_F32:
+            return "fp32";
+        case HTP_TYPE_F16:
+            return "fp16";
+        case HTP_TYPE_Q4_0:
+            return "q4_0";
+        case HTP_TYPE_Q8_0:
+            return "q8_0";
+        case HTP_TYPE_MXFP4:
+            return "mxfp4";
+    }
+    return 0;
+}
+
+// Internal types
+#define QK_Q4_0x4x2  256  // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
+#define QK_Q8_0x4x2  256  // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
+#define QK_MXFP4x4x2 256  // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
+
+#define HTP_MAX_DIMS 4
+
+struct htp_tensor {
+    uint32_t data;              // Buffer offset in the messages, and data pointer on the NSP
+    uint32_t type;              // Data type
+    uint32_t ne[HTP_MAX_DIMS];  // Number of elements
+    uint32_t nb[HTP_MAX_DIMS];  // Stride in bytes (see ggml.h ggml_tensor)
+};
+
+#define HTP_MAX_OP_PARAMS 64
+
+struct htp_general_req {
+    uint32_t op;  // GGML/HTP Op
+    int32_t  op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
+    // Params for the op, e.g. epsilon of RMS norm
+    uint32_t flags;          // Request flags
+
+    struct htp_tensor src0;  // Input0 tensor
+    struct htp_tensor src1;  // Input1 tensor
+    struct htp_tensor src2;  // Input2 tensor
+    struct htp_tensor dst;   // Output tensor
+
+    // should be multiple of 64 bytes (cacheline)
+};
+
+struct htp_general_rsp {
+    uint32_t op;           // GGML/HTP Op
+    uint32_t status;       // HTP_STATUS_...
+    uint32_t prof_usecs;   // Number of usec per request
+    uint32_t prof_cycles;  // Number of cycles per request
+    uint32_t prof_pkts;    // Number of instruction packets per request
+    uint8_t  unused[44];   // Pad to 64 bytes
+};
+
+#define HTP_MAX_MESSAGE_SIZE   sizeof(struct htp_general_req)
+#define HTP_MAX_PACKET_BUFFERS 4
+
+#endif /* HTP_MSG_H */
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
new file mode 100644
index 00000000000..45723196791
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -0,0 +1,53 @@
+#ifndef HTP_OPS_H
+#define HTP_OPS_H
+
+#include "htp-ctx.h"
+#include "htp-msg.h"
+#include "worker-pool.h"
+
+#include <assert.h>
+#include <stdint.h>
+
+// ggml-common.h must be included prior to this header
+
+struct htp_spad {
+    uint8_t * data;
+    size_t    size;
+    size_t    size_per_thread;
+};
+
+struct htp_ops_context {
+    struct htp_context * ctx;
+
+    enum htp_op op;
+    int32_t     op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
+
+    struct htp_tensor src0;
+    struct htp_tensor src1;
+    struct htp_tensor src2;
+    struct htp_tensor dst;
+
+    struct htp_spad src0_spad;
+    struct htp_spad src1_spad;
+    struct htp_spad src2_spad;
+    struct htp_spad dst_spad;
+
+    worker_pool_context_t * wpool;      // worker pool
+    uint32_t                n_threads;  // num threads
+
+    uint32_t src0_nrows_per_thread;
+    uint32_t src1_nrows_per_thread;
+
+    uint32_t flags;
+};
+
+int op_matmul(struct htp_ops_context * octx);
+int op_matmul_id(struct htp_ops_context * octx);
+int op_binary(struct htp_ops_context * octx);
+int op_unary(struct htp_ops_context * octx);
+int op_activations(struct htp_ops_context * octx);
+int op_softmax(struct htp_ops_context * octx);
+int op_add_id(struct htp_ops_context * octx);
+int op_rope(struct htp_ops_context * octx);
+
+#endif /* HTP_OPS_H */
diff --git a/ggml/src/ggml-hexagon/htp/htp_iface.idl b/ggml/src/ggml-hexagon/htp/htp_iface.idl
new file mode 100644
index 00000000000..9ebd937e46d
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl
@@ -0,0 +1,16 @@
+// FastRPC IDL interface for GGML HTP
+
+#ifndef HTP_IDL
+#define HTP_IDL
+
+#include "AEEStdDef.idl"
+#include "remote.idl"
+
+interface htp_iface : remote_handle64 {
+    AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx);
+    AEEResult stop();
+    AEEResult enable_etm();
+    AEEResult disable_etm();
+};
+
+#endif /* HTP_IDL */
diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
new file mode 100644
index 00000000000..d2964a04271
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -0,0 +1,80 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    HVX_Vector vec_out = Q6_V_vzero();
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
+        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            if (true == negate) {
+                HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
+                *p_vec_out++          = hvx_vec_exp_fp32(neg_vec_in);
+            } else {
+                *p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++);
+            }
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+            if (true == negate) {
+                HVX_Vector neg_vec_in                    = hvx_vec_neg_fp32(in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in);
+            } else {
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in);
+            }
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        if (true == negate) {
+            HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
+
+            vec_out = hvx_vec_exp_fp32(neg_vec_in);
+        } else {
+            vec_out = hvx_vec_exp_fp32(in);
+        }
+
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
+    }
+}
diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
new file mode 100644
index 00000000000..4b4043dcbde
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@@ -0,0 +1,60 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_inverse_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * p_vec_in  = (HVX_Vector *) src;
+        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            *p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in  = *(HVX_UVector *) srcf;
+        HVX_Vector out = hvx_vec_inverse_fp32(in);
+
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
+    }
+}
diff --git a/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c b/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c
new file mode 100644
index 00000000000..15ac64697c7
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c
@@ -0,0 +1,49 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+#if 0
+// Reference algo used in hvx-utils
+static void fast_sigmoid_f32(const float*  restrict src, float* restrict dst, const int num_elems)
+{
+    const float c1 = 0.03138777;
+    const float c2 = 0.276281267;
+    const float c_log2f = 1.442695022;
+
+    int32_t store_ints[32];
+    float store_floats[3][32];
+
+    for (int i = 0; i < num_elems; i++)
+    {
+        float v = src0[i];
+
+        v *= c_log2f*0.5;
+        int intPart = (int)v;
+        float x = (v - intPart);
+        float xx = x * x;
+        float v1 = c_log2f + c2 * xx;
+        float v2 = x + xx * c1 * x;
+        float v3 = (v2 + v1);
+        *((int*)&v3) += intPart << 24;
+        float v4 = v2 - v1;
+        float v5 = v3 - v4;
+        float res = v3 / v5;
+
+        dst[i] = res;
+    }
+}
+#endif
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
new file mode 100644
index 00000000000..d59f8a303fc
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -0,0 +1,944 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+
+#include <HAP_farf.h>
+#include <HAP_mem.h>
+#include <HAP_perf.h>
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "hvx-utils.h"
+
+#define htp_binary_ops_preamble                                                                                \
+    int step_of_4 = num_elems >> 7;                                                                            \
+    int step_of_2 = (num_elems - step_of_4 * VLEN_FP32 * 4) >> 6;                                              \
+    int step_of_1 = (num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2) >> 5;                  \
+    int remaining = num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32; \
+                                                                                                               \
+    const uint8_t * restrict src0_curr = src0;                                                                 \
+    const uint8_t * restrict src1_curr = src1;                                                                 \
+    uint8_t * restrict dst_curr        = dst;
+
+void hvx_mul_f32(const uint8_t * restrict src0,
+                 const uint8_t * restrict src1,
+                 uint8_t * restrict dst,
+                 const int num_elems) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
+        (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
+        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
+            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
+            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
+
+            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
+
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * src0f = (const float *) src0 + num_elems_whole;
+        const float * src1f = (const float *) src1 + num_elems_whole;
+        float *       dstf  = (float *) dst + num_elems_whole;
+
+        HVX_Vector in1 = *(HVX_UVector *) src0f;
+        HVX_Vector in2 = *(HVX_UVector *) src1f;
+
+        HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+    }
+}
+
+void hvx_mul_f32_opt(const uint8_t * restrict src0,
+                     const uint8_t * restrict src1,
+                     uint8_t * restrict dst,
+                     const int num_elems) {
+    htp_binary_ops_preamble
+
+        for (int i = 0; i < step_of_4; i++) {
+        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
+
+        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
+
+        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
+
+        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
+
+        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
+
+        HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
+
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
+
+        HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
+
+        HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
+
+        src0_curr += 4 * VLEN;
+
+        HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(v3a, v3b);
+
+        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
+
+        HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
+
+        *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
+
+        HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v4a, v4b);
+
+        src1_curr += 4 * VLEN;
+
+        *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
+
+        dst_curr += 4 * VLEN;
+    }
+    for (int i = 0; i < step_of_2; i++) {
+        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
+
+        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
+
+        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
+
+        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
+
+        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
+
+        src0_curr += 2 * VLEN;
+
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
+
+        src1_curr += 2 * VLEN;
+
+        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
+
+        dst_curr += 2 * VLEN;
+    }
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector va = *(HVX_Vector *) src0_curr;
+
+        src0_curr += VLEN;
+
+        HVX_Vector vb = *(HVX_Vector *) src1_curr;
+
+        src1_curr += VLEN;
+
+        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(va, vb);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
+
+        dst_curr += VLEN;
+    }
+    if (remaining > 0) {
+        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
+        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
+    }
+}
+
+void hvx_mul_mul_f32_opt(const uint8_t * restrict src0,
+                         const uint8_t * restrict src1,
+                         const uint8_t * restrict src2,
+                         uint8_t * restrict dst,
+                         const int num_elems) {
+    const uint8_t * restrict src0_curr = src0;
+    const uint8_t * restrict src1_curr = src1;
+    const uint8_t * restrict src2_curr = src2;
+    uint8_t * restrict dst_curr        = dst;
+
+    int step_of_2 = num_elems >> 6;
+    int step_of_1 = (num_elems - step_of_2 * VLEN_FP32 * 2) >> 5;
+    int remaining = num_elems - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32;
+
+    for (int i = 0; i < step_of_2; i++) {
+        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
+        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
+        HVX_Vector v1c = *(HVX_Vector *) src2_curr;
+
+        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
+
+        HVX_Vector v1_ = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
+        HVX_Vector v1  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1_), v1c);
+
+        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
+
+        HVX_Vector v2c = *(HVX_Vector *) (src2_curr + VLEN);
+
+        src0_curr += 2 * VLEN;
+
+        HVX_Vector v2_ = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
+        HVX_Vector v2  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2_), v2c);
+
+        src1_curr += 2 * VLEN;
+        src2_curr += 2 * VLEN;
+
+        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
+
+        dst_curr += 2 * VLEN;
+    }
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector va = *(HVX_Vector *) src0_curr;
+        src0_curr += VLEN;
+
+        HVX_Vector vb = *(HVX_Vector *) src1_curr;
+        src1_curr += VLEN;
+
+        HVX_Vector vc = *(HVX_Vector *) src2_curr;
+        src2_curr += VLEN;
+
+        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(va, vb);
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), vc);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v2);
+        dst_curr += VLEN;
+    }
+    if (remaining > 0) {
+        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), *(HVX_Vector *) src2_curr);
+        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v2));
+    }
+}
+
+void hvx_add_f32(const uint8_t * restrict src0,
+                 const uint8_t * restrict src1,
+                 uint8_t * restrict dst,
+                 const int num_elems) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
+        (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
+        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
+            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
+            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
+
+            HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2);
+
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * src0f = (const float *) src0 + num_elems_whole;
+        const float * src1f = (const float *) src1 + num_elems_whole;
+        float *       dstf  = (float *) dst + num_elems_whole;
+
+        HVX_Vector in1 = *(HVX_UVector *) src0f;
+        HVX_Vector in2 = *(HVX_UVector *) src1f;
+
+        HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2);
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+    }
+}
+
+void hvx_add_f32_opt(const uint8_t * restrict src0,
+                     const uint8_t * restrict src1,
+                     uint8_t * restrict dst,
+                     const int num_elems) {
+    htp_binary_ops_preamble
+
+        for (int i = 0; i < step_of_4; i++) {
+        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
+
+        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
+
+        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
+
+        HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b);
+
+        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
+
+        HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
+
+        HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
+
+        HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
+
+        HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
+
+        src0_curr += 4 * VLEN;
+
+        HVX_Vector v3 = Q6_Vqf32_vadd_VsfVsf(v3a, v3b);
+
+        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
+
+        HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
+
+        *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
+
+        HVX_Vector v4 = Q6_Vqf32_vadd_VsfVsf(v4a, v4b);
+
+        src1_curr += 4 * VLEN;
+
+        *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
+
+        dst_curr += 4 * VLEN;
+    }
+    for (int i = 0; i < step_of_2; i++) {
+        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
+
+        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
+
+        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
+
+        HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b);
+
+        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
+
+        src0_curr += 2 * VLEN;
+
+        HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b);
+
+        src1_curr += 2 * VLEN;
+
+        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
+
+        dst_curr += 2 * VLEN;
+    }
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector va = *(HVX_Vector *) src0_curr;
+
+        src0_curr += VLEN;
+
+        HVX_Vector vb = *(HVX_Vector *) src1_curr;
+
+        src1_curr += VLEN;
+
+        HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(va, vb);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
+
+        dst_curr += VLEN;
+    }
+    if (remaining > 0) {
+        HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
+        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
+    }
+}
+
+void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
+    size_t left_over       = num_elems & (VLEN_FP32 - 1);
+    size_t num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec);
+            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+            HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (const float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+    }
+}
+
+void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
+    size_t left_over       = num_elems & (VLEN_FP32 - 1);
+    size_t num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
+            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
+
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (const float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+    }
+}
+
+void hvx_sub_f32(const uint8_t * restrict src0,
+                 const uint8_t * restrict src1,
+                 uint8_t * restrict dst,
+                 const int num_elems) {
+    size_t left_over       = num_elems & (VLEN_FP32 - 1);
+    size_t num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
+        (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
+        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
+            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
+            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
+
+            HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2);
+
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * src0f = (const float *) src0 + num_elems_whole;
+        const float * src1f = (const float *) src1 + num_elems_whole;
+        float *       dstf  = (float *) dst + num_elems_whole;
+
+        HVX_Vector in1 = *(HVX_UVector *) src0f;
+        HVX_Vector in2 = *(HVX_UVector *) src1f;
+
+        HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2);
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+    }
+}
+
+void hvx_sub_f32_opt(const uint8_t * restrict src0,
+                     const uint8_t * restrict src1,
+                     uint8_t * restrict dst,
+                     const int num_elems) {
+    htp_binary_ops_preamble
+
+        for (int i = 0; i < step_of_4; i++) {
+        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
+
+        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
+
+        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
+
+        HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b);
+
+        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
+
+        HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
+
+        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
+
+        HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
+
+        HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
+
+        src0_curr += 4 * VLEN;
+
+        HVX_Vector v3 = Q6_Vqf32_vsub_VsfVsf(v3a, v3b);
+
+        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
+
+        HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
+
+        *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
+
+        HVX_Vector v4 = Q6_Vqf32_vsub_VsfVsf(v4a, v4b);
+
+        src1_curr += 4 * VLEN;
+
+        *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
+
+        dst_curr += 4 * VLEN;
+    }
+    for (int i = 0; i < step_of_2; i++) {
+        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
+
+        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
+
+        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
+
+        HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b);
+
+        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
+
+        src0_curr += 2 * VLEN;
+
+        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b);
+
+        src1_curr += 2 * VLEN;
+
+        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
+
+        dst_curr += 2 * VLEN;
+    }
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector va = *(HVX_Vector *) src0_curr;
+
+        src0_curr += VLEN;
+
+        HVX_Vector vb = *(HVX_Vector *) src1_curr;
+
+        src1_curr += VLEN;
+
+        HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(va, vb);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
+
+        dst_curr += VLEN;
+    }
+    if (remaining > 0) {
+        HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
+        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
+    }
+}
+
+void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
+    size_t left_over       = num_elems & (VLEN_FP32 - 1);
+    size_t num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
+            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+            HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec);
+
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (const float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec);
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+    }
+}
+
+float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    if (0 == htp_is_aligned((void *) src, VLEN)) {
+        FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n");
+    }
+
+    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
+
+    HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
+
+    HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
+    HVX_Vector zero_vec    = Q6_V_vsplat_R(0x00000000);
+
+#pragma unroll(4)
+    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
+        sum_vec_acc  = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
+        vec_in1++;
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (const float *) src + num_elems_whole;
+
+        HVX_Vector vec_left = *(HVX_UVector *) srcf;
+
+        HVX_Vector vec_left_sq = Q6_Vqf32_vmpy_VsfVsf(vec_left, vec_left);
+        HVX_Vector vec_tmp     = Q6_V_valign_VVR(vec_left_sq, zero_vec, left_over * SIZEOF_FP32);
+
+        sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, vec_tmp);
+    }
+
+    HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec_acc);
+    return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
+}
+
+float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if (0 == htp_is_aligned((void *) src, VLEN)) {
+        FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    HVX_Vector sum_vec  = Q6_V_vsplat_R(0x00000000);
+    HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000);
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * vec_in = (HVX_Vector *) src;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
+            sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+            sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), in);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (const float *) src + num_elems_whole;
+
+        HVX_Vector vec_left = *(HVX_UVector *) srcf;
+        HVX_Vector vec_tmp  = Q6_V_valign_VVR(vec_left, zero_vec, left_over * SIZEOF_FP32);
+        // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, vec_tmp);
+        sum_vec             = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), vec_tmp);
+    }
+
+    HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec);
+    return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
+}
+
+void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * vec_in1 = (HVX_Vector *) src;
+        HVX_Vector * vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
+            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
+
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (const float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+    }
+}
+
+float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
+    int left_over       = num_elems & (VLEN_FP32 - 1);
+    int num_elems_whole = num_elems - left_over;
+
+    int unaligned_addr = 0;
+    int unaligned_loop = 0;
+    if (0 == htp_is_aligned((void *) src, VLEN)) {
+        FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
+        unaligned_addr = 1;
+    }
+
+    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
+        unaligned_loop = 1;
+        FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
+    }
+
+    HVX_Vector vec_max   = hvx_vec_splat_fp32(((const float *) src)[0]);
+    HVX_Vector vec_first = hvx_vec_splat_fp32(((const float *) src)[0]);
+
+    if (0 == unaligned_loop) {
+        HVX_Vector * restrict vec_in = (HVX_Vector *) src;
+
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
+        }
+    } else {
+#pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+
+            vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, in);
+        }
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (const float *) src + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        HVX_Vector temp = Q6_V_valign_VVR(in, vec_first, left_over * SIZEOF_FP32);
+        vec_max         = Q6_Vsf_vmax_VsfVsf(vec_max, temp);
+    }
+
+    HVX_Vector v = hvx_vec_reduce_max_fp32(vec_max);
+    return hvx_vec_get_fp32(v);
+}
+
+void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
+    size_t left_over       = num_elems & (VLEN_FP32 - 1);
+    size_t num_elems_whole = num_elems - left_over;
+
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+    }
+
+    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
+
+    const float * src_f = (const float *) src;
+
+    HVX_Vector vec_min = Q6_V_vsplat_R(val);
+
+    HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
+    HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+        vec_min    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
+        *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (const float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, in);
+
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(vec_min));
+    }
+}
+
+void hvx_clamp_scalar_f32(const uint8_t * restrict src,
+                          const float limit_left,
+                          const float limit_right,
+                          uint8_t * restrict dst,
+                          const int num_elems) {
+    size_t left_over       = num_elems & (VLEN_FP32 - 1);
+    size_t num_elems_whole = num_elems - left_over;
+
+    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
+        FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+    }
+
+    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
+
+    HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
+    HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+
+    HVX_Vector range_left  = hvx_vec_splat_fp32(limit_left);
+    HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
+
+#pragma unroll(4)
+    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+        HVX_Vector in_vec = *vec_in++;
+        HVX_Vector temp_v = in_vec;
+
+        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
+
+        in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+        in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
+
+        *vec_out++ = Q6_Vsf_equals_Vqf32(in_vec);
+    }
+
+    if (left_over > 0) {
+        const float * srcf = (const float *) src + num_elems_whole;
+        float *       dstf = (float *) dst + num_elems_whole;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+
+        HVX_Vector temp_v = in;
+
+        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in, range_right);
+        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in);
+
+        in = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+        in = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
+
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(in));
+    }
+}
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
new file mode 100644
index 00000000000..9a88b46306e
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -0,0 +1,981 @@
+#ifndef HVX_UTILS_H
+#define HVX_UTILS_H
+
+#include "ops-utils.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#define SIZEOF_FP32 (4)
+#define SIZEOF_FP16 (2)
+#define VLEN        (128)
+#define VLEN_FP32   (VLEN / SIZEOF_FP32)
+#define VLEN_FP16   (VLEN / SIZEOF_FP16)
+
+static inline HVX_Vector hvx_vec_splat_fp32(float i) {
+    union {
+        float   f;
+        int32_t i;
+    } fp32 = { .f = i };
+
+    return Q6_V_vsplat_R(fp32.i);
+}
+
+static inline void hvx_vec_store_u(void * addr, uint32_t n, HVX_Vector v) {
+    // Rotate as needed.
+    v = Q6_V_vlalign_VVR(v, v, (size_t) addr);
+
+    uint32_t left_off  = (size_t) addr & 127;
+    uint32_t right_off = left_off + n;
+
+    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr);
+    HVX_VectorPred qr     = Q6_Q_vsetq2_R(right_off);
+
+    if (right_off > 128) {
+        Q6_vmem_QRIV(qr, (HVX_Vector *) addr + 1, v);
+        // all 1's
+        qr = Q6_Q_vcmp_eq_VbVb(v, v);
+    }
+
+    ql_not = Q6_Q_or_QQn(ql_not, qr);
+    Q6_vmem_QnRIV(ql_not, (HVX_Vector *) addr, v);
+}
+
+static inline void hvx_vec_store_a(void * ptr, size_t n, HVX_Vector v) {
+    assert((unsigned long) ptr % 128 == 0);
+
+    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) ptr);
+    HVX_VectorPred qr     = Q6_Q_vsetq2_R(n);
+    ql_not                = Q6_Q_or_QQn(ql_not, qr);
+    Q6_vmem_QnRIV(ql_not, (HVX_Vector *) ptr, v);
+}
+
+static inline HVX_Vector hvx_vec_repl4(HVX_Vector v) {
+    // vdelta control to replicate first 4 bytes across all elements
+    static const uint8_t __attribute__((aligned(128))) repl[128] = {
+        0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+    };
+
+    HVX_Vector ctrl = *(HVX_Vector *) repl;
+    return Q6_V_vdelta_VV(v, ctrl);
+}
+
+// copy n fp16 elements : source and destination are aligned to HVX Vector (128)
+static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    HVX_Vector * restrict vdst = (HVX_Vector *) dst;
+    HVX_Vector * restrict vsrc = (HVX_Vector *) src;
+
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+
+    uint32_t nvec = n / 64;
+    uint32_t nloe = n % 64;
+
+    uint32_t i = 0;
+
+#pragma unroll(4)
+    for (; i < nvec; i++) {
+        HVX_Vector v = vsrc[i];
+        vdst[i]      = v;
+    }
+
+    if (nloe) {
+        HVX_Vector v = vsrc[i];
+        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
+    }
+}
+
+// copy n fp16 elements : source is aligned, destination is potentially unaligned
+static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    HVX_UVector * restrict vdst = (HVX_UVector *) dst;
+    HVX_Vector * restrict vsrc  = (HVX_Vector *) src;
+
+    assert((unsigned long) src % 128 == 0);
+
+    uint32_t nvec = n / 64;
+    uint32_t nloe = n % 64;
+
+    uint32_t i = 0;
+
+#pragma unroll(4)
+    for (; i < nvec; i++) {
+        HVX_Vector v = vsrc[i];
+        vdst[i]      = v;
+    }
+
+    if (nloe) {
+        HVX_Vector v = vsrc[i];
+        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
+    }
+}
+
+// copy n fp16 elements : source is aligned, destination is potentially unaligned
+static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    HVX_Vector * restrict vdst  = (HVX_Vector *) dst;
+    HVX_UVector * restrict vsrc = (HVX_UVector *) src;
+
+    assert((unsigned long) dst % 128 == 0);
+
+    uint32_t nvec = n / 64;
+    uint32_t nloe = n % 64;
+
+    uint32_t i = 0;
+
+#pragma unroll(4)
+    for (; i < nvec; i++) {
+        HVX_Vector v = vsrc[i];
+        vdst[i]      = v;
+    }
+
+    if (nloe) {
+        HVX_Vector v = vsrc[i];
+        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
+    }
+}
+
+// copy n fp32 elements : source and destination are aligned to HVX Vector (128)
+static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    HVX_Vector * restrict vdst = (HVX_Vector *) dst;
+    HVX_Vector * restrict vsrc = (HVX_Vector *) src;
+
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+
+    uint32_t nvec = n / 32;
+    uint32_t nloe = n % 32;
+
+    uint32_t i = 0;
+
+#pragma unroll(4)
+    for (; i < nvec; i++) {
+        HVX_Vector v = vsrc[i];
+        vdst[i]      = v;
+    }
+
+    if (nloe) {
+        HVX_Vector v = vsrc[i];
+        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
+    }
+}
+
+// copy n fp32 elements : source is aligned, destination is unaligned
+static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    HVX_UVector * restrict vdst = (HVX_UVector *) dst;
+    HVX_Vector * restrict vsrc  = (HVX_Vector *) src;
+
+    assert((unsigned long) src % 128 == 0);
+
+    uint32_t nvec = n / 32;
+    uint32_t nloe = n % 32;
+
+    uint32_t i = 0;
+
+#pragma unroll(4)
+    for (; i < nvec; i++) {
+        HVX_Vector v = vsrc[i];
+        vdst[i]      = v;
+    }
+
+    if (nloe) {
+        HVX_Vector v = vsrc[i];
+        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
+    }
+}
+
+// copy n fp32 elements : source is unaligned, destination is aligned
+static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    HVX_Vector * restrict vdst  = (HVX_Vector *) dst;
+    HVX_UVector * restrict vsrc = (HVX_UVector *) src;
+
+    assert((unsigned long) dst % 128 == 0);
+
+    uint32_t nvec = n / 32;
+    uint32_t nloe = n % 32;
+
+    uint32_t i = 0;
+
+#pragma unroll(4)
+    for (; i < nvec; i++) {
+        HVX_Vector v = vsrc[i];
+        vdst[i]      = v;
+    }
+
+    if (nloe) {
+        HVX_Vector v = vsrc[i];
+        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
+    }
+}
+
+// bcast 1 fp32 element from source to n fp32 elements in destination : destination is aligned
+static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t n) {
+    HVX_Vector * restrict vdst = (HVX_Vector *) dst;
+
+    HVX_Vector velem = hvx_vec_splat_fp32(elem);
+
+    assert((unsigned long) dst % 128 == 0);
+
+    uint32_t nvec = n / 32;
+    uint32_t nloe = n % 32;
+
+    uint32_t i = 0;
+
+#pragma unroll(4)
+    for (; i < nvec; i++) {
+        vdst[i] = velem;
+    }
+
+    if (nloe) {
+        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), velem);
+    }
+}
+
+static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
+    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
+    uint32_t right_off = left_off + n;
+    return right_off <= chunk_size;
+}
+
+static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
+    union {
+        HVX_Vector v;
+        __fp16 d[64];
+    } u = { .v = v };
+
+    const uint32_t n0 = n / 16;
+    const uint32_t n1 = n % 16;
+    int            i  = 0;
+    for (; i < n0; i++) {
+        htp_dump_fp16_line(pref, u.d + (16 * i), 16);
+    }
+    if (n1) {
+        htp_dump_fp16_line(pref, u.d + (16 * i), n1);
+    }
+}
+
+static void hvx_vec_dump_fp16(char * pref, HVX_Vector v) {
+    hvx_vec_dump_fp16_n(pref, v, 64);
+}
+
+static void hvx_vec_dump_fp32_n(char * pref, HVX_Vector v, uint32_t n) {
+    union {
+        HVX_Vector v;
+        float      d[32];
+    } u = { .v = v };
+
+    const uint32_t n0 = n / 16;
+    const uint32_t n1 = n % 16;
+    int            i  = 0;
+    for (; i < n0; i++) {
+        htp_dump_fp32_line(pref, u.d + (16 * i), 16);
+    }
+    if (n1) {
+        htp_dump_fp32_line(pref, u.d + (16 * i), n1);
+    }
+}
+
+static void hvx_vec_dump_fp32_hmt(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        float      d[32];
+    } u = { .v = v };
+
+    FARF(HIGH, "%s: %.6f %.6f %.6f %.6f ...  %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f\n", pref, u.d[0], u.d[1],
+         u.d[2], u.d[3], u.d[12], u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
+}
+
+static void hvx_vec_dump_fp32(char * pref, HVX_Vector v) {
+    hvx_vec_dump_fp32_n(pref, v, 32);
+}
+
+static void hvx_vec_dump_int32(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        int32_t    d[32];
+    } u = { .v = v };
+
+    for (int i = 0; i < 32 / 16; i++) {
+        htp_dump_int32_line(pref, u.d + (16 * i), 16);
+    }
+}
+
+static void hvx_vec_dump_int32_hmt(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        int32_t    d[32];
+    } u = { .v = v };
+
+    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[12],
+         u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
+}
+
+static void hvx_vec_dump_int8_hmt(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        int8_t     d[128];
+    } u = { .v = v };
+
+    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[60],
+         u.d[61], u.d[62], u.d[63], u.d[124], u.d[125], u.d[126], u.d[127]);
+}
+
+static void hvx_vec_dump_int8(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        int8_t     d[128];
+    } u = { .v = v };
+
+    for (int i = 0; i < 128 / 16; i++) {
+        htp_dump_int8_line(pref, u.d + (16 * i), 16);
+    }
+}
+
+static void hvx_vec_dump_uint8(char * pref, HVX_Vector v) {
+    union {
+        HVX_Vector v;
+        uint8_t    d[128];
+    } u = { .v = v };
+
+    for (int i = 0; i < 128 / 16; i++) {
+        htp_dump_uint8_line(pref, u.d + (16 * i), 16);
+    }
+}
+
+static bool hvx_vec_eq(HVX_Vector v0, HVX_Vector v1, size_t n) {
+    typedef union {
+        HVX_Vector v;
+        int8_t     d[128];
+    } U;
+
+    U u0 = { .v = v0 };
+    U u1 = { .v = v1 };
+
+    for (int i = 0; i < n; i++) {
+        if (u0.d[i] != u1.d[i]) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static inline float hvx_vec_get_fp32(HVX_Vector v) {
+    float __attribute__((aligned(128))) x;
+    hvx_vec_store_a(&x, 4, v);
+    return x;
+}
+
+static inline HVX_Vector hvx_vec_int32_reduce_sum_n(HVX_Vector in, unsigned int n) {
+    unsigned int total = n * 4;  // total vec nbytes
+    unsigned int width = 4;      // int32
+
+    HVX_Vector sum = in, sum_t;
+    while (width < total) {
+        sum_t = Q6_V_vror_VR(sum, width);     // rotate right
+        sum   = Q6_Vw_vadd_VwVw(sum_t, sum);  // elementwise sum
+        width = width << 1;
+    }
+    return sum;
+}
+
+static inline HVX_Vector hvx_vec_int32_reduce_sum(HVX_Vector in) {
+    return hvx_vec_int32_reduce_sum_n(in, 32);
+}
+
+static inline HVX_Vector hvx_vec_qf32_reduce_sum_n(HVX_Vector in, unsigned int n) {
+    unsigned int total = n * 4;  // total vec nbytes
+    unsigned int width = 4;      // fp32 nbytes
+
+    HVX_Vector sum = in, sum_t;
+    while (width < total) {
+        sum_t = Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum), width);  // rotate right
+        sum   = Q6_Vqf32_vadd_Vqf32Vsf(sum, sum_t);             // elementwise sum
+        width = width << 1;
+    }
+    return sum;
+}
+
+static inline HVX_Vector hvx_vec_qf32_reduce_sum(HVX_Vector in) {
+    return hvx_vec_qf32_reduce_sum_n(in, 32);
+}
+
+static inline HVX_Vector hvx_vec_reduce_max_fp16(HVX_Vector in) {
+    unsigned total = 128;  // total vec nbytes
+    unsigned width = 2;    // fp16 nbytes
+
+    HVX_Vector _max = in, _max_t;
+    while (width < total) {
+        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
+        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
+        width  = width << 1;
+    }
+
+    return _max;
+}
+
+static inline HVX_Vector hvx_vec_reduce_max2_fp16(HVX_Vector in, HVX_Vector _max) {
+    unsigned total = 128;  // total vec nbytes
+    unsigned width = 2;    // fp32 nbytes
+
+    HVX_Vector _max_t;
+
+    _max = Q6_Vhf_vmax_VhfVhf(in, _max);
+    while (width < total) {
+        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
+        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
+        width  = width << 1;
+    }
+
+    return _max;
+}
+
+static inline HVX_Vector hvx_vec_reduce_max_fp32(HVX_Vector in) {
+    unsigned total = 128;  // total vec nbytes
+    unsigned width = 4;    // fp32 nbytes
+
+    HVX_Vector _max = in, _max_t;
+    while (width < total) {
+        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
+        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
+        width  = width << 1;
+    }
+
+    return _max;
+}
+
+static inline HVX_Vector hvx_vec_reduce_max2_fp32(HVX_Vector in, HVX_Vector _max) {
+    unsigned total = 128;  // total vec nbytes
+    unsigned width = 4;    // fp32 nbytes
+
+    HVX_Vector _max_t;
+
+    _max = Q6_Vsf_vmax_VsfVsf(in, _max);
+    while (width < total) {
+        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
+        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
+        width  = width << 1;
+    }
+
+    return _max;
+}
+
+static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) {
+    // abs by clearing the fp16 sign bit
+    HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff);
+    return Q6_V_vand_VV(v, mask);
+}
+
+static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
+    // neg by setting the fp16 sign bit
+    HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
+    return Q6_V_vor_VV(v, mask);
+}
+
+static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
+    // abs by clearing the fp32 sign bit
+    HVX_Vector mask = Q6_V_vsplat_R(0x7fffffff);
+    return Q6_V_vand_VV(v, mask);
+}
+
+static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
+#if __HTP_ARCH__ > 75
+    return Q6_Vsf_vfneg_Vsf(v);
+#else
+    // neg by setting the fp32 sign bit
+    HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
+    return Q6_V_vor_VV(v, mask);
+#endif  // __HTP_ARCH__ > 75
+}
+
+// ====================================================
+// FUNCTION: 1/(x+1)     y(0) = 1,  y(0.5) = 0.6667, y(1) = 0.5
+// Order:3; continuity: True; Ends forced: True
+// Mode: unsigned;   Result fractional bits: 14
+// Peak Error: 1.1295e-04  Rms Error: 2.8410e-05   Mean Error: 1.1370e-05
+//      32769  -32706   31252  -10589
+//      32590  -30635   22793   -4493
+//      32066  -27505   16481   -2348
+//      31205  -24054   11849   -1306
+
+static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
+    // input is 0..0xffff representing 0.0  .. 1.0
+    HVX_Vector p;
+    p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
+    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
+    p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
+    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
+    return p;  // signed result, 14 fractional bits
+}
+
+// Find reciprocal of fp16.
+// (1) first, convert to fp32, multiplying by 1.0; this is done to
+//    handle denormals. Ignoring sign and zero, result should be at
+//    least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
+//    (exponent in range [103,143])
+// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
+// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
+// (4) convert that to fp16
+// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
+//     the result with the max value.
+static inline HVX_Vector hvx_vec_inverse_fp16(HVX_Vector vals) {
+    HVX_Vector     em_mask  = Q6_Vh_vsplat_R(0x7FFF);
+    HVX_Vector     avals    = Q6_V_vand_VV(vals, em_mask);
+    HVX_VectorPred is_neg   = Q6_Q_vcmp_gt_VhVh(avals, vals);
+    // is too small to 1/x ? for 'standard' fp16, this would be 0x101
+    HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
+
+    HVX_VectorPair to_qf32  = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00));  // *1.0
+    HVX_Vector     to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
+    HVX_Vector     to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
+
+    // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
+    HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
+    // likewise extract the upper 16 from each, containing the exponents in range 103..142
+    HVX_Vector exp_u16  = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
+    //Get exponent in IEEE 32-bit representation
+    exp_u16             = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
+
+    // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
+    // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
+    // Use poly to transform to 1/x, with 14 fractional bits
+    //
+    HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
+
+    HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm);  //count leading zeros
+
+    // Get mantissa for 16-bit represenation
+    HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
+
+    //Compute Reciprocal Exponent
+    HVX_Vector exp_recip =
+        Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
+    //Convert it for 16-bit representation
+    exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
+    exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
+
+    //Merge exponent and mantissa for reciprocal
+    HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
+    // map 'small' inputs to standard largest value 0x7bff
+    recip            = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
+    // add sign back
+    recip            = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
+    return recip;
+}
+
+#define IEEE_VSF_EXPLEN   (8)
+#define IEEE_VSF_EXPBIAS  (127)
+#define IEEE_VSF_EXPMASK  (0xFF)
+#define IEEE_VSF_MANTLEN  (23)
+#define IEEE_VSF_MANTMASK (0x7FFFFF)
+#define IEEE_VSF_MIMPMASK (0x800000)
+
+static inline HVX_Vector hvx_vec_truncate_fp32(HVX_Vector in_vec) {
+    HVX_Vector mask_mant_v  = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
+    HVX_Vector mask_impl_v  = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
+    HVX_Vector const_zero_v = Q6_V_vzero();
+
+    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
+
+    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
+    expval_v &= IEEE_VSF_EXPMASK;
+    expval_v -= IEEE_VSF_EXPBIAS;
+
+    // negative exp == fractional value
+    HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
+
+    HVX_Vector rshift_v = IEEE_VSF_MANTLEN - expval_v;         // fractional bits - exp shift
+
+    HVX_Vector mant_v = in_vec & mask_mant_v;                  // obtain mantissa
+    HVX_Vector vout   = Q6_Vw_vadd_VwVw(mant_v, mask_impl_v);  // add implicit 1.0
+
+    vout = Q6_Vw_vasr_VwVw(vout, rshift_v);                    // shift to obtain truncated integer
+    vout = Q6_V_vmux_QVV(q_negexp, const_zero_v, vout);        // expval<0 -> 0
+
+    HVX_Vector neg_vout = -vout;
+
+    vout = Q6_V_vmux_QVV(q_negative, neg_vout, vout);  // handle negatives
+
+    return (vout);
+}
+
+static inline HVX_Vector hvx_vec_floor_fp32(HVX_Vector in_vec) {
+    HVX_Vector mask_mant_v    = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
+    HVX_Vector mask_impl_v    = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
+    HVX_Vector const_mnlen_v  = Q6_V_vsplat_R(IEEE_VSF_MANTLEN);
+    HVX_Vector const_zero_v   = Q6_V_vzero();
+    HVX_Vector const_negone_v = Q6_V_vsplat_R(0xbf800000);  // -1 IEEE vsf
+
+    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
+
+    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
+    expval_v &= IEEE_VSF_EXPMASK;
+    expval_v -= IEEE_VSF_EXPBIAS;
+
+    HVX_VectorPred q_negexp     = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
+    HVX_VectorPred q_expltmn    = Q6_Q_vcmp_gt_VwVw(const_mnlen_v, expval_v);
+    HVX_VectorPred q_negexp_pos = Q6_Q_vcmp_gtand_QVwVw(q_negexp, in_vec, const_zero_v);
+    HVX_VectorPred q_negexp_neg = Q6_Q_vcmp_gtand_QVwVw(q_negexp, const_zero_v, in_vec);
+
+    // if expval < 0 (q_negexp)         // <0, floor is 0
+    //    if vin > 0
+    //       floor = 0
+    //    if vin < 0
+    //       floor = -1
+    // if expval < mant_len (q_expltmn) // >0, but fraction may exist
+    //    get sign (q_negative)
+    //    mask >> expval                // fraction bits to mask off
+    //    vout = ~(mask)                // apply mask to remove fraction
+    //    if (qneg)                     // negative floor is one less (more, sign bit for neg)
+    //      vout += ((impl_mask) >> expval)
+    //    if (mask && vin)
+    //      vout = vin
+    // else                             // already an integer
+    //    ;                             // no change
+
+    // compute floor
+    mask_mant_v >>= expval_v;
+    HVX_Vector neg_addin_v    = mask_impl_v >> expval_v;
+    HVX_Vector vout_neg_addin = Q6_Vw_vadd_VwVw(in_vec, neg_addin_v);
+    HVX_Vector vout           = Q6_V_vmux_QVV(q_negative, vout_neg_addin, in_vec);
+
+    HVX_Vector     mask_chk_v = Q6_V_vand_VV(in_vec, mask_mant_v);  // chk if bits set
+    HVX_VectorPred q_integral = Q6_Q_vcmp_eq_VwVw(const_zero_v, mask_chk_v);
+
+    HVX_Vector not_mask_v = Q6_V_vnot_V(mask_mant_v);        // frac bits to clear
+    HVX_Vector vfrfloor_v = Q6_V_vand_VV(vout, not_mask_v);  // clear frac bits
+
+    vout = in_vec;
+    vout = Q6_V_vmux_QVV(q_expltmn, vfrfloor_v, vout);         // expval<mant
+    vout = Q6_V_vmux_QVV(q_integral, in_vec, vout);            // integral values
+    vout = Q6_V_vmux_QVV(q_negexp_pos, const_zero_v, vout);    // expval<0 x>0 -> 0
+    vout = Q6_V_vmux_QVV(q_negexp_neg, const_negone_v, vout);  // expval<0 x<0 -> -1
+
+    return vout;
+}
+
+static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
+    // This looks complicated.
+    // Ideally should just be Q6_Vh_equals_Vhf(vin)
+    // but that instruction does not do proper rounding.
+
+    // convert to qf32, multiplying by 1.0 in the process.
+    HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00));
+
+    // 'in-range' values are +/32752.
+    // add 192K to it, convert to sf
+    HVX_Vector v192K = Q6_V_vsplat_R(0x48400000);
+    HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K));
+    HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K));
+
+    // for in-range cases, result is {163858... 229360} so the exponent is always 144.
+    // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer.
+    // Start by <<10 to get the final 'sign' bit in bit 15...
+    vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10);
+    vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10);
+
+    // now round down to 16
+    return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
+}
+
+static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
+    HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
+    HVX_Vector two_sf       = hvx_vec_splat_fp32(2.0);
+
+    // First approximation
+    HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
+
+    HVX_Vector r_qf;
+
+    // Refine
+    r_qf = Q6_Vqf32_vmpy_VsfVsf(
+        i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
+    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
+        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
+    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
+        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
+
+    return Q6_Vsf_equals_Vqf32(r_qf);
+}
+
+#define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
+#define FAST_SIGMOID_C1    (0x3d009076)  // 0.03138777
+#define FAST_SIGMOID_C2    (0x3e8d74bd)  // 0.276281267
+#define FAST_SIGMOID_C3    (0x3f000000)  // 0.5
+
+static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) {
+    v = Q6_Vqf32_vmpy_VsfVsf(v, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
+    v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v), Q6_V_vsplat_R(FAST_SIGMOID_C3));
+
+    HVX_Vector in_int = hvx_vec_truncate_fp32(Q6_Vsf_equals_Vqf32(v));
+    HVX_Vector x      = Q6_Vqf32_vsub_Vqf32Vsf(v, Q6_Vsf_equals_Vw(in_int));
+    HVX_Vector xx     = Q6_Vqf32_vmpy_Vqf32Vqf32(x, x);
+
+    HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(xx), Q6_V_vsplat_R(FAST_SIGMOID_C2));
+    v1            = Q6_Vqf32_vadd_Vqf32Vsf(v1, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
+
+    HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(x), Q6_V_vsplat_R(FAST_SIGMOID_C1));
+    v2            = Q6_Vqf32_vmpy_Vqf32Vqf32(v2, xx);
+    v2            = Q6_Vqf32_vadd_Vqf32Vqf32(v2, x);
+
+    HVX_Vector v3          = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vqf32(v2, v1));
+    HVX_Vector v3_exponent = Q6_Vw_vasl_VwR(v3, 1);
+    v3_exponent            = Q6_Vuw_vlsr_VuwR(v3_exponent, 24);
+    v3_exponent            = Q6_Vw_vadd_VwVw(in_int, v3_exponent);
+    v3                     = Q6_Vw_vaslacc_VwVwR(v3, in_int, 24);
+
+    HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1));
+    HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4));
+
+    HVX_Vector res = hvx_vec_inverse_fp32(v5);
+    res            = Q6_Vqf32_vmpy_VsfVsf(v3, res);
+
+    return Q6_Vsf_equals_Vqf32(res);
+}
+
+#define EXP_COEFF_5 (0x39506967)  // 0.000198757 = 1/(7!)
+#define EXP_COEFF_4 (0x3AB743CE)  // 0.0013982   = 1/(6!)
+#define EXP_COEFF_3 (0x3C088908)  // 0.00833345  = 1/(5!)
+#define EXP_COEFF_2 (0x3D2AA9C1)  // 0.416658    = 1/(4!)
+#define EXP_COEFF_1 (0x3E2AAAAA)  // 0.16666667  = 1/(3!)
+#define EXP_COEFF_0 (0x3F000000)  // 0.5         = 1/(2!)
+#define EXP_LOGN2   (0x3F317218)  // ln(2)   = 0.6931471805
+#define EXP_LOG2E   (0x3FB8AA3B)  // log2(e) = 1/ln(2) = 1.4426950408
+#define EXP_ONE     (0x3f800000)  // 1.0
+#define EXP_RANGE_R (0x41a00000)  // 20.0
+#define EXP_RANGE_L (0xc1a00000)  // -20.0
+
+static inline HVX_Vector hvx_vec_exp_fp32(HVX_Vector in_vec) {
+    HVX_Vector z_qf32_v;
+    HVX_Vector x_v;
+    HVX_Vector x_qf32_v;
+    HVX_Vector y_v;
+    HVX_Vector k_v;
+    HVX_Vector f_v;
+    HVX_Vector epsilon_v;
+    HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E);
+    HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2);
+    HVX_Vector E_const;
+    HVX_Vector zero_v = Q6_V_vzero();
+
+    // exp(x) is approximated as follows:
+    //   f = floor(x/ln(2)) = floor(x*log2(e))
+    //   epsilon = x - f*ln(2)
+    //   exp(x) = exp(epsilon+f*ln(2))
+    //          = exp(epsilon)*exp(f*ln(2))
+    //          = exp(epsilon)*2^f
+    //
+    //   Since epsilon is close to zero, it can be approximated with its Taylor series:
+    //            exp(x) ~= 1+x+x^2/2!+x^3/3!+...+x^n/n!+...
+    //   Preserving the first eight elements, we get:
+    //            exp(x) ~= 1+x+e0*x^2+e1*x^3+e2*x^4+e3*x^5+e4*x^6+e5*x^7
+    //                   =  1+x+(E0+(E1+(E2+(E3+(E4+E5*x)*x)*x)*x)*x)*x^2
+
+    HVX_Vector temp_v = in_vec;
+
+    // Clamp inputs to (-20.0, 20.0)
+    HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
+    HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
+
+    in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
+    in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v);
+
+    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
+    epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
+
+    //    f_v is the floating point result and k_v is the integer result
+    f_v = hvx_vec_floor_fp32(epsilon_v);
+    k_v = hvx_vec_truncate_fp32(f_v);
+
+    x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v);
+
+    //  x = x - f_v * logn2;
+    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2);
+    x_qf32_v  = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v);
+    // normalize before every QFloat's vmpy
+    x_qf32_v  = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
+
+    // z = x * x;
+    z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
+    z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
+
+    x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
+
+    // y = E4 + E5 * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_5);
+    y_v     = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
+    E_const = Q6_V_vsplat_R(EXP_COEFF_4);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = E3 + y * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_3);
+    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = E2 + y * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_2);
+    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = E1 + y * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_1);
+    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = E0 + y * x;
+    E_const = Q6_V_vsplat_R(EXP_COEFF_0);
+    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
+    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = x + y * z;
+    y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v);
+    y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v);
+    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
+
+    // y = y + 1.0;
+    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE));
+
+    // insert exponents
+    //        y = ldexpf(y, k);
+    //    y_v += k_v; // qf32
+    // modify exponent
+
+    y_v = Q6_Vsf_equals_Vqf32(y_v);
+
+    // add k_v to the exponent of y_v
+    HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1);
+
+    y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1);
+    y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent);
+
+    // exponent cannot be negative; if overflow is detected, result is set to zero
+    HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent);
+
+    y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN);
+
+    y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v);
+
+    return y_v;
+}
+
+#define RSQRT_CONST        0x5f3759df  // Constant for fast inverse square root calculation
+#define RSQRT_ONE_HALF     0x3f000000  // 0.5
+#define RSQRT_THREE_HALVES 0x3fc00000  // 1.5
+
+static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
+    //Algorithm :
+    //  x2 = input*0.5
+    //  y  = * (long *) &input
+    //  y  = 0x5f3759df - (y>>2)
+    //  y  = y*(threehalfs - x2*y*y)
+
+    HVX_Vector rsqrtconst = Q6_V_vsplat_R(RSQRT_CONST);
+    HVX_Vector onehalf    = Q6_V_vsplat_R(RSQRT_ONE_HALF);
+    HVX_Vector threehalfs = Q6_V_vsplat_R(RSQRT_THREE_HALVES);
+
+    HVX_Vector x2, y, ypower2, temp;
+
+    x2 = Q6_Vqf32_vmpy_VsfVsf(in_vec, onehalf);
+    x2 = Q6_Vqf32_vadd_Vqf32Vsf(x2, Q6_V_vzero());
+
+    y = Q6_Vw_vasr_VwR(in_vec, 1);
+    y = Q6_Vw_vsub_VwVw(rsqrtconst, y);
+
+    // 1st iteration
+    ypower2 = Q6_Vqf32_vmpy_VsfVsf(y, y);
+    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
+    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
+    temp    = Q6_Vqf32_vmpy_VsfVsf(y, Q6_Vsf_equals_Vqf32(temp));
+
+    // 2nd iteration
+    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
+    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
+    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
+    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
+
+    // 3rd iteration
+    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
+    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
+    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
+    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
+    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
+
+    return Q6_Vsf_equals_Vqf32(temp);
+}
+
+static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
+    int step_of_1 = num_elems >> 5;
+    int remaining = num_elems - step_of_1 * VLEN_FP32;
+
+    assert(remaining == 0);
+
+    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
+    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
+
+#pragma unroll(4)
+    for (int i = 0; i < step_of_1; i++) {
+        v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]);
+    }
+}
+
+float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
+void  hvx_mul_f32(const uint8_t * restrict src0,
+                  const uint8_t * restrict src1,
+                  uint8_t * restrict dst,
+                  const int num_elems);
+void  hvx_mul_f32_opt(const uint8_t * restrict src0,
+                      const uint8_t * restrict src1,
+                      uint8_t * restrict dst,
+                      const int num_elems);
+void  hvx_mul_mul_f32_opt(const uint8_t * restrict src0,
+                          const uint8_t * restrict src1,
+                          const uint8_t * restrict src2,
+                          uint8_t * restrict dst,
+                          const int num_elems);
+void  hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
+void  hvx_add_f32(const uint8_t * restrict src0,
+                  const uint8_t * restrict src1,
+                  uint8_t * restrict dst,
+                  const int num_elems);
+void  hvx_add_f32_opt(const uint8_t * restrict src0,
+                      const uint8_t * restrict src1,
+                      uint8_t * restrict dst,
+                      const int num_elems);
+void  hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
+void  hvx_sub_f32(const uint8_t * restrict src0,
+                  const uint8_t * restrict src1,
+                  uint8_t * restrict dst,
+                  const int num_elems);
+void  hvx_sub_f32_opt(const uint8_t * restrict src0,
+                      const uint8_t * restrict src1,
+                      uint8_t * restrict dst,
+                      const int num_elems);
+void  hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
+void  hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale);
+void  hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
+void  hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
+void  hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate);
+float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems);
+float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems);
+void  hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
+void  hvx_clamp_scalar_f32(const uint8_t * restrict src,
+                           const float limit_left,
+                           const float limit_right,
+                           uint8_t * restrict dst,
+                           const int num_elems);
+
+#endif /* HVX_UTILS_H */
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
new file mode 100644
index 00000000000..e35ea3b0211
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -0,0 +1,945 @@
+#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+#pragma clang diagnostic ignored "-Wunused-function"
+
+#define FARF_ERROR  1
+#define FARF_HIGH   1
+#define FARF_MEDIUM 0
+#define FARF_LOW    0
+#include <AEEStdErr.h>
+#include <dspqueue.h>
+#include <HAP_compute_res.h>
+#include <HAP_etm_config.h>
+#include <HAP_farf.h>
+#include <HAP_mem.h>
+#include <HAP_perf.h>
+#include <HAP_power.h>
+#include <HAP_ps.h>
+#include <qurt.h>
+#include <qurt_thread.h>
+#include <remote.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "ops-utils.h"
+#include "worker-pool.h"
+
+AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
+    struct htp_context * ctx;
+    int                  err = 0;
+
+    ctx = calloc(1, sizeof(*ctx));
+    if (ctx == NULL) {
+        return AEE_ENOMEMORY;
+    }
+
+    // Use the context structure as a handle
+    *handle = (remote_handle64) ctx;
+
+    // Enable FARF logs
+    HAP_setFARFRuntimeLoggingParams(0xffff, NULL, 0);
+
+    // Set client class
+    {
+        HAP_power_request_t request;
+        memset(&request, 0, sizeof(HAP_power_request_t));
+        request.type    = HAP_power_set_apptype;
+        request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
+
+        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
+            return err;
+        }
+    }
+
+    {
+        HAP_power_request_t request;
+        memset(&request, 0, sizeof(request));
+
+        request.type                              = HAP_power_set_DCVS_v3;
+        request.dcvs_v3.set_dcvs_enable           = TRUE;
+        request.dcvs_v3.dcvs_enable               = TRUE;
+        request.dcvs_v3.dcvs_option               = HAP_DCVS_V2_PERFORMANCE_MODE;
+        request.dcvs_v3.set_bus_params            = TRUE;
+        request.dcvs_v3.bus_params.min_corner     = HAP_DCVS_VCORNER_MAX;
+        request.dcvs_v3.bus_params.max_corner     = HAP_DCVS_VCORNER_MAX;
+        request.dcvs_v3.bus_params.target_corner  = HAP_DCVS_VCORNER_MAX;
+        request.dcvs_v3.set_core_params           = TRUE;
+        request.dcvs_v3.core_params.min_corner    = HAP_DCVS_VCORNER_MAX;
+        request.dcvs_v3.core_params.max_corner    = HAP_DCVS_VCORNER_MAX;
+        request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
+        request.dcvs_v3.set_sleep_disable         = TRUE;
+        request.dcvs_v3.sleep_disable             = TRUE;
+        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
+            return err;
+        }
+
+        memset(&request, 0, sizeof(request));
+        request.type         = HAP_power_set_HVX;
+        request.hvx.power_up = TRUE;
+        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
+            return err;
+        }
+    }
+
+    {
+        // Power on HMX
+        HAP_power_request_t request;
+        memset(&request, 0, sizeof(HAP_power_request_t));
+        request.type         = HAP_power_set_HMX;
+        request.hmx.power_up = TRUE;
+        FARF(ALWAYS, "Powering HMX on\n");
+        err = HAP_power_set((void *) &ctx, &request);
+        if (err != AEE_SUCCESS) {
+            FARF(ERROR, "Error powering on HMX.");
+            return err;
+        }
+    }
+
+    return AEE_SUCCESS;
+}
+
+AEEResult htp_iface_close(remote_handle64 handle) {
+    struct htp_context * ctx = (struct htp_context *) handle;
+
+    if (!ctx) {
+        return AEE_EBADPARM;
+    }
+
+    if (ctx->queue) {
+        FARF(ERROR, "Closing handle with queue still open");
+        return AEE_EITEMBUSY;
+    }
+
+    free(ctx);
+    return AEE_SUCCESS;
+}
+
+AEEResult htp_iface_enable_etm(remote_handle64 handle) {
+    int err = HAP_user_etm_enable();
+    if (err) {
+        if (err == AEE_EVERSIONNOTSUPPORT) {
+            FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
+        } else {
+            FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
+        }
+    }
+    return err;
+}
+
+AEEResult htp_iface_disable_etm(remote_handle64 handle) {
+    int err = HAP_user_etm_disable();
+    if (err) {
+        if (err == AEE_EVERSIONNOTSUPPORT) {
+            FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
+        } else {
+            FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
+        }
+    }
+    return err;
+}
+
+static int vtcm_acquire(struct htp_context * ctx) {
+    if (!ctx->vtcm_valid) {
+        // Temporarily bump thread priority to make sure it's higher than other sessions.
+        // This way the resource manager will notify the other thread to release VTCM.
+        // Note that we need to reaquire VTCM at normal priority for this to work next time.
+        qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
+        HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
+        HAP_compute_res_release_cached(ctx->vtcm_rctx);
+        qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
+
+        HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
+        ctx->vtcm_valid = true;
+    }
+
+    ctx->vtcm_inuse = true;
+    return 0;
+}
+
+static int vtcm_release(struct htp_context * ctx) {
+    ctx->vtcm_inuse = false;
+
+    if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
+        ctx->vtcm_valid         = false;
+        ctx->vtcm_needs_release = false;
+        HAP_compute_res_release_cached(ctx->vtcm_rctx);
+    }
+
+    return 0;
+}
+
+static int vtcm_release_callback(unsigned int rctx, void * state) {
+    struct htp_context * ctx = (struct htp_context *) state;
+
+    if (!ctx || ctx->vtcm_rctx != rctx) {
+        return AEE_EBADPARM;
+    }
+
+    // If VTCM is not inuse (not processing Ops) release it right here
+    // otherwise we'll release it once we're done with the current Op.
+
+    if (ctx->vtcm_inuse) {
+        ctx->vtcm_needs_release = false;
+        return 0;
+    }
+
+    ctx->vtcm_valid = false;
+    HAP_compute_res_release_cached(ctx->vtcm_rctx);
+
+    return 0;
+}
+
+static int vtcm_alloc(struct htp_context * ctx) {
+    unsigned int vtcm_size = 8 * 1024 * 1024;  // 8MB default
+    HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL);
+
+    compute_res_attr_t attr;
+    HAP_compute_res_attr_init(&attr);
+    HAP_compute_res_attr_set_serialize(&attr, 0);
+    HAP_compute_res_attr_set_cache_mode(&attr, 1);
+    HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size);
+    HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
+    HAP_compute_res_attr_set_hmx_param(&attr, 1);
+
+    // Allocate VTCM for scratch pads
+    uint32_t rctx = HAP_compute_res_acquire(&attr, 1000000 /* timeout */);
+    if (!rctx) {
+        FARF(ERROR, "failed to allocate %zu bytes VTCM\n", ctx->vtcm_size);
+        return AEE_ENOMEMORY;
+    }
+
+    void * vtcm_ptr;
+    if (HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &vtcm_ptr, &vtcm_size) != 0) {
+        HAP_compute_res_release(rctx);
+        FARF(ERROR, "failed to allocate %zu bytes VTCM (new)\n", ctx->vtcm_size);
+        return AEE_ENOMEMORY;
+    }
+
+    ctx->vtcm_base          = (uint8_t *) vtcm_ptr;
+    ctx->vtcm_size          = vtcm_size;
+    ctx->vtcm_rctx          = rctx;
+    ctx->vtcm_valid         = false;
+    ctx->vtcm_inuse         = false;
+    ctx->vtcm_needs_release = false;
+
+    return 0;
+}
+
+static void vtcm_free(struct htp_context * ctx) {
+    if (ctx->vtcm_rctx) {
+        HAP_compute_res_release(ctx->vtcm_rctx);
+        ctx->vtcm_base = 0;
+        ctx->vtcm_rctx = 0;
+    }
+}
+
+static void htp_packet_callback(dspqueue_t queue, int error, void * context);
+static void htp_error_callback(dspqueue_t queue, int error, void * context);
+
+AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
+    struct htp_context * ctx = (struct htp_context *) handle;
+
+    if (!ctx) {
+        return AEE_EBADPARM;
+    }
+
+    if (ctx->queue) {
+        FARF(ERROR, "Queue already open");
+        return AEE_EITEMBUSY;
+    }
+
+    // Import queue created on the CPU
+    int err = dspqueue_import(dsp_queue_id,         // Queue ID from dspqueue_export
+                              htp_packet_callback,  // Packet callback
+                              htp_error_callback,   // Error callback; no errors expected on the DSP
+                              (void *) ctx,         // Callback context
+                              &ctx->queue);
+
+    if (err) {
+        FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
+        return err;
+    }
+
+    ctx->thread_id   = qurt_thread_get_id();
+    ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
+
+    // allocate VTCM
+    err = vtcm_alloc(ctx);
+    if (err != AEE_SUCCESS) {
+        FARF(ERROR, "Unable to allocate VTCM");
+        return AEE_ENOMEMORY;
+    }
+
+    qurt_sysenv_max_hthreads_t hw_threads;
+    qurt_sysenv_get_max_hw_threads(&hw_threads);
+    uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
+
+    if (n_hvx == 0) {
+        n_hvx = hw_nhvx;
+    }
+    if (n_hvx > hw_threads.max_hthreads) {
+        n_hvx = hw_threads.max_hthreads;
+    }
+    if (n_hvx > HTP_MAX_NTHREADS) {
+        n_hvx = HTP_MAX_NTHREADS;
+    }
+
+    ctx->n_threads = n_hvx;
+    for (int i = 0; i < ctx->n_threads; i++) {
+        ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
+    }
+
+    // init worker pool
+    err = worker_pool_init(&ctx->worker_pool, n_hvx);
+    if (err != AEE_SUCCESS) {
+        FARF(ERROR, "Unable to create worker pool");
+        return err;
+    }
+
+    FARF(HIGH, "session %u started: n-hvx %u vtcm-size %zu vtcm-rctx %u n-threads %u thread-id %d thread-prio %d \n",
+         sess_id, hw_nhvx, ctx->vtcm_size, ctx->vtcm_rctx, ctx->n_threads, ctx->thread_id, ctx->thread_prio);
+
+    return AEE_SUCCESS;
+}
+
+AEEResult htp_iface_stop(remote_handle64 handle) {
+    struct htp_context * ctx = (struct htp_context *) handle;
+    if (!ctx) {
+        return AEE_EBADPARM;
+    }
+
+    if (!ctx->queue) {
+        FARF(ERROR, "Queue not open");
+        return AEE_EBADSTATE;
+    }
+
+    // Close queue. dspqueue_close() will also wait for callbacks to finish.
+    int err    = dspqueue_close(ctx->queue);
+    ctx->queue = NULL;
+    if (err != 0) {
+        FARF(ERROR, "Queue close failed with 0x%08x", (unsigned) err);
+        return err;
+    }
+
+    if (ctx->worker_pool) {
+        // Release worker pool
+        worker_pool_release(&ctx->worker_pool);
+    }
+
+    for (int i = 0; i < ctx->n_threads; i++) {
+        dma_queue_delete(ctx->dma[i]);
+    }
+
+    vtcm_free(ctx);
+
+    return AEE_SUCCESS;
+}
+
+static void htp_error_callback(dspqueue_t queue, int error, void * context) {
+    // No errors expected on the DSP.
+    FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
+}
+
+struct profile_data {
+    uint64_t usecs;
+    uint64_t cycles;
+    uint64_t pkts;
+};
+
+static inline void profile_start(struct profile_data * d) {
+    d->usecs  = HAP_perf_get_qtimer_count();
+    d->cycles = htp_get_cycles();
+    d->pkts   = htp_get_pktcnt();
+}
+
+static inline void profile_stop(struct profile_data * d) {
+    d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
+    d->cycles = htp_get_cycles() - d->cycles;
+    d->pkts   = htp_get_pktcnt() - d->pkts;
+}
+
+static int send_htp_rsp(struct htp_context *     c,
+                        uint32_t                 op,
+                        uint32_t                 status,
+                        struct dspqueue_buffer * bufs,
+                        size_t                   n_bufs,
+                        struct profile_data *    prof) {
+    // Prep response struct
+    struct htp_general_rsp rsp;
+    rsp.op          = op;
+    rsp.status      = status;
+    rsp.prof_usecs  = prof->usecs;
+    rsp.prof_cycles = prof->cycles;
+    rsp.prof_pkts   = prof->pkts;
+
+    int err = dspqueue_write(c->queue,
+                             0,                       // Flags
+                             n_bufs,
+                             bufs,                    // Buffer references
+                             sizeof(rsp),
+                             (const uint8_t *) &rsp,  // Message
+                             DSPQUEUE_TIMEOUT_NONE);
+
+    if (err != 0) {
+        FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
+    }
+
+    return err;
+}
+
+static void proc_matmul_req(struct htp_context *     ctx,
+                            struct htp_general_req * req,
+                            struct dspqueue_buffer * bufs,
+                            size_t                   n_bufs) {
+    // Prep response buffer structs (needed for error responses, etc)
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    // We had written to the output buffer, we'd also need to flush it
+    rsp_bufs[2].fd     = bufs[2].fd;
+    rsp_bufs[2].ptr    = bufs[2].ptr;
+    rsp_bufs[2].size   = bufs[2].size;
+    rsp_bufs[2].offset = bufs[2].offset;
+    rsp_bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
+                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+
+    // Setup Op context
+    struct htp_ops_context octx = { 0 };
+    octx.ctx                    = ctx;
+    octx.src0                   = req->src0;
+    octx.src1                   = req->src1;
+    octx.dst                    = req->dst;
+    octx.flags                  = req->flags;
+    octx.op                     = req->op;
+
+    // Update data pointers
+    octx.src0.data = (uint32_t) bufs[0].ptr;
+    octx.src1.data = (uint32_t) bufs[1].ptr;
+    octx.dst.data  = (uint32_t) bufs[2].ptr;
+    octx.n_threads = ctx->n_threads;
+
+    struct profile_data prof;
+    profile_start(&prof);
+
+    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
+    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
+        rsp_status = op_matmul(&octx);
+        vtcm_release(ctx);
+    }
+
+    profile_stop(&prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
+}
+
+static void proc_matmul_id_req(struct htp_context *     ctx,
+                               struct htp_general_req * req,
+                               struct dspqueue_buffer * bufs,
+                               size_t                   n_bufs) {
+    // Prep response buffer structs (needed for error responses, etc)
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[2].fd     = bufs[2].fd;
+    rsp_bufs[2].ptr    = bufs[2].ptr;
+    rsp_bufs[2].size   = bufs[2].size;
+    rsp_bufs[2].offset = bufs[2].offset;
+    rsp_bufs[2].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    // We had written to the output buffer, we'd also need to flush it
+    rsp_bufs[3].fd     = bufs[3].fd;
+    rsp_bufs[3].ptr    = bufs[3].ptr;
+    rsp_bufs[3].size   = bufs[3].size;
+    rsp_bufs[3].offset = bufs[3].offset;
+    rsp_bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
+                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+
+    // Setup Op context
+    struct htp_ops_context octx = { 0 };
+    octx.ctx                    = ctx;
+    octx.src0                   = req->src0;
+    octx.src1                   = req->src1;
+    octx.src2                   = req->src2;
+    octx.dst                    = req->dst;
+    octx.flags                  = req->flags;
+    octx.op                     = req->op;
+
+    // Update data pointers
+    octx.src0.data = (uint32_t) bufs[0].ptr;
+    octx.src1.data = (uint32_t) bufs[1].ptr;
+    octx.src2.data = (uint32_t) bufs[2].ptr;
+    octx.dst.data  = (uint32_t) bufs[3].ptr;
+    octx.n_threads = ctx->n_threads;
+
+    struct profile_data prof;
+    profile_start(&prof);
+
+    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
+    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
+        rsp_status = op_matmul_id(&octx);
+        vtcm_release(ctx);
+    }
+
+    profile_stop(&prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
+}
+
+static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    // We had written to the output buffer, we'd also need to flush it
+    rsp_bufs[2].fd     = bufs[2].fd;
+    rsp_bufs[2].ptr    = bufs[2].ptr;
+    rsp_bufs[2].offset = bufs[2].offset;
+    rsp_bufs[2].size   = bufs[2].size;
+    rsp_bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
+                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+
+    // Setup Op context
+    struct htp_ops_context octx = { 0 };
+    octx.ctx                    = ctx;
+    octx.src0                   = req->src0;
+    octx.src1                   = req->src1;
+    octx.dst                    = req->dst;
+    octx.flags                  = req->flags;
+    octx.op                     = req->op;
+
+    // Update data pointers
+    octx.src0.data = (uint32_t) bufs[0].ptr;
+    octx.src1.data = (uint32_t) bufs[1].ptr;
+    octx.dst.data  = (uint32_t) bufs[2].ptr;
+    octx.n_threads = ctx->n_threads;
+
+    struct profile_data prof;
+    profile_start(&prof);
+
+    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
+    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
+        rsp_status = op_binary(&octx);
+        vtcm_release(ctx);
+    }
+
+    profile_stop(&prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
+}
+
+static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[2].fd     = bufs[2].fd;
+    rsp_bufs[2].ptr    = bufs[2].ptr;
+    rsp_bufs[2].offset = bufs[2].offset;
+    rsp_bufs[2].size   = bufs[2].size;
+    rsp_bufs[2].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    // We had written to the output buffer, we'd also need to flush it
+    rsp_bufs[3].fd     = bufs[3].fd;
+    rsp_bufs[3].ptr    = bufs[3].ptr;
+    rsp_bufs[3].offset = bufs[3].offset;
+    rsp_bufs[3].size   = bufs[3].size;
+    rsp_bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
+                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+
+    // Setup Op context
+    struct htp_ops_context octx = { 0 };
+    octx.ctx                    = ctx;
+    octx.src0                   = req->src0;
+    octx.src1                   = req->src1;
+    octx.src2                   = req->src2;
+    octx.dst                    = req->dst;
+    octx.flags                  = req->flags;
+    octx.op                     = req->op;
+
+    // Update data pointers
+    octx.src0.data = (uint32_t) bufs[0].ptr;
+    octx.src1.data = (uint32_t) bufs[1].ptr;
+    octx.src2.data = (uint32_t) bufs[2].ptr;
+    octx.dst.data  = (uint32_t) bufs[3].ptr;
+    octx.n_threads = ctx->n_threads;
+
+    struct profile_data prof;
+    profile_start(&prof);
+
+    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
+    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
+        rsp_status = op_binary(&octx);
+        vtcm_release(ctx);
+    }
+
+    profile_stop(&prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
+}
+
+static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    // We had written to the output buffer, we'd also need to flush it
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
+                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+
+    // Setup Op context
+    struct htp_ops_context octx = { 0 };
+    octx.ctx                    = ctx;
+    octx.src0                   = req->src0;
+    octx.dst                    = req->dst;
+    octx.flags                  = req->flags;
+    octx.op                     = req->op;
+
+    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
+
+    // Update data pointers
+    octx.src0.data = (uint32_t) bufs[0].ptr;
+    octx.dst.data  = (uint32_t) bufs[1].ptr;
+    octx.n_threads = ctx->n_threads;
+
+    struct profile_data prof;
+    profile_start(&prof);
+
+    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
+    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
+        rsp_status = op_unary(&octx);
+        vtcm_release(ctx);
+    }
+
+    profile_stop(&prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 2, &prof);
+}
+
+static void proc_activations_req(struct htp_context *     ctx,
+                                 struct htp_general_req * req,
+                                 struct dspqueue_buffer * bufs,
+                                 uint32_t                 n_bufs) {
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    int write_idx = 1;
+    if (3 == n_bufs) {
+        rsp_bufs[1].fd     = bufs[1].fd;
+        rsp_bufs[1].ptr    = bufs[1].ptr;
+        rsp_bufs[1].offset = bufs[1].offset;
+        rsp_bufs[1].size   = bufs[1].size;
+        rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+        write_idx = 2;
+    }
+
+    // We had written to the output buffer, we'd also need to flush it
+    rsp_bufs[write_idx].fd     = bufs[write_idx].fd;
+    rsp_bufs[write_idx].ptr    = bufs[write_idx].ptr;
+    rsp_bufs[write_idx].offset = bufs[write_idx].offset;
+    rsp_bufs[write_idx].size   = bufs[write_idx].size;
+    rsp_bufs[write_idx].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                                 DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
+                                 DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+
+    // Setup Op context
+    struct htp_ops_context octx = { 0 };
+    octx.ctx                    = ctx;
+    octx.src0                   = req->src0;
+    if (3 == n_bufs) {
+        octx.src1 = req->src1;
+    }
+    octx.dst   = req->dst;
+    octx.flags = req->flags;
+    octx.op    = req->op;
+
+    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
+
+    // Update data pointers
+    octx.src0.data = (uint32_t) bufs[0].ptr;
+    if (3 == n_bufs) {
+        octx.src1.data = (uint32_t) bufs[1].ptr;
+        octx.dst.data  = (uint32_t) bufs[2].ptr;
+    } else {
+        octx.dst.data = (uint32_t) bufs[1].ptr;
+    }
+    octx.n_threads = ctx->n_threads;
+
+    struct profile_data prof;
+    profile_start(&prof);
+
+    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
+    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
+        if (octx.op == HTP_OP_SOFTMAX) {
+            rsp_status = op_softmax(&octx);
+        } else {
+            rsp_status = op_activations(&octx);
+        }
+        vtcm_release(ctx);
+    }
+
+    profile_stop(&prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
+}
+
+static void proc_rope_req(struct htp_context *     ctx,
+                          struct htp_general_req * req,
+                          struct dspqueue_buffer * bufs,
+                          uint32_t                 n_bufs) {
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    int write_idx = 2;
+    if (4 == n_bufs) {
+        rsp_bufs[write_idx].fd     = bufs[write_idx].fd;
+        rsp_bufs[write_idx].ptr    = bufs[write_idx].ptr;
+        rsp_bufs[write_idx].offset = bufs[write_idx].offset;
+        rsp_bufs[write_idx].size   = bufs[write_idx].size;
+        rsp_bufs[write_idx].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+        write_idx++;
+    }
+
+    // We had written to the output buffer, we'd also need to flush it
+    rsp_bufs[write_idx].fd     = bufs[write_idx].fd;
+    rsp_bufs[write_idx].ptr    = bufs[write_idx].ptr;
+    rsp_bufs[write_idx].offset = bufs[write_idx].offset;
+    rsp_bufs[write_idx].size   = bufs[write_idx].size;
+    rsp_bufs[write_idx].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                                 DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
+                                 DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
+
+    // Setup Op context
+    struct htp_ops_context octx = { 0 };
+    octx.ctx                    = ctx;
+    octx.src0                   = req->src0;
+    octx.src1                   = req->src1;
+    if (4 == n_bufs) {
+        octx.src2 = req->src2;
+    }
+    octx.dst   = req->dst;
+    octx.flags = req->flags;
+    octx.op    = req->op;
+
+    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
+
+    // Update data pointers
+    octx.src0.data = (uint32_t) bufs[0].ptr;
+    octx.src1.data = (uint32_t) bufs[1].ptr;
+    if (4 == n_bufs) {
+        octx.src2.data = (uint32_t) bufs[2].ptr;
+        octx.dst.data  = (uint32_t) bufs[3].ptr;
+    } else {
+        octx.dst.data = (uint32_t) bufs[2].ptr;
+    }
+    octx.n_threads = ctx->n_threads;
+
+    struct profile_data prof;
+    profile_start(&prof);
+
+    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
+    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
+        rsp_status = op_rope(&octx);
+        vtcm_release(ctx);
+    }
+
+    profile_stop(&prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
+}
+
+static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
+    struct htp_context * ctx = (struct htp_context *) context;
+
+    // Repeatedly read packets from the queue until it's empty. We don't
+    // necessarily get a separate callback for each packet, and new packets
+    // may arrive while we're processing the previous one. This ensures we
+    // keep the DSP busy as much as possible and avoid waiting for the CPU.
+
+    while (1) {
+        struct htp_general_req req;
+        uint32_t               req_size;
+
+        struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
+        uint32_t               n_bufs;
+        uint32_t               flags;
+
+        // Read packet from queue
+        int err = dspqueue_read_noblock(queue, &flags,
+                                        HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
+                                        &n_bufs,                 // Number of buffer references
+                                        bufs,                    // Buffer references
+                                        sizeof(req),             // Max message length
+                                        &req_size,               // Message length
+                                        (uint8_t *) &req);       // Message
+
+        if (err == AEE_EWOULDBLOCK) {
+            // Consumed all packets available for now
+            return;
+        }
+
+        if (err != 0) {
+            FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
+            return;
+        }
+
+        if (req_size != sizeof(req)) {
+            FARF(ERROR, "Invalid request size");
+            continue;
+        }
+
+        if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) {
+            // Host wants early notification
+            dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
+        }
+
+        // Process packet based on its message type
+        switch (req.op) {
+            case HTP_OP_MUL_MAT:
+                if (n_bufs != 3) {
+                    FARF(ERROR, "Bad matmul-req buffer list");
+                    continue;
+                }
+                proc_matmul_req(ctx, &req, bufs, n_bufs);
+                break;
+
+            case HTP_OP_MUL_MAT_ID:
+                if (n_bufs != 4) {
+                    FARF(ERROR, "Bad matmul-id-req buffer list");
+                    continue;
+                }
+                proc_matmul_id_req(ctx, &req, bufs, n_bufs);
+                break;
+
+            case HTP_OP_MUL:
+            case HTP_OP_ADD:
+            case HTP_OP_SUB:
+                if (n_bufs != 3) {
+                    FARF(ERROR, "Bad binary-req buffer list");
+                    continue;
+                }
+                proc_binary_req(ctx, &req, bufs);
+                break;
+
+            case HTP_OP_RMS_NORM:
+                if (n_bufs != 2) {
+                    FARF(ERROR, "Bad unary-req buffer list");
+                    continue;
+                }
+
+                proc_unary_req(ctx, &req, bufs);
+                break;
+
+            case HTP_OP_UNARY_SILU:
+                if (n_bufs != 2) {
+                    FARF(ERROR, "Bad act-req buffer list");
+                    continue;
+                }
+                proc_activations_req(ctx, &req, bufs, n_bufs);
+                break;
+
+            case HTP_OP_GLU_SWIGLU:
+            case HTP_OP_SOFTMAX:
+                if ((n_bufs != 2) && (n_bufs != 3)) {
+                    FARF(ERROR, "Bad act-req buffer list");
+                    continue;
+                }
+                proc_activations_req(ctx, &req, bufs, n_bufs);
+                break;
+
+            case HTP_OP_ADD_ID:
+                if (n_bufs != 4) {
+                    FARF(ERROR, "Bad add-id-req buffer list");
+                    continue;
+                }
+                proc_add_id_req(ctx, &req, bufs);
+                break;
+
+            case HTP_OP_ROPE:
+                if ((n_bufs != 3) && (n_bufs != 4)) {
+                    FARF(ERROR, "Bad rope-req buffer list");
+                    continue;
+                }
+                proc_rope_req(ctx, &req, bufs, n_bufs);
+                break;
+
+            default:
+                FARF(ERROR, "Unknown Op %u", req.op);
+                break;
+        }
+    }
+}
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
new file mode 100644
index 00000000000..10d9ebbd597
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -0,0 +1,2215 @@
+#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+
+#include <HAP_farf.h>
+#include <HAP_mem.h>
+#include <HAP_perf.h>
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <qurt_thread.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+struct htp_matmul_type {
+    const char * type;
+    void (*vec_dot)(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+    void (*vec_dot_rx2)(const int n,
+                        float * restrict s,
+                        const void * restrict vx,
+                        uint32_t vx_row_size,
+                        const void * restrict vy);
+};
+
+typedef struct {
+    HVX_Vector v[2];
+} HVX_Vector_x2;
+
+typedef struct {
+    HVX_Vector v[4];
+} HVX_Vector_x4;
+
+typedef struct {
+    HVX_Vector v[8];
+} HVX_Vector_x8;
+
+// vdelta control to replicate first 4x fp32 values across lanes
+static const uint8_t __attribute__((aligned(128))) repl_4x_fp32[128] = {
+    0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10,
+    0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20,
+    0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04,
+    0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40,
+    0x44, 0x44, 0x44, 0x44, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04,
+    0x04, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
+    0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10,
+};
+
+// vdelta control to replicate and interleave first 8x fp32 values across lanes
+static const uint8_t __attribute__((aligned(128))) repl_interleave_8x_fp32[128] = {
+    0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20,
+    0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04,
+    0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40,
+    0x44, 0x44, 0x44, 0x44, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40, 0x44, 0x44, 0x44,
+    0x44, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
+    0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20,
+};
+
+// vdelta control to replicate first fp32 value across all elements
+static const uint8_t __attribute__((aligned(128))) repl_1x_fp32[128] = {
+    0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10,
+    0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04,
+    0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08,
+    0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08,
+    0x04, 0x04, 0x04, 0x04, 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04,
+    0x04, 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, 0x10, 0x10,
+    0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
+};
+
+// vdelta control to replicate first fp16 value across all elements
+static const uint8_t __attribute__((aligned(128))) repl_1x_fp16[128] = {
+    0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02,
+    0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04,
+    0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08,
+    0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x40, 0x40, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02,
+    0x04, 0x04, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02,
+    0x02, 0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x10, 0x10,
+    0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+};
+
+// vdelta control to expand first 32 e8m0 values into 32 uint32 elements
+static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = {
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, 0x00,
+    0x00, 0x11, 0x10, 0x10, 0x10, 0x02, 0x00, 0x04, 0x00, 0x01, 0x02, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x01, 0x04,
+    0x00, 0x00, 0x22, 0x20, 0x20, 0x20, 0x21, 0x22, 0x20, 0x24, 0x04, 0x00, 0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x02,
+    0x00, 0x04, 0x00, 0x11, 0x12, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08,
+    0x01, 0x02, 0x00, 0x04, 0x44, 0x40, 0x40, 0x40, 0x41, 0x40, 0x40, 0x40, 0x42, 0x40, 0x44, 0x40, 0x41, 0x42, 0x48,
+    0x48, 0x08, 0x08, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x12, 0x10, 0x10, 0x10, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00,
+    0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x22, 0x20, 0x24, 0x20, 0x21, 0x22, 0x20, 0x20,
+};
+
+static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = {
+    0,    0, 1,    0, 2,    0, 3, 0, 4, 0, 6, 0, 8, 0, 12, 0, 0, 0, 0xff, 0, 0xfe, 0, 0xfd, 0, 0xfc, 0,
+    0xfa, 0, 0xf8, 0, 0xf4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0,    0, 0,    0, 0,    0, 0,    0,
+    0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0,    0, 0,    0, 0,    0, 0,    0,
+    0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0,    0, 0,    0, 0,    0, 0,    0,
+    0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0,    0, 0,    0, 0,    0,
+};
+
+// q4x4x2 and q8x4x2 are the flat q4/8_0 formats where all quants are stored first followed by all scales
+
+static inline size_t q8x4x2_row_size(uint32_t ne) {
+    // ensures perfect alignment of quants and full row
+    const uint32_t qk = QK_Q8_0x4x2;
+    const uint32_t nb = (ne + qk - 1) / qk;
+    return htp_round_up(ne + nb * 8 * sizeof(__fp16), 128);
+}
+
+static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
+    const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
+
+    HVX_Vector v0_1 = vptr[0];  // first 256 elements (128 bytes)
+    HVX_Vector v2_3 = vptr[1];  // ...
+    HVX_Vector v4_5 = vptr[2];  // ...
+    HVX_Vector v6_7 = vptr[3];  // ...
+
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+
+    HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4);  // & 0x0F
+    HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4);    // >> 4
+    HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4);  // & 0x0F
+    HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4);    // >> 4
+    HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4);  // & 0x0F
+    HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4);    // >> 4
+    HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4);  // & 0x0F
+    HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4);    // >> 4
+
+    // Convert uint4 to int4 (i.e. x - 8)
+    const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
+    v0                  = Q6_Vb_vsub_VbVb(v0, i8);
+    v1                  = Q6_Vb_vsub_VbVb(v1, i8);
+    v2                  = Q6_Vb_vsub_VbVb(v2, i8);
+    v3                  = Q6_Vb_vsub_VbVb(v3, i8);
+    v4                  = Q6_Vb_vsub_VbVb(v4, i8);
+    v5                  = Q6_Vb_vsub_VbVb(v5, i8);
+    v6                  = Q6_Vb_vsub_VbVb(v6, i8);
+    v7                  = Q6_Vb_vsub_VbVb(v7, i8);
+
+    HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
+    return r;
+}
+
+static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr) {
+    const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
+
+    HVX_Vector v0_1 = vptr[0];  // first 256 elements (128 bytes)
+    HVX_Vector v2_3 = vptr[1];  // ...
+    HVX_Vector v4_5 = vptr[2];  // ...
+    HVX_Vector v6_7 = vptr[3];  // ...
+
+    const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
+
+    HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4);  // & 0x0F
+    HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4);    // >> 4
+    HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4);  // & 0x0F
+    HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4);    // >> 4
+    HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4);  // & 0x0F
+    HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4);    // >> 4
+    HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4);  // & 0x0F
+    HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4);    // >> 4
+
+    HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut;
+    v0             = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
+    v1             = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
+    v2             = Q6_Vb_vlut32_VbVbI(v2, lut, 0);
+    v3             = Q6_Vb_vlut32_VbVbI(v3, lut, 0);
+    v4             = Q6_Vb_vlut32_VbVbI(v4, lut, 0);
+    v5             = Q6_Vb_vlut32_VbVbI(v5, lut, 0);
+    v6             = Q6_Vb_vlut32_VbVbI(v6, lut, 0);
+    v7             = Q6_Vb_vlut32_VbVbI(v7, lut, 0);
+
+    HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
+    return r;
+}
+
+static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
+    const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
+
+    HVX_Vector v0 = vptr[0];  // first  128 vals
+    HVX_Vector v1 = vptr[1];  // ...
+    HVX_Vector v2 = vptr[2];  // ...
+    HVX_Vector v3 = vptr[3];  // ...
+    HVX_Vector v4 = vptr[4];  // ...
+    HVX_Vector v5 = vptr[5];  // ...
+    HVX_Vector v6 = vptr[6];  // ...
+    HVX_Vector v7 = vptr[7];  // ...
+
+    HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
+    return r;
+}
+
+static inline HVX_Vector_x4 hvx_vec_load_x4_f16(const uint8_t * restrict ptr) {
+    const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
+
+    HVX_Vector v0 = vptr[0];  // first  64 vals
+    HVX_Vector v1 = vptr[1];  // second 64 vals
+    HVX_Vector v2 = vptr[2];  // third  64 vals
+    HVX_Vector v3 = vptr[3];  // forth  64 vals
+
+    HVX_Vector_x4 r = { v0, v1, v2, v3 };
+    return r;
+}
+
+static inline HVX_Vector_x4 hvx_vec_load_x4_f32_as_f16(const uint8_t * restrict ptr) {
+    const HVX_VectorPair * restrict vptr = (const HVX_VectorPair *) ptr;
+
+    HVX_VectorPair v0 = vptr[0];  // first  64 vals
+    HVX_VectorPair v1 = vptr[1];  // second 64 vals
+    HVX_VectorPair v2 = vptr[2];  // third  64 vals
+    HVX_VectorPair v3 = vptr[3];  // forth  64 vals
+
+    HVX_Vector vq0_lo = Q6_Vqf32_vsub_VsfVsf(Q6_V_lo_W(v0), Q6_V_vzero());
+    HVX_Vector vq0_hi = Q6_Vqf32_vsub_VsfVsf(Q6_V_hi_W(v0), Q6_V_vzero());
+    HVX_Vector vq1_lo = Q6_Vqf32_vsub_VsfVsf(Q6_V_lo_W(v1), Q6_V_vzero());
+    HVX_Vector vq1_hi = Q6_Vqf32_vsub_VsfVsf(Q6_V_hi_W(v1), Q6_V_vzero());
+    HVX_Vector vq2_lo = Q6_Vqf32_vsub_VsfVsf(Q6_V_lo_W(v2), Q6_V_vzero());
+    HVX_Vector vq2_hi = Q6_Vqf32_vsub_VsfVsf(Q6_V_hi_W(v2), Q6_V_vzero());
+    HVX_Vector vq3_lo = Q6_Vqf32_vsub_VsfVsf(Q6_V_lo_W(v3), Q6_V_vzero());
+    HVX_Vector vq3_hi = Q6_Vqf32_vsub_VsfVsf(Q6_V_hi_W(v3), Q6_V_vzero());
+
+    HVX_Vector vh0 = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vq0_hi, vq0_lo));
+    HVX_Vector vh1 = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vq1_hi, vq1_lo));
+    HVX_Vector vh2 = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vq2_hi, vq2_lo));
+    HVX_Vector vh3 = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vq3_hi, vq3_lo));
+
+    // vcombine does a shuffle, use vdeal to undo
+
+    HVX_Vector_x4 r = { Q6_Vh_vdeal_Vh(vh0), Q6_Vh_vdeal_Vh(vh1), Q6_Vh_vdeal_Vh(vh2), Q6_Vh_vdeal_Vh(vh3) };
+    return r;
+}
+
+static inline HVX_Vector hvx_vec_rmpy_x8(HVX_Vector_x8 x, HVX_Vector_x8 y) {
+    HVX_Vector r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]);
+    HVX_Vector r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]);
+    HVX_Vector r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]);
+    HVX_Vector r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]);
+    HVX_Vector r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]);
+    HVX_Vector r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]);
+    HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]);
+    HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]);
+
+    HVX_VectorPair p0 = Q6_W_vshuff_VVR(r4, r0, 16);
+    HVX_VectorPair p1 = Q6_W_vshuff_VVR(r5, r1, 16);
+    HVX_VectorPair p2 = Q6_W_vshuff_VVR(r6, r2, 16);
+    HVX_VectorPair p3 = Q6_W_vshuff_VVR(r7, r3, 16);
+
+    r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
+    r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
+    r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2));
+    r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3));
+
+    p0 = Q6_W_vshuff_VVR(r2, r0, 8);
+    p1 = Q6_W_vshuff_VVR(r3, r1, 8);
+    r0 = Q6_Vw_vadd_VwVw(Q6_V_hi_W(p0), Q6_V_lo_W(p0));
+    r1 = Q6_Vw_vadd_VwVw(Q6_V_hi_W(p1), Q6_V_lo_W(p1));
+
+    p0 = Q6_W_vshuff_VVR(r1, r0, 4);
+    r0 = Q6_Vw_vadd_VwVw(Q6_V_hi_W(p0), Q6_V_lo_W(p0));
+
+    static const uint8_t vrd[128] __attribute__((aligned(128))) = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
+        0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54,
+        0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+        0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28,
+        0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68,
+        0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C,
+        0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C,
+    };
+
+    static const uint8_t vd[128] __attribute__((aligned(128))) = {
+        0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x10, 0x10, 0x20, 0x20, 0x20, 0x20, 0x30, 0x30, 0x30, 0x30,
+        0x24, 0x24, 0x24, 0x24, 0x34, 0x34, 0x34, 0x34, 0x04, 0x04, 0x04, 0x04, 0x14, 0x14, 0x14, 0x14,
+        0x08, 0x08, 0x08, 0x08, 0x18, 0x18, 0x18, 0x18, 0x28, 0x28, 0x28, 0x28, 0x38, 0x38, 0x38, 0x38,
+        0x2C, 0x2C, 0x2C, 0x2C, 0x3C, 0x3C, 0x3C, 0x3C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1C, 0x1C, 0x1C, 0x1C,
+        0x30, 0x30, 0x30, 0x30, 0x20, 0x20, 0x20, 0x20, 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00,
+        0x14, 0x14, 0x14, 0x14, 0x04, 0x04, 0x04, 0x04, 0x34, 0x34, 0x34, 0x34, 0x24, 0x24, 0x24, 0x24,
+        0x38, 0x38, 0x38, 0x38, 0x28, 0x28, 0x28, 0x28, 0x18, 0x18, 0x18, 0x18, 0x08, 0x08, 0x08, 0x08,
+        0x1C, 0x1C, 0x1C, 0x1C, 0x0C, 0x0C, 0x0C, 0x0C, 0x3C, 0x3C, 0x3C, 0x3C, 0x2C, 0x2C, 0x2C, 0x2C,
+    };
+
+    r0 = Q6_V_vrdelta_VV(r0, *(HVX_Vector *) vrd);
+    r0 = Q6_V_vdelta_VV(r0, *(HVX_Vector *) vd);
+
+    return r0;
+}
+
+static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx % 128 == 0);
+    assert((unsigned long) vy % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                     // int4
+    const uint32_t x_qrow_size = n / 2;                                      // int4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                         // int8
+    const uint32_t y_qrow_size = n;                                          // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx + 0);            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx + x_qrow_size);  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);     // then scales
+
+    // Row sum (qf32)
+    HVX_Vector r0_sum = Q6_V_vsplat_R(0);
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    const uint32_t nloe = n % qk;  // num leftover elemements
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+    }
+
+    // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+
+        // Zero out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+    }
+
+    // Reduce and convert into fp32
+    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
+
+    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
+}
+
+static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
+                                      float * restrict s,
+                                      const void * restrict vx,
+                                      uint32_t vx_row_size,
+                                      const void * restrict vy) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx % 128 == 0);
+    assert((unsigned long) vy % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
+    const uint32_t x_qblk_size = qk / 2;                                                           // int4
+    const uint32_t x_qrow_size = n / 2;                                                            // int4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                                               // int8
+    const uint32_t y_qrow_size = n;                                                                // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) (vx + (0 * vx_row_size)) + 0);            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) (vx + (0 * vx_row_size)) + x_qrow_size);  // then scales
+
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) (vx + (1 * vx_row_size)) + 0);            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) (vx + (1 * vx_row_size)) + x_qrow_size);  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);                                     // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);                           // then scales
+
+    // Row sum (qf32)
+    HVX_Vector r0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_sum = Q6_V_vsplat_R(0);
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    const uint32_t nloe = n % qk;  // num leftover elemements
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+    }
+
+    // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+
+        // Zero out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+    }
+
+    // Reduce and convert into fp32
+    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
+    r1_sum = hvx_vec_qf32_reduce_sum(r1_sum);
+
+    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
+    hvx_vec_store_u(&s[1], 4, Q6_Vsf_equals_Vqf32(r1_sum));
+}
+
+static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx % 128 == 0);
+    assert((unsigned long) vy % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
+    const uint32_t x_qblk_size = qk;                                         // int8
+    const uint32_t x_qrow_size = n;                                          // int8 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                         // int8
+    const uint32_t y_qrow_size = n;                                          // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx + 0);            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx + x_qrow_size);  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);     // then scales
+
+    // Row sum (qf32)
+    HVX_Vector r0_sum = Q6_V_vsplat_R(0);
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+    }
+
+    // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+
+        // Zero out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+    }
+
+    // Reduce and convert into fp32
+    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
+
+    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
+}
+
+static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
+                                      float * restrict s,
+                                      const void * restrict vx,
+                                      uint32_t vx_row_size,
+                                      const void * restrict vy) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx % 128 == 0);
+    assert((unsigned long) vy % 128 == 0);
+
+    const uint32_t qk = QK_Q4_0x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
+    const uint32_t x_qblk_size = qk;                                                               // int8
+    const uint32_t x_qrow_size = n;                                                                // int8 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                                               // int8
+    const uint32_t y_qrow_size = n;                                                                // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) (vx + (0 * vx_row_size)) + 0);            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) (vx + (0 * vx_row_size)) + x_qrow_size);  // then scales
+
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) (vx + (1 * vx_row_size)) + 0);            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) (vx + (1 * vx_row_size)) + x_qrow_size);  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);                                     // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);                           // then scales
+
+    // Row sum (qf32)
+    HVX_Vector r0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_sum = Q6_V_vsplat_R(0);
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+    }
+
+    // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+
+        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
+        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
+        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+
+        // Zero out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+    }
+
+    // Reduce and convert into fp32
+    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
+    r1_sum = hvx_vec_qf32_reduce_sum(r1_sum);
+
+    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
+    hvx_vec_store_u(&s[1], 4, Q6_Vsf_equals_Vqf32(r1_sum));
+}
+
+static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
+                                     float * restrict s,
+                                     const void * restrict vx,
+                                     const void * restrict vy) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx % 128 == 0);
+    assert((unsigned long) vy % 128 == 0);
+
+    const uint32_t qk = QK_MXFP4x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 1;                                  // 32x e8m0
+    const uint32_t x_qblk_size = qk / 2;                                     // fp4
+    const uint32_t x_qrow_size = n / 2;                                      // fp4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                  // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                         // int8
+    const uint32_t y_qrow_size = n;                                          // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) vx + 0);            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) vx + x_qrow_size);  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);               // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);     // then scales
+
+    // Row sum (qf32)
+    HVX_Vector r0_sum = Q6_V_vsplat_R(0);
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
+        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+
+        // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
+        HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
+        vy_d            = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
+        vy_d            = Q6_Vsf_equals_Vqf32(vy_d);
+
+        // Convert rX_d scales from e8m0 to fp32
+        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
+        // Left shift with zero fill to create FP32
+        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
+        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
+        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
+        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
+        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+
+        // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
+        HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
+        vy_d            = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
+        vy_d            = Q6_Vsf_equals_Vqf32(vy_d);
+
+        // Convert rX_d scales from e8m0 to fp32
+        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
+        // Left shift with zero fill to create FP32
+        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
+        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
+        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
+        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
+
+        // Zero-out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+    }
+
+    // Reduce and convert into fp32
+    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
+
+    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
+}
+
+static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
+                                         float * restrict s,
+                                         const void * restrict vx,
+                                         uint32_t vx_row_size,
+                                         const void * restrict vy) {
+    assert(n % 32 == 0);  // min sub-block size
+    assert((unsigned long) vx % 128 == 0);
+    assert((unsigned long) vy % 128 == 0);
+
+    const uint32_t qk = QK_MXFP4x4x2 * 4;
+
+    const uint32_t x_dblk_size = 8 * 4 * 1;                                                        // 32x e8m0
+    const uint32_t x_qblk_size = qk / 2;                                                           // fp4
+    const uint32_t x_qrow_size = n / 2;                                                            // fp4 (not padded)
+
+    const uint32_t y_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
+    const uint32_t y_qblk_size = qk;                                                               // int8
+    const uint32_t y_qrow_size = n;                                                                // int8 (not padded)
+
+    const uint8_t * restrict r0_x_q = ((const uint8_t *) (vx + (0 * vx_row_size)) + 0);            // quants first
+    const uint8_t * restrict r0_x_d = ((const uint8_t *) (vx + (0 * vx_row_size)) + x_qrow_size);  // then scales
+
+    const uint8_t * restrict r1_x_q = ((const uint8_t *) (vx + (1 * vx_row_size)) + 0);            // quants first
+    const uint8_t * restrict r1_x_d = ((const uint8_t *) (vx + (1 * vx_row_size)) + x_qrow_size);  // then scales
+
+    const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);                                     // quants first
+    const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);                           // then scales
+
+    // Row sum (qf32)
+    HVX_Vector r0_sum = Q6_V_vsplat_R(0);
+    HVX_Vector r1_sum = Q6_V_vsplat_R(0);
+
+    // Multiply and accumulate into int32.
+    // Compute combined scale (fp32).
+    // Apply scale to acc and accumulate into the row sum (qf32).
+
+    const uint32_t nb   = n / qk;  // num full blocks
+    int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
+
+    uint32_t i = 0;
+    for (; i < nb; i++) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
+        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+
+        // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
+        HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
+        vy_d            = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
+        vy_d            = Q6_Vsf_equals_Vqf32(vy_d);
+
+        // Convert rX_d scales from e8m0 to fp32
+        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
+        // Left shift with zero fill to create FP32
+        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
+        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
+        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
+        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+        r1_d                 = Q6_V_vdelta_VV(r1_d, expand);
+        r1_d                 = Q6_V_vand_VV(r1_d, e8m0_mask);
+        r1_d                 = Q6_Vw_vasl_VwR(r1_d, 23);
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+    }
+
+    // Process leftovers
+    if (nloe) {
+        HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+        HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
+        HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
+
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
+        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+
+        // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
+        HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
+        vy_d            = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
+        vy_d            = Q6_Vsf_equals_Vqf32(vy_d);
+
+        // Convert rX_d scales from e8m0 to fp32
+        // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
+        // Left shift with zero fill to create FP32
+        // FIXME: might need to handle zero as a special case (see ggml-cpu code)
+        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
+        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
+        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+        r1_d                 = Q6_V_vdelta_VV(r1_d, expand);
+        r1_d                 = Q6_V_vand_VV(r1_d, e8m0_mask);
+        r1_d                 = Q6_Vw_vasl_VwR(r1_d, 23);
+
+        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
+        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
+
+        // Zero-out unused scales
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
+        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+    }
+
+    // Reduce and convert into fp32
+    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
+    r1_sum = hvx_vec_qf32_reduce_sum(r1_sum);
+
+    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
+    hvx_vec_store_u(&s[1], 4, Q6_Vsf_equals_Vqf32(r1_sum));
+}
+
+#if 1
+static void vec_dot_f16_f32(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
+    if (0) {
+        float rsum                 = 0;
+        const __fp16 * restrict vx = (const __fp16 * restrict) x;
+        const float * restrict vy  = (const float * restrict) y;
+
+        for (uint32_t i = 0; i < n; i++) {
+            rsum += vx[i] * (__fp16) vy[i];
+        }
+        *s = rsum;
+        return;
+    }
+
+    const HVX_UVector * restrict vx     = (const HVX_UVector * restrict) x;
+    const HVX_UVectorPair * restrict vy = (const HVX_UVectorPair * restrict) y;
+
+    uint32_t nv0 = n / 64;  // num full fp16 hvx vectors
+    uint32_t nv1 = n % 64;  // leftover elements
+
+    // for some reason we need volatile here so that the compiler doesn't try anything funky
+    volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
+
+    uint32_t i = 0;
+
+    for (i = 0; i < nv0; i++) {
+        HVX_VectorPair yp = vy[i];
+
+        HVX_Vector     x  = vx[i];
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
+
+        HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
+        HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
+
+        HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
+        rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
+    }
+
+    if (nv1) {
+        HVX_VectorPair yp = vy[i];
+
+        HVX_Vector     x  = vx[i];
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
+
+        if (nv1 >= 32) {
+            HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
+            rsum          = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
+            nv1 -= 32;
+        }
+
+        rsum = hvx_vec_qf32_reduce_sum(rsum);
+
+        if (nv1) {
+            HVX_Vector lo  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
+            HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
+            rsum           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
+        }
+
+        // hvx_vec_dump_fp16("X", x);
+        // hvx_vec_dump_fp16("Y", y);
+        // hvx_vec_dump_fp32("SUM",  Q6_Vsf_equals_Vqf32(sum));
+        // hvx_vec_dump_fp32("RSUM", Q6_Vsf_equals_Vqf32(rsum));
+    } else {
+        rsum = hvx_vec_qf32_reduce_sum(rsum);
+    }
+
+    *s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum));
+
+#    ifdef HTP_DEBUG
+    {
+        float rsum                 = 0;
+        const __fp16 * restrict vx = (const __fp16 * restrict) x;
+        const float * restrict vy  = (const float * restrict) y;
+
+        for (uint32_t i = 0; i < n; i++) {
+            rsum += vx[i] * vy[i];
+        }
+
+        float diff = fabs(*s - rsum);
+        if (diff > 0.001) {
+            FARF(HIGH, "vec-dot-f16-missmatch: %u (%u:%u) expected %.6f got %.6f\n", n, nv0, nv1, rsum, *s);
+            // htp_dump_f16("x", vx, n);
+            // htp_dump_f32("y", vy, n);
+        }
+    }
+#    endif
+}
+#else
+static void vec_dot_f16_f32(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
+    const uint32_t fk = 64;
+    const uint32_t nb = n / fk;
+
+    assert(n % fk == 0);
+    assert(nb % 4 == 0);
+
+    const uint32_t x_blk_size = 2 * fk;  // fp16
+    const uint32_t y_blk_size = 4 * fk;  // fp32
+
+    // Row sum (qf32)
+    HVX_Vector rsum0 = Q6_V_vsplat_R(0);
+    HVX_Vector rsum1 = Q6_V_vsplat_R(0);
+    HVX_Vector rsum2 = Q6_V_vsplat_R(0);
+    HVX_Vector rsum3 = Q6_V_vsplat_R(0);
+
+    for (uint32_t i = 0; i < nb; i += 4) {
+        HVX_Vector_x4 vx = hvx_vec_load_x4_f16(x + (i * x_blk_size));
+        HVX_Vector_x4 vy = hvx_vec_load_x4_f32_as_f16(y + (i * y_blk_size));
+
+        HVX_VectorPair fa0 = Q6_Wqf32_vmpy_VhfVhf(vx.v[0], vy.v[0]);
+        HVX_VectorPair fa1 = Q6_Wqf32_vmpy_VhfVhf(vx.v[1], vy.v[1]);
+        HVX_VectorPair fa2 = Q6_Wqf32_vmpy_VhfVhf(vx.v[2], vy.v[2]);
+        HVX_VectorPair fa3 = Q6_Wqf32_vmpy_VhfVhf(vx.v[3], vy.v[3]);
+
+        rsum0 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum0, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(fa0), Q6_V_hi_W(fa0)));
+        rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(fa1), Q6_V_hi_W(fa1)));
+        rsum2 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum2, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(fa2), Q6_V_hi_W(fa2)));
+        rsum3 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum3, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(fa3), Q6_V_hi_W(fa3)));
+    }
+
+    // Reduce and convert into fp32
+    rsum0           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum0, rsum1);
+    rsum2           = Q6_Vqf32_vadd_Vqf32Vqf32(rsum2, rsum3);
+    HVX_Vector rsum = hvx_vec_qf32_reduce_sum(Q6_Vqf32_vadd_Vqf32Vqf32(rsum0, rsum2));
+    hvx_vec_store_u(s, 4, Q6_Vsf_equals_Vqf32(rsum));
+}
+#endif
+
+#define htp_matmul_preamble            \
+    const uint32_t ne00 = src0->ne[0]; \
+    const uint32_t ne01 = src0->ne[1]; \
+    const uint32_t ne02 = src0->ne[2]; \
+    const uint32_t ne03 = src0->ne[3]; \
+                                       \
+    const uint32_t ne10 = src1->ne[0]; \
+    const uint32_t ne11 = src1->ne[1]; \
+    const uint32_t ne12 = src1->ne[2]; \
+    const uint32_t ne13 = src1->ne[3]; \
+                                       \
+    const uint32_t ne0 = dst->ne[0];   \
+    const uint32_t ne1 = dst->ne[1];   \
+    const uint32_t ne2 = dst->ne[2];   \
+    const uint32_t ne3 = dst->ne[3];   \
+                                       \
+    const uint32_t nb00 = src0->nb[0]; \
+    const uint32_t nb01 = src0->nb[1]; \
+    const uint32_t nb02 = src0->nb[2]; \
+    const uint32_t nb03 = src0->nb[3]; \
+                                       \
+    const uint32_t nb10 = src1->nb[0]; \
+    const uint32_t nb11 = src1->nb[1]; \
+    const uint32_t nb12 = src1->nb[2]; \
+    const uint32_t nb13 = src1->nb[3]; \
+                                       \
+    const uint32_t nb0 = dst->nb[0];   \
+    const uint32_t nb1 = dst->nb[1];   \
+    const uint32_t nb2 = dst->nb[2];   \
+    const uint32_t nb3 = dst->nb[3];
+
+// q8x4 src1 tensor is already in VTCM spad
+static void matmul(struct htp_matmul_type * mt,
+                   struct htp_tensor * restrict src0,
+                   struct htp_tensor * restrict src1,
+                   struct htp_tensor * restrict dst,
+                   struct htp_spad * restrict src0_spad,
+                   struct htp_spad * restrict src1_spad,
+                   struct htp_spad * restrict dst_spad,
+                   uint32_t    nth,
+                   uint32_t    ith,
+                   uint32_t    src0_nrows_per_thread,
+                   dma_queue * dma_queue) {
+    htp_matmul_preamble;
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src1_nrows = ne11 * ne12 * ne13;  // src1 rows
+
+    const uint32_t src0_start_row  = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row    = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+    const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    const size_t dst_row_size  = nb1;
+    const size_t src0_row_size = nb01;
+    const size_t src1_row_size = q8x4x2_row_size(ne10);
+
+    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+
+    // Per-thread VTCM scratchpads for all tensors
+    // Note that the entire src1 tensor is already in VTCM
+    // For other tensors we allocate N rows per thread, padded to HVX vector size
+    uint8_t * restrict spad_dst  = dst_spad->data + dst_spad->size_per_thread * ith;
+    uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
+    uint8_t * restrict src1_data = src1_spad->data;
+
+    volatile uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
+
+// Prefill spad with src0 rows
+#pragma unroll(4)
+    for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+        const int is0 = (ir0 - src0_start_row);
+        if (is0 >= HTP_SPAD_SRC0_NROWS) {
+            break;
+        }
+        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+                       src0_row_size_padded, src0_row_size, 2);
+    }
+
+    // Process src0 rows
+    for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+
+#pragma unroll(2)
+        for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
+            const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_row_size);
+            float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
+            mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
+        }
+
+        // Prefetch next (n + spad_nrows) row
+        const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
+        const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
+        if (pr0 < src0_end_row_x2) {
+            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                           src0_row_size_padded, src0_row_size, 2);
+        }
+    }
+
+    // Process the last row (if any)
+    if (src0_end_row != src0_end_row_x2) {
+        uint32_t  ir0 = src0_end_row_x2;
+        const int is0 = (ir0 - src0_start_row);
+        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+                       src0_row_size_padded, src0_row_size, 1);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+
+#pragma unroll(2)
+        for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
+            const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_row_size);
+            float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
+            mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
+         src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// q8x4x2 src1 tensor is already in VTCM spad
+static void matvec(struct htp_matmul_type * mt,
+                   struct htp_tensor * restrict src0,
+                   struct htp_tensor * restrict src1,
+                   struct htp_tensor * restrict dst,
+                   struct htp_spad * restrict src0_spad,
+                   struct htp_spad * restrict src1_spad,
+                   struct htp_spad * restrict dst_spad,
+                   uint32_t    nth,
+                   uint32_t    ith,
+                   uint32_t    src0_nrows_per_thread,
+                   dma_queue * dma_queue) {
+    htp_matmul_preamble;
+
+    const uint32_t src0_nrows = ne01;
+
+    const uint32_t src0_start_row  = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row    = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+    const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    const size_t dst_row_size  = nb1;
+    const size_t src0_row_size = nb01;
+    const size_t src1_row_size = q8x4x2_row_size(ne10);
+
+    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+
+    // Per-thread VTCM scratchpads for all tensors
+    // Note that the entire src1 tensor is already in VTCM
+    // For other tensors we allocate N rows per thread, padded to HVX vector size
+    uint8_t * spad_dst  = dst_spad->data + dst_spad->size_per_thread * ith;
+    uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
+    uint8_t * src1_data = src1_spad->data;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    float * tmp = (float *) spad_dst;
+
+    const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
+    const uint8_t * restrict src1_col = (const uint8_t *) src1_data;
+    float * restrict dst_col          = (float *) dst->data;
+
+// Prefill spad with 2x src0 rows
+#pragma unroll(2)
+    for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+        const uint32_t is0 = (ir0 - src0_start_row);
+        if (is0 >= HTP_SPAD_SRC0_NROWS) {
+            break;
+        }
+        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+                       src0_row_size_padded, src0_row_size, 2);
+    }
+
+    // Process src0 rows
+    for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        mt->vec_dot_rx2(ne00, &tmp[ir0 - src0_start_row], ss0, src0_row_size_padded, src1_col);
+
+        // Prefetch next (n + spad_nrows) row
+        const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
+        const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
+        if (pr0 < src0_end_row_x2) {
+            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                           src0_row_size_padded, src0_row_size, 2);
+        }
+    }
+
+    // Process the last row (if any)
+    if (src0_end_row != src0_end_row_x2) {
+        const uint32_t ir0 = src0_end_row_x2;
+        const uint32_t is0 = (ir0 - src0_start_row);
+        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+                       src0_row_size_padded, src0_row_size, 1);
+        const uint8_t * ss0 = dma_queue_pop(dma_queue);
+        mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
+    }
+
+    hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
+         src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)]
+
+struct mmid_row_mapping {
+    uint32_t i1;
+    uint32_t i2;
+};
+
+// q8x4 src1 tensor is already in VTCM spad
+static void matmul_id(struct htp_matmul_type * mt,
+                      struct htp_tensor * restrict src0,
+                      struct htp_tensor * restrict src1,
+                      struct htp_tensor * restrict ids,
+                      struct htp_tensor * restrict dst,
+                      struct htp_spad * restrict src0_spad,
+                      struct htp_spad * restrict src1_spad,
+                      struct htp_spad * restrict src2_spad,
+                      struct htp_spad * restrict dst_spad,
+                      uint32_t    nth,
+                      uint32_t    ith,
+                      uint32_t    src0_nrows_per_thread,
+                      dma_queue * dma_queue) {
+    htp_matmul_preamble;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const uint32_t src0_nrows = ne01;  // src0 rows per expert
+    const uint32_t src1_nrows = ne11;
+
+    const uint32_t src0_start_row  = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row    = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+    const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    const uint32_t n_ids = ids->ne[0];  // n_expert_used
+    const uint32_t n_as  = ne02;        // n_expert
+
+    const size_t matrix_row_counts_size = n_as * sizeof(uint32_t);
+    const size_t matrix_row_map_size    = n_as * ids->ne[0] * ids->ne[1] * sizeof(struct mmid_row_mapping);
+
+    const uint32_t *                matrix_row_counts = (const uint32_t *) src2_spad->data + 0;
+    const struct mmid_row_mapping * matrix_rows       = (const void *) src2_spad->data + matrix_row_counts_size;
+
+    const size_t dst_row_size  = nb1;
+    const size_t src0_row_size = nb01;
+    const size_t src1_row_size = q8x4x2_row_size(ne10);
+
+    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+
+    // Per-thread VTCM scratchpads for all tensors
+    // Note that the entire src1 tensor is already in VTCM
+    // For other tensors we allocate N rows per thread, padded to HVX vector size
+    uint8_t * restrict spad_dst  = dst_spad->data + dst_spad->size_per_thread * ith;
+    uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
+    uint8_t * restrict src1_data = src1_spad->data;
+
+    for (uint32_t cur_a = 0; cur_a < n_as; ++cur_a) {
+        const int32_t cne1 = matrix_row_counts[cur_a];
+
+        if (cne1 == 0) {
+            continue;
+        }
+
+        const uint8_t * src0_row = (const uint8_t *) src0->data + (0 + cur_a * nb02 + 0);
+
+// Prefill spad with src0 rows
+#pragma unroll(4)
+        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+            const int is0 = (ir0 - src0_start_row);
+            if (is0 >= HTP_SPAD_SRC0_NROWS) {
+                break;
+            }
+            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+                           src0_row_size_padded, src0_row_size, 2);
+        }
+
+        // Process src0 rows
+        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+
+            for (uint32_t cid = 0; cid < cne1; ++cid) {
+                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
+                const int               rm1         = row_mapping.i1;  // expert idx
+                const int               rm2         = row_mapping.i2;  // token idx
+
+                const uint32_t ir1 = src1_nrows == 1 ? 0 : rm1;        // src1 row idx
+                const uint8_t * restrict src1_col =
+                    (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size);
+                float * dst_row = (float *) (dst->data + (rm1 * nb1 + rm2 * nb2 + 0));
+
+                mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
+            }
+
+            // Prefetch next (n + spad_nrows) row
+            const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
+            const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
+            if (pr0 < src0_end_row_x2) {
+                dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                               src0_row_size_padded, src0_row_size, 2);
+            }
+        }
+
+        // Process the last row (if any)
+        if (src0_end_row != src0_end_row_x2) {
+            uint32_t       ir0 = src0_end_row_x2;
+            const uint32_t is0 = (ir0 - src0_start_row);
+            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+                           src0_row_size_padded, src0_row_size, 1);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+
+            for (uint32_t cid = 0; cid < cne1; ++cid) {
+                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
+                const int               rm1         = row_mapping.i1;  // expert idx
+                const int               rm2         = row_mapping.i2;  // token idx
+
+                const uint32_t ir1 = src1_nrows == 1 ? 0 : rm1;        // src1 row idx
+                const uint8_t * restrict src1_col =
+                    (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size);
+                float * dst_row = (float *) (dst->data + (rm1 * nb1 + rm2 * nb2 + 0));
+
+                mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
+            }
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
+         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
+         src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1],
+         dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// q8x4 src1 tensor is already in VTCM spad
+static void matvec_id(struct htp_matmul_type * mt,
+                      struct htp_tensor * restrict src0,
+                      struct htp_tensor * restrict src1,
+                      struct htp_tensor * restrict src2,
+                      struct htp_tensor * restrict dst,
+                      struct htp_spad * restrict src0_spad,
+                      struct htp_spad * restrict src1_spad,
+                      struct htp_spad * restrict src2_spad,
+                      struct htp_spad * restrict dst_spad,
+                      uint32_t    nth,
+                      uint32_t    ith,
+                      uint32_t    src0_nrows_per_thread,
+                      dma_queue * dma_queue) {
+    htp_matmul_preamble;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const uint32_t src0_nrows = ne01;  // src0 rows per expert
+
+    const uint32_t src0_start_row  = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row    = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+    const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    assert(ne13 % ne03 == 0);
+
+    const size_t dst_row_size  = nb1;
+    const size_t src0_row_size = nb01;
+    const size_t src1_row_size = q8x4x2_row_size(ne10);
+
+    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+
+    const uint32_t n_aids = src2->ne[0];  // num activated experts
+    const uint32_t n_ids  = ne02;         // num experts
+
+    // Per-thread VTCM scratchpads for all tensors
+    // Note that the entire src1 tensor is already in VTCM
+    // For other tensors we allocate N rows per thread, padded to HVX vector size
+    uint8_t * restrict spad_dst  = dst_spad->data + dst_spad->size_per_thread * ith;
+    uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
+    uint8_t * restrict src1_data = src1_spad->data;
+
+    for (uint32_t ie1 = 0; ie1 < n_aids; ++ie1) {  // for each expert
+        const uint32_t eid = *(const int32_t *) ((const uint8_t *) src2->data + ie1 * src2->nb[0]);
+        assert(eid < n_ids);
+
+        const uint8_t * restrict src0_row = (const uint8_t *) src0->data + eid * nb02;
+        const uint8_t * restrict src1_col = (const uint8_t *) src1_data;
+        float * restrict dst_row          = (float *) (dst->data + ie1 * nb1);
+
+// Prefill spad with src0 rows
+#pragma unroll(4)
+        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+            const int is0 = (ir0 - src0_start_row);
+            if (is0 >= HTP_SPAD_SRC0_NROWS) {
+                break;
+            }
+            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+                           src0_row_size_padded, src0_row_size, 2);
+        }
+
+        // Process src0 rows
+        for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
+            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            mt->vec_dot_rx2(ne00, &dst_row[ir0], ss0, src0_row_size_padded, src1_col);
+
+            // Prefetch next (n + spad_nrows) row
+            const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
+            const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
+            if (pr0 < src0_end_row_x2) {
+                dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                               src0_row_size_padded, src0_row_size, 2);
+            }
+        }
+
+        // Process the last row (if any)
+        if (src0_end_row != src0_end_row_x2) {
+            uint32_t       ir0 = src0_end_row_x2;
+            const uint32_t is0 = (ir0 - src0_start_row);
+            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+                           src0_row_size_padded, src0_row_size, 1);
+            const uint8_t * ss0 = dma_queue_pop(dma_queue);
+            mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
+         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
+         src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0],
+         dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// *** matmul in fp16
+
+static void matmul_f16_f32(struct htp_tensor * restrict src0,
+                           struct htp_tensor * restrict src1,
+                           struct htp_tensor * restrict dst,
+                           struct htp_spad * restrict src0_spad,
+                           struct htp_spad * restrict src1_spad,
+                           struct htp_spad * restrict dst_spad,
+                           uint32_t    nth,
+                           uint32_t    ith,
+                           uint32_t    src0_nrows_per_thread,
+                           dma_queue * dma_queue) {
+    htp_matmul_preamble;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const size_t src0_row_size = sizeof(__fp16) * ne00;
+    const size_t src1_row_size = sizeof(float) * ne10;
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
+    const uint32_t nr0 = ne0;
+
+    // This is the size of the rest of the dimensions of the result
+    const uint32_t nr1 = ne1 * ne2 * ne3;
+
+    uint32_t chunk_size = 64;
+
+    // distribute the thread work across the inner or outer loop based on which one is larger
+    uint32_t nchunk0 = nr0 > nr1 ? nth : 1;  // parallelize by src0 rows
+    uint32_t nchunk1 = nr0 > nr1 ? 1 : nth;  // parallelize by src1 rows
+
+    // The number of elements in each chunk
+    const uint32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+    const uint32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
+
+    uint32_t current_chunk = ith;
+
+    const uint32_t ith0 = current_chunk % nchunk0;
+    const uint32_t ith1 = current_chunk / nchunk0;
+
+    const uint32_t ir0_start = dr0 * ith0;
+    const uint32_t ir0_end   = MIN(ir0_start + dr0, nr0);
+
+    const uint32_t ir1_start = dr1 * ith1;
+    const uint32_t ir1_end   = MIN(ir1_start + dr1, nr1);
+
+    // broadcast factors
+    const uint32_t r2 = ne12 / ne02;
+    const uint32_t r3 = ne13 / ne03;
+
+    // no work for this thread
+    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
+        return;
+    }
+
+    // block-tiling attempt
+    const uint32_t blck_0 = 64;
+    const uint32_t blck_1 = 64;
+
+    float tmp[32];
+
+    for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (uint32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1++) {
+                const uint32_t i13 = (ir1 / (ne12 * ne1));
+                const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+
+                // broadcast src0 into src1
+                const uint32_t i03 = i13 / r3;
+                const uint32_t i02 = i12 / r2;
+
+                const uint32_t i1 = i11;
+                const uint32_t i2 = i12;
+                const uint32_t i3 = i13;
+
+                const uint8_t * restrict src0_row = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
+                const uint8_t * restrict src1_col =
+                    (const uint8_t *) src1->data + (i11 + i12 * ne11 + i13 * ne12 * ne11) * src1_row_size;
+                float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+                for (uint32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0++) {
+                    vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row + ir0 * src0_row_size, src1_col);
+                }
+
+                hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0);
+            }
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0],
+         src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// *** dynamic quant
+
+static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
+    assert((unsigned long) x % 128 == 0);
+    assert((unsigned long) y_q % 128 == 0);
+
+    HVX_Vector * vx = (HVX_Vector *) x;
+
+    // Load and convert into QF32
+    HVX_Vector zero   = Q6_V_vsplat_R(0);
+    HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero);  // 32 elements
+    HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero);  // 32 elements
+    HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero);  // 32 elements
+    HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero);  // 32 elements
+
+    // Convert into fp16
+    HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
+    HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
+
+    // Compute max and scale
+    HVX_Vector vmax_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx01_hf));
+    vmax_hf            = hvx_vec_reduce_max2_fp16(hvx_vec_abs_fp16(vx23_hf), vmax_hf);
+
+    // Replicate first fp16 scale across all lanes
+    HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_fp16;
+    vmax_hf         = Q6_V_vdelta_VV(vmax_hf, ctrl);
+
+    HVX_Vector vd_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd_hf   = Q6_Vhf_equals_Vqf16(vd_qf16);
+
+    *(HVX_UVector *) y_d = vd_hf;
+
+    // Divide input by the scale
+    HVX_Vector vd_inv_hf = hvx_vec_inverse_fp16(vd_hf);
+    vx01_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd_inv_hf));
+    vx23_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd_inv_hf));
+
+    // Convert to int8
+    HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf);
+    HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf);
+    HVX_Vector vx_i8    = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16);
+
+    *(HVX_Vector *) y_q = vx_i8;
+}
+
+// Overrides input x
+static void quantize_row_fp32_q8x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) {
+    assert(k % 32 == 0);
+    const uint32_t qk = QK_Q8_0x4x2;
+    const uint32_t nb = (k + qk - 1) / qk;
+
+    const uint32_t qrow_size = k;              // int8
+
+    const uint32_t dblk_size = 8 * 2;          // 8x __fp16
+    const uint32_t qblk_size = QK_Q8_0x4x2;    // int8
+
+    uint8_t * restrict y_q = (y + 0);          // quants first
+    uint8_t * restrict y_d = (y + qrow_size);  // then scales
+
+    // Temp scales override input since we're working off of the aligned temp buffer in VTCM
+    uint8_t * restrict t_d = (uint8_t *) x;
+
+    for (uint32_t i = 0; i < nb; i++) {
+        quantize_block_fp32_q8x4(x + (i * 2 + 0) * qk / 2, y_q + (i * 2 + 0) * qblk_size / 2,
+                                 t_d + (i * 2 + 0) * dblk_size / 2);
+        quantize_block_fp32_q8x4(x + (i * 2 + 1) * qk / 2, y_q + (i * 2 + 1) * qblk_size / 2,
+                                 t_d + (i * 2 + 1) * dblk_size / 2);
+    }
+
+    // now copy the scales into final location
+    hvx_copy_fp16_ua(y_d, t_d, nb * 8);
+}
+
+static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
+                                 uint8_t * restrict dst,
+                                 struct htp_spad * spad,
+                                 uint32_t          nth,
+                                 uint32_t          ith,
+                                 uint32_t          nrows_per_thread) {
+    uint64_t t1 = HAP_perf_get_qtimer_count();
+
+    const uint32_t ne0 = src->ne[0];
+    const uint32_t ne1 = src->ne[1];
+    const uint32_t ne2 = src->ne[2];
+    const uint32_t ne3 = src->ne[3];
+
+    const uint32_t nrows = ne1 * ne2 * ne3;                             // total n_rows
+
+    const uint32_t ir_first = nrows_per_thread * ith;                   // first row
+    const uint32_t ir_last  = MIN(ir_first + nrows_per_thread, nrows);  // last row
+
+    const size_t src_row_size = src->nb[1];
+    const size_t dst_row_size = q8x4x2_row_size(ne0);
+
+    uint8_t * restrict src_data = (uint8_t *) src->data + (src_row_size * ir_first);
+    uint8_t * restrict dst_data = (uint8_t *) dst + (dst_row_size * ir_first);
+    uint8_t * restrict tmp_data = (uint8_t *) spad->data + (spad->size_per_thread * ith);
+
+    const size_t src_row_size_padded = htp_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float));
+    memset(tmp_data, 0, src_row_size_padded);  // zero-out temp row data for padding
+
+    for (uint32_t i = ir_first; i < ir_last; ++i) {
+        htp_l2fetch(src_data, 2, src_row_size, src_row_size);
+        hvx_copy_fp32_aa(tmp_data, src_data, ne0);
+
+        // FARF(HIGH, "quantize-q8x4-row: %u\n", i);
+        quantize_row_fp32_q8x4x2((float *) tmp_data, dst_data, ne0);
+        dst_data += dst_row_size;
+        src_data += src_row_size;
+    }
+
+    uint64_t t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
+         ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+    quantize_fp32_q8x4x2(&octx->src1, octx->src1_spad.data, &octx->src0_spad, n, i, octx->src1_nrows_per_thread);
+}
+
+// ** matmul callbacks for worker_pool
+
+static void htp_matvec_q4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "q4x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_q4x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_q4x4x2_q8x4x2_rx2;
+
+    matvec(&mt, &octx->src0, &octx->src1, &octx->dst, &octx->src0_spad, &octx->src1_spad, &octx->dst_spad, n, i,
+           octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matmul_q4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "q4x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_q4x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_q4x4x2_q8x4x2_rx2;
+
+    matmul(&mt, &octx->src0, &octx->src1, &octx->dst, &octx->src0_spad, &octx->src1_spad, &octx->dst_spad, n, i,
+           octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matvec_q8x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "q8x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_q8x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_q8x4x2_q8x4x2_rx2;
+
+    matvec(&mt, &octx->src0, &octx->src1, &octx->dst, &octx->src0_spad, &octx->src1_spad, &octx->dst_spad, n, i,
+           octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matmul_q8x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "q8x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_q8x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_q8x4x2_q8x4x2_rx2;
+
+    matmul(&mt, &octx->src0, &octx->src1, &octx->dst, &octx->src0_spad, &octx->src1_spad, &octx->dst_spad, n, i,
+           octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matvec_mxfp4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "mxfp4x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_mxfp4x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_mxfp4x4x2_q8x4x2_rx2;
+
+    matvec(&mt, &octx->src0, &octx->src1, &octx->dst, &octx->src0_spad, &octx->src1_spad, &octx->dst_spad, n, i,
+           octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matmul_mxfp4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "mxfp4x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_mxfp4x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_mxfp4x4x2_q8x4x2_rx2;
+
+    matmul(&mt, &octx->src0, &octx->src1, &octx->dst, &octx->src0_spad, &octx->src1_spad, &octx->dst_spad, n, i,
+           octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matmul_f16_f32(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+    matmul_f16_f32(&octx->src0, &octx->src1, &octx->dst, &octx->src0_spad, &octx->src1_spad, &octx->dst_spad, n, i,
+                   octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+// ** matmul-id callbacks for worker_pool
+
+static void htp_matvec_id_q4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "q4x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_q4x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_q4x4x2_q8x4x2_rx2;
+
+    matvec_id(&mt, &octx->src0, &octx->src1, &octx->src2, &octx->dst, &octx->src0_spad, &octx->src1_spad,
+              &octx->src2_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matmul_id_q4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "q4x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_q4x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_q4x4x2_q8x4x2_rx2;
+
+    matmul_id(&mt, &octx->src0, &octx->src1, &octx->src2, &octx->dst, &octx->src0_spad, &octx->src1_spad,
+              &octx->src2_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matvec_id_q8x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "q8x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_q8x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_q8x4x2_q8x4x2_rx2;
+
+    matvec_id(&mt, &octx->src0, &octx->src1, &octx->src2, &octx->dst, &octx->src0_spad, &octx->src1_spad,
+              &octx->src2_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matmul_id_q8x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "q8x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_q8x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_q8x4x2_q8x4x2_rx2;
+
+    matmul_id(&mt, &octx->src0, &octx->src1, &octx->src2, &octx->dst, &octx->src0_spad, &octx->src1_spad,
+              &octx->src2_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matvec_id_mxfp4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "mxfp4x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_mxfp4x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_mxfp4x4x2_q8x4x2_rx2;
+
+    matvec_id(&mt, &octx->src0, &octx->src1, &octx->src2, &octx->dst, &octx->src0_spad, &octx->src1_spad,
+              &octx->src2_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+static void htp_matmul_id_mxfp4x4x2_q8x4x2(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = data;
+
+    struct htp_matmul_type mt;
+    mt.type        = "mxfp4x4x2-q8x4x2";
+    mt.vec_dot     = vec_dot_mxfp4x4x2_q8x4x2;
+    mt.vec_dot_rx2 = vec_dot_mxfp4x4x2_q8x4x2_rx2;
+
+    matmul_id(&mt, &octx->src0, &octx->src1, &octx->src2, &octx->dst, &octx->src0_spad, &octx->src1_spad,
+              &octx->src2_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
+}
+
+// ** main matmul entry point
+
+int op_matmul(struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    htp_matmul_preamble;
+
+    const char * op_type;
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;
+    const uint32_t src1_nrows = ne11 * ne12 * ne13;
+
+    const size_t src0_row_size = nb01;
+    const size_t dst_row_size  = nb1;
+    size_t       src1_row_size = nb11;
+
+    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+    size_t       src1_row_size_padded;
+
+    worker_callback_t quant_job_func;
+    worker_callback_t matmul_job_func;
+
+    bool need_quant = !(octx->flags & HTP_OPFLAGS_SKIP_QUANTIZE);
+
+    switch (src0->type) {
+        case HTP_TYPE_Q4_0:
+            op_type        = "q4x4x2-fp32";
+            quant_job_func = htp_quantize_fp32_q8x4x2;
+            if (src1_nrows > 1) {
+                matmul_job_func = htp_matmul_q4x4x2_q8x4x2;
+            } else {
+                matmul_job_func = htp_matvec_q4x4x2_q8x4x2;
+            }
+
+            src1_row_size = q8x4x2_row_size(ne10);  // row size post quantization
+
+            // Entire src1 tensor is placed into the VTCM
+            // For other tensors we allocate N rows per thread, padded to HVX vector size
+
+            octx->dst_spad.size_per_thread  = htp_round_up(HTP_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = htp_round_up(HTP_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+
+            // src0 spad is also used in dynamic quantizer to store padded src1 rows
+            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
+                octx->src0_spad.size_per_thread = src1_row_size_padded;
+            }
+
+            octx->src1_spad.size = octx->src1_spad.size_per_thread;
+            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
+            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
+            break;
+
+        case HTP_TYPE_Q8_0:
+            op_type        = "q8x4x2-fp32";
+            quant_job_func = htp_quantize_fp32_q8x4x2;
+            if (src1_nrows > 1) {
+                matmul_job_func = htp_matmul_q8x4x2_q8x4x2;
+            } else {
+                matmul_job_func = htp_matvec_q8x4x2_q8x4x2;
+            }
+
+            src1_row_size = q8x4x2_row_size(ne10);  // row size post quantization
+
+            // Entire src1 tensor is placed into the VTCM
+            // For other tensors we allocate N rows per thread, padded to HVX vector size
+
+            octx->dst_spad.size_per_thread  = htp_round_up(HTP_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = htp_round_up(HTP_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+
+            // src0 spad is also used in dynamic quantizer to store padded src1 rows
+            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
+                octx->src0_spad.size_per_thread = src1_row_size_padded;
+            }
+
+            octx->src1_spad.size = octx->src1_spad.size_per_thread;
+            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
+            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
+            break;
+
+        case HTP_TYPE_MXFP4:
+            op_type        = "mxfp4x4x2-f32";
+            quant_job_func = htp_quantize_fp32_q8x4x2;
+            if (src1_nrows > 1) {
+                matmul_job_func = htp_matmul_mxfp4x4x2_q8x4x2;
+            } else {
+                matmul_job_func = htp_matvec_mxfp4x4x2_q8x4x2;
+            }
+
+            src1_row_size = q8x4x2_row_size(ne10);  // row size post quantization
+
+            // Entire src1 tensor is placed into the VTCM
+            // For other tensors we allocate N rows per thread, padded to HVX vector size
+
+            octx->dst_spad.size_per_thread  = htp_round_up(HTP_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = htp_round_up(HTP_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+
+            // src0 spad is also used in dynamic quantizer to store padded src1 rows
+            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
+                octx->src0_spad.size_per_thread = src1_row_size_padded;
+            }
+
+            octx->src1_spad.size = octx->src1_spad.size_per_thread;
+            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
+            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
+            break;
+
+        case HTP_TYPE_F16:
+            op_type         = "f16-f32";
+            quant_job_func  = NULL;  // htp_quantize_f32_f16;
+            matmul_job_func = htp_matmul_f16_f32;
+
+            // For all tensors we allocate N rows per thread, padded to HVX vector size
+            octx->dst_spad.size_per_thread  = htp_round_up(HTP_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = htp_round_up(HTP_SPAD_SRC0_NROWS * src0_row_size, 256);
+            octx->src1_spad.size_per_thread = htp_round_up(HTP_SPAD_SRC1_NROWS * src1_row_size, 256);
+
+            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
+            octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads;
+            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
+
+            need_quant = false;
+            break;
+
+        default:
+            return HTP_STATUS_NO_SUPPORT;
+    }
+
+    // VTCM scratchpads for all tensors
+    size_t spad_size = octx->src1_spad.size + octx->src0_spad.size + octx->dst_spad.size;
+
+    FARF(HIGH, "matmul-%s : src0-spad-size %u src1-spad-size %u dst-spad-size %u (%zu)\n", op_type,
+         octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size, spad_size);
+
+    FARF(HIGH, "matmul-%s : %ux%ux%ux%u * %ux%ux%ux%u-> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", op_type, src0->ne[0],
+         src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0],
+         dst->ne[1], dst->ne[2], dst->ne[3], src0->data, src1->data, dst->data);
+
+    // Make sure the reserved vtcm size is sufficient
+    if (octx->ctx->vtcm_size < spad_size) {
+        FARF(ERROR, "matmul-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type,
+             octx->ctx->vtcm_size, spad_size);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+
+    octx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads;
+    octx->src0_nrows_per_thread += (octx->src0_nrows_per_thread & 1);  // round up to even
+
+    if (need_quant) {
+        // Run quant jobs
+        const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
+        octx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, octx, n_quant_jobs);
+    }
+
+    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        // Run matmul jobs
+        const uint32_t n_matmul_jobs = octx->n_threads;
+        worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, octx, n_matmul_jobs);
+    }
+
+    return HTP_STATUS_OK;
+}
+
+// ** main matmul-id entry point
+
+int op_matmul_id(struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    const struct htp_tensor * ids  = &octx->src2;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    htp_matmul_preamble;
+
+    const char * op_type;
+
+    worker_callback_t quant_job_func;
+    worker_callback_t matmul_id_job_func;
+
+    const size_t src0_row_size = nb01;
+    const size_t dst_row_size  = nb1;
+
+    const size_t src0_row_size_padded = htp_round_up(src0_row_size, 128);
+
+    const uint32_t src0_nrows = ne01;  // per expert
+    const uint32_t src1_nrows = ne11 * ne12 * ne13;
+
+    size_t src1_row_size;
+    size_t src1_row_size_padded;
+
+    // row groups
+    const int n_ids = ids->ne[0];  // n_expert_used
+    const int n_as  = ne02;        // n_expert
+
+    size_t matrix_row_counts_size = n_as * sizeof(uint32_t);
+    size_t matrix_row_map_size    = n_as * ids->ne[0] * ids->ne[1] * sizeof(struct mmid_row_mapping);
+
+    switch (src0->type) {
+        case HTP_TYPE_Q4_0:
+            op_type        = "q4x2x2-f32";
+            quant_job_func = htp_quantize_fp32_q8x4x2;
+            src1_row_size  = q8x4x2_row_size(ne10);  // row size post quantization
+            if (src1_nrows > 1) {
+                matmul_id_job_func = htp_matmul_id_q4x4x2_q8x4x2;
+            } else {
+                matmul_id_job_func = htp_matvec_id_q4x4x2_q8x4x2;
+            }
+
+            // Entire src1 tensor is placed into the VTCM
+            // For other tensors we allocate N rows per thread, padded to HVX vector size
+            octx->dst_spad.size_per_thread  = htp_round_up(HTP_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = htp_round_up(HTP_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+            octx->src2_spad.size_per_thread = htp_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
+
+            // src0 spad is also used in dynamic quantizer to store padded src1 rows
+            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
+                octx->src0_spad.size_per_thread = src1_row_size_padded;
+            }
+
+            octx->src2_spad.size = octx->src2_spad.size_per_thread;
+            octx->src1_spad.size = octx->src1_spad.size_per_thread;
+            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
+            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
+            break;
+
+        case HTP_TYPE_Q8_0:
+            op_type        = "q8x2x2-f32";
+            quant_job_func = htp_quantize_fp32_q8x4x2;
+            src1_row_size  = q8x4x2_row_size(ne10);  // row size post quantization
+            if (src1_nrows > 1) {
+                matmul_id_job_func = htp_matmul_id_q8x4x2_q8x4x2;
+            } else {
+                matmul_id_job_func = htp_matvec_id_q8x4x2_q8x4x2;
+            }
+
+            // Entire src1 tensor is placed into the VTCM
+            // For other tensors we allocate N rows per thread, padded to HVX vector size
+            octx->dst_spad.size_per_thread  = htp_round_up(HTP_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = htp_round_up(HTP_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+            octx->src2_spad.size_per_thread = htp_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
+
+            // src0 spad is also used in dynamic quantizer to store padded src1 rows
+            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
+                octx->src0_spad.size_per_thread = src1_row_size_padded;
+            }
+
+            octx->src2_spad.size = octx->src2_spad.size_per_thread;
+            octx->src1_spad.size = octx->src1_spad.size_per_thread;
+            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
+            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
+            break;
+
+        case HTP_TYPE_MXFP4:
+            op_type        = "mxfp4x2x2-f32";
+            quant_job_func = htp_quantize_fp32_q8x4x2;
+            src1_row_size  = q8x4x2_row_size(ne10);  // row size post quantization
+            if (src1_nrows > 1) {
+                matmul_id_job_func = htp_matmul_id_mxfp4x4x2_q8x4x2;
+            } else {
+                matmul_id_job_func = htp_matvec_id_mxfp4x4x2_q8x4x2;
+            }
+
+            // Entire src1 tensor is placed into the VTCM
+            // For other tensors we allocate N rows per thread, padded to HVX vector size
+            octx->dst_spad.size_per_thread  = htp_round_up(HTP_SPAD_DST_NROWS * dst_row_size, 256);
+            octx->src0_spad.size_per_thread = htp_round_up(HTP_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
+            octx->src1_spad.size_per_thread = htp_round_up(src1_row_size * src1_nrows, 256);
+            octx->src2_spad.size_per_thread = htp_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
+
+            // src0 spad is also used in dynamic quantizer to store padded src1 rows
+            src1_row_size_padded = htp_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
+            if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
+                octx->src0_spad.size_per_thread = src1_row_size_padded;
+            }
+
+            octx->src2_spad.size = octx->src2_spad.size_per_thread;
+            octx->src1_spad.size = octx->src1_spad.size_per_thread;
+            octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
+            octx->dst_spad.size  = octx->dst_spad.size_per_thread * octx->n_threads;
+            break;
+
+        default:
+            return HTP_STATUS_NO_SUPPORT;
+    }
+
+    size_t spad_size = octx->src2_spad.size + octx->src1_spad.size + octx->src0_spad.size + octx->dst_spad.size;
+
+    FARF(HIGH, "matmul-id-%s : src0-spad-size %u src1-spad-size %u src2-spad-size %u dst-spad-size %u (%zu)\n", op_type,
+         octx->src0_spad.size, octx->src1_spad.size, octx->src2_spad.size, octx->dst_spad.size, spad_size);
+
+    FARF(HIGH, "matmul-id-%s : %ux%ux%ux%u * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", op_type,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+         ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], src0->data,
+         src1->data, dst->data);
+
+    // Make sure the reserved vtcm size is sufficient
+    if (octx->ctx->vtcm_size < spad_size) {
+        FARF(ERROR, "matmul-id-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type,
+             octx->ctx->vtcm_size, spad_size);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->src2_spad.data = octx->src1_spad.data + octx->src1_spad.size;
+    octx->dst_spad.data  = octx->src2_spad.data + octx->src2_spad.size;
+
+    octx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads;
+    octx->src0_nrows_per_thread += (octx->src0_nrows_per_thread & 1);  // round up to even
+
+    if (src1_nrows > 1) {
+        // initialize matrix_row_counts and map
+        uint32_t *                matrix_row_counts = (uint32_t *) octx->src2_spad.data + 0;
+        struct mmid_row_mapping * matrix_rows       = (void *) octx->src2_spad.data + matrix_row_counts_size;
+
+        memset(matrix_row_counts, 0, n_as * sizeof(uint32_t));
+
+        // group rows by src0 matrix
+        for (uint32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {  // token idx
+            for (uint32_t id = 0; id < n_ids; ++id) {         // expert idx
+                const uint32_t i02 =
+                    *(const uint32_t *) ((const uint8_t *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
+
+                assert(i02 >= 0 && i02 < n_as);
+
+                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) { id, iid1 };
+                matrix_row_counts[i02] += 1;
+            }
+        }
+    }
+
+    // Setup worker pool callbacks
+    if (!(octx->flags & HTP_OPFLAGS_SKIP_QUANTIZE)) {
+        // Run quant jobs
+        const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
+        octx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, octx, n_quant_jobs);
+    }
+
+    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        // Run matmul-id jobs
+        const uint32_t n_matmul_jobs = octx->n_threads;
+        worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, octx, n_matmul_jobs);
+    }
+
+    return HTP_STATUS_OK;
+}
diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h
new file mode 100644
index 00000000000..f03ff34028f
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@@ -0,0 +1,116 @@
+#ifndef OPS_UTILS_H
+#define OPS_UTILS_H
+
+#include "htp-msg.h"
+
+#ifndef MAX
+#    define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+#ifndef MIN
+#    define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+static inline uint64_t htp_get_cycles() {
+    uint64_t cycles = 0;
+    asm volatile(" %0 = c15:14\n" : "=r"(cycles));
+    return cycles;
+}
+
+static inline uint64_t htp_get_pktcnt() {
+    uint64_t pktcnt;
+    asm volatile(" %0 = c19:18\n" : "=r"(pktcnt));
+    return pktcnt;
+}
+
+static inline int32_t htp_is_aligned(void * addr, uint32_t align) {
+    return ((size_t) addr & (align - 1)) == 0;
+}
+
+static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
+    return m * ((n + m - 1) / m);
+}
+
+static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {
+    const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
+    asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
+}
+
+static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
+    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
+    uint32_t right_off = left_off + n;
+    return right_off <= chunk_size;
+}
+
+static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) {
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
+    for (int i = 0; i < 16; i++) {
+        p += sprintf(p, "%d, ", x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += sprintf(p, "%d, ", x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += sprintf(p, "%d, ", (int) x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) {
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += sprintf(p, "%.6f, ", (float) x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) {
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += sprintf(p, "%.6f, ", x[i]);
+    }
+    FARF(HIGH, "%s\n", str);
+}
+
+static inline void htp_dump_f32(char * pref, const float * x, uint32_t n) {
+    uint32_t n0 = n / 16;
+    uint32_t n1 = n % 16;
+
+    uint32_t i = 0;
+    for (; i < n0; i++) {
+        htp_dump_fp32_line(pref, x + (16 * i), 16);
+    }
+    if (n1) {
+        htp_dump_fp32_line(pref, x + (16 * i), n1);
+    }
+}
+
+static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
+    uint32_t n0 = n / 16;
+    uint32_t n1 = n % 16;
+
+    uint32_t i = 0;
+    for (; i < n0; i++) {
+        htp_dump_fp16_line(pref, x + (16 * i), 16);
+    }
+    if (n1) {
+        htp_dump_fp16_line(pref, x + (16 * i), n1);
+    }
+}
+
+#endif /* OPS_UTILS_H */
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
new file mode 100644
index 00000000000..16afa50f5b0
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -0,0 +1,418 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+#include <HAP_farf.h>
+#include <HAP_mem.h>
+#include <HAP_perf.h>
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <qurt_thread.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+#define htp_rope_preamble              \
+    const uint32_t ne00 = src0->ne[0]; \
+    const uint32_t ne01 = src0->ne[1]; \
+    const uint32_t ne02 = src0->ne[2]; \
+    const uint32_t ne03 = src0->ne[3]; \
+                                       \
+    const uint32_t ne0 = dst->ne[0];   \
+    const uint32_t ne1 = dst->ne[1];   \
+    const uint32_t ne2 = dst->ne[2];   \
+    const uint32_t ne3 = dst->ne[3];   \
+                                       \
+    const uint32_t nb00 = src0->nb[0]; \
+    const uint32_t nb01 = src0->nb[1]; \
+    const uint32_t nb02 = src0->nb[2]; \
+    const uint32_t nb03 = src0->nb[3]; \
+                                       \
+    const uint32_t nb0 = dst->nb[0];   \
+    const uint32_t nb1 = dst->nb[1];   \
+    const uint32_t nb2 = dst->nb[2];   \
+    const uint32_t nb3 = dst->nb[3];
+
+struct rope_th_ctx {
+    int32_t n_dims;
+    int32_t mode;
+    int32_t n_ctx_orig;
+    int32_t sections[4];
+
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+    float theta_scale;
+    float corr_dims[2];
+
+    struct htp_ops_context * octx;
+};
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
+
+    return (1 - MIN(1, MAX(0, y)));
+}
+
+static void rope_cache_init(const float   theta_base,
+                            float         freq_scale,
+                            const float * freq_factors,
+                            float *       corr_dims,
+                            uint32_t      ne0,
+                            float         ext_factor,
+                            float         mscale,
+                            float *       cache,
+                            float         theta_scale) {
+    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
+    float theta = theta_base;
+
+    for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
+        const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
+
+        float theta_extrap = theta / ff;
+
+        // Get n-d rotational scaling corrected for extrapolation
+        float theta_interp = freq_scale * theta_extrap;
+        float theta2       = theta_interp;
+
+        if (ext_factor != 0.0f) {
+            float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+            theta2         = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+            // Get n-d magnitude scaling corrected for interpolation
+            mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+        }
+
+        cache[i0 + 0] = cosf(theta2) * mscale;
+        cache[i0 + 1] = sinf(theta2) * mscale;
+
+        theta *= theta_scale;
+    }
+}
+
+#define M_PI 3.1415926535897932384626433
+
+static void rope_corr_dims(int     n_dims,
+                           int     n_ctx_orig,
+                           float   freq_base,
+                           float   beta_fast,
+                           float   beta_slow,
+                           float * dims) {
+    float start = floorf(n_dims * logf(n_ctx_orig / (beta_fast * 2 * (float) M_PI)) / (2 * logf(freq_base)));
+    float end   = ceilf(n_dims * logf(n_ctx_orig / (beta_slow * 2 * (float) M_PI)) / (2 * logf(freq_base)));
+    dims[0]     = MAX(0, start);
+    dims[1]     = MIN(n_dims - 1, end);
+}
+
+static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context * octx) {
+    memset(rope_ctx, 0, sizeof(struct rope_th_ctx));
+
+    const int32_t * op_params = &octx->op_params[0];
+
+    rope_ctx->n_dims     = ((const int32_t *) op_params)[1];
+    rope_ctx->mode       = ((const int32_t *) op_params)[2];
+    rope_ctx->n_ctx_orig = ((const int32_t *) op_params)[4];
+
+    memcpy(&rope_ctx->freq_base, (int32_t *) op_params + 5, sizeof(float));
+    memcpy(&rope_ctx->freq_scale, (int32_t *) op_params + 6, sizeof(float));
+    memcpy(&rope_ctx->ext_factor, (int32_t *) op_params + 7, sizeof(float));
+    memcpy(&rope_ctx->attn_factor, (int32_t *) op_params + 8, sizeof(float));
+    memcpy(&rope_ctx->beta_fast, (int32_t *) op_params + 9, sizeof(float));
+    memcpy(&rope_ctx->beta_slow, (int32_t *) op_params + 10, sizeof(float));
+    memcpy(&rope_ctx->sections, (int32_t *) op_params + 11, sizeof(int) * 4);
+
+    rope_ctx->theta_scale = powf(rope_ctx->freq_base, -2.0f / rope_ctx->n_dims);
+
+    rope_corr_dims(rope_ctx->n_dims, rope_ctx->n_ctx_orig, rope_ctx->freq_base, rope_ctx->beta_fast,
+                   rope_ctx->beta_slow, rope_ctx->corr_dims);
+
+    rope_ctx->octx = octx;
+    FARF(HIGH, "rope-f32 n_dims:%d, ext_factor:%.6f, theta_scale:%.6f, attn_factor:%.6f\n", rope_ctx->n_dims,
+         rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
+}
+
+static void hvx_calc_rope_f32(const float * restrict src0,
+                              float * restrict dst,
+                              const int num_elems,
+                              const float * restrict theta_cache) {
+    // for (int i = 0; i < num_elems; i += 2) {
+    //const float cos_theta = theta_cache[i + 0];
+    //const float sin_theta = theta_cache[i + 1];
+
+    //const float x0 = src[0];
+    //const float x1 = src[1];
+
+    //dst[0] = x0*cos_theta - x1*sin_theta;
+    //dst[1] = x0*sin_theta + x1*cos_theta;
+
+    //src += 2;
+    //dst += 2;
+    // }
+
+    const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
+    const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
+    uint8_t * restrict dst_curr         = (uint8_t *) dst;
+
+    int step_of_1 = num_elems >> 6;  // 6 because we process two vectors at once
+
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector v0 = *(HVX_Vector *) src0_curr;
+        HVX_Vector v1 = *(HVX_Vector *) (src0_curr + VLEN);
+
+        HVX_Vector v2 = *(HVX_Vector *) theta_curr;
+        HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
+
+        HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);  // vx0_x1[0] = x0, vx0_x1[1] = x1
+        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
+
+        HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+        HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+        HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
+        HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
+
+        HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
+        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
+
+        HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
+
+        *(HVX_Vector *) dst_curr          = Q6_V_lo_W(vstore);
+        *(HVX_Vector *) (dst_curr + VLEN) = Q6_V_hi_W(vstore);
+
+        src0_curr += 2 * VLEN;
+        theta_curr += 2 * VLEN;
+        dst_curr += 2 * VLEN;
+    }
+}
+
+static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
+                         const uint32_t       ir0,
+                         const uint32_t       ir1,
+                         int                  nth,
+                         int                  ith,
+                         int                  opt_path) {
+    struct htp_ops_context * octx = rope_ctx->octx;
+
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    const struct htp_tensor * src2 = &octx->src2;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    htp_rope_preamble;
+
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    float * wp0 = (float *) (octx->src0_spad.data + (ith * nb01));
+
+    const float * freq_factors = NULL;
+    if (src2 != NULL) {
+        freq_factors = (const float *) src2->data;
+    }
+
+    int ir = 0;
+
+    for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
+        for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
+            const int32_t p = pos[i2];
+
+            rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
+                            rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
+
+            for (uint32_t i1 = 0; i1 < ne1; i1++) {  // attn-heads
+                if (ir++ < ir0) {
+                    continue;
+                }
+                if (ir > ir1) {
+                    break;
+                }
+
+                const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
+                float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+
+                const float * src_loc      = src;
+                float *       dst_data_loc = dst_data;
+
+                if (1 == opt_path) {
+                    hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
+                } else {
+                    for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
+                        const float cos_theta = wp0[i0 + 0];
+                        const float sin_theta = wp0[i0 + 1];
+
+                        const float x0 = src_loc[0];
+                        const float x1 = src_loc[1];
+
+                        dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
+                        dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
+
+                        src_loc += 2;
+                        dst_data_loc += 2;
+                    }
+                }
+
+                for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
+                    dst_data_loc[0] = src_loc[0];
+                    dst_data_loc[1] = src_loc[1];
+
+                    src_loc += 2;
+                    dst_data_loc += 2;
+                }
+            }
+        }
+    }
+}
+
+static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int ith) {
+    struct htp_ops_context * octx = rope_ctx->octx;
+
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    htp_rope_preamble;
+
+    const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
+        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
+        FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
+        is_aligned = 0;
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
+
+    rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
+    struct rope_th_ctx * rope_ctx = (struct rope_th_ctx *) data;
+
+    rope_job_f32_per_thread(rope_ctx, n, i);
+}
+
+static int execute_op_rope_f32(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    const struct htp_tensor * src2 = &octx->src2;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    worker_callback_t op_func;
+    const char *      op_type = NULL;
+
+    struct rope_th_ctx rope_ctx;
+
+    switch (octx->op) {
+        case HTP_OP_ROPE:
+            op_func = rope_job_dispatcher_f32;
+            op_type = "rope-f32";
+
+            init_rope_ctx(&rope_ctx, octx);
+            break;
+
+        default:
+            FARF(ERROR, "Unsupported Op %u\n", octx->op);
+            return HTP_STATUS_NO_SUPPORT;
+    }
+
+    const uint32_t n_threads = octx->n_threads;
+
+    const size_t src0_row_size = src0->nb[1];
+    const size_t src1_row_size = src0_row_size;
+    const size_t dst_row_size  = dst->nb[1];
+
+    // VTCM scratchpads for all tensors
+    // N rows per thread, padded to HVX vector size
+    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
+    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
+
+    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+
+    if (src2->ne[0]) {
+        FARF(HIGH,
+             "%s: %ux%ux%ux%u (x %ux%ux%ux%u x %ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u "
+             "dst-spad-size %u\n",
+             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
+             src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1], dst->ne[2],
+             dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
+    } else {
+        FARF(HIGH,
+             "%s: %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
+             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
+             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
+             octx->dst_spad.size);
+    }
+
+    // Make sure the reserved vtcm size is sufficient
+    if (octx->ctx->vtcm_size < spad_size) {
+        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
+             spad_size);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+
+    uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+
+    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        uint32_t n_jobs             = MIN(n_threads, src0_nrows);
+        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, op_func, &rope_ctx, n_jobs);
+    }
+
+    return err;
+}
+
+int op_rope(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    switch (octx->src0.type) {
+        case HTP_TYPE_F32:
+            err = execute_op_rope_f32(octx);
+            break;
+
+        default:
+            err = HTP_STATUS_NO_SUPPORT;
+            break;
+    }
+
+    return err;
+}
diff --git a/ggml/src/ggml-hexagon/htp/softmax-ops.c b/ggml/src/ggml-hexagon/htp/softmax-ops.c
new file mode 100644
index 00000000000..5fea4221afa
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c
@@ -0,0 +1,402 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+#include <HAP_farf.h>
+#include <HAP_mem.h>
+#include <HAP_perf.h>
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <qurt_thread.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+#define htp_softmax_preamble3                              \
+    const uint32_t ne00 = src0->ne[0];                     \
+    const uint32_t ne01 = src0->ne[1];                     \
+    const uint32_t ne02 = src0->ne[2];                     \
+    const uint32_t ne03 = src0->ne[3];                     \
+                                                           \
+    const uint32_t nb00 = src0->nb[0];                     \
+    const uint32_t nb01 = src0->nb[1];                     \
+    const uint32_t nb02 = src0->nb[2];                     \
+    const uint32_t nb03 = src0->nb[3];                     \
+                                                           \
+    const uint32_t ne10 = (src1->ne[0]) ? src1->ne[0] : 1; \
+    const uint32_t ne11 = (src1->ne[0]) ? src1->ne[1] : 1; \
+    const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1; \
+    const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1; \
+                                                           \
+    const uint32_t nb10 = (src1->ne[0]) ? src1->nb[0] : 1; \
+    const uint32_t nb11 = (src1->ne[0]) ? src1->nb[1] : 1; \
+    const uint32_t nb12 = (src1->ne[0]) ? src1->nb[2] : 1; \
+    const uint32_t nb13 = (src1->ne[0]) ? src1->nb[3] : 1; \
+                                                           \
+    const uint32_t ne0 = dst->ne[0];                       \
+    const uint32_t ne1 = dst->ne[1];                       \
+    const uint32_t ne2 = dst->ne[2];                       \
+    const uint32_t ne3 = dst->ne[3];                       \
+                                                           \
+    const uint32_t nb0 = dst->nb[0];                       \
+    const uint32_t nb1 = dst->nb[1];                       \
+    const uint32_t nb2 = dst->nb[2];                       \
+    const uint32_t nb3 = dst->nb[3];
+
+struct softmax_th_ctx {
+    bool     use_f16;
+    bool     use_src1;
+    uint32_t n_head;
+    uint32_t n_head_log2;
+
+    float scale;
+    float max_bias;
+    float m0;
+    float m1;
+
+    struct htp_ops_context * octx;
+};
+
+static void init_softmax_ctx(struct softmax_th_ctx * softmax_ctx, struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+
+    memset(softmax_ctx, 0, sizeof(struct softmax_th_ctx));
+
+    memcpy(&softmax_ctx->scale, (float *) octx->op_params, sizeof(float));
+    memcpy(&softmax_ctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
+
+    softmax_ctx->n_head      = src0->ne[2];
+    softmax_ctx->n_head_log2 = 1u << (uint32_t) floor(log2(softmax_ctx->n_head));
+
+    softmax_ctx->m0 = powf(2.0f, -(softmax_ctx->max_bias) / softmax_ctx->n_head_log2);
+    softmax_ctx->m1 = powf(2.0f, -(softmax_ctx->max_bias / 2.0f) / softmax_ctx->n_head_log2);
+
+    softmax_ctx->use_src1 = (src1->ne[0] != 0);
+    softmax_ctx->use_f16  = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
+
+    softmax_ctx->octx = octx;
+}
+
+static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
+                                      uint8_t * restrict dst,
+                                      const int num_elems,
+                                      float     scale,
+                                      const uint8_t * restrict mask,
+                                      float slope) {
+    const uint8_t * restrict src_curr  = src;
+    uint8_t * restrict dst_curr        = dst;
+    const uint8_t * restrict mask_curr = mask;
+
+    HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
+    HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);
+
+    int step_of_1 = num_elems >> 5;
+
+#pragma unroll(4)
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector v1 = *(HVX_Vector *) src_curr;
+
+        HVX_Vector v3 = *(HVX_Vector *) mask_curr;
+
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);
+
+        HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v3, slope_vec);
+
+        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(v2, v4);
+
+        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v5);
+
+        src_curr += VLEN;
+        dst_curr += VLEN;
+        mask_curr += VLEN;
+    }
+}
+
+static void hvx_fast_softmax_f32(const uint8_t * restrict src,
+                                 uint8_t * restrict dst,
+                                 uint8_t * restrict pad,
+                                 const int num_elems) {
+    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
+    HVX_Vector * restrict v_pad       = (HVX_Vector *) pad;
+    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
+
+    HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000);
+    HVX_Vector max_vec = hvx_vec_splat_fp32(((const float *) src)[0]);
+    HVX_Vector zero_v  = Q6_V_vzero();
+    HVX_Vector one_v   = hvx_vec_splat_fp32(1.0);
+
+    int step_of_1 = num_elems >> 5;
+
+#pragma unroll(4)
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector v1 = v_src[i];
+        max_vec       = Q6_Vsf_vmax_VsfVsf(max_vec, v1);
+    }
+
+    HVX_Vector v = hvx_vec_reduce_max_fp32(max_vec);
+    max_vec      = hvx_vec_repl4(v);
+
+#pragma unroll(4)
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, max_vec);
+
+        HVX_Vector v3 = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(v2));
+
+        sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), v3);
+
+        v_pad[i] = v3;
+    }
+
+    v       = hvx_vec_qf32_reduce_sum(sum_vec);
+    sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));
+
+    HVX_VectorPred pos_sum   = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
+    HVX_Vector     v4        = hvx_vec_inverse_fp32(sum_vec);
+    HVX_Vector     scale_vec = Q6_V_vmux_QVV(pos_sum, v4, one_v);
+
+#pragma unroll(4)
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector v1 = v_pad[i];
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);
+        v_dst[i]      = Q6_Vsf_equals_Vqf32(v2);
+    }
+}
+
+static float hvx_softmax_f32(const uint8_t * restrict src,
+                             uint8_t * restrict dst,
+                             uint8_t * restrict spad,
+                             const int   num_elems,
+                             const float max) {
+    hvx_sub_scalar_f32(src, max, spad, num_elems);
+
+    hvx_exp_f32(spad, dst, num_elems, false);
+
+    float sum = hvx_self_sum_f32(dst, num_elems);
+
+    return sum;
+}
+
+static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ctx, int opt_path) {
+    struct htp_ops_context * octx = softmax_ctx->octx;
+
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    const struct htp_tensor * dst  = &octx->dst;
+
+    htp_softmax_preamble3;
+
+    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * nb01);
+    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * nb01);
+    uint8_t * dst_spad_data  = octx->dst_spad.data + (ith * nb1);
+
+    float * wp0 = (float *) src0_spad_data;
+    float * wp1 = (float *) src1_spad_data;
+    float * wp2 = (float *) dst_spad_data;
+
+    for (uint32_t i03 = 0; i03 < ne03; i03++) {
+        for (uint32_t i02 = 0; i02 < ne02; i02++) {
+            for (uint32_t i01 = ith; i01 < ne01; i01 += nth) {
+                const uint32_t i11 = i01;
+                const uint32_t i12 = i02 % ne12;
+                const uint32_t i13 = i03 % ne13;
+
+                // ALiBi
+                const uint32_t h = i02;  // head
+
+                const float slope = (softmax_ctx->max_bias > 0.0f) ?
+                                        h < softmax_ctx->n_head_log2 ?
+                                        powf(softmax_ctx->m0, h + 1) :
+                                        powf(softmax_ctx->m1, 2 * (h - softmax_ctx->n_head_log2) + 1) :
+                                        1.0f;
+
+                float * sp = (float *) ((char *) octx->src0.data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+                float * dp = (float *) ((char *) octx->dst.data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+
+                // broadcast the mask across rows
+                __fp16 * mp_f16 = (softmax_ctx->use_src1) ?
+                                      (__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
+                                      NULL;
+                float *  mp_f32 = (softmax_ctx->use_src1) ?
+                                      (float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
+                                      NULL;
+
+                if ((1 == opt_path) && (mp_f32) && !(softmax_ctx->use_f16)) {
+                    hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
+                                              (const uint8_t *) mp_f32, slope);
+                } else {
+                    hvx_scale_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale);
+                    if (mp_f32) {
+                        if (softmax_ctx->use_f16) {
+                            for (int i = 0; i < ne00; ++i) {
+                                wp0[i] += slope * (float) mp_f16[i];
+                            }
+                        } else {
+                            for (int i = 0; i < ne00; ++i) {
+                                wp0[i] += slope * mp_f32[i];
+                            }
+                        }
+                    }
+                }
+
+                if (1 == opt_path) {
+                    hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
+                } else {
+                    float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);
+                    float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
+                    sum       = sum > 0.0 ? (1.0 / sum) : 1;
+                    hvx_scale_f32((const uint8_t *) wp2, (uint8_t *) dp, ne00, sum);
+                }
+            }
+        }
+    }
+}
+
+static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int nth, int ith) {
+    struct htp_ops_context * octx = softmax_ctx->octx;
+
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    htp_softmax_preamble3;
+
+    const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
+        is_aligned = 0;
+        FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n");
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
+
+    softmax_htp_f32(nth, ith, softmax_ctx, opt_path);
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+         softmax_ctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
+         ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void softmax_job_dispatcher_f32(unsigned int n, unsigned int i, void * p_data) {
+    struct softmax_th_ctx * p_softmax_ctx = (struct softmax_th_ctx *) p_data;
+    softmax_job_f32_per_thread(p_softmax_ctx, n, i);
+}
+
+static int execute_op_softmax_f32(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    const struct htp_tensor * src0 = &octx->src0;
+    const struct htp_tensor * src1 = &octx->src1;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    worker_callback_t op_func;
+    const char *      op_type = NULL;
+
+    struct softmax_th_ctx softmax_ctx;
+
+    switch (octx->op) {
+        case HTP_OP_SOFTMAX:
+            op_func = softmax_job_dispatcher_f32;
+            op_type = "softmax-f32";
+
+            init_softmax_ctx(&softmax_ctx, octx);
+            break;
+
+        default:
+            FARF(ERROR, "Unsupported Op %u\n", octx->op);
+            return HTP_STATUS_NO_SUPPORT;
+    }
+
+    const uint32_t n_threads = octx->n_threads;
+
+    const size_t src0_row_size = src0->nb[1];
+    const size_t src1_row_size = src0_row_size;
+    const size_t dst_row_size  = dst->nb[1];
+
+    // VTCM scratchpads for all tensors
+    // N rows per thread, padded to HVX vector size
+    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
+    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
+
+    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+
+    if (src1->ne[0]) {
+        FARF(HIGH,
+             "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
+             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
+             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
+             octx->dst_spad.size);
+    } else {
+        FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
+             src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+             octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
+    }
+
+    // Make sure the reserved vtcm size is sufficient
+    if (octx->ctx->vtcm_size < spad_size) {
+        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
+             spad_size);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+
+    uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+
+    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        uint32_t n_jobs             = MIN(n_threads, src0_nrows);
+        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, op_func, &softmax_ctx, n_jobs);
+    }
+
+    return err;
+}
+
+int op_softmax(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    switch (octx->src0.type) {
+        case HTP_TYPE_F32:
+            err = execute_op_softmax_f32(octx);
+            break;
+
+        default:
+            err = HTP_STATUS_NO_SUPPORT;
+            break;
+    }
+
+    return err;
+}
diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c
new file mode 100644
index 00000000000..85b60779b79
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -0,0 +1,255 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+
+#include <HAP_farf.h>
+#include <HAP_mem.h>
+#include <HAP_perf.h>
+#include <HAP_ps.h>
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <math.h>
+#include <qurt_thread.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-dma.h"
+#include "htp-msg.h"
+#include "htp-ops.h"
+#include "hvx-utils.h"
+#include "ops-utils.h"
+
+#define htp_unary_preamble            \
+    const uint32_t ne00 = src->ne[0]; \
+    const uint32_t ne01 = src->ne[1]; \
+    const uint32_t ne02 = src->ne[2]; \
+    const uint32_t ne03 = src->ne[3]; \
+                                      \
+    const uint32_t ne0 = dst->ne[0];  \
+    const uint32_t ne1 = dst->ne[1];  \
+    const uint32_t ne2 = dst->ne[2];  \
+    const uint32_t ne3 = dst->ne[3];  \
+                                      \
+    const uint32_t nb00 = src->nb[0]; \
+    const uint32_t nb01 = src->nb[1]; \
+    const uint32_t nb02 = src->nb[2]; \
+    const uint32_t nb03 = src->nb[3]; \
+                                      \
+    const uint32_t nb0 = dst->nb[0];  \
+    const uint32_t nb1 = dst->nb[1];  \
+    const uint32_t nb2 = dst->nb[2];  \
+    const uint32_t nb3 = dst->nb[3];
+
+static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
+                                  uint8_t * restrict dst,
+                                  uint8_t * restrict pad,
+                                  const int num_elems,
+                                  float     epsilon) {
+    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
+    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
+
+    HVX_Vector sum_v     = Q6_V_vsplat_R(0x00000000);
+    HVX_Vector epsilon_v = hvx_vec_splat_fp32(epsilon);
+
+    int step_of_1 = num_elems >> 5;
+#pragma unroll(4)
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
+        sum_v         = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
+    }
+
+    HVX_Vector reduced_sum = hvx_vec_qf32_reduce_sum(sum_v);
+    sum_v                  = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum));
+
+    HVX_Vector t_v            = hvx_vec_splat_fp32((float) num_elems);
+    HVX_Vector denom_v        = hvx_vec_inverse_fp32(t_v);
+    HVX_Vector mean_v         = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
+    HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
+
+    HVX_Vector scale_v = hvx_vec_rsqrt_fp32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
+
+#pragma unroll(4)
+    for (int i = 0; i < step_of_1; i++) {
+        HVX_Vector v1 = v_src[i];
+        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
+        v_dst[i]      = Q6_Vsf_equals_Vqf32(v2);
+    }
+}
+
+static void rms_norm_htp_f32(const float * restrict src,
+                             float * restrict dst,
+                             uint8_t * restrict spad,
+                             const uint32_t num_rows,
+                             const uint32_t row_elems,
+                             const size_t   row_size,
+                             int32_t *      op_params,
+                             int            opt_path) {
+    float epsilon = 0.f;
+    memcpy(&epsilon, op_params, sizeof(float));
+
+    for (uint32_t ir = 0; ir < num_rows; ir++) {
+        const float * restrict src_local = src + (ir * row_elems);
+        float * restrict dst_local       = dst + (ir * row_elems);
+
+        if (ir + 1 < num_rows) {
+            htp_l2fetch(src_local + row_elems, 1, row_size, row_size);
+        }
+
+        if (1 == opt_path) {
+            hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
+        } else {
+            float sum = hvx_sum_of_squares_f32((const uint8_t *) src_local, row_elems);
+
+            const float mean  = sum / row_elems;
+            const float scale = 1.0f / sqrtf(mean + epsilon);
+
+            hvx_scale_f32((const uint8_t *) src_local, (uint8_t *) dst_local, row_elems, scale);
+        }
+    }
+}
+
+static void unary_job_f32_per_thread(const struct htp_tensor * src,
+                                     struct htp_tensor *       dst,
+                                     uint8_t *                 spad,
+                                     int                       htp_op,
+                                     int32_t *                 op_params,
+                                     uint32_t                  nth,
+                                     uint32_t                  ith,
+                                     uint32_t                  src0_nrows_per_thread) {
+    htp_unary_preamble;
+
+    const size_t src0_row_size = nb01;
+    const size_t dst_row_size  = nb1;
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if ((0 == htp_is_aligned((void *) src->data, VLEN)) || (0 == htp_is_aligned((void *) dst->data, VLEN))) {
+        is_aligned = 0;
+        FARF(HIGH, "unary-f32: unaligned addresses in unary op, possibly slower execution\n");
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
+
+    const uint8_t * restrict data_src = (const uint8_t *) src->data;
+    uint8_t * restrict data_dst       = (uint8_t *) dst->data;
+
+    const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
+    float * restrict dst_th       = (float *) (data_dst + (src0_start_row * dst_row_size));
+    uint8_t * restrict spad_th    = (uint8_t *) spad + (ith * nb01);
+
+    switch (htp_op) {
+        case HTP_OP_RMS_NORM:
+            rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
+            break;
+
+        default:
+            break;
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "unary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, src->ne[0],
+         src->ne[1], src->ne[2], src->ne[3], src0_start_row, src0_end_row, dst->ne[0], dst->ne[1], dst->ne[2],
+         dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void unary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = (struct htp_ops_context *) data;
+
+    unary_job_f32_per_thread(&octx->src0, &octx->dst, octx->src0_spad.data, octx->op, octx->op_params, n, i,
+                             octx->src0_nrows_per_thread);
+}
+
+static int execute_op_unary_f32(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    const struct htp_tensor * src0 = &octx->src0;
+    struct htp_tensor *       dst  = &octx->dst;
+
+    worker_callback_t unary_op_func;
+    const char *      op_type = NULL;
+
+    switch (octx->op) {
+        case HTP_OP_RMS_NORM:
+            unary_op_func = unary_job_dispatcher_f32;
+            op_type       = "rmsnorm-f32";
+            break;
+
+        default:
+            FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
+            return HTP_STATUS_NO_SUPPORT;
+    }
+
+    const int      n_threads  = octx->n_threads;
+    const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+
+    const size_t src0_row_size = src0->nb[1];
+    const size_t dst_row_size  = dst->nb[1];
+
+    // VTCM scratchpads for all tensors
+    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
+    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
+
+    size_t spad_size = octx->src0_spad.size + octx->dst_spad.size;
+
+    FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
+
+    // Make sure the reserved vtcm size is sufficient
+    if (octx->ctx->vtcm_size < spad_size) {
+        FARF(ERROR, "unary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
+             spad_size);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
+
+    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        uint32_t n_jobs = MIN(n_threads, src0_nrows);
+
+        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+
+        worker_pool_run_func(octx->ctx->worker_pool, unary_op_func, octx, n_jobs);
+    }
+
+    return err;
+}
+
+int op_unary(struct htp_ops_context * octx) {
+    int err = HTP_STATUS_OK;
+
+    switch (octx->src0.type) {
+        case HTP_TYPE_F32:
+            err = execute_op_unary_f32(octx);
+            break;
+
+        default:
+            err = HTP_STATUS_NO_SUPPORT;
+            break;
+    }
+
+    return err;
+}
diff --git a/ggml/src/ggml-hexagon/htp/worker-pool.c b/ggml/src/ggml-hexagon/htp/worker-pool.c
new file mode 100644
index 00000000000..cd38c2126c7
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/worker-pool.c
@@ -0,0 +1,297 @@
+#include "worker-pool.h"
+
+#include <qurt.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HTP_DEBUG
+#    define FARF_HIGH 1
+#endif
+
+#include "HAP_farf.h"
+
+#define WORKER_THREAD_STACK_SZ  (2 * 16384)
+#define LOWEST_USABLE_QURT_PRIO (254)
+
+struct worker_pool_s;
+
+// internal structure kept in thread-local storage per instance of worker pool
+typedef struct {
+    struct worker_pool_s * pool;
+    unsigned int           id;
+} worker_context_t;
+
+// internal structure kept in thread-local storage per instance of worker pool
+typedef struct worker_pool_s {
+    worker_pool_job_t job[MAX_NUM_WORKERS];      // list of job descriptors
+    qurt_thread_t     thread[MAX_NUM_WORKERS];   // thread ID's of the workers
+    worker_context_t  context[MAX_NUM_WORKERS];  // worker contexts
+    void *            stack[MAX_NUM_WORKERS];    // thread stack pointers
+    unsigned int      n_threads;                 // number of workers in this pool
+
+    atomic_uint seqn;                            // seqno used to detect new jobs
+    atomic_uint next_job;                        // next job index
+    atomic_uint n_pending;                       // number of pending jobs
+    atomic_uint n_jobs;                          // number of current jobs
+    atomic_bool killed;                          // threads need to exit
+} worker_pool_t;
+
+static void worker_pool_main(void * context) {
+    worker_context_t * me   = (worker_context_t *) context;
+    worker_pool_t *    pool = me->pool;
+
+    FARF(HIGH, "worker-pool: thread %u started", me->id);
+
+    unsigned int prev_seqn = 0;
+    while (!atomic_load(&pool->killed)) {
+        unsigned int seqn = atomic_load(&pool->seqn);
+        if (seqn == prev_seqn) {
+            // Nothing to do
+            qurt_futex_wait(&pool->seqn, prev_seqn);
+            continue;
+        }
+
+        // New job
+        prev_seqn = seqn;
+
+        unsigned int n = atomic_load(&pool->n_jobs);
+        unsigned int i = atomic_fetch_add(&pool->next_job, 1);
+        if (i >= n) {
+            // Spurios wakeup
+            continue;
+        }
+
+        pool->job[i].func(n, i, pool->job[i].data);
+
+        atomic_fetch_sub(&pool->n_pending, 1);
+    }
+
+    FARF(HIGH, "worker-pool: thread %u stopped", me->id);
+}
+
+AEEResult worker_pool_init_with_stack_size(worker_pool_context_t * context, uint32_t n_threads, uint32_t stack_size) {
+    int err = 0;
+
+    if (NULL == context) {
+        FARF(ERROR, "NULL context passed to worker_pool_init().");
+        return AEE_EBADPARM;
+    }
+
+    // Allocations
+    int size = (stack_size * n_threads) + (sizeof(worker_pool_t));
+
+    unsigned char * mem_blob = (unsigned char *) malloc(size);
+    if (!mem_blob) {
+        FARF(ERROR, "Could not allocate memory for worker pool!!");
+        return AEE_ENOMEMORY;
+    }
+
+    worker_pool_t * me = (worker_pool_t *) (mem_blob + stack_size * n_threads);
+
+    // name for the first worker, useful in debugging threads
+    char name[19];
+    snprintf(name, 12, "0x%8x:", (int) me);
+    strcat(name, "worker0");
+    me->n_threads = n_threads;
+
+    // initializations
+    for (unsigned int i = 0; i < me->n_threads; i++) {
+        me->stack[i]  = NULL;
+        me->thread[i] = 0;
+
+        me->context[i].id   = i;
+        me->context[i].pool = me;
+    }
+
+    // initialize job queue
+    me->n_pending = 0;
+    me->n_jobs    = 0;
+    me->next_job  = 0;
+    me->seqn      = 0;
+    me->killed    = 0;
+
+    // launch the workers
+    qurt_thread_attr_t attr;
+    qurt_thread_attr_init(&attr);
+
+    for (unsigned int i = 0; i < me->n_threads; i++) {
+        // set up stack
+        me->stack[i] = mem_blob;
+        mem_blob += stack_size;
+        qurt_thread_attr_set_stack_addr(&attr, me->stack[i]);
+        qurt_thread_attr_set_stack_size(&attr, stack_size);
+
+        // set up name
+        qurt_thread_attr_set_name(&attr, name);
+        name[17] = (name[17] + 1);
+        // name threads context:worker0, context:worker1, .. (recycle at 9, but num threads should be less than that anyway)
+        if (name[17] > '9') {
+            name[17] = '0';
+        }
+
+        // set up priority - by default, match the creating thread's prio
+        int prio = qurt_thread_get_priority(qurt_thread_get_id());
+
+        if (prio < 1) {
+            prio = 1;
+        }
+        if (prio > LOWEST_USABLE_QURT_PRIO) {
+            prio = LOWEST_USABLE_QURT_PRIO;
+        }
+
+        qurt_thread_attr_set_priority(&attr, prio);
+
+        // launch
+        err = qurt_thread_create(&me->thread[i], &attr, worker_pool_main, (void *) &me->context[i]);
+        if (err) {
+            FARF(ERROR, "Could not launch worker threads!");
+            worker_pool_release((worker_pool_context_t *) &me);
+            return AEE_EQURTTHREADCREATE;
+        }
+    }
+    *context = (worker_pool_context_t *) me;
+    return AEE_SUCCESS;
+}
+
+AEEResult worker_pool_init(worker_pool_context_t * context, uint32_t n_threads) {
+    return worker_pool_init_with_stack_size(context, n_threads, WORKER_THREAD_STACK_SZ);
+}
+
+// clean up worker pool
+void worker_pool_release(worker_pool_context_t * context) {
+    worker_pool_t * me = (worker_pool_t *) *context;
+
+    // if no worker pool exists, return error.
+    if (NULL == me) {
+        return;
+    }
+
+    atomic_store(&me->killed, 1);
+    atomic_fetch_add(&me->seqn, 1);
+    qurt_futex_wake(&me->seqn, me->n_threads);
+
+    // de-initializations
+    for (unsigned int i = 0; i < me->n_threads; i++) {
+        if (me->thread[i]) {
+            int status;
+            (void) qurt_thread_join(me->thread[i], &status);
+        }
+    }
+
+    // free allocated memory (were allocated as a single buffer starting at stack[0])
+    if (me->stack[0]) {
+        free(me->stack[0]);
+    }
+
+    *context = NULL;
+}
+
+// run jobs
+AEEResult worker_pool_run_jobs(worker_pool_context_t context, worker_pool_job_t * job, unsigned int n) {
+    worker_pool_t * me = (worker_pool_t *) context;
+    if (NULL == me) {
+        FARF(ERROR, "worker-pool: invalid context");
+        return AEE_EBADPARM;
+    }
+
+    if (n > me->n_threads) {
+        FARF(ERROR, "worker-pool: invalid number of jobs %u for n-threads %u", n, me->n_threads);
+        return AEE_EBADPARM;
+    }
+
+    memcpy(me->job, job, sizeof(worker_pool_job_t) * n);
+
+    if (n > 1) {
+        atomic_store(&me->next_job, 1);
+        atomic_store(&me->n_jobs, n);
+        atomic_store(&me->n_pending, n - 1);
+
+        // wake up workers
+        atomic_fetch_add(&me->seqn, 1);
+        qurt_futex_wake(&me->seqn, n - 1);
+    }
+
+    // main thread runs job #0
+    me->job[0].func(n, 0, me->job[0].data);
+
+    if (n > 1) {
+        while (atomic_load(&me->n_pending))
+            ;
+    }
+
+    return 0;
+}
+
+// run func
+AEEResult worker_pool_run_func(worker_pool_context_t context, worker_callback_t func, void * data, unsigned int n) {
+    worker_pool_job_t job[n];
+
+    for (unsigned int i = 0; i < n; i++) {
+        job[i].func = func;
+        job[i].data = data;
+    }
+
+    return worker_pool_run_jobs(context, job, n);
+}
+
+AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio) {
+    worker_pool_t * me = (worker_pool_t *) context;
+
+    // if no worker pool exists, return error.
+    if (!me) {
+        return AEE_ENOMORE;
+    }
+
+    int result = AEE_SUCCESS;
+    if (prio < 1) {
+        prio = 1;
+    }
+    if (prio > LOWEST_USABLE_QURT_PRIO) {
+        prio = LOWEST_USABLE_QURT_PRIO;
+    }
+
+    for (unsigned int i = 0; i < me->n_threads; i++) {
+        int res = qurt_thread_set_priority(me->thread[i], (unsigned short) prio);
+        if (0 != res) {
+            result = AEE_EBADPARM;
+            FARF(ERROR, "QURT failed to set priority of thread %d, ERROR = %d", me->thread[i], res);
+        }
+    }
+
+    return result;
+}
+
+AEEResult worker_pool_retrieve_thread_id(worker_pool_context_t context, unsigned int * tids) {
+    worker_pool_t * me = (worker_pool_t *) context;
+    if (!me) {
+        FARF(ERROR, "worker-pool: invalid context");
+        return AEE_EBADPARM;
+        ;
+    }
+
+    for (int i = 0; i < me->n_threads; i++) {
+        tids[i] = me->thread[i];
+    }
+
+    return AEE_SUCCESS;
+}
+
+AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int * prio) {
+    worker_pool_t * me = (worker_pool_t *) context;
+    if (!me) {
+        FARF(ERROR, "worker-pool: invalid context");
+        return AEE_EBADPARM;
+    }
+
+    int priority = qurt_thread_get_priority(me->thread[0]);
+    if (priority > 0) {
+        *prio = priority;
+        return 0;
+    } else {
+        *prio = 0;
+        return AEE_EBADSTATE;
+    }
+}
diff --git a/ggml/src/ggml-hexagon/htp/worker-pool.h b/ggml/src/ggml-hexagon/htp/worker-pool.h
new file mode 100644
index 00000000000..6f8c9056c4b
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/worker-pool.h
@@ -0,0 +1,57 @@
+#ifndef HTP_WORKER_POOL_H
+#define HTP_WORKER_POOL_H
+
+// MACRO enables function to be visible in shared-library case.
+#define WORKERPOOL_API __attribute__((visibility("default")))
+
+#include <AEEStdDef.h>
+#include <AEEStdErr.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// signature of callbacks to be invoked by worker threads
+typedef void (*worker_callback_t)(unsigned int n, unsigned int i, void *);
+
+/// Typedef of worker_pool context
+typedef void * worker_pool_context_t;
+
+/// descriptor for requested callback
+typedef struct {
+    worker_callback_t func;
+    void *            data;
+} worker_pool_job_t;
+
+/// Maximum supported number of worker threads.
+#define MAX_NUM_WORKERS 10
+
+// Initialize worker pool.
+WORKERPOOL_API AEEResult worker_pool_init(worker_pool_context_t * context, uint32_t n_threads);
+
+// Initialize worker pool with custom stack size
+WORKERPOOL_API AEEResult worker_pool_init_with_stack_size(worker_pool_context_t * context,
+                                                          uint32_t                n_threads,
+                                                          uint32_t                stack_size);
+
+// Kill worker threads and release worker pool resources
+WORKERPOOL_API void worker_pool_release(worker_pool_context_t * context);
+
+// Run jobs with the worker pool.
+WORKERPOOL_API AEEResult worker_pool_run_jobs(worker_pool_context_t context, worker_pool_job_t * job, unsigned int n);
+
+WORKERPOOL_API AEEResult worker_pool_run_func(worker_pool_context_t context,
+                                              worker_callback_t     func,
+                                              void *                data,
+                                              unsigned int          n);
+
+WORKERPOOL_API AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio);
+WORKERPOOL_API AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int * prio);
+WORKERPOOL_API AEEResult worker_pool_retrieve_thread_id(worker_pool_context_t context, unsigned int * tids);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // #ifndef HTP_WORKER_POOL_H

From ec4436f7927f81f5d6b1ce92d2bae7ffff45434e Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Sun, 12 Oct 2025 19:00:55 -0700
Subject: [PATCH 03/37] hexagon: fix format checker errors

---
 docs/backend/hexagon/CMakeUserPresets.json |   6 +-
 docs/backend/hexagon/README.md             | 298 +++++++++++----------
 docs/backend/hexagon/developer.md          | 142 +++++-----
 docs/backend/hexagon/run-cli.sh            |   6 +-
 docs/backend/hexagon/run-tool.sh           |   2 +-
 ggml/src/ggml-hexagon/CMakeLists.txt       |   2 +-
 ggml/src/ggml-hexagon/htp-utils.c          |  30 +--
 ggml/src/ggml-hexagon/htp-utils.h          | 198 +++++++-------
 ggml/src/ggml-hexagon/htp/CMakeLists.txt   |   2 +-
 ggml/src/ggml-hexagon/htp/hvx-exp.c        |   4 +-
 ggml/src/ggml-hexagon/htp/hvx-inverse.c    |   4 +-
 ggml/src/ggml-hexagon/htp/hvx-utils.c      |  42 +--
 ggml/src/ggml-hexagon/htp/hvx-utils.h      |  16 +-
 ggml/src/ggml-hexagon/htp/matmul-ops.c     |  20 +-
 ggml/src/ggml-hexagon/htp/softmax-ops.c    |   8 +-
 ggml/src/ggml-hexagon/htp/unary-ops.c      |   4 +-
 16 files changed, 402 insertions(+), 382 deletions(-)

diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json
index d413ec46fca..da0c03c1d9b 100644
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -25,8 +25,8 @@
         }
     },
 
-    { 
-        "name": "arm64-windows-snapdragon", 
+    {
+        "name": "arm64-windows-snapdragon",
         "inherits": [ "base", "arm64-windows-llvm" ],
         "cacheVariables": {
             "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
@@ -37,7 +37,7 @@
             "LLAMA_CURL":       "OFF"
         }
     },
- 
+
     { "name": "arm64-android-snapdragon-debug"  , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
     { "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
 
diff --git a/docs/backend/hexagon/README.md b/docs/backend/hexagon/README.md
index db22d47bf0f..51e0026c629 100644
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -7,57 +7,63 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
 
 This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
 
-    ~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.1
-    [d]/> cd /workspace
+```
+~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.1
+[d]/> cd /workspace
+```
 
 The rest of the Android build process assumes that you're running inside the toolchain container.
 Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
 
-    [d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
-     
-    [d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
-    Preset CMake variables:
-      ANDROID_ABI="arm64-v8a"
-      ...
-      CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
-      GGML_HEXAGON="ON"
-      GGML_OPENCL="ON"
-      GGML_OPENMP="OFF"
-      HEXAGON_SDK_ROOT="/opt/hexagon/6.3.0.0"
-    ...
-    -- Including OpenCL backend
-    -- Including Hexagon backend
-    ...
-    -- Build files have been written to: /workspace/build-snapdragon
-
-    [d]/workspace> cmake --build build-snapdragon 
-    ...
-    [144/356] Performing build step for 'htp-v73'
-    [1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
-    [2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
-    [3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
-    [4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
-    ...
-    -- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
-    -- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
-    ...
- 
+```
+[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
+
+[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
+Preset CMake variables:
+  ANDROID_ABI="arm64-v8a"
+  ...
+  CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
+  GGML_HEXAGON="ON"
+  GGML_OPENCL="ON"
+  GGML_OPENMP="OFF"
+  HEXAGON_SDK_ROOT="/opt/hexagon/6.3.0.0"
+...
+-- Including OpenCL backend
+-- Including Hexagon backend
+...
+-- Build files have been written to: /workspace/build-snapdragon
+
+[d]/workspace> cmake --build build-snapdragon
+...
+[144/356] Performing build step for 'htp-v73'
+[1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
+[2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
+[3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
+[4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
+...
+-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
+-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
+...
+```
+
 To generate an installable "package" simply use cmake --install:
 
-    [d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon
-    -- Install configuration: "Release"
-    -- Installing: /workspace/pkg-snapdragon/lib/libggml-cpu.so
-    -- Installing: /workspace/pkg-snapdragon/lib/libggml-opencl.so
-    -- Installing: /workspace/pkg-snapdragon/lib/libggml-hexagon.so
-    -- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v73.so
-    -- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v75.so
-    -- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v79.so
-    -- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v81.so
-    -- Installing: /workspace/pkg-snapdragon/lib/libggml.so
-    ...
-    -- Installing: /workspace/pkg-snapdragon/bin/llama-bench
-    -- Installing: /workspace/pkg-snapdragon/bin/llama-cli
-    ...
+```
+[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon
+-- Install configuration: "Release"
+-- Installing: /workspace/pkg-snapdragon/lib/libggml-cpu.so
+-- Installing: /workspace/pkg-snapdragon/lib/libggml-opencl.so
+-- Installing: /workspace/pkg-snapdragon/lib/libggml-hexagon.so
+-- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v73.so
+-- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v75.so
+-- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v79.so
+-- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v81.so
+-- Installing: /workspace/pkg-snapdragon/lib/libggml.so
+...
+-- Installing: /workspace/pkg-snapdragon/bin/llama-bench
+-- Installing: /workspace/pkg-snapdragon/bin/llama-cli
+...
+```
 
 ## How to Install
 
@@ -67,20 +73,24 @@ Please see https://developer.android.com/studio/debug/dev-options for details.
 Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
 **Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
 
-    ~/src/llama.cpp$ adb push pkg-snapdragon/* /data/local/tmp/llama.cpp
-    pkg-snapdragon/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
-    pkg-snapdragon/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
-    pkg-snapdragon/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
-    102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
+```
+~/src/llama.cpp$ adb push pkg-snapdragon/* /data/local/tmp/llama.cpp
+pkg-snapdragon/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
+pkg-snapdragon/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
+pkg-snapdragon/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
+102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
+```
 
 At this point, you should also install some models:
 
-    ~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
-    ...
-    2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920]
+```
+~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
+...
+2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920]
 
-    ~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
-    Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
+~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
+Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
+```
 
 ## How to Run
 
@@ -95,102 +105,108 @@ Here are some examples of running various llama.cpp tools via ADB.
 
 Simple question for Llama-3.2-1B
 
-    ~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./docs/backend/snapdragon/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
-    ...
-    ggml-hex: HTP arch version v79
-    ggml-hex: allocating new session: HTP0
-    ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
-    ...
-    load_tensors: offloading output layer to GPU
-    load_tensors: offloaded 17/17 layers to GPU
-    load_tensors:          CPU model buffer size =   225.49 MiB
-    load_tensors:         HTP0 model buffer size =     0.26 MiB
-    load_tensors:  HTP0-REPACK model buffer size =   504.00 MiB
-    ...
-    I hope this helps you understand the world's most popular cookies! [end of text]
-    ... 
-    llama_perf_sampler_print:    sampling time =      30.08 ms /   487 runs   (    0.06 ms per token, 16191.77 tokens per second)
-    llama_perf_context_print:        load time =     617.94 ms
-    llama_perf_context_print: prompt eval time =      80.76 ms /    11 tokens (    7.34 ms per token,   136.21 tokens per second)
-    llama_perf_context_print:        eval time =    9210.59 ms /   475 runs   (   19.39 ms per token,    51.57 tokens per second)
-    llama_perf_context_print:       total time =    9454.92 ms /   486 tokens
-    llama_perf_context_print:    graphs reused =        473
-    llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
-    llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
-    llama_memory_breakdown_print: |   - Host               |                  439 =   225 +     136 +      77                |
-    llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =   504 +       0 +       0                |
+```
+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./docs/backend/snapdragon/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
+...
+ggml-hex: HTP arch version v79
+ggml-hex: allocating new session: HTP0
+ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
+...
+load_tensors: offloading output layer to GPU
+load_tensors: offloaded 17/17 layers to GPU
+load_tensors:          CPU model buffer size =   225.49 MiB
+load_tensors:         HTP0 model buffer size =     0.26 MiB
+load_tensors:  HTP0-REPACK model buffer size =   504.00 MiB
+...
+I hope this helps you understand the world's most popular cookies! [end of text]
+...
+llama_perf_sampler_print:    sampling time =      30.08 ms /   487 runs   (    0.06 ms per token, 16191.77 tokens per second)
+llama_perf_context_print:        load time =     617.94 ms
+llama_perf_context_print: prompt eval time =      80.76 ms /    11 tokens (    7.34 ms per token,   136.21 tokens per second)
+llama_perf_context_print:        eval time =    9210.59 ms /   475 runs   (   19.39 ms per token,    51.57 tokens per second)
+llama_perf_context_print:       total time =    9454.92 ms /   486 tokens
+llama_perf_context_print:    graphs reused =        473
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - Host               |                  439 =   225 +     136 +      77                |
+llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =   504 +       0 +       0                |
+```
 
 Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
 
-    M=../gguf/OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 docs/backend/hexagon/run-cli.sh -f surfing.txt -no-cnv
-    ...
-    ggml-hex: allocating new registry : ndev 2
-    ggml-hex: HTP arch version v81
-    ggml-hex: allocating new session: HTP0
-    ggml-hex: allocating new session: HTP1
-    ...
-    load_tensors: offloading output layer to GPU
-    load_tensors: offloaded 17/17 layers to GPU
-    load_tensors:          CPU model buffer size =   143.86 MiB
-    load_tensors:         HTP1 model buffer size =     0.23 MiB
-    load_tensors:  HTP1-REPACK model buffer size =  1575.00 MiB
-    load_tensors:         HTP0 model buffer size =     0.28 MiB
-    load_tensors:  HTP0-REPACK model buffer size =  2025.00 MiB
-    ...
-    llama_context:        CPU  output buffer size =     0.19 MiB
-    llama_kv_cache:       HTP1 KV buffer size =   238.00 MiB
-    llama_kv_cache:       HTP0 KV buffer size =   306.00 MiB
-    llama_kv_cache: size =  544.00 MiB (  8192 cells,  16 layers,  1/1 seqs), K (q8_0):  272.00 MiB, V (q8_0):  272.00 MiB
-    llama_context:       HTP0 compute buffer size =    15.00 MiB
-    llama_context:       HTP1 compute buffer size =    15.00 MiB
-    llama_context:        CPU compute buffer size =    24.56 MiB
-    ...
-    llama_perf_context_print: prompt eval time =    1730.57 ms /   212 tokens (    8.16 ms per token,   122.50 tokens per second)
-    llama_perf_context_print:        eval time =    5624.75 ms /   257 runs   (   21.89 ms per token,    45.69 tokens per second)
-    llama_perf_context_print:       total time =    7377.33 ms /   469 tokens
-    llama_perf_context_print:    graphs reused =        255
-    llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
-    llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
-    llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
-    llama_memory_breakdown_print: |   - Host               |                  742 =   144 +     544 +      54                |
-    llama_memory_breakdown_print: |   - HTP1-REPACK        |                 1575 =  1575 +       0 +       0                |
-    llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2025 =  2025 +       0 +       0                | 
+```
+M=../gguf/OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 docs/backend/hexagon/run-cli.sh -f surfing.txt -no-cnv
+...
+ggml-hex: allocating new registry : ndev 2
+ggml-hex: HTP arch version v81
+ggml-hex: allocating new session: HTP0
+ggml-hex: allocating new session: HTP1
+...
+load_tensors: offloading output layer to GPU
+load_tensors: offloaded 17/17 layers to GPU
+load_tensors:          CPU model buffer size =   143.86 MiB
+load_tensors:         HTP1 model buffer size =     0.23 MiB
+load_tensors:  HTP1-REPACK model buffer size =  1575.00 MiB
+load_tensors:         HTP0 model buffer size =     0.28 MiB
+load_tensors:  HTP0-REPACK model buffer size =  2025.00 MiB
+...
+llama_context:        CPU  output buffer size =     0.19 MiB
+llama_kv_cache:       HTP1 KV buffer size =   238.00 MiB
+llama_kv_cache:       HTP0 KV buffer size =   306.00 MiB
+llama_kv_cache: size =  544.00 MiB (  8192 cells,  16 layers,  1/1 seqs), K (q8_0):  272.00 MiB, V (q8_0):  272.00 MiB
+llama_context:       HTP0 compute buffer size =    15.00 MiB
+llama_context:       HTP1 compute buffer size =    15.00 MiB
+llama_context:        CPU compute buffer size =    24.56 MiB
+...
+llama_perf_context_print: prompt eval time =    1730.57 ms /   212 tokens (    8.16 ms per token,   122.50 tokens per second)
+llama_perf_context_print:        eval time =    5624.75 ms /   257 runs   (   21.89 ms per token,    45.69 tokens per second)
+llama_perf_context_print:       total time =    7377.33 ms /   469 tokens
+llama_perf_context_print:    graphs reused =        255
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - Host               |                  742 =   144 +     544 +      54                |
+llama_memory_breakdown_print: |   - HTP1-REPACK        |                 1575 =  1575 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2025 =  2025 +       0 +       0                |
+```
 
 Op test for MUL_MAT
 
-    ~/src/llama.cpp$ HB=0 docs/backend/snapdragon/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
-    ...
-    Backend 2/3: HTP0
-    Device description: Hexagon
-    Device memory: 2048 MB (2048 MB free)
-    MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
-    MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
-    MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
-
-    ~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf docs/backend/hexagon/run-bench.sh -p 128 -n 64
-    ...
-    ggml-hex: allocating new registry : ndev 1
-    ggml-hex: HTP arch version v79
-    ggml-hex: allocating new session: HTP0
-    ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
-    | model          |       size | params | backend    | ngl | threads | n_batch | mmap |  test |           t/s |
-    | ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
-    | llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 | pp128 | 169.42 ± 1.75 |
-    | llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 |  tg64 |  51.54 ± 1.13 |
-
-    build: 6a8cf8914 (6733)
+```
+~/src/llama.cpp$ HB=0 docs/backend/snapdragon/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
+...
+Backend 2/3: HTP0
+Device description: Hexagon
+Device memory: 2048 MB (2048 MB free)
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+
+~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf docs/backend/hexagon/run-bench.sh -p 128 -n 64
+...
+ggml-hex: allocating new registry : ndev 1
+ggml-hex: HTP arch version v79
+ggml-hex: allocating new session: HTP0
+ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
+| model          |       size | params | backend    | ngl | threads | n_batch | mmap |  test |           t/s |
+| ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
+| llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 | pp128 | 169.42 ± 1.75 |
+| llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 |  tg64 |  51.54 ± 1.13 |
+
+build: 6a8cf8914 (6733)
+```
 
 ## Environment variables
 
 - `GGML_HEXAGON_NDEV=1`
-  Controls the number of devices/sessions to allocate. The default is 1.  
+  Controls the number of devices/sessions to allocate. The default is 1.
   Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four.
 
 - `GGML_HEXAGON_NHVX=0`
   Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version).
 
 - `GGML_HEXAGON_HOSTBUF=1`
-  Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.  
+  Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
   This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
 
 - `GGML_HEXAGON_VERBOSE=1`
@@ -205,18 +221,18 @@ Op test for MUL_MAT
   ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024
   ```
 
-- `GGML_HEXAGON_PROFILE=1`  
+- `GGML_HEXAGON_PROFILE=1`
   Generates a host-side profile for the ggml-hexagon Ops.
 
-- `GGML_HEXAGON_OPMASK=0x0`  
+- `GGML_HEXAGON_OPMASK=0x0`
   Allows enabling specific stages of the processing pipeline:
 
-  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)  
-  - `0x2` Enable Dynamic Quantizer (if needed for the Op)  
+  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
+  - `0x2` Enable Dynamic Quantizer (if needed for the Op)
   - `0x4` Enable Op Compute (MUL_MAT, etc.)
 
   Examples:
 
-      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out  
-      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest  
+      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
+      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
       `GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
diff --git a/docs/backend/hexagon/developer.md b/docs/backend/hexagon/developer.md
index 55164f92acb..98885ae0f2d 100644
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@@ -4,7 +4,7 @@
 
 The Hexagon backend consist of two parts:
 
-  - `libggml-hexagon` 
+  - `libggml-hexagon`
     This is the regular CPU-side GGML backend library, either shared or statically linked
 
   - `libggm-htp-vNN`
@@ -13,26 +13,28 @@ The Hexagon backend consist of two parts:
 
 Here is an example of the build artifacts
 
-    ~/src/llama.cpp$ ls -l pkg-snapdragon/lib/libggml*
-    pkg-snapdragon/lib/libggml-base.so
-    pkg-snapdragon/lib/libggml-cpu.so
-    pkg-snapdragon/lib/libggml-hexagon.so      <<< CPU library 
-    pkg-snapdragon/lib/libggml-htp-v73.so      <<< HTP op/kernels for Hexagon v73
-    pkg-snapdragon/lib/libggml-htp-v75.so     
-    pkg-snapdragon/lib/libggml-htp-v79.so
-    pkg-snapdragon/lib/libggml-htp-v81.so
+```
+~/src/llama.cpp$ ls -l pkg-snapdragon/lib/libggml*
+pkg-snapdragon/lib/libggml-base.so
+pkg-snapdragon/lib/libggml-cpu.so
+pkg-snapdragon/lib/libggml-hexagon.so      <<< CPU library
+pkg-snapdragon/lib/libggml-htp-v73.so      <<< HTP op/kernels for Hexagon v73
+pkg-snapdragon/lib/libggml-htp-v75.so
+pkg-snapdragon/lib/libggml-htp-v79.so
+pkg-snapdragon/lib/libggml-htp-v81.so
+```
 
 ## Memory buffers
 
 Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU.
-The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically 
+The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically
 quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA).
 
-Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those 
+Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those
 to the NPU at this point.
 
 The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4.
-From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force 
+From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force
 the repacking.
 
 ## Large model handling
@@ -46,60 +48,62 @@ Each Hexagon device behaves like a GPU from the offload and model splitting pers
 
 Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
 
-    M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt docs/backend/hexagon/run-cli.sh -no-cnv -f surfing.txt -n 32
-    ...
-    LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
-    ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
-    GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf 
-          -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt 
-    ...
-    llama_model_loader: - type  f32:  289 tensors
-    llama_model_loader: - type q4_0:   96 tensors
-    llama_model_loader: - type q8_0:    2 tensors
-    llama_model_loader: - type mxfp4:  72 tensors
-    ...
-    load_tensors: offloaded 25/25 layers to GPU
-    load_tensors:          CPU model buffer size =  1182.09 MiB
-    load_tensors:         HTP1 model buffer size =     6.64 MiB
-    load_tensors:  HTP1-REPACK model buffer size =  2505.94 MiB
-    load_tensors:         HTP3 model buffer size =     5.55 MiB
-    load_tensors:  HTP3-REPACK model buffer size =  2088.28 MiB
-    load_tensors:         HTP0 model buffer size =     7.75 MiB
-    load_tensors:  HTP0-REPACK model buffer size =  2923.59 MiB
-    load_tensors:         HTP2 model buffer size =     6.64 MiB
-    load_tensors:  HTP2-REPACK model buffer size =  2505.94 MiB
-    ...
-    llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-    llama_context:        CPU  output buffer size =     0.77 MiB
-    llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
-    llama_kv_cache:       HTP1 KV buffer size =    25.50 MiB
-    llama_kv_cache:       HTP3 KV buffer size =    25.50 MiB
-    llama_kv_cache:       HTP0 KV buffer size =    25.50 MiB
-    llama_kv_cache:       HTP2 KV buffer size =    25.50 MiB
-    llama_kv_cache: size =  102.00 MiB (  8192 cells,  12 layers,  1/1 seqs), K (q8_0):   51.00 MiB, V (q8_0):   51.00 MiB
-    llama_kv_cache_iswa: creating     SWA KV cache, size = 256 cells
-    llama_kv_cache:       HTP1 KV buffer size =     0.80 MiB
-    llama_kv_cache:       HTP3 KV buffer size =     0.53 MiB
-    llama_kv_cache:       HTP0 KV buffer size =     1.06 MiB
-    llama_kv_cache:       HTP2 KV buffer size =     0.80 MiB
-    llama_kv_cache: size =    3.19 MiB (   256 cells,  12 layers,  1/1 seqs), K (q8_0):    1.59 MiB, V (q8_0):    1.59 MiB
-    llama_context:       HTP0 compute buffer size =    16.06 MiB
-    llama_context:       HTP1 compute buffer size =    16.06 MiB
-    llama_context:       HTP2 compute buffer size =    16.06 MiB
-    llama_context:       HTP3 compute buffer size =    16.06 MiB
-    llama_context:        CPU compute buffer size =    98.19 MiB
-    ...
-    llama_perf_context_print: prompt eval time =    3843.67 ms /   197 tokens ( 19.51 ms per token, 51.25 tokens per second)
-    llama_perf_context_print:        eval time =    1686.13 ms /    31 runs   ( 54.39 ms per token, 18.39 tokens per second)
-    llama_perf_context_print:       total time =    6266.30 ms /   228 tokens
-    llama_perf_context_print:    graphs reused =         30
-    llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
-    llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
-    llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
-    llama_memory_breakdown_print: |   - HTP2 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
-    llama_memory_breakdown_print: |   - HTP3 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
-    llama_memory_breakdown_print: |   - Host               |                 1476 =  1208 +     105 +     162                |
-    llama_memory_breakdown_print: |   - HTP1-REPACK        |                 2505 =  2505 +       0 +       0                |
-    llama_memory_breakdown_print: |   - HTP3-REPACK        |                 2088 =  2088 +       0 +       0                |
-    llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2923 =  2923 +       0 +       0                |
-    llama_memory_breakdown_print: |   - HTP2-REPACK        |                 2505 =  2505 +       0 +       0                |
+```
+M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt docs/backend/hexagon/run-cli.sh -no-cnv -f surfing.txt -n 32
+...
+LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
+      -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
+...
+llama_model_loader: - type  f32:  289 tensors
+llama_model_loader: - type q4_0:   96 tensors
+llama_model_loader: - type q8_0:    2 tensors
+llama_model_loader: - type mxfp4:  72 tensors
+...
+load_tensors: offloaded 25/25 layers to GPU
+load_tensors:          CPU model buffer size =  1182.09 MiB
+load_tensors:         HTP1 model buffer size =     6.64 MiB
+load_tensors:  HTP1-REPACK model buffer size =  2505.94 MiB
+load_tensors:         HTP3 model buffer size =     5.55 MiB
+load_tensors:  HTP3-REPACK model buffer size =  2088.28 MiB
+load_tensors:         HTP0 model buffer size =     7.75 MiB
+load_tensors:  HTP0-REPACK model buffer size =  2923.59 MiB
+load_tensors:         HTP2 model buffer size =     6.64 MiB
+load_tensors:  HTP2-REPACK model buffer size =  2505.94 MiB
+...
+llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
+llama_context:        CPU  output buffer size =     0.77 MiB
+llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
+llama_kv_cache:       HTP1 KV buffer size =    25.50 MiB
+llama_kv_cache:       HTP3 KV buffer size =    25.50 MiB
+llama_kv_cache:       HTP0 KV buffer size =    25.50 MiB
+llama_kv_cache:       HTP2 KV buffer size =    25.50 MiB
+llama_kv_cache: size =  102.00 MiB (  8192 cells,  12 layers,  1/1 seqs), K (q8_0):   51.00 MiB, V (q8_0):   51.00 MiB
+llama_kv_cache_iswa: creating     SWA KV cache, size = 256 cells
+llama_kv_cache:       HTP1 KV buffer size =     0.80 MiB
+llama_kv_cache:       HTP3 KV buffer size =     0.53 MiB
+llama_kv_cache:       HTP0 KV buffer size =     1.06 MiB
+llama_kv_cache:       HTP2 KV buffer size =     0.80 MiB
+llama_kv_cache: size =    3.19 MiB (   256 cells,  12 layers,  1/1 seqs), K (q8_0):    1.59 MiB, V (q8_0):    1.59 MiB
+llama_context:       HTP0 compute buffer size =    16.06 MiB
+llama_context:       HTP1 compute buffer size =    16.06 MiB
+llama_context:       HTP2 compute buffer size =    16.06 MiB
+llama_context:       HTP3 compute buffer size =    16.06 MiB
+llama_context:        CPU compute buffer size =    98.19 MiB
+...
+llama_perf_context_print: prompt eval time =    3843.67 ms /   197 tokens ( 19.51 ms per token, 51.25 tokens per second)
+llama_perf_context_print:        eval time =    1686.13 ms /    31 runs   ( 54.39 ms per token, 18.39 tokens per second)
+llama_perf_context_print:       total time =    6266.30 ms /   228 tokens
+llama_perf_context_print:    graphs reused =         30
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP2 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP3 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - Host               |                 1476 =  1208 +     105 +     162                |
+llama_memory_breakdown_print: |   - HTP1-REPACK        |                 2505 =  2505 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP3-REPACK        |                 2088 =  2088 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2923 =  2923 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP2-REPACK        |                 2505 =  2505 +       0 +       0                |
+```
diff --git a/docs/backend/hexagon/run-cli.sh b/docs/backend/hexagon/run-cli.sh
index a236bcd43b1..763482e55ab 100755
--- a/docs/backend/hexagon/run-cli.sh
+++ b/docs/backend/hexagon/run-cli.sh
@@ -25,7 +25,7 @@ experimental=
 [ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
 
 sched=
-[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" 
+[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 
 profile=
 [ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
@@ -47,6 +47,6 @@ adb $adbserial shell " \
     ADSP_LIBRARY_PATH=$basedir/$branch/lib \
     $verbose $experimental $sched $opmask $profile $nhvx $ndev           \
       ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model       \
-        -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
-  	-ngl 99 --device $device $cli_opts $@ \
+         -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
+         -ngl 99 --device $device $cli_opts $@ \
 "
diff --git a/docs/backend/hexagon/run-tool.sh b/docs/backend/hexagon/run-tool.sh
index 71ac1dc98c3..bfc213e4c5f 100755
--- a/docs/backend/hexagon/run-tool.sh
+++ b/docs/backend/hexagon/run-tool.sh
@@ -22,7 +22,7 @@ experimental=
 [ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$V"
 
 sched=
-[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v" 
+[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 
 profile=
 [ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index c8a48dc9034..6f7eb3f1832 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -33,7 +33,7 @@ ggml_add_backend_library(${TARGET_NAME} ggml-hexagon.cpp ../../include/ggml-hexa
 target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
 target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})
 
-# Build HTP bits 
+# Build HTP bits
 set(HTP_CMAKE_ARGS
     -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
     -DCMAKE_BUILD_TYPE=Release
diff --git a/ggml/src/ggml-hexagon/htp-utils.c b/ggml/src/ggml-hexagon/htp-utils.c
index 0b31438bbb5..0c64b657fa7 100644
--- a/ggml/src/ggml-hexagon/htp-utils.c
+++ b/ggml/src/ggml-hexagon/htp-utils.c
@@ -177,9 +177,9 @@ int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
     }
     if (remote_handle_control) {
         if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) {
-            /*
-            * Query the DSP for VTCM information
-            * Since the ADSP does not have a dedicated VTCM, we expect the output to be 0
+            /*
+            * Query the DSP for VTCM information
+            * Since the ADSP does not have a dedicated VTCM, we expect the output to be 0
             */
             struct remote_dsp_capability dsp_capability_vtcm_dsp;
             dsp_capability_vtcm_dsp.domain       = (uint32_t) domain;
@@ -244,9 +244,9 @@ bool is_async_fastrpc_supported(int domain) {
     int nErr = AEE_SUCCESS;
     if (remote_handle_control) {
         if (domain == CDSP_DOMAIN_ID) {
-            /*
-            * Query the DSP for ASYNC_FASTRPC_SUPPORT information
-            * Async fastrpc is supported only on CDSP
+            /*
+            * Query the DSP for ASYNC_FASTRPC_SUPPORT information
+            * Async fastrpc is supported only on CDSP
             */
             struct remote_dsp_capability dsp_capability_async_support;
             dsp_capability_async_support.domain       = (uint32_t) domain;
@@ -284,9 +284,9 @@ bool is_status_notification_supported(int domain) {
     int nErr = AEE_SUCCESS;
 
     if (remote_handle_control) {
-        /*
-        * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
-        * DSP User PD status notification Support
+        /*
+        * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
+        * DSP User PD status notification Support
         */
         struct remote_dsp_capability dsp_capability_status_notification_support;
         dsp_capability_status_notification_support.domain       = (uint32_t) domain;
@@ -326,9 +326,9 @@ int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
     }
     if (remote_handle_control) {
         if (domain == CDSP_DOMAIN_ID) {
-            /*
-            * Query the DSP for HMX SUPPORT information
-            * HMX is supported on CDSP only
+            /*
+            * Query the DSP for HMX SUPPORT information
+            * HMX is supported on CDSP only
             */
             struct remote_dsp_capability dsp_capability_hmx_dsp;
             dsp_capability_hmx_dsp.domain       = (uint32_t) domain;
@@ -406,9 +406,9 @@ int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) {
 
     if (remote_handle_control) {
         if (domain == CDSP_DOMAIN_ID) {
-            /*
-            * Query the DSP for HVX SUPPORT information
-            * HVX is supported on CDSP only
+            /*
+            * Query the DSP for HVX SUPPORT information
+            * HVX is supported on CDSP only
             */
             struct remote_dsp_capability dsp_capability_hvx_dsp;
             dsp_capability_hvx_dsp.domain       = (uint32_t) domain;
diff --git a/ggml/src/ggml-hexagon/htp-utils.h b/ggml/src/ggml-hexagon/htp-utils.h
index caba3e174a2..66f9fd373e2 100644
--- a/ggml/src/ggml-hexagon/htp-utils.h
+++ b/ggml/src/ggml-hexagon/htp-utils.h
@@ -10,7 +10,7 @@ extern "C" {
 #include <remote.h>
 #include <stdbool.h>
 
-/* Offset to differentiate HLOS and Hexagon error codes.
+/* Offset to differentiate HLOS and Hexagon error codes.
    Stores the value of AEE_EOFFSET for Hexagon. */
 #ifndef DSP_OFFSET
 #    define DSP_OFFSET 0x80000400
@@ -23,7 +23,7 @@ extern "C" {
 #    endif
 #endif
 
-/* Abstraction of different OS specific sleep APIs.
+/* Abstraction of different OS specific sleep APIs.
    SLEEP accepts input in seconds. */
 #ifndef SLEEP
 #    ifdef __hexagon__
@@ -69,146 +69,146 @@ extern "C" {
 #if !defined(_WINDOWS)
 #    pragma weak remote_system_request
 #endif
-/**
- * Wrapper for FastRPC Capability API: query DSP support.
- *
- * @param[out]  domain pointer to supported domain.
- * @return      0          if query is successful.
- *              non-zero   if error, return value points to the error.
+/**
+ * Wrapper for FastRPC Capability API: query DSP support.
+ *
+ * @param[out]  domain pointer to supported domain.
+ * @return      0          if query is successful.
+ *              non-zero   if error, return value points to the error.
  */
 int get_dsp_support(int * domain);
 
-/**
- * Wrapper for FastRPC Capability API: query VTCM information.
- *
- * @param[in]   domain value of domain in the queried.
- * @param[out]  capability capability value of the attribute queried.
- * @param[in]   attr value of the attribute to the queried.
- * @return      0          if query is successful.
- *              non-zero   if error, return value points to the error.
+/**
+ * Wrapper for FastRPC Capability API: query VTCM information.
+ *
+ * @param[in]   domain value of domain in the queried.
+ * @param[out]  capability capability value of the attribute queried.
+ * @param[in]   attr value of the attribute to the queried.
+ * @return      0          if query is successful.
+ *              non-zero   if error, return value points to the error.
  */
 int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr);
 
-/**
- * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
- *
- * @return      true          if unsigned pd is supported.
- *              false         if unsigned pd is not supported, capability query failed.
+/**
+ * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
+ *
+ * @return      true          if unsigned pd is supported.
+ *              false         if unsigned pd is not supported, capability query failed.
  */
 
 bool get_unsignedpd_support(void);
 
-/**
- * Wrapper for FastRPC Capability API: query unsigned pd support.
- *
- * @param[in]   domain value of domain in the queried.
- * @return      true          if unsigned pd is supported.
- *              false         if unsigned pd is not supported, capability query failed.
+/**
+ * Wrapper for FastRPC Capability API: query unsigned pd support.
+ *
+ * @param[in]   domain value of domain in the queried.
+ * @return      true          if unsigned pd is supported.
+ *              false         if unsigned pd is not supported, capability query failed.
  */
 
 bool is_unsignedpd_supported(int domain_id);
 
-/**
- * is_valid_domain_id API: query a domain id is valid.
- *
- * @param[in]   domain value of domain in the queried.
- * @param[in]   compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
- * @return      true          if value of domain is valid.
- *              false         if value of domain is not valid.
+/**
+ * is_valid_domain_id API: query a domain id is valid.
+ *
+ * @param[in]   domain value of domain in the queried.
+ * @param[in]   compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
+ * @return      true          if value of domain is valid.
+ *              false         if value of domain is not valid.
  */
 
 bool is_valid_domain_id(int domain_id, int compute_only);
 
-/**
- * get_domain API: get domain struct from domain value.
- *
- * @param[in]  domain value of a domain
- * @return     Returns domain struct of the domain if it is supported or else
- *             returns NULL.
- *
+/**
+ * get_domain API: get domain struct from domain value.
+ *
+ * @param[in]  domain value of a domain
+ * @return     Returns domain struct of the domain if it is supported or else
+ *             returns NULL.
+ *
  */
 
 domain * get_domain(int domain_id);
 
-/**
- * get_domains_info API: get information for all the domains available on the device
- *
- * @param[in]  domain_type pointer to domain type
- * @param[in]  num_domains pointer to number of domains
- * @param[in]  domains_info pointer to save discovered domains information.
- * @return     0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
- * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
- *
+/**
+ * get_domains_info API: get information for all the domains available on the device
+ *
+ * @param[in]  domain_type pointer to domain type
+ * @param[in]  num_domains pointer to number of domains
+ * @param[in]  domains_info pointer to save discovered domains information.
+ * @return     0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
+ *
  */
 
 int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info);
 
-/**
- * get_effective_domain_id API: get effective domain id for given session id
- *
- * @param[in]  domain_name pointer to domain name
- * @param[in]  session_id
- * @param[in]  effec_domain_id pointer to save obtained effective domain id.
- * @return     0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
+/**
+ * get_effective_domain_id API: get effective domain id for given session id
+ *
+ * @param[in]  domain_name pointer to domain name
+ * @param[in]  session_id
+ * @param[in]  effec_domain_id pointer to save obtained effective domain id.
+ * @return     0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
  */
 
 int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id);
 
-/**
- * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
- *
- * @param[in]  domain_id value of a domain
- * @return     Returns true or false stating support of Async FastRPC
- *
+/**
+ * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
+ *
+ * @param[in]  domain_id value of a domain
+ * @return     Returns true or false stating support of Async FastRPC
+ *
  */
 
 bool is_async_fastrpc_supported(int domain_id);
 
-/**
- * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
- *
- * @param[in]  domain_id value of a domain
- * @return     Returns true or false stating status notification support information
- *
+/**
+ * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
+ *
+ * @param[in]  domain_id value of a domain
+ * @return     Returns true or false stating status notification support information
+ *
  */
 bool is_status_notification_supported(int domain_id);
 
-/**
- * get_hmx_support_info API: query the DSP for HMX SUPPORT information
- *
- * @param[in]   domain_id value of a domain
- * @param[out]  capability capability value of the attribute queried.
- * @param[in]   attr value of the attribute to the queried.
- * @return      0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
+/**
+ * get_hmx_support_info API: query the DSP for HMX SUPPORT information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  capability capability value of the attribute queried.
+ * @param[in]   attr value of the attribute to the queried.
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
  */
 int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr);
 
-/**
- * get_hex_arch_ver API: query the Hexagon processor architecture version information
- *
- * @param[in]   domain_id value of a domain
- * @param[out]  Arch version (73, 75, ...)
- * @return      0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
+/**
+ * get_hex_arch_ver API: query the Hexagon processor architecture version information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  Arch version (73, 75, ...)
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
  */
 int get_hex_arch_ver(int domain, int * arch);
 
-/**
- * get_hvx_support_info API: query the DSP for HVX SUPPORT information
- *
- * @param[in]   domain_id value of a domain
- * @param[out]  capability capability value of the attribute queried.
- * @param[in]   attr value of the attribute to the queried.
- * @return      0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
+/**
+ * get_hvx_support_info API: query the DSP for HVX SUPPORT information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  capability capability value of the attribute queried.
+ * @param[in]   attr value of the attribute to the queried.
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
  */
 int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr);
 
diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index 9be5df55083..22e3fea11d8 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -31,7 +31,7 @@ add_library(${HTP_LIB} SHARED
 )
 
 target_compile_definitions(${HTP_LIB} PRIVATE
-	$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>)
+    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>)
 
 build_idl(htp_iface.idl ${HTP_LIB})
 
diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
index d2964a04271..19f6795083c 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -38,7 +38,7 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             if (true == negate) {
                 HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
@@ -48,7 +48,7 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
             }
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
index 4b4043dcbde..4cf588a8781 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@@ -36,12 +36,12 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
         HVX_Vector * p_vec_in  = (HVX_Vector *) src;
         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             *p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
             *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in);
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index d59f8a303fc..a056e165326 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -54,13 +54,13 @@ void hvx_mul_f32(const uint8_t * restrict src0,
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
             HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
@@ -263,13 +263,13 @@ void hvx_add_f32(const uint8_t * restrict src0,
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
             HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
@@ -404,13 +404,13 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -453,13 +453,13 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -505,13 +505,13 @@ void hvx_sub_f32(const uint8_t * restrict src0,
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
             HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
@@ -646,13 +646,13 @@ void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -688,7 +688,7 @@ float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems)
     HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
     HVX_Vector zero_vec    = Q6_V_vsplat_R(0x00000000);
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
         HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
         sum_vec_acc  = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
@@ -732,13 +732,13 @@ float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
     if (0 == unaligned_loop) {
         HVX_Vector * vec_in = (HVX_Vector *) src;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
             sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -781,13 +781,13 @@ void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const i
         HVX_Vector * vec_in1 = (HVX_Vector *) src;
         HVX_Vector * vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -830,12 +830,12 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in = (HVX_Vector *) src;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
@@ -873,7 +873,7 @@ void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
     HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
         vec_min    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
         *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
@@ -911,7 +911,7 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
     HVX_Vector range_left  = hvx_vec_splat_fp32(limit_left);
     HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
         HVX_Vector in_vec = *vec_in++;
         HVX_Vector temp_v = in_vec;
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 9a88b46306e..650ad48476b 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -80,7 +80,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -104,7 +104,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -128,7 +128,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -153,7 +153,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -177,7 +177,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -201,7 +201,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -226,7 +226,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         vdst[i] = velem;
     }
@@ -926,7 +926,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     const HVX_Vector * restrict v_src = (HVX_Vector *) src;
     HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]);
     }
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 10d9ebbd597..f92773ac1c6 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -1089,8 +1089,8 @@ static void matmul(struct htp_matmul_type * mt,
 
     const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
 
-// Prefill spad with src0 rows
-#pragma unroll(4)
+    // Prefill spad with src0 rows
+    #pragma unroll(4)
     for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
         const int is0 = (ir0 - src0_start_row);
         if (is0 >= HTP_SPAD_SRC0_NROWS) {
@@ -1104,7 +1104,7 @@ static void matmul(struct htp_matmul_type * mt,
     for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
         const uint8_t * ss0 = dma_queue_pop(dma_queue);
 
-#pragma unroll(2)
+        #pragma unroll(2)
         for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
             const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_row_size);
             float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
@@ -1128,7 +1128,7 @@ static void matmul(struct htp_matmul_type * mt,
                        src0_row_size_padded, src0_row_size, 1);
         const uint8_t * ss0 = dma_queue_pop(dma_queue);
 
-#pragma unroll(2)
+        #pragma unroll(2)
         for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
             const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_row_size);
             float * restrict dst_row          = (float *) (dst->data + (ir1 * dst_row_size));
@@ -1191,8 +1191,8 @@ static void matvec(struct htp_matmul_type * mt,
     const uint8_t * restrict src1_col = (const uint8_t *) src1_data;
     float * restrict dst_col          = (float *) dst->data;
 
-// Prefill spad with 2x src0 rows
-#pragma unroll(2)
+    // Prefill spad with 2x src0 rows
+    #pragma unroll(2)
     for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
         const uint32_t is0 = (ir0 - src0_start_row);
         if (is0 >= HTP_SPAD_SRC0_NROWS) {
@@ -1305,8 +1305,8 @@ static void matmul_id(struct htp_matmul_type * mt,
 
         const uint8_t * src0_row = (const uint8_t *) src0->data + (0 + cur_a * nb02 + 0);
 
-// Prefill spad with src0 rows
-#pragma unroll(4)
+        // Prefill spad with src0 rows
+        #pragma unroll(4)
         for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
             const int is0 = (ir0 - src0_start_row);
             if (is0 >= HTP_SPAD_SRC0_NROWS) {
@@ -1429,8 +1429,8 @@ static void matvec_id(struct htp_matmul_type * mt,
         const uint8_t * restrict src1_col = (const uint8_t *) src1_data;
         float * restrict dst_row          = (float *) (dst->data + ie1 * nb1);
 
-// Prefill spad with src0 rows
-#pragma unroll(4)
+        // Prefill spad with src0 rows
+        #pragma unroll(4)
         for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
             const int is0 = (ir0 - src0_start_row);
             if (is0 >= HTP_SPAD_SRC0_NROWS) {
diff --git a/ggml/src/ggml-hexagon/htp/softmax-ops.c b/ggml/src/ggml-hexagon/htp/softmax-ops.c
index 5fea4221afa..5bf0cbf7922 100644
--- a/ggml/src/ggml-hexagon/htp/softmax-ops.c
+++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c
@@ -105,7 +105,7 @@ static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
 
     int step_of_1 = num_elems >> 5;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         HVX_Vector v1 = *(HVX_Vector *) src_curr;
 
@@ -140,7 +140,7 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
 
     int step_of_1 = num_elems >> 5;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         HVX_Vector v1 = v_src[i];
         max_vec       = Q6_Vsf_vmax_VsfVsf(max_vec, v1);
@@ -149,7 +149,7 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
     HVX_Vector v = hvx_vec_reduce_max_fp32(max_vec);
     max_vec      = hvx_vec_repl4(v);
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         HVX_Vector v1 = v_src[i];
         HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, max_vec);
@@ -168,7 +168,7 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
     HVX_Vector     v4        = hvx_vec_inverse_fp32(sum_vec);
     HVX_Vector     scale_vec = Q6_V_vmux_QVV(pos_sum, v4, one_v);
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         HVX_Vector v1 = v_pad[i];
         HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);
diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c
index 85b60779b79..bb7557b0252 100644
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -58,7 +58,7 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
     HVX_Vector epsilon_v = hvx_vec_splat_fp32(epsilon);
 
     int step_of_1 = num_elems >> 5;
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         HVX_Vector v1 = v_src[i];
         HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
@@ -75,7 +75,7 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
 
     HVX_Vector scale_v = hvx_vec_rsqrt_fp32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         HVX_Vector v1 = v_src[i];
         HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);

From aa65f212615bbecbb1902ecc1876df8691f7d80b Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 14 Oct 2025 16:40:41 -0700
Subject: [PATCH 04/37] hexagon: update readme and cmake presets

---
 docs/backend/hexagon/CMakeUserPresets.json | 2 +-
 docs/backend/hexagon/README.md             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json
index da0c03c1d9b..e00e5c4d436 100644
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -9,7 +9,7 @@
         "cacheVariables": {
             "ANDROID_ABI":      "arm64-v8a",
             "ANDROID_PLATFORM": "android-31",
-            "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK}/build/cmake/android.toolchain.cmake",
+            "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
             "CMAKE_C_FLAGS":   "-march=armv8.7a -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
             "CMAKE_CXX_FLAGS": "-march=armv8.7a -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
             "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
diff --git a/docs/backend/hexagon/README.md b/docs/backend/hexagon/README.md
index 51e0026c629..f35fb0413f4 100644
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -8,7 +8,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
 This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
 
 ```
-~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.1
+~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.2
 [d]/> cd /workspace
 ```
 
@@ -106,7 +106,7 @@ Here are some examples of running various llama.cpp tools via ADB.
 Simple question for Llama-3.2-1B
 
 ```
-~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./docs/backend/snapdragon/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./docs/backend/hexagon/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
 ...
 ggml-hex: HTP arch version v79
 ggml-hex: allocating new session: HTP0

From 647fa3de4d59f563075e17aeb0bfaaff31b564b1 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 14 Oct 2025 18:30:19 -0700
Subject: [PATCH 05/37] ci: add android-ndk-build jobs that build plain ARM64
 and Snapdragon versions

---
 .github/workflows/build.yml | 75 +++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index fe868638936..1c769d74236 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1305,6 +1305,81 @@ jobs:
           cd examples/llama.android
           ./gradlew build --no-daemon
 
+  android-ndk-build:
+    runs-on: ubuntu-latest
+
+    env:
+      OPENCL_VERSION: 2025.07.22
+
+    strategy:
+      matrix:
+        include:
+          - build: 'arm64-cpu'
+            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF'
+          - build: 'arm64-snapdragon'
+            defines: '--preset arm64-android-snapdragon-release'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        run: |
+          mkdir opencl
+          curl -L -o opencl/clhpp.tar.gz      https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          curl -L -o opencl/headers.tar.gz    https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          tar -xaf opencl/headers.tar.gz    -C opencl
+          tar -xaf opencl/clhpp.tar.gz      -C opencl
+          tar -xaf opencl/icd-loader.tar.gz -C opencl
+          sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL         ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+          sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
+          cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
+          cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
+          cmake --build build
+          sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+          rm -rf opencl
+
+      - name: Install Hexagon SDK
+        id: install_hexsdk
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        env:
+          HEXSDK_VER: 6.3.0
+          HEXTLS_VER: 19.0.04
+        run: |
+          curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
+          mkdir hex-sdk
+          tar -xaf hex-sdk.tar.gz -C hex-sdk
+          ls -l hex-sdk
+          sudo mv hex-sdk /opt/hexagon
+          echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER.0"                                   >> "$GITHUB_ENV"
+          echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER.0/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV"
+          echo "DEFAULT_HLOS_ARCH=64"                                                          >> "$GITHUB_ENV"
+          echo "DEFAULT_TOOLS_VARIANT=toolv19"                                                 >> "$GITHUB_ENV"
+          echo "DEFAULT_NO_QURT_INC=0"                                                         >> "$GITHUB_ENV"
+          echo "DEFAULT_DSP_ARCH=v73"                                                          >> "$GITHUB_ENV"
+
+      - name: Update CMake presets
+        id: update_presets
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        run: |
+          cp docs/backend/hexagon/CMakeUserPresets.json .
+
+      - name: Build
+        id: ndk_build
+        run: |
+          cmake ${{ matrix.defines }} -B build
+          cmake --build build
+          cmake --install build --prefix pkg
+
+      - name: Test
+        id: cmake_test
+        run: |
+          echo "FIXME: test on devices"
+
   openEuler-latest-cmake-cann:
     if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
     defaults:

From da7caac44470a5ee206f51e806770d914290e13c Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 14 Oct 2025 12:13:47 -0700
Subject: [PATCH 06/37] hexagon: add simple graph optimizer for stacking
 MUL_MAT ops with the same input

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 165 ++++++++++++++++++++++++-
 1 file changed, 163 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index e2f05cb2ad5..f592fff1b12 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2404,7 +2404,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
 
     t2 = ggml_time_us();
 
-    HEX_VERBOSE(
+    HEX_PROFILE(
         "ggml-hex: %s matmul-id %s %u:%u:%u:%u x %s %u:%u:%u:%u (%s %u:%u:%u:%u) -> %s %u:%u:%u:%u : op-usec %u "
         "op-cycles %u op-pkts %u (%f) call-usec %llu\n",
         sess->name.c_str(), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
@@ -3211,6 +3211,167 @@ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
     }
 }
 
+struct node_info {
+    ggml_tensor * node;
+
+    std::vector<ggml_tensor *> fused;
+
+    ggml_op op() const {
+        return node->op;
+    }
+
+    const ggml_tensor * dst() const {
+        return fused.empty() ? node : fused.back();
+    }
+
+    const ggml_tensor * src1() const {
+        return node->src[1];
+    }
+
+    bool is_empty() const {
+        return ggml_op_is_empty(node->op);
+    }
+
+    void add_fused(ggml_tensor * t) {
+        fused.push_back(t);
+    }
+
+    bool stackable() const {
+        switch (this->op()) {
+            case GGML_OP_MUL_MAT:
+            case GGML_OP_MUL_MAT_ID:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    bool same_input(const node_info& n) const {
+        return n.src1() == this->src1();
+    }
+};
+
+static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
+    const int n = nodes.size();
+
+    std::vector<int> res;
+    res.reserve(n);
+
+    std::vector<bool> used(n, false);
+
+    // The main goal here is to stack the MUL_MAT ops with the same src1 input.
+    // This allows use to reuse dynamically quantized src1 in VTCM.
+
+    for (int i0 = 0; i0 < n; i0++) {
+        if (used[i0]) {
+            continue;
+        }
+
+        res.push_back(i0);
+
+        const auto & node0 = nodes[i0];
+
+        if (!node0.stackable()) {
+            continue;
+        }
+
+        // that many nodes forward to search for stackable nodes that can reuse VTCM
+        constexpr int N_FORWARD = 8;
+
+        for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
+            if (used[i1]) {
+                continue;
+            }
+
+            const auto & node1 = nodes[i1];
+
+            if (node1.stackable() && node1.same_input(node0)) {
+                res.push_back(i1);
+                used[i1] = true;
+            }
+        }
+    }
+
+    return res;
+}
+
+static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgraph * gf) {
+    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
+    const int n = gf->n_nodes;
+
+    constexpr int MAX_FUSE = 16;
+
+    enum ggml_op ops[MAX_FUSE];
+
+    std::vector<node_info> nodes;
+    nodes.reserve(gf->n_nodes);
+
+    // fuse nodes:
+    // we don't want to make reorders that break fusing, so we first pack all fusable tensors
+    //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
+    for (int i = 0; i < n; i++) {
+        node_info node = {
+            /*.node =*/ gf->nodes[i],
+            /*.fused =*/ {},
+        };
+
+        // fuse only ops that start with these operations
+        // can be expanded when needed
+        if (node.op() == GGML_OP_ADD ||
+            node.op() == GGML_OP_NORM ||
+            node.op() == GGML_OP_RMS_NORM) {
+            ops[0] = node.op();
+
+            int f = i + 1;
+            while (f < n && f < i + MAX_FUSE) {
+                // conservatively allow fusing only these ops
+                // can be expanded when needed
+                if (gf->nodes[f]->op != GGML_OP_ADD &&
+                    gf->nodes[f]->op != GGML_OP_MUL &&
+                    gf->nodes[f]->op != GGML_OP_NORM &&
+                    gf->nodes[f]->op != GGML_OP_RMS_NORM) {
+                    break;
+                }
+                ops[f - i] = gf->nodes[f]->op;
+                f++;
+            }
+
+            f -= i;
+            for (; f > 1; f--) {
+                if (ggml_can_fuse(gf, i, ops, f)) {
+                    break;
+                }
+            }
+
+            // add the fused tensors into the node info so we can unfuse them later
+            for (int k = 1; k < f; k++) {
+                ++i;
+
+                // the .dst() becomes the last fused tensor
+                node.add_fused(gf->nodes[i]);
+            }
+        }
+
+        nodes.push_back(std::move(node));
+    }
+
+    const auto order = ggml_hexagon_graph_optimize_reorder(nodes);
+
+    // unfuse
+    {
+        int j = 0;
+        for (const auto i : order) {
+            const auto & node = nodes[i];
+
+            gf->nodes[j++] = node.node;
+
+            for (auto * fused : node.fused) {
+                gf->nodes[j++] = fused;
+            }
+        }
+    }
+}
+
 static struct ggml_backend_i hexagon_backend_i = {
     /* .get_name                = */ ggml_backend_hexagon_name,
     /* .free                    = */ ggml_backend_hexagon_free,
@@ -3225,7 +3386,7 @@ static struct ggml_backend_i hexagon_backend_i = {
     /* .graph_compute           = */ ggml_backend_hexagon_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
-    /* .graph_optimize          = */ NULL,
+    /* .graph_optimize          = */ ggml_backend_hexagon_graph_optimize,
 };
 
 static ggml_guid_t ggml_backend_hexagon_guid() {

From bbbc8eae03c7c248a6af1d674b069bc61467e954 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 15 Oct 2025 16:58:41 -0700
Subject: [PATCH 07/37] hexagon: move ADB helper scripts into
 scripts/snapdragon/adb

---
 docs/backend/hexagon/README.md                | 19 ++++++++++---------
 .../snapdragon/adb}/llama-cli.farf            |  0
 .../snapdragon/adb}/run-bench.sh              |  2 +-
 .../snapdragon/adb}/run-cli.sh                |  0
 .../snapdragon/adb}/run-tool.sh               |  0
 5 files changed, 11 insertions(+), 10 deletions(-)
 rename {docs/backend/hexagon => scripts/snapdragon/adb}/llama-cli.farf (100%)
 rename {docs/backend/hexagon => scripts/snapdragon/adb}/run-bench.sh (86%)
 rename {docs/backend/hexagon => scripts/snapdragon/adb}/run-cli.sh (100%)
 rename {docs/backend/hexagon => scripts/snapdragon/adb}/run-tool.sh (100%)

diff --git a/docs/backend/hexagon/README.md b/docs/backend/hexagon/README.md
index f35fb0413f4..d15fbb5e0fc 100644
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -106,9 +106,10 @@ Here are some examples of running various llama.cpp tools via ADB.
 Simple question for Llama-3.2-1B
 
 ```
-~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./docs/backend/hexagon/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
 ...
-ggml-hex: HTP arch version v79
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v79
 ggml-hex: allocating new session: HTP0
 ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
 ...
@@ -135,10 +136,10 @@ llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =
 Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
 
 ```
-M=../gguf/OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 docs/backend/hexagon/run-cli.sh -f surfing.txt -no-cnv
+~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
 ...
-ggml-hex: allocating new registry : ndev 2
-ggml-hex: HTP arch version v81
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v81
 ggml-hex: allocating new session: HTP0
 ggml-hex: allocating new session: HTP1
 ...
@@ -173,7 +174,7 @@ llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2025 =
 Op test for MUL_MAT
 
 ```
-~/src/llama.cpp$ HB=0 docs/backend/snapdragon/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
+~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
 ...
 Backend 2/3: HTP0
 Device description: Hexagon
@@ -182,10 +183,10 @@ MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=
 MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
 MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
 
-~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf docs/backend/hexagon/run-bench.sh -p 128 -n 64
+~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64
 ...
-ggml-hex: allocating new registry : ndev 1
-ggml-hex: HTP arch version v79
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v79
 ggml-hex: allocating new session: HTP0
 ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
 | model          |       size | params | backend    | ngl | threads | n_batch | mmap |  test |           t/s |
diff --git a/docs/backend/hexagon/llama-cli.farf b/scripts/snapdragon/adb/llama-cli.farf
similarity index 100%
rename from docs/backend/hexagon/llama-cli.farf
rename to scripts/snapdragon/adb/llama-cli.farf
diff --git a/docs/backend/hexagon/run-bench.sh b/scripts/snapdragon/adb/run-bench.sh
similarity index 86%
rename from docs/backend/hexagon/run-bench.sh
rename to scripts/snapdragon/adb/run-bench.sh
index 1e8a33abc0c..4e2b7798092 100755
--- a/docs/backend/hexagon/run-bench.sh
+++ b/scripts/snapdragon/adb/run-bench.sh
@@ -31,6 +31,6 @@ adb $adbserial shell " \
   cd $basedir;         \
   LD_LIBRARY_PATH=$basedir/$branch/lib   \
   ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $ndev $nhvx ./$branch/bin/llama-bench --mmap 0 -m $basedir/../gguf/$model \
+    $ndev $nhvx ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
         -t 4 --batch-size 128 -ngl 99 $@ \
 "
diff --git a/docs/backend/hexagon/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh
similarity index 100%
rename from docs/backend/hexagon/run-cli.sh
rename to scripts/snapdragon/adb/run-cli.sh
diff --git a/docs/backend/hexagon/run-tool.sh b/scripts/snapdragon/adb/run-tool.sh
similarity index 100%
rename from docs/backend/hexagon/run-tool.sh
rename to scripts/snapdragon/adb/run-tool.sh

From cc7dbd4bf3491aca5738a966707e15b034e2fe1d Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 15 Oct 2025 18:31:53 -0700
Subject: [PATCH 08/37] hexagon: replace all f/printfs with GGML_LOG_...

---
 ggml/src/ggml-hexagon/CMakeLists.txt   |   6 +-
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 182 ++++++++++---------------
 ggml/src/ggml-hexagon/htp-utils.c      |  86 ++++++------
 3 files changed, 122 insertions(+), 152 deletions(-)

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 6f7eb3f1832..60a154694b3 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -3,8 +3,7 @@ include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
 option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
 
 add_library(htp_iface OBJECT
-    ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/htp-utils.c)
+    ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
 
 set_target_properties(htp_iface PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(htp_iface PUBLIC
@@ -28,7 +27,8 @@ link_custom_library(htp_iface cdsprpc)
 link_custom_library(htp_iface rpcmem)
 
 set(TARGET_NAME ggml-hexagon)
-ggml_add_backend_library(${TARGET_NAME} ggml-hexagon.cpp ../../include/ggml-hexagon.h)
+ggml_add_backend_library(${TARGET_NAME}
+    ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h)
 
 target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
 target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index f592fff1b12..5dc7188b70e 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -51,14 +51,8 @@ static int    opt_experimental = 0;
 static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
 static int opt_opsync = 0;  // synchronous ops
 
-#define HEX_VERBOSE(...)              \
-    if (opt_verbose) {                \
-        fprintf(stderr, __VA_ARGS__); \
-        fflush(stderr);               \
-    }
 #define HEX_PROFILE(...) \
-    if (opt_profile)     \
-    fprintf(stderr, __VA_ARGS__)
+    if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
 
 static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
     return ((size_t) addr & (align - 1)) == 0;
@@ -270,18 +264,16 @@ static void htp_packet_callback(dspqueue_t queue, AEEResult error, void * contex
         }
 
         if (err != 0) {
-            fprintf(stderr, "ggml-hex: dspqueue_read_noblock failed: 0x%08x\n", (unsigned) err);
-            exit(1);
+            GGML_ABORT("ggml-hex: dspqueue_read_noblock failed: 0x%08x\n", (unsigned) err);
         }
 
         // Basic sanity checks
         if (rsp_size != sizeof(rsp)) {
-            fprintf(stderr, "ggml-hex: dspcall : bad response (size)\n");
-            exit(1);
+            GGML_ABORT("ggml-hex: dspcall : bad response (size)\n");
         }
 
         if (rsp.status != HTP_STATUS_OK) {
-            fprintf(stderr, "ggml-hex: dspcall : dsp-rsp: %s\n", status_to_str(rsp.status));
+            GGML_LOG_ERROR("ggml-hex: dspcall : dsp-rsp: %s\n", status_to_str(rsp.status));
             // TODO: handle errors
         }
 
@@ -297,8 +289,7 @@ static void htp_packet_callback(dspqueue_t queue, AEEResult error, void * contex
 // Error callback - simply terminates with an error. Used where we don't
 // expect errors.
 [[noreturn]] static void htp_error_callback(dspqueue_t queue, AEEResult error, void * context) {
-    fprintf(stderr, "ggml-hex: dspcall general error 0x%x: for queue %p\n", error, (void *) queue);
-    exit(1);
+    GGML_ABORT("ggml-hex: dspcall general error 0x%x: for queue %p\n", error, (void *) queue);
 }
 
 // ** backend buffers
@@ -315,13 +306,13 @@ struct ggml_backend_hexagon_buffer_type_context {
 
 struct ggml_backend_hexagon_buffer_context {
     bool mmap_to(ggml_hexagon_session * s) {
-        HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
+        GGML_LOG_DEBUG("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
                     s->name.c_str(), (void *) this->base, s->domain_id, s->session_id, this->size, this->fd,
                     (int) this->repack);
 
         int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
         if (err != 0) {
-            fprintf(stderr, "ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
+            GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
                     s->domain_id, this->size, this->fd, (unsigned) err);
             return false;
         }
@@ -354,19 +345,19 @@ struct ggml_backend_hexagon_buffer_context {
 
         this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
         if (!this->base) {
-            fprintf(stderr, "ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
+            GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
             return;
         }
 
         this->fd = rpcmem_to_fd(this->base);
         if (this->fd < 0) {
-            fprintf(stderr, "ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base);
+            GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base);
             rpcmem_free(this->base);
             this->base = NULL;
             return;
         }
 
-        HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
+        GGML_LOG_DEBUG("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
                     (void *) this->base, size, this->fd, (int) repack);
 
         this->sess   = sess;
@@ -409,7 +400,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf
     auto ctx  = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
     auto sess = ctx->sess;
 
-    HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
+    GGML_LOG_DEBUG("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
                 tensor->name, (void *) ctx->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage,
                 (int) ctx->repack);
 
@@ -436,7 +427,7 @@ static x2_q4 unpack_q4(uint8_t v) {
 }
 
 static void dump_block_q4_0(const block_q4_0 * b, int i) {
-    HEX_VERBOSE("ggml-hex: repack q4_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_q4(b->qs[0]).v[0],
+    GGML_LOG_DEBUG("ggml-hex: repack q4_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_q4(b->qs[0]).v[0],
                 unpack_q4(b->qs[1]).v[0], unpack_q4(b->qs[2]).v[0], unpack_q4(b->qs[3]).v[0], unpack_q4(b->qs[12]).v[1],
                 unpack_q4(b->qs[13]).v[1], unpack_q4(b->qs[14]).v[1], unpack_q4(b->qs[15]).v[1],
                 GGML_FP16_TO_FP32(b->d));
@@ -454,13 +445,13 @@ static void dump_packed_block_q4x4x2(const uint8_t * v, unsigned int i, size_t k
     const uint8_t *   q = v_q + i * qblk_size;
     const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
 
-    HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
+    GGML_LOG_DEBUG("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
                 unpack_q4(q[0]).v[0], unpack_q4(q[1]).v[0], unpack_q4(q[2]).v[0], unpack_q4(q[3]).v[0],
                 unpack_q4(q[60]).v[0], unpack_q4(q[61]).v[0], unpack_q4(q[62]).v[0], unpack_q4(q[63]).v[0],
                 unpack_q4(q[124]).v[0], unpack_q4(q[125]).v[0], unpack_q4(q[126]).v[0], unpack_q4(q[127]).v[0],
                 GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
 
-    HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
+    GGML_LOG_DEBUG("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
                 i + 1, unpack_q4(q[0]).v[1], unpack_q4(q[1]).v[1], unpack_q4(q[2]).v[1], unpack_q4(q[3]).v[1],
                 unpack_q4(q[60]).v[1], unpack_q4(q[61]).v[1], unpack_q4(q[62]).v[1], unpack_q4(q[63]).v[1],
                 unpack_q4(q[124]).v[1], unpack_q4(q[125]).v[1], unpack_q4(q[126]).v[1], unpack_q4(q[127]).v[1],
@@ -669,7 +660,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    HEX_VERBOSE("ggml-hex: repack-q4_0-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+    GGML_LOG_DEBUG("ggml-hex: repack-q4_0-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
                 t->ne[0], nrows, row_size);
 
     init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros
@@ -701,7 +692,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+    GGML_LOG_DEBUG("ggml-hex: repack-q4x4x2-q4_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
                 t->ne[0], nrows, row_size);
 
     memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros
@@ -721,7 +712,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
 
 // ======== Q8x4x2 ====================
 static void dump_block_q8_0(const block_q8_0 * b, int i) {
-    HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
+    GGML_LOG_DEBUG("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
                 b->qs[3], b->qs[28], b->qs[29], b->qs[30], b->qs[31], GGML_FP16_TO_FP32(b->d));
 }
 
@@ -737,11 +728,11 @@ static void dump_packed_block_q8x4x2(const uint8_t * v, unsigned int i, size_t k
     const uint8_t *   q = v_q + i * qblk_size;
     const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
 
-    HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
+    GGML_LOG_DEBUG("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
                 q[0], q[1], q[2], q[3], q[60], q[61], q[62], q[63], q[124], q[125], q[126], q[127],
                 GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
 
-    HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
+    GGML_LOG_DEBUG("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
                 i + 1, q[128], q[129], q[130], q[131], q[192], q[193], q[194], q[195], q[252], q[253], q[254], q[255],
                 GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
 }
@@ -943,7 +934,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    HEX_VERBOSE("ggml-hex: repack-q8_0-q8x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+    GGML_LOG_DEBUG("ggml-hex: repack-q8_0-q8x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
                 t->ne[0], nrows, row_size);
 
     init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros
@@ -975,7 +966,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    HEX_VERBOSE("ggml-hex: repack-q8x4x2-q8_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+    GGML_LOG_DEBUG("ggml-hex: repack-q8x4x2-q8_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
                 t->ne[0], nrows, row_size);
 
     memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros
@@ -1006,7 +997,7 @@ static x2_mxfp4 unpack_mxfp4(uint8_t v) {
 }
 
 static void dump_block_mxfp4(const block_mxfp4 * b, int i) {
-    HEX_VERBOSE("ggml-hex: repack mxfp4 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_mxfp4(b->qs[0]).v[0],
+    GGML_LOG_DEBUG("ggml-hex: repack mxfp4 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_mxfp4(b->qs[0]).v[0],
                 unpack_mxfp4(b->qs[1]).v[0], unpack_mxfp4(b->qs[2]).v[0], unpack_mxfp4(b->qs[3]).v[0],
                 unpack_mxfp4(b->qs[12]).v[1], unpack_mxfp4(b->qs[13]).v[1], unpack_mxfp4(b->qs[14]).v[1],
                 unpack_mxfp4(b->qs[15]).v[1], GGML_E8M0_TO_FP32_HALF(b->e));
@@ -1024,14 +1015,14 @@ static void dump_packed_block_mxfp4x4x2(const uint8_t * v, unsigned int i, size_
     const uint8_t * q = v_q + i * qblk_size;
     const uint8_t * e = (const uint8_t *) (v_e + i * eblk_size);
 
-    HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
+    GGML_LOG_DEBUG("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
                 unpack_mxfp4(q[0]).v[0], unpack_mxfp4(q[1]).v[0], unpack_mxfp4(q[2]).v[0], unpack_mxfp4(q[3]).v[0],
                 unpack_mxfp4(q[60]).v[0], unpack_mxfp4(q[61]).v[0], unpack_mxfp4(q[62]).v[0], unpack_mxfp4(q[63]).v[0],
                 unpack_mxfp4(q[124]).v[0], unpack_mxfp4(q[125]).v[0], unpack_mxfp4(q[126]).v[0],
                 unpack_mxfp4(q[127]).v[0], GGML_E8M0_TO_FP32_HALF(e[0]), GGML_E8M0_TO_FP32_HALF(e[1]),
                 GGML_E8M0_TO_FP32_HALF(e[2]), GGML_E8M0_TO_FP32_HALF(e[3]));
 
-    HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
+    GGML_LOG_DEBUG("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
                 i + 1, unpack_mxfp4(q[0]).v[1], unpack_mxfp4(q[1]).v[1], unpack_mxfp4(q[2]).v[1],
                 unpack_mxfp4(q[3]).v[1], unpack_mxfp4(q[60]).v[1], unpack_mxfp4(q[61]).v[1], unpack_mxfp4(q[62]).v[1],
                 unpack_mxfp4(q[63]).v[1], unpack_mxfp4(q[124]).v[1], unpack_mxfp4(q[125]).v[1],
@@ -1242,7 +1233,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    HEX_VERBOSE("ggml-hex: repack-mxfp4-mxfp4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
+    GGML_LOG_DEBUG("ggml-hex: repack-mxfp4-mxfp4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
                 size, t->ne[0], nrows, row_size);
 
     init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros
@@ -1274,7 +1265,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    HEX_VERBOSE("ggml-hex: repack-mxfp4x4x2-mxfp4 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
+    GGML_LOG_DEBUG("ggml-hex: repack-mxfp4x4x2-mxfp4 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
                 size, t->ne[0], nrows, row_size);
 
     memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros
@@ -1300,7 +1291,7 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
 
-    HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
+    GGML_LOG_DEBUG("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
                 offset, size);
 
     switch (tensor->type) {
@@ -1333,7 +1324,7 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
 
-    HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
+    GGML_LOG_DEBUG("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
                 offset, size);
 
     switch (tensor->type) {
@@ -1364,7 +1355,7 @@ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t      bu
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
 
-    HEX_VERBOSE("ggml-hex: %s copy-tensor %s -> %s size %zu\n", sess->name.c_str(), src->name, dst->name,
+    GGML_LOG_DEBUG("ggml-hex: %s copy-tensor %s -> %s size %zu\n", sess->name.c_str(), src->name, dst->name,
                 ggml_nbytes(src));
 
     memcpy(dst->data, src->data, ggml_nbytes(src));
@@ -1376,7 +1367,7 @@ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t      bu
 static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
-    HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
+    GGML_LOG_DEBUG("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
     memset(ctx->base, value, ctx->size);
 }
 
@@ -1467,12 +1458,11 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
     this->prof_cycles = 0;
     this->prof_pkts   = 0;
 
-    fprintf(stderr, "ggml-hex: allocating new session: %s\n", this->name.c_str());
+    GGML_LOG_INFO("ggml-hex: allocating new session: %s\n", this->name.c_str());
 
     domain * my_domain = get_domain(this->domain_id);
     if (my_domain == NULL) {
-        fprintf(stderr, "ggml-hex: unable to get domain struct for CDSP\n");
-        exit(1);
+        GGML_ABORT("ggml-hex: unable to get domain struct for CDSP\n");
     }
 
     // Create new session
@@ -1485,9 +1475,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
 
         int err = remote_session_control(FASTRPC_RESERVE_NEW_SESSION, (void *) &n, sizeof(n));
         if (err != AEE_SUCCESS) {
-            fprintf(stderr, "ggml-hex: remote_session_control failed to reserve new session %d : error 0x%x\n", dev_id,
-                    err);
-            exit(1);
+            GGML_ABORT("ggml-hex: remote_session_control failed to reserve new session %d : error 0x%x\n", dev_id, err);
         }
 
         // Save the IDs
@@ -1512,9 +1500,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
 
         int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
         if (err != AEE_SUCCESS) {
-            fprintf(stderr, "ggml-hex: remote_session_control failed to get URI for session %d : error 0x%x\n", dev_id,
-                    err);
-            exit(1);
+            GGML_ABORT("ggml-hex: remote_session_control failed to get URI for session %d : error 0x%x\n", dev_id, err);
         }
     }
 
@@ -1525,21 +1511,18 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
         u.enable = 1;
         int err  = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *) &u, sizeof(u));
         if (err != AEE_SUCCESS) {
-            fprintf(stderr,
-                    "ggml-hex: remote_session_control failed to enable unsigned PD for session %d : error 0x%x\n",
+            GGML_ABORT("ggml-hex: remote_session_control failed to enable unsigned PD for session %d : error 0x%x\n",
                     dev_id, err);
-            exit(1);
         }
     }
 
     // Open session
     int err = htp_iface_open(session_uri, &this->handle);
     if (err != AEE_SUCCESS) {
-        fprintf(stderr, "ggml-hex: failed to open session %d : error 0x%x\n", dev_id, err);
-        exit(1);
+        GGML_ABORT("ggml-hex: failed to open session %d : error 0x%x\n", dev_id, err);
     }
 
-    fprintf(stderr, "ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
+    GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
             this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
 
     // Enable FastRPC QoS mode
@@ -1549,8 +1532,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
 
         int err = remote_handle64_control(this->handle, DSPRPC_CONTROL_LATENCY, (void *) &l, sizeof(l));
         if (err != 0) {
-            fprintf(stderr, "ggml-hex: failed to enable fastrpc QOS mode: 0x%08x\n", (unsigned) err);
-            exit(1);
+            GGML_LOG_ERROR("ggml-hex: failed to enable fastrpc QOS mode: 0x%08x\n", (unsigned) err);
         }
     }
 
@@ -1564,22 +1546,19 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
                           &queue);
 
     if (err != 0) {
-        fprintf(stderr, "ggml-hex: dspqueue_create failed: 0x%08x\n", (unsigned) err);
-        exit(1);
+        GGML_ABORT("ggml-hex: dspqueue_create failed: 0x%08x\n", (unsigned) err);
     }
 
     // Export queue for use on the DSP
     err = dspqueue_export(queue, &this->queue_id);
     if (err != 0) {
-        fprintf(stderr, "ggml-hex: dspqueue_export failed: 0x%08x\n", (unsigned) err);
-        exit(1);
+        GGML_ABORT("ggml-hex: dspqueue_export failed: 0x%08x\n", (unsigned) err);
     }
 
     if (opt_etm) {
         err = htp_iface_enable_etm(this->handle);
         if (err != 0) {
-            fprintf(stderr, "ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
-            exit(1);
+            GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
         }
     }
 
@@ -1588,8 +1567,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
     // listening for packets in a callback.
     err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx);
     if (err != 0) {
-        fprintf(stderr, "ggml-hex: htp_iface_start failed: 0x%08x\n", (unsigned) err);
-        exit(1);
+        GGML_ABORT("ggml-hex: htp_iface_start failed: 0x%08x\n", (unsigned) err);
     }
 
     buffer_type.iface   = ggml_backend_hexagon_buffer_type_interface;
@@ -1600,26 +1578,24 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
 }
 
 ggml_hexagon_session::~ggml_hexagon_session() {
-    fprintf(stderr, "ggml-hex: releasing session: %s\n", this->name.c_str());
+    GGML_LOG_INFO("ggml-hex: releasing session: %s\n", this->name.c_str());
 
     // Stop the DSP-side service and close the queue
     int err = htp_iface_stop(this->handle);
     if (err != 0) {
-        fprintf(stderr, "ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
-        exit(1);
+        GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
     }
 
     if (opt_etm) {
         err = htp_iface_disable_etm(this->handle);
         if (err != 0) {
-            fprintf(stderr, "ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
+            GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
         }
     }
 
     err = dspqueue_close(queue);
     if (err != 0) {
-        fprintf(stderr, "ggml-hex: dspqueue_close failed: 0x%08x\n", (unsigned) err);
-        exit(1);
+        GGML_ABORT("ggml-hex: dspqueue_close failed: 0x%08x\n", (unsigned) err);
     }
 
     htp_iface_close(this->handle);
@@ -2145,7 +2121,7 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
     auto buf  = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
     auto sess = buf->sess;
 
-    HEX_VERBOSE("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
+    GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
                 t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
                 (unsigned int) d->size);
 }
@@ -2228,7 +2204,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+        GGML_LOG_DEBUG("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
                     names, dims, types, strides, buffs, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
@@ -2251,8 +2227,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
         );
 
         if (err != 0) {
-            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
-            exit(1);
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
         }
     }
 
@@ -2366,7 +2341,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+        GGML_LOG_DEBUG("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
                     names, dims, types, strides, buffs, req.flags);
 
         if (opt_verbose > 1) {
@@ -2391,8 +2366,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
         );
 
         if (err != 0) {
-            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
-            exit(1);
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
         }
     }
 
@@ -2453,8 +2427,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
             req.op = HTP_OP_SUB;
             break;
         default:
-            fprintf(stderr, "ggml-hex: unsupported op:%d\n", node->op);
-            exit(1);
+            GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
     }
 
     init_htp_tensor(&req.src0, src0);
@@ -2516,7 +2489,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
+        GGML_LOG_DEBUG("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
                     ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
@@ -2538,8 +2511,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
                                  1000000);                // Timeout
 
         if (0 != err) {
-            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
-            exit(1);
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
         }
     }
 
@@ -2595,8 +2567,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
             req.op = HTP_OP_ADD_ID;
             break;
         default:
-            fprintf(stderr, "ggml-hex: unsupported op:%d\n", node->op);
-            exit(1);
+            GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
     }
 
     init_htp_tensor(&req.src0, src0);
@@ -2657,7 +2628,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
+        GGML_LOG_DEBUG("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
                     ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
 
         if (opt_verbose > 1) {
@@ -2681,8 +2652,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
                                  1000000);                // Timeout
 
         if (0 != err) {
-            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
-            exit(1);
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
         }
     }
 
@@ -2755,8 +2725,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
     }
 
     if (!supported) {
-        fprintf(stderr, "ggml-hex: unsupported op:%d\n", op->op);
-        exit(1);
+        GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op);
     }
 
     init_htp_tensor(&req.dst, dst);
@@ -2839,7 +2808,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+        GGML_LOG_DEBUG("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
                     names, dims, types, strides, buffs, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
@@ -2865,8 +2834,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
                                  1000000);                // Timeout
 
         if (0 != err) {
-            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
-            exit(1);
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
         }
     }
 
@@ -3013,7 +2981,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+        GGML_LOG_DEBUG("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
                     names, dims, types, strides, buffs, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
@@ -3039,8 +3007,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
                                  1000000);                // Timeout
 
         if (0 != err) {
-            fprintf(stderr, "ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
-            exit(1);
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
         }
     }
 
@@ -3117,7 +3084,7 @@ static inline int last_compute_op(ggml_cgraph * graph) {
 static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
     auto sess = static_cast<ggml_hexagon_session *>(backend->context);
 
-    HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
+    GGML_LOG_DEBUG("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
 
     const int last = last_compute_op(graph);
 
@@ -3186,9 +3153,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
                 break;
 
             default:
-                fprintf(stderr, "\nggml-hex: %s not supported\n", ggml_op_desc(node));
-                fflush(stderr);
-                GGML_ASSERT(false);
+                GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
         }
     }
 
@@ -3203,7 +3168,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
     auto sess = static_cast<ggml_hexagon_session *>(backend->context);
 
-    HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());
+    GGML_LOG_DEBUG("ggml-hex: %s synchronize\n", sess->name.c_str());
 
     // Wait until all pending ops complete
     while (sess->op_pending) {
@@ -3296,7 +3261,6 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
 }
 
 static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgraph * gf) {
-    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
     const int n = gf->n_nodes;
 
     constexpr int MAX_FUSE = 16;
@@ -3536,7 +3500,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        HEX_VERBOSE("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
+        GGML_LOG_DEBUG("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
                     ggml_op_name(op->op), names, dims, types, strides, buffs, (int) supp);
     }
 
@@ -3556,14 +3520,14 @@ static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, gg
     // Need session/domain-id for buffers to be compatible
     bool supp = (s0->session_id == s1->session_id);
 
-    HEX_VERBOSE("ggml-hex: %s device-supports-buft %s (%d)\n", s0->name.c_str(), s1->name.c_str(), (int) supp);
+    GGML_LOG_DEBUG("ggml-hex: %s device-supports-buft %s (%d)\n", s0->name.c_str(), s1->name.c_str(), (int) supp);
 
     return supp;
 }
 
 static ggml_backend_buffer_type_t * ggml_backend_hexagon_device_get_extra_buffers_type(ggml_backend_dev_t dev) {
     auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
-    HEX_VERBOSE("ggml-hex: device-get-extra-buft : %s \n", s0->name.c_str());
+    GGML_LOG_DEBUG("ggml-hex: device-get-extra-buft : %s \n", s0->name.c_str());
 
     static ggml_backend_buffer_type_t bufts[2];
     bufts[0] = ggml_backend_hexagon_device_get_repack_buffer_type(dev);
@@ -3601,17 +3565,17 @@ struct ggml_hexagon_registry {
 };
 
 ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
-    fprintf(stderr, "ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
+    GGML_LOG_INFO("ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
 
     if (!opt_arch) {
         int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
         if (err != 0) {
-            fprintf(stderr, "ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
+            GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
             opt_arch = 73;
         }
     }
 
-    fprintf(stderr, "ggml-hex: Hexagon Arch version v%d\n", opt_arch);
+    GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
 
     // Create devices / sessions
     for (size_t i = 0; i < opt_ndev; i++) {
@@ -3622,7 +3586,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
 }
 
 ggml_hexagon_registry::~ggml_hexagon_registry() {
-    fprintf(stderr, "ggml-hex: releasing registry\n");
+    GGML_LOG_INFO("ggml-hex: releasing registry\n");
 
     // Release devices / sessions
     for (size_t i = 0; i < opt_ndev; i++) {
@@ -3708,7 +3672,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
 
     reg->context = new ggml_hexagon_registry(reg);
 
-    HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
+    GGML_LOG_DEBUG("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
                 sizeof(struct htp_general_rsp));
 }
 
diff --git a/ggml/src/ggml-hexagon/htp-utils.c b/ggml/src/ggml-hexagon/htp-utils.c
index 0c64b657fa7..e8a035af8c6 100644
--- a/ggml/src/ggml-hexagon/htp-utils.c
+++ b/ggml/src/ggml-hexagon/htp-utils.c
@@ -3,6 +3,12 @@
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
 #pragma clang diagnostic ignored "-Wsign-compare"
 
+#define GGML_COMMON_IMPL_C
+#include "ggml-backend-impl.h"
+#include "ggml-common.h"
+#include "ggml-hexagon.h"
+#include "ggml-impl.h"
+
 #include "htp-utils.h"
 
 #include <domain.h>
@@ -71,20 +77,20 @@ int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** do
     if (remote_system_request) {
         nErr = remote_system_request(&req);
         if (nErr != AEE_SUCCESS) {
-            printf("Failure in remote_system_request call: %d.\n", nErr);
+            GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
             goto bail;
         }
         // Allocate memory for domain-info array
         req.sys.max_domains = req.sys.num_domains;
         if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) {
             nErr = AEE_ENOMEMORY;
-            printf("Unable to allocate memory for req.sys.domains");
+            GGML_LOG_ERROR("Unable to allocate memory for req.sys.domains");
             goto bail;
         }
 
         nErr = remote_system_request(&req);
         if (nErr != AEE_SUCCESS) {
-            printf("Failure in remote_system_request call: %d.\n", nErr);
+            GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
             goto bail;
         }
 
@@ -93,7 +99,7 @@ int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** do
             domain = &req.sys.domains[i];
             if (domain->type != ss_info && domain_type != NULL) {
                 nErr = -1;
-                printf("Incorrect data received from remote_system_request.\n");
+                GGML_LOG_ERROR("Incorrect data received from remote_system_request.\n");
                 goto bail;
             }
         }
@@ -120,7 +126,7 @@ int get_effective_domain_id(char * domain_name, int session_id, int * effec_doma
 
     err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess));
     if (err) {
-        printf("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name,
+        GGML_LOG_ERROR("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name,
                session_id);
         return err;
     }
@@ -137,7 +143,7 @@ int get_dsp_support(int * domain) {
         struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 };
         nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
         if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            printf("\nFastRPC Capability API is not supported on this device\n");
+            GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
             goto bail;
         }
 
@@ -153,12 +159,12 @@ int get_dsp_support(int * domain) {
         }
 
         if (nErr != AEE_SUCCESS) {
-            printf("\nget_dsp_support failed with Error 0x%x\n", nErr);
+            GGML_LOG_ERROR("\nget_dsp_support failed with Error 0x%x\n", nErr);
             goto bail;
         }
     } else {
         nErr = AEE_EUNSUPPORTEDAPI;
-        printf("remote_dsp_capability interface is not supported on this device\n");
+        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
     }
 
 bail:
@@ -172,7 +178,7 @@ int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
     if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
     } else {
         nErr = AEE_EBADPARM;
-        printf("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n");
+        GGML_LOG_ERROR("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n");
         goto bail;
     }
     if (remote_handle_control) {
@@ -188,24 +194,24 @@ int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
             nErr                                 = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp,
                                                                          sizeof(struct remote_dsp_capability));
             if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                printf("\nFastRPC Capability API is not supported on this device\n");
-                printf("Running the usecase without checking the capability\n");
+                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
+                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
                 nErr = AEE_SUCCESS;
                 goto bail;
             } else if (nErr == AEE_SUCCESS) {
                 *capability = dsp_capability_vtcm_dsp.capability;
             } else {
-                printf("\nget_vtcm_info failed with Error 0x%x\n", nErr);
+                GGML_LOG_ERROR("\nget_vtcm_info failed with Error 0x%x\n", nErr);
                 goto bail;
             }
         } else {
             nErr = AEE_EUNSUPPORTED;
-            printf("Unsupported domain %d\n", domain);
+            GGML_LOG_ERROR("Unsupported domain %d\n", domain);
             goto bail;
         }
     } else {
         nErr = AEE_EUNSUPPORTEDAPI;
-        printf("remote_dsp_capability interface is not supported on this device\n");
+        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
     }
 
 bail:
@@ -218,11 +224,11 @@ bool is_unsignedpd_supported(int domain_id) {
         struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 };
         nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
         if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            printf("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n");
+            GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n");
             return false;
         }
         if (nErr) {
-            printf("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr);
+            GGML_LOG_ERROR("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr);
             return false;
         }
         if (dsp_capability_domain.capability == 1) {
@@ -230,7 +236,7 @@ bool is_unsignedpd_supported(int domain_id) {
         }
     } else {
         nErr = AEE_EUNSUPPORTEDAPI;
-        printf("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n");
+        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n");
         return false;
     }
     return false;
@@ -255,25 +261,25 @@ bool is_async_fastrpc_supported(int domain) {
             nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support,
                                          sizeof(struct remote_dsp_capability));
             if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                printf("\nFastRPC Capability API is not supported on this device\n");
-                printf("Running the usecase without checking the capability\n");
+                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
+                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
                 nErr = AEE_SUCCESS;
                 goto bail;
             } else if (dsp_capability_async_support.capability == 1) {
                 return true;
             }
             if (nErr != AEE_SUCCESS) {
-                printf("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr);
+                GGML_LOG_ERROR("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr);
                 goto bail;
             }
         } else {
             nErr = AEE_EUNSUPPORTED;
-            printf("Async fastrpc is not supported on domain %d\n", domain);
+            GGML_LOG_ERROR("Async fastrpc is not supported on domain %d\n", domain);
             goto bail;
         }
     } else {
         nErr = AEE_EUNSUPPORTEDAPI;
-        printf("remote_dsp_capability interface is not supported on this device\n");
+        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
     }
 
 bail:
@@ -295,20 +301,20 @@ bool is_status_notification_supported(int domain) {
         nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support,
                                      sizeof(struct remote_dsp_capability));
         if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            printf("\nFastRPC Capability API is not supported on this device\n");
-            printf("Running the usecase without checking the capability\n");
+            GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
+            GGML_LOG_ERROR("Running the usecase without checking the capability\n");
             nErr = AEE_SUCCESS;
             goto bail;
         } else if (dsp_capability_status_notification_support.capability == 1) {
             return true;
         }
         if (nErr != AEE_SUCCESS) {
-            printf("\nis_status_notification_supported failed with Error 0x%x\n", nErr);
+            GGML_LOG_ERROR("\nis_status_notification_supported failed with Error 0x%x\n", nErr);
             goto bail;
         }
     } else {
         nErr = AEE_EUNSUPPORTEDAPI;
-        printf("remote_dsp_capability interface is not supported on this device\n");
+        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
     }
 
 bail:
@@ -321,7 +327,7 @@ int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
 
     if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
         nErr = AEE_EBADPARM;
-        printf("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n");
+        GGML_LOG_ERROR("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n");
         goto bail;
     }
     if (remote_handle_control) {
@@ -337,24 +343,24 @@ int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
             nErr                                = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp,
                                                                         sizeof(struct remote_dsp_capability));
             if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                printf("\nFastRPC Capability API is not supported on this device\n");
-                printf("Running the usecase without checking the capability\n");
+                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
+                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
                 nErr = AEE_SUCCESS;
                 goto bail;
             } else if (nErr == AEE_SUCCESS) {
                 *capability = dsp_capability_hmx_dsp.capability;
             } else {
-                printf("\nget_hmx_support_info failed with Error 0x%x\n", nErr);
+                GGML_LOG_ERROR("\nget_hmx_support_info failed with Error 0x%x\n", nErr);
                 goto bail;
             }
         } else {
             nErr = AEE_EUNSUPPORTED;
-            printf("HMX support is not there for domain %d\n", domain);
+            GGML_LOG_ERROR("HMX support is not there for domain %d\n", domain);
             goto bail;
         }
     } else {
         nErr = AEE_EUNSUPPORTEDAPI;
-        printf("remote_dsp_capability interface is not supported on this device\n");
+        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
     }
 
 bail:
@@ -363,7 +369,7 @@ int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
 
 int get_hex_arch_ver(int domain, int * arch) {
     if (!remote_handle_control) {
-        fprintf(stderr, "ggml-hex: remote_handle_control is not supported on this device\n");
+        GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
         return AEE_EUNSUPPORTEDAPI;
     }
 
@@ -374,12 +380,12 @@ int get_hex_arch_ver(int domain, int * arch) {
 
     int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
     if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
-        fprintf(stderr, "ggml-hex: FastRPC capability API is not supported on this device\n");
+        GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
         return AEE_EUNSUPPORTEDAPI;
     }
 
     if (err != AEE_SUCCESS) {
-        fprintf(stderr, "ggml-hex: FastRPC capability query failed (err %d)\n", err);
+        GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
         return err;
     }
 
@@ -417,24 +423,24 @@ int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) {
             nErr                                = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp,
                                                                         sizeof(struct remote_dsp_capability));
             if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                printf("\nFastRPC Capability API is not supported on this device\n");
-                printf("Running the usecase without checking the capability\n");
+                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
+                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
                 nErr = AEE_SUCCESS;
                 goto bail;
             } else if (nErr == AEE_SUCCESS) {
                 *capability = dsp_capability_hvx_dsp.capability;
             } else {
-                printf("\nget_hvx_support_info failed with Error 0x%x\n", nErr);
+                GGML_LOG_ERROR("\nget_hvx_support_info failed with Error 0x%x\n", nErr);
                 goto bail;
             }
         } else {
             nErr = AEE_EUNSUPPORTED;
-            printf("HVX support is not available on domain %d\n", domain);
+            GGML_LOG_ERROR("HVX support is not available on domain %d\n", domain);
             goto bail;
         }
     } else {
         nErr = AEE_EUNSUPPORTEDAPI;
-        printf("remote_dsp_capability interface is not supported on this device\n");
+        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
     }
 
 bail:

From 69a8047e8374509e08164023ca9246fd9ee9842f Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 15 Oct 2025 18:37:50 -0700
Subject: [PATCH 09/37] readme: add hexagon to the list supported backends

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e373611051e..6d30a8bdab0 100644
--- a/README.md
+++ b/README.md
@@ -280,6 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
+| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
 
 ## Obtaining and quantizing models
 

From debdb3b48286aac34c923770f631f4ca2d42f2f3 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Thu, 16 Oct 2025 09:57:06 -0700
Subject: [PATCH 10/37] hexagon: stack malmuts with quantized inputs only

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 5dc7188b70e..6ce31691bf6 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -3189,6 +3189,10 @@ struct node_info {
         return fused.empty() ? node : fused.back();
     }
 
+    const ggml_tensor * src0() const {
+        return node->src[0];
+    }
+
     const ggml_tensor * src1() const {
         return node->src[1];
     }
@@ -3205,7 +3209,7 @@ struct node_info {
         switch (this->op()) {
             case GGML_OP_MUL_MAT:
             case GGML_OP_MUL_MAT_ID:
-                return true;
+                return ggml_is_quantized(this->src0()->type);
             default:
                 return false;
         }

From 3475e29b5721579c965848d85c8633eee0448bc8 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Thu, 16 Oct 2025 22:37:39 -0700
Subject: [PATCH 11/37] hexagon: add TODO for fixing issues in
 hexagon_graph_optimize

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 6ce31691bf6..ee0111ec2d2 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -3231,6 +3231,9 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
     // The main goal here is to stack the MUL_MAT ops with the same src1 input.
     // This allows use to reuse dynamically quantized src1 in VTCM.
 
+    // TODO: the current version might do incorrect reodering in cases where quantized src0
+    //       input is an output of another Op.
+
     for (int i0 = 0; i0 < n; i0++) {
         if (used[i0]) {
             continue;

From 1e750df0da029a9568e8f897155e2c6e3199b93f Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Thu, 16 Oct 2025 23:38:05 -0700
Subject: [PATCH 12/37] hexagon: update to hex-sdk 6.4.0 and add scripts for
 running on QDC

---
 .github/workflows/build.yml                |  8 ++--
 docs/backend/hexagon/README.md             | 34 +++++++-------
 docs/backend/hexagon/developer.md          | 18 ++++----
 scripts/snapdragon/qdc/readme.md           |  1 +
 scripts/snapdragon/qdc/requirements.txt    | 25 ++++++++++
 scripts/snapdragon/qdc/tests/test_bench.py | 54 ++++++++++++++++++++++
 6 files changed, 110 insertions(+), 30 deletions(-)
 create mode 100644 scripts/snapdragon/qdc/readme.md
 create mode 100644 scripts/snapdragon/qdc/requirements.txt
 create mode 100644 scripts/snapdragon/qdc/tests/test_bench.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1c769d74236..15e11330952 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1347,7 +1347,7 @@ jobs:
         id: install_hexsdk
         if: ${{ matrix.build == 'arm64-snapdragon' }}
         env:
-          HEXSDK_VER: 6.3.0
+          HEXSDK_VER: 6.4.0.2
           HEXTLS_VER: 19.0.04
         run: |
           curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
@@ -1355,8 +1355,8 @@ jobs:
           tar -xaf hex-sdk.tar.gz -C hex-sdk
           ls -l hex-sdk
           sudo mv hex-sdk /opt/hexagon
-          echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER.0"                                   >> "$GITHUB_ENV"
-          echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER.0/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV"
+          echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER"                                     >> "$GITHUB_ENV"
+          echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER"   >> "$GITHUB_ENV"
           echo "DEFAULT_HLOS_ARCH=64"                                                          >> "$GITHUB_ENV"
           echo "DEFAULT_TOOLS_VARIANT=toolv19"                                                 >> "$GITHUB_ENV"
           echo "DEFAULT_NO_QURT_INC=0"                                                         >> "$GITHUB_ENV"
@@ -1373,7 +1373,7 @@ jobs:
         run: |
           cmake ${{ matrix.defines }} -B build
           cmake --build build
-          cmake --install build --prefix pkg
+          cmake --install build --prefix pkg-adb/llama.cpp
 
       - name: Test
         id: cmake_test
diff --git a/docs/backend/hexagon/README.md b/docs/backend/hexagon/README.md
index d15fbb5e0fc..85f136ef9ee 100644
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -8,7 +8,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
 This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
 
 ```
-~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.2
+~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
 [d]/> cd /workspace
 ```
 
@@ -26,7 +26,7 @@ Preset CMake variables:
   GGML_HEXAGON="ON"
   GGML_OPENCL="ON"
   GGML_OPENMP="OFF"
-  HEXAGON_SDK_ROOT="/opt/hexagon/6.3.0.0"
+  HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
 ...
 -- Including OpenCL backend
 -- Including Hexagon backend
@@ -49,19 +49,19 @@ Preset CMake variables:
 To generate an installable "package" simply use cmake --install:
 
 ```
-[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon
+[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp
 -- Install configuration: "Release"
--- Installing: /workspace/pkg-snapdragon/lib/libggml-cpu.so
--- Installing: /workspace/pkg-snapdragon/lib/libggml-opencl.so
--- Installing: /workspace/pkg-snapdragon/lib/libggml-hexagon.so
--- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v73.so
--- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v75.so
--- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v79.so
--- Installing: /workspace/pkg-snapdragon/lib/libggml-htp-v81.so
--- Installing: /workspace/pkg-snapdragon/lib/libggml.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so
 ...
--- Installing: /workspace/pkg-snapdragon/bin/llama-bench
--- Installing: /workspace/pkg-snapdragon/bin/llama-cli
+-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
+-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
 ...
 ```
 
@@ -74,10 +74,10 @@ Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
 **Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
 
 ```
-~/src/llama.cpp$ adb push pkg-snapdragon/* /data/local/tmp/llama.cpp
-pkg-snapdragon/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
-pkg-snapdragon/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
-pkg-snapdragon/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
+~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/
+pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
+pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
+pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
 102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
 ```
 
diff --git a/docs/backend/hexagon/developer.md b/docs/backend/hexagon/developer.md
index 98885ae0f2d..aa7aa2cb62a 100644
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@@ -14,14 +14,14 @@ The Hexagon backend consist of two parts:
 Here is an example of the build artifacts
 
 ```
-~/src/llama.cpp$ ls -l pkg-snapdragon/lib/libggml*
-pkg-snapdragon/lib/libggml-base.so
-pkg-snapdragon/lib/libggml-cpu.so
-pkg-snapdragon/lib/libggml-hexagon.so      <<< CPU library
-pkg-snapdragon/lib/libggml-htp-v73.so      <<< HTP op/kernels for Hexagon v73
-pkg-snapdragon/lib/libggml-htp-v75.so
-pkg-snapdragon/lib/libggml-htp-v79.so
-pkg-snapdragon/lib/libggml-htp-v81.so
+~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml*
+pkg-adb/llama.cpp/lib/libggml-base.so
+pkg-adb/llama.cpp/lib/libggml-cpu.so
+pkg-adb/llama.cpp/lib/libggml-hexagon.so      <<< CPU library
+pkg-adb/llama.cpp/lib/libggml-htp-v73.so      <<< HTP op/kernels for Hexagon v73
+pkg-adb/llama.cpp/lib/libggml-htp-v75.so
+pkg-adb/llama.cpp/lib/libggml-htp-v79.so
+pkg-adb/llama.cpp/lib/libggml-htp-v81.so
 ```
 
 ## Memory buffers
@@ -49,7 +49,7 @@ Each Hexagon device behaves like a GPU from the offload and model splitting pers
 Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
 
 ```
-M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt docs/backend/hexagon/run-cli.sh -no-cnv -f surfing.txt -n 32
+M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
 ...
 LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
 ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
diff --git a/scripts/snapdragon/qdc/readme.md b/scripts/snapdragon/qdc/readme.md
new file mode 100644
index 00000000000..b92cf243aaa
--- /dev/null
+++ b/scripts/snapdragon/qdc/readme.md
@@ -0,0 +1 @@
+This directory includes pytest based scripts for running CI jobs on Qualcomm Device Cloud (QDC).
diff --git a/scripts/snapdragon/qdc/requirements.txt b/scripts/snapdragon/qdc/requirements.txt
new file mode 100644
index 00000000000..f04bd682ea0
--- /dev/null
+++ b/scripts/snapdragon/qdc/requirements.txt
@@ -0,0 +1,25 @@
+Appium-Python-Client==5.2.4
+attrs==25.4.0
+certifi==2025.10.5
+exceptiongroup==1.3.0
+h11==0.16.0
+idna==3.11
+iniconfig==2.1.0
+outcome==1.3.0.post0
+packaging==25.0
+pluggy==1.6.0
+Pygments==2.19.2
+PySocks==1.7.1
+pytest==8.4.2
+pytest-dependency==0.6.0
+selenium==4.36.0
+setuptools==80.9.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+tomli==2.3.0
+trio==0.31.0
+trio-websocket==0.12.2
+typing_extensions==4.15.0
+urllib3==2.5.0
+websocket-client==1.9.0
+wsproto==1.2.0
diff --git a/scripts/snapdragon/qdc/tests/test_bench.py b/scripts/snapdragon/qdc/tests/test_bench.py
new file mode 100644
index 00000000000..b1213ea4bb6
--- /dev/null
+++ b/scripts/snapdragon/qdc/tests/test_bench.py
@@ -0,0 +1,54 @@
+import pytest,subprocess,sys
+
+tmp_path='/data/local/tmp'
+pkg_path=f'{tmp_path}/llama.cpp'
+lib_path=f'{pkg_path}/lib'
+bin_path=f'{pkg_path}/bin'
+
+model='../gguf/Llama-3.2-1B-Instruct-Q4_0.gguf'
+cli_pref=f'cd {pkg_path} && LD_LIBRARY_PATH={lib_path} ADSP_LIBRARY_PATH={lib_path} {bin_path}'
+
+def run_cmd(cmd):
+    p = subprocess.run(cmd, text = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
+    sys.stdout.write(p.stdout)
+    assert(p.returncode == 0)
+
+@pytest.mark.dependency()
+def test_install():
+    run_cmd([ 'adb', 'push', 'llama.cpp', f'{tmp_path}' ])
+    run_cmd([ 'adb', 'shell', f'chmod 755 {bin_path}/*' ])
+
+## Basic cli tests
+def run_llama_cli(dev, opts):
+    run_cmd([ 'adb', 'shell',
+        f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 --batch-size 128 -n 128 -t 4 -no-cnv --seed 42 {opts} ' +
+        f'-p "what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**."'
+        ])
+
+@pytest.mark.dependency(depends=['test_install'])
+def test_llama_cli_cpu():
+    run_llama_cli('none', '-ctk q8_0 -ctv q8_0 -fa on')
+
+@pytest.mark.dependency(depends=['test_install'])
+def test_llama_cli_gpu():
+    run_llama_cli('GPUOpenCL', '-fa on')
+
+@pytest.mark.dependency(depends=['test_install'])
+def test_llama_cli_npu():
+    run_llama_cli('HTP0', '-ctk q8_0 -ctv q8_0 -fa on')
+
+## Basic bench tests
+def run_llama_bench(dev):
+    run_cmd([ 'adb', 'shell',
+        f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'
+        ])
+
+@pytest.mark.dependency(depends=['test_install'])
+def test_llama_bench_cpu():
+    run_llama_bench('none')
+
+def test_llama_bench_gpu():
+    run_llama_bench('GPUOpenCL')
+
+def test_llama_bench_npu():
+    run_llama_bench('HTP0')

From 8e7d8b5a8af11663a523b9b239c2e947b0106fdf Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Fri, 17 Oct 2025 00:08:58 -0700
Subject: [PATCH 13/37] scripts: fix lint errors

---
 scripts/snapdragon/qdc/tests/test_bench.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/snapdragon/qdc/tests/test_bench.py b/scripts/snapdragon/qdc/tests/test_bench.py
index b1213ea4bb6..39a3542e76e 100644
--- a/scripts/snapdragon/qdc/tests/test_bench.py
+++ b/scripts/snapdragon/qdc/tests/test_bench.py
@@ -8,15 +8,18 @@
 model='../gguf/Llama-3.2-1B-Instruct-Q4_0.gguf'
 cli_pref=f'cd {pkg_path} && LD_LIBRARY_PATH={lib_path} ADSP_LIBRARY_PATH={lib_path} {bin_path}'
 
+
 def run_cmd(cmd):
     p = subprocess.run(cmd, text = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
     sys.stdout.write(p.stdout)
     assert(p.returncode == 0)
 
+
 @pytest.mark.dependency()
 def test_install():
-    run_cmd([ 'adb', 'push', 'llama.cpp', f'{tmp_path}' ])
-    run_cmd([ 'adb', 'shell', f'chmod 755 {bin_path}/*' ])
+    run_cmd(['adb', 'push', 'llama.cpp', f'{tmp_path}'])
+    run_cmd(['adb', 'shell', f'chmod 755 {bin_path}/*'])
+
 
 ## Basic cli tests
 def run_llama_cli(dev, opts):
@@ -25,30 +28,35 @@ def run_llama_cli(dev, opts):
         f'-p "what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**."'
         ])
 
+
 @pytest.mark.dependency(depends=['test_install'])
 def test_llama_cli_cpu():
     run_llama_cli('none', '-ctk q8_0 -ctv q8_0 -fa on')
 
+
 @pytest.mark.dependency(depends=['test_install'])
 def test_llama_cli_gpu():
     run_llama_cli('GPUOpenCL', '-fa on')
 
+
 @pytest.mark.dependency(depends=['test_install'])
 def test_llama_cli_npu():
     run_llama_cli('HTP0', '-ctk q8_0 -ctv q8_0 -fa on')
 
+
 ## Basic bench tests
 def run_llama_bench(dev):
-    run_cmd([ 'adb', 'shell',
-        f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'
-        ])
+    run_cmd(['adb', 'shell', f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'])
+
 
 @pytest.mark.dependency(depends=['test_install'])
 def test_llama_bench_cpu():
     run_llama_bench('none')
 
+
 def test_llama_bench_gpu():
     run_llama_bench('GPUOpenCL')
 
+
 def test_llama_bench_npu():
     run_llama_bench('HTP0')

From 20aa68974bdcb07d55154cdaaf3754f7a5f5c7c6 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Fri, 17 Oct 2025 09:41:43 -0700
Subject: [PATCH 14/37] scripts: update qdc pytest script to make linter happy

---
 scripts/snapdragon/qdc/tests/test_bench.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/snapdragon/qdc/tests/test_bench.py b/scripts/snapdragon/qdc/tests/test_bench.py
index 39a3542e76e..651ab5b7172 100644
--- a/scripts/snapdragon/qdc/tests/test_bench.py
+++ b/scripts/snapdragon/qdc/tests/test_bench.py
@@ -1,4 +1,6 @@
-import pytest,subprocess,sys
+import pytest
+import subprocess
+import sys
 
 tmp_path='/data/local/tmp'
 pkg_path=f'{tmp_path}/llama.cpp'
@@ -23,10 +25,9 @@ def test_install():
 
 ## Basic cli tests
 def run_llama_cli(dev, opts):
-    run_cmd([ 'adb', 'shell',
-        f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 --batch-size 128 -n 128 -t 4 -no-cnv --seed 42 {opts} ' +
-        f'-p "what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**."'
-        ])
+    prompt='what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**.'
+    opts = '--batch-size 128 -n 128 -no-cnv --seed 42 ' + opts
+    run_cmd(['adb', 'shell', f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 -t 4 {opts} -p "{prompt}"'])
 
 
 @pytest.mark.dependency(depends=['test_install'])

From 03e2b9c328c047ceefa7a6d4ab8ce547b49593fd Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Fri, 17 Oct 2025 18:28:37 -0700
Subject: [PATCH 15/37] hexagon: add reduce sum in fp32

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 650ad48476b..b2ca8e88f46 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -405,6 +405,23 @@ static inline HVX_Vector hvx_vec_qf32_reduce_sum(HVX_Vector in) {
     return hvx_vec_qf32_reduce_sum_n(in, 32);
 }
 
+static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n) {
+    unsigned int total = n * 4;  // total vec nbytes
+    unsigned int width = 4;      // fp32 nbytes
+
+    HVX_Vector sum = in, sum_t;
+    while (width < total) {
+        sum_t = Q6_V_vror_VR(sum, width);       // rotate right
+        sum   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum
+        width = width << 1;
+    }
+    return sum;
+}
+
+static inline HVX_Vector hvx_vec_fp32_reduce_sum(HVX_Vector in) {
+    return hvx_vec_fp32_reduce_sum_n(in, 32);
+}
+
 static inline HVX_Vector hvx_vec_reduce_max_fp16(HVX_Vector in) {
     unsigned total = 128;  // total vec nbytes
     unsigned width = 2;    // fp16 nbytes

From 384164dc00080dfd89ee9a11715625816020d114 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Fri, 17 Oct 2025 19:14:56 -0700
Subject: [PATCH 16/37] hexagon: reduce number of vector stores in matmul
 output

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 30 +++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index f92773ac1c6..006b8e2a6a4 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -464,12 +464,12 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
     }
 
-    // Reduce and convert into fp32
-    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
-    r1_sum = hvx_vec_qf32_reduce_sum(r1_sum);
+    // Convert into fp32 and reduce
+    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
-    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
-    hvx_vec_store_u(&s[1], 4, Q6_Vsf_equals_Vqf32(r1_sum));
+    hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
 }
 
 static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@@ -637,12 +637,12 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
         r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
     }
 
-    // Reduce and convert into fp32
-    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
-    r1_sum = hvx_vec_qf32_reduce_sum(r1_sum);
+    // Convert into fp32 and reduce
+    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
-    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
-    hvx_vec_store_u(&s[1], 4, Q6_Vsf_equals_Vqf32(r1_sum));
+    hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
 }
 
 static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
@@ -879,12 +879,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
         r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
     }
 
-    // Reduce and convert into fp32
-    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
-    r1_sum = hvx_vec_qf32_reduce_sum(r1_sum);
+    // Convert into fp32 and reduce
+    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
-    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
-    hvx_vec_store_u(&s[1], 4, Q6_Vsf_equals_Vqf32(r1_sum));
+    hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
 }
 
 #if 1

From a314eb6978a31851f9d78dcbd243c4d206f13f28 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Sat, 18 Oct 2025 16:29:09 -0700
Subject: [PATCH 17/37] hexagon: remove the need for vdelta in
 reduce-multiply-x8

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 48 ++++++--------------------
 1 file changed, 11 insertions(+), 37 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 006b8e2a6a4..26dd2ce1ec8 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -253,48 +253,22 @@ static inline HVX_Vector hvx_vec_rmpy_x8(HVX_Vector_x8 x, HVX_Vector_x8 y) {
     HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]);
     HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]);
 
-    HVX_VectorPair p0 = Q6_W_vshuff_VVR(r4, r0, 16);
-    HVX_VectorPair p1 = Q6_W_vshuff_VVR(r5, r1, 16);
-    HVX_VectorPair p2 = Q6_W_vshuff_VVR(r6, r2, 16);
-    HVX_VectorPair p3 = Q6_W_vshuff_VVR(r7, r3, 16);
-
+    HVX_VectorPair p0 = Q6_W_vdeal_VVR(r1, r0, -4);
+    HVX_VectorPair p1 = Q6_W_vdeal_VVR(r3, r2, -4);
+    HVX_VectorPair p2 = Q6_W_vdeal_VVR(r5, r4, -4);
+    HVX_VectorPair p3 = Q6_W_vdeal_VVR(r7, r6, -4);
     r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
     r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
     r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2));
     r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3));
 
-    p0 = Q6_W_vshuff_VVR(r2, r0, 8);
-    p1 = Q6_W_vshuff_VVR(r3, r1, 8);
-    r0 = Q6_Vw_vadd_VwVw(Q6_V_hi_W(p0), Q6_V_lo_W(p0));
-    r1 = Q6_Vw_vadd_VwVw(Q6_V_hi_W(p1), Q6_V_lo_W(p1));
-
-    p0 = Q6_W_vshuff_VVR(r1, r0, 4);
-    r0 = Q6_Vw_vadd_VwVw(Q6_V_hi_W(p0), Q6_V_lo_W(p0));
-
-    static const uint8_t vrd[128] __attribute__((aligned(128))) = {
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
-        0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54, 0x54,
-        0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
-        0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28,
-        0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68,
-        0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C, 0x7C,
-        0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C, 0x3C,
-    };
-
-    static const uint8_t vd[128] __attribute__((aligned(128))) = {
-        0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x10, 0x10, 0x20, 0x20, 0x20, 0x20, 0x30, 0x30, 0x30, 0x30,
-        0x24, 0x24, 0x24, 0x24, 0x34, 0x34, 0x34, 0x34, 0x04, 0x04, 0x04, 0x04, 0x14, 0x14, 0x14, 0x14,
-        0x08, 0x08, 0x08, 0x08, 0x18, 0x18, 0x18, 0x18, 0x28, 0x28, 0x28, 0x28, 0x38, 0x38, 0x38, 0x38,
-        0x2C, 0x2C, 0x2C, 0x2C, 0x3C, 0x3C, 0x3C, 0x3C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1C, 0x1C, 0x1C, 0x1C,
-        0x30, 0x30, 0x30, 0x30, 0x20, 0x20, 0x20, 0x20, 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00,
-        0x14, 0x14, 0x14, 0x14, 0x04, 0x04, 0x04, 0x04, 0x34, 0x34, 0x34, 0x34, 0x24, 0x24, 0x24, 0x24,
-        0x38, 0x38, 0x38, 0x38, 0x28, 0x28, 0x28, 0x28, 0x18, 0x18, 0x18, 0x18, 0x08, 0x08, 0x08, 0x08,
-        0x1C, 0x1C, 0x1C, 0x1C, 0x0C, 0x0C, 0x0C, 0x0C, 0x3C, 0x3C, 0x3C, 0x3C, 0x2C, 0x2C, 0x2C, 0x2C,
-    };
-
-    r0 = Q6_V_vrdelta_VV(r0, *(HVX_Vector *) vrd);
-    r0 = Q6_V_vdelta_VV(r0, *(HVX_Vector *) vd);
+    p0 = Q6_W_vdeal_VVR(r1, r0, -4);
+    p1 = Q6_W_vdeal_VVR(r3, r2, -4);
+    r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
+    r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
+
+    p0 = Q6_W_vdeal_VVR(r1, r0, -4);
+    r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
 
     return r0;
 }

From 7f2d00bde3e333bc97344c53fa5945d2e8b81674 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Sat, 18 Oct 2025 18:15:40 -0700
Subject: [PATCH 18/37] hexagon: consistent use of reduce_sum_fp32 for row_sums

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 26dd2ce1ec8..21eedf77561 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -343,9 +343,9 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
     }
 
     // Reduce and convert into fp32
-    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
+    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
 
-    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
+    hvx_vec_store_u(&s[0], 4, r0_sum);
 }
 
 static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
@@ -516,9 +516,9 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
     }
 
     // Reduce and convert into fp32
-    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
+    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
 
-    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
+    hvx_vec_store_u(&s[0], 4, r0_sum);
 }
 
 static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
@@ -722,9 +722,9 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
     }
 
     // Reduce and convert into fp32
-    r0_sum = hvx_vec_qf32_reduce_sum(r0_sum);
+    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
 
-    hvx_vec_store_u(&s[0], 4, Q6_Vsf_equals_Vqf32(r0_sum));
+    hvx_vec_store_u(&s[0], 4, r0_sum);
 }
 
 static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,

From 5de19f8b07a7943453a31d1ef7002f5378087b4f Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Sun, 19 Oct 2025 13:22:27 -0700
Subject: [PATCH 19/37] hexagon: some more matmul optimizations and comments

Optimize cases where tensor dims are not multiple of 1024 (e.g in Qwen models).
We've handled those cases already but at a higher overhead.
---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 122 ++++++++++++++++---------
 1 file changed, 78 insertions(+), 44 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 21eedf77561..c99b6a0d18e 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -243,36 +243,70 @@ static inline HVX_Vector_x4 hvx_vec_load_x4_f32_as_f16(const uint8_t * restrict
     return r;
 }
 
-static inline HVX_Vector hvx_vec_rmpy_x8(HVX_Vector_x8 x, HVX_Vector_x8 y) {
-    HVX_Vector r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]);
-    HVX_Vector r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]);
-    HVX_Vector r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]);
-    HVX_Vector r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]);
-    HVX_Vector r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]);
-    HVX_Vector r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]);
-    HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]);
-    HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]);
-
-    HVX_VectorPair p0 = Q6_W_vdeal_VVR(r1, r0, -4);
-    HVX_VectorPair p1 = Q6_W_vdeal_VVR(r3, r2, -4);
-    HVX_VectorPair p2 = Q6_W_vdeal_VVR(r5, r4, -4);
-    HVX_VectorPair p3 = Q6_W_vdeal_VVR(r7, r6, -4);
-    r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
-    r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
-    r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2));
-    r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3));
-
-    p0 = Q6_W_vdeal_VVR(r1, r0, -4);
-    p1 = Q6_W_vdeal_VVR(r3, r2, -4);
-    r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
-    r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
-
-    p0 = Q6_W_vdeal_VVR(r1, r0, -4);
-    r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
+// Reduce multiply 1024 x 1024 int8 elements (32x q4/8 blocks in 8x HVX vectors).
+// Accumulate each block into a single int32 value.
+// Return a single HVX vector with 32x int32 accumulators.
+// This version is parameterized to support less than 1024 elements.
+// if() checks are optimized out at compile time -- make sure to pass N as a constexpr.
+
+static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
+    HVX_Vector r0 = Q6_V_vsplat_R(0);
+    HVX_Vector r1 = Q6_V_vsplat_R(0);
+    HVX_Vector r2 = Q6_V_vsplat_R(0);
+    HVX_Vector r3 = Q6_V_vsplat_R(0);
+    HVX_Vector r4 = Q6_V_vsplat_R(0);
+    HVX_Vector r5 = Q6_V_vsplat_R(0);
+    HVX_Vector r6 = Q6_V_vsplat_R(0);
+    HVX_Vector r7 = Q6_V_vsplat_R(0);
+
+    HVX_VectorPair p3;
+    HVX_VectorPair p2;
+    HVX_VectorPair p1;
+    HVX_VectorPair p0;
+
+    if (n >=  128) { r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]); }
+    if (n >=  256) { r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]); }
+    if (n >=  384) { r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]); }
+    if (n >=  512) { r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]); }
+    if (n >=  640) { r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]); }
+    if (n >=  768) { r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]); }
+    if (n >=  896) { r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]); }
+    if (n >= 1024) { r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]); }
+
+    if (n >=  128) { p0 = Q6_W_vdeal_VVR(r1, r0, -4); }
+    if (n >=  384) { p1 = Q6_W_vdeal_VVR(r3, r2, -4); }
+    if (n >=  640) { p2 = Q6_W_vdeal_VVR(r5, r4, -4); }
+    if (n >=  896) { p3 = Q6_W_vdeal_VVR(r7, r6, -4); }
+
+    if (n >=  128) { r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); }
+    if (n >=  384) { r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); }
+    if (n >=  640) { r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2)); }
+    if (n >=  896) { r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3)); }
+
+    if (n >=  128) { p0 = Q6_W_vdeal_VVR(r1, r0, -4); }
+    if (n >=  640) { p1 = Q6_W_vdeal_VVR(r3, r2, -4); }
+
+    if (n >=  128) { r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); }
+    if (n >=  640) { r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); }
+
+    if (n >=  128) { p0 = Q6_W_vdeal_VVR(r1, r0, -4); }
+    if (n >=  128) { r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); }
 
     return r0;
 }
 
+static inline HVX_Vector hvx_vec_rmpy_x8_full(HVX_Vector_x8 x, HVX_Vector_x8 y) {
+    return hvx_vec_rmpy_x8_n(x, y, 1024);
+}
+
+// Handle most common cases of tensors not multiple of 1024.
+static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
+    if (n <= 256) { return hvx_vec_rmpy_x8_n(x, y, 256); };
+    if (n <= 512) { return hvx_vec_rmpy_x8_n(x, y, 512); };
+    if (n <= 768) { return hvx_vec_rmpy_x8_n(x, y, 768); };
+    return hvx_vec_rmpy_x8_n(x, y, 1024);
+}
+
 static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     assert(n % 32 == 0);  // min sub-block size
     assert((unsigned long) vx % 128 == 0);
@@ -309,7 +343,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
 
         HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
@@ -326,7 +360,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
 
         HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
@@ -393,8 +427,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
         HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
-        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
         HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
@@ -416,8 +450,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
         HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
-        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
 
         HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
@@ -482,7 +516,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
 
         HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
@@ -499,7 +533,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
 
         HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
@@ -566,8 +600,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
         HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
         HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
-        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
         HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
@@ -589,8 +623,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
         HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
         HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
-        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
 
         HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
         HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
@@ -658,7 +692,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
 
         HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
         HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
@@ -690,7 +724,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
 
         HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
         HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
@@ -772,8 +806,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
         HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
-        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
         HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
         HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
@@ -813,8 +847,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
         HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
 
-        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r0_q, vy_q));
-        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8(r1_q, vy_q));
+        HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+        HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
         HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
         HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);

From cf0242e306dfec1af10b8c74f23c84f4489fd23f Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 14:08:02 -0700
Subject: [PATCH 20/37] hexagon: update cmake presets

---
 docs/backend/hexagon/CMakeUserPresets.json | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json
index e00e5c4d436..5422bc17307 100644
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -19,6 +19,7 @@
             "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
             "PREBUILT_LIB_DIR": "android_aarch64",
             "GGML_OPENMP":      "OFF",
+            "GGML_LLAMAFILE":   "OFF",
             "GGML_OPENCL":      "ON",
             "GGML_HEXAGON":     "ON",
             "LLAMA_CURL":       "OFF"
@@ -32,6 +33,7 @@
             "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
             "PREBUILT_LIB_DIR": "windows_aarch64",
             "GGML_OPENMP":      "OFF",
+            "GGML_LLAMAFILE":   "OFF",
             "GGML_OPENCL":      "ON",
             "GGML_HEXAGON":     "ON",
             "LLAMA_CURL":       "OFF"

From 250e3a66252a8dc18abfcee6872d60113509ffb6 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 14:08:38 -0700
Subject: [PATCH 21/37] hexagon: add OPMASK support for run-bench.sh wrapper

---
 scripts/snapdragon/adb/run-bench.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/snapdragon/adb/run-bench.sh b/scripts/snapdragon/adb/run-bench.sh
index 4e2b7798092..25e0662016c 100755
--- a/scripts/snapdragon/adb/run-bench.sh
+++ b/scripts/snapdragon/adb/run-bench.sh
@@ -19,6 +19,9 @@ device="HTP0"
 verbose=""
 [ "$V" != "" ] && verbose="$V"
 
+opmask=
+[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+
 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
 
@@ -31,6 +34,6 @@ adb $adbserial shell " \
   cd $basedir;         \
   LD_LIBRARY_PATH=$basedir/$branch/lib   \
   ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $ndev $nhvx ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
+    $ndev $nhvx $opmask ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
         -t 4 --batch-size 128 -ngl 99 $@ \
 "

From 08a97e63b35221b80bdd1c0a0c1fa06154e5bf74 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 14:14:35 -0700
Subject: [PATCH 22/37] hexagon: update to use GGML_BACKEND_API

---
 ggml/include/ggml-hexagon.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/include/ggml-hexagon.h b/ggml/include/ggml-hexagon.h
index bdb432be949..6e079004103 100644
--- a/ggml/include/ggml-hexagon.h
+++ b/ggml/include/ggml-hexagon.h
@@ -8,11 +8,11 @@ extern "C" {
 #endif
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_hexagon_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
 
-GGML_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
 
-GGML_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
 
 #ifdef  __cplusplus
 }

From 6d2d0bd2003ce89f6efe25b8c2da9a79a8f97066 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 14:21:57 -0700
Subject: [PATCH 23/37] hexagon: remove unused logic for setting tensor flags
 for the views

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index ee0111ec2d2..d836fac197d 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -405,9 +405,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf
                 (int) ctx->repack);
 
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        // Just a view, reuse the same extra info
-        tensor->flags = tensor->view_src->flags;
-        tensor->extra = tensor->view_src->extra;
+        ; // nothing to do for the view
     } else {
         if (!ctx->mapped) {
             ctx->mmap();

From 18d7d20451b2ccf74f42b3a74bc28ea7e6e28ac2 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 14:34:25 -0700
Subject: [PATCH 24/37] hexagon: add asserts to set/get_tensor to make sure we
 handle complete tensors

Same asserts as the CPU backend.
---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index d836fac197d..3b132b8b59a 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1286,6 +1286,9 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                    const void *          data,
                                                    size_t                offset,
                                                    size_t                size) {
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
 
@@ -1319,6 +1322,9 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                    void *                data,
                                                    size_t                offset,
                                                    size_t                size) {
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
 

From 26a90a0b1e978ba90c0417f8f5aa4d4a4034465a Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 16:13:18 -0700
Subject: [PATCH 25/37] hexagon: use cpy_tensor slow path for non-host buffers

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 3b132b8b59a..60d3971a207 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1356,16 +1356,11 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
 static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t      buffer,
                                                    const struct ggml_tensor * src,
                                                    struct ggml_tensor *       dst) {
-    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
-    auto sess = ctx->sess;
-
-    GGML_LOG_DEBUG("ggml-hex: %s copy-tensor %s -> %s size %zu\n", sess->name.c_str(), src->name, dst->name,
-                ggml_nbytes(src));
-
-    memcpy(dst->data, src->data, ggml_nbytes(src));
-    return true;
-
     GGML_UNUSED(buffer);
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst);
+    // we might optimize this later, for now take the slow path (ie get/set_tensor)
+    return false;
 }
 
 static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {

From a8e5ad82c391ae383f7f7710f772982fb2c51ce6 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 16:38:59 -0700
Subject: [PATCH 26/37] hexagon: error checks in the buffer allocator

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 28 +++++++++++++++++---------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 60d3971a207..fc7f03725a2 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1388,19 +1388,28 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
     return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
 }
 
-static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type,
-                                                                           size_t                     size) {
+static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
+            ggml_backend_buffer_type_t buffer_type, size_t size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
-    ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
-    return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
+    try {
+        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
+        return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
+    } catch (std::exception const &exc) {
+        GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
+        return nullptr;
+    }
 }
 
 static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
-    ggml_backend_buffer_type_t buffer_type,
-    size_t                     size) {
+            ggml_backend_buffer_type_t buffer_type, size_t size) {
     auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
-    ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
-    return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
+    try {
+        ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
+        return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
+    } catch (std::exception const &exc) {
+        GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
+        return nullptr;
+    }
 }
 
 static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
@@ -1408,8 +1417,7 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer
     GGML_UNUSED(buffer_type);
 }
 
-static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
-                                                              const struct ggml_tensor * t) {
+static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) {
     return ggml_nbytes(t);
 }
 

From dc001b9fd5587fc02eb33f76be550f2a87215ce2 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 16:44:06 -0700
Subject: [PATCH 27/37] cmake: move include(extProj) under ggml-hexagon

---
 ggml/src/CMakeLists.txt              | 1 -
 ggml/src/ggml-hexagon/CMakeLists.txt | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 8f60fa62f06..ba281b8e6d1 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1,6 +1,5 @@
 include(CheckCXXCompilerFlag)
 include("../cmake/common.cmake")
-include(ExternalProject)
 
 add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
 
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 60a154694b3..166825c2c5f 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -1,4 +1,5 @@
 include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
+include(ExternalProject)
 
 option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
 

From c749b869471c6303002db6941d25814e865212a8 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 17:08:57 -0700
Subject: [PATCH 28/37] hexagon: don't forget to delete the backend on free

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index fc7f03725a2..03dce759e18 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -3054,8 +3054,9 @@ static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
 }
 
 static void ggml_backend_hexagon_free(ggml_backend_t backend) {
-    auto sess = static_cast<ggml_hexagon_session *>(backend->context);
-    GGML_UNUSED(sess);
+    // we just need to delete the backend here
+    // the sessions are allocated & freed as part of the registry
+    delete backend;
 }
 
 static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {

From 0c01229e4cf11286c897307dc61a48cb9a1702ea Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 17:13:50 -0700
Subject: [PATCH 29/37] hexagon: set/get_tensor size assert apply only to
 quantized tensors

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 03dce759e18..d6bdc5f0755 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1286,9 +1286,6 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                    const void *          data,
                                                    size_t                offset,
                                                    size_t                size) {
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
 
@@ -1298,16 +1295,19 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
     switch (tensor->type) {
         case GGML_TYPE_Q4_0:
             GGML_ASSERT(offset == 0);
+            GGML_ASSERT(size == ggml_nbytes(tensor));
             repack_q4_0_q4x4x2(tensor, data, size);
             break;
 
         case GGML_TYPE_Q8_0:
             GGML_ASSERT(offset == 0);
+            GGML_ASSERT(size == ggml_nbytes(tensor));
             repack_q8_0_q8x4x2(tensor, data, size);
             break;
 
         case GGML_TYPE_MXFP4:
             GGML_ASSERT(offset == 0);
+            GGML_ASSERT(size == ggml_nbytes(tensor));
             repack_mxfp4_mxfp4x4x2(tensor, data, size);
             break;
 
@@ -1322,9 +1322,6 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                    void *                data,
                                                    size_t                offset,
                                                    size_t                size) {
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
 
@@ -1334,16 +1331,19 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
     switch (tensor->type) {
         case GGML_TYPE_Q4_0:
             GGML_ASSERT(offset == 0);
+            GGML_ASSERT(size == ggml_nbytes(tensor));
             repack_q4x4x2_q4_0(data, tensor, size);
             break;
 
         case GGML_TYPE_Q8_0:
             GGML_ASSERT(offset == 0);
+            GGML_ASSERT(size == ggml_nbytes(tensor));
             repack_q8x4x2_q8_0(data, tensor, size);
             break;
 
         case GGML_TYPE_MXFP4:
             GGML_ASSERT(offset == 0);
+            GGML_ASSERT(size == ggml_nbytes(tensor));
             repack_mxfp4x4x2_mxfp4(data, tensor, size);
             break;
 

From 62ef4ebadfdbfa753931b9b7a8280adeb7d8f448 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 21 Oct 2025 17:39:29 -0700
Subject: [PATCH 30/37] hexagon: reintroduce HEX_VERBOSE wrapper for
 GGML_LOG_DEBUG for now

GGML_LOG_DEBUG is always enabled for test-backend-ops and the output gets in the way.
Ideally we need a bit more finer log levels.
---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 71 ++++++++++++++------------
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index d6bdc5f0755..03e736a84dc 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -51,6 +51,9 @@ static int    opt_experimental = 0;
 static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
 static int opt_opsync = 0;  // synchronous ops
 
+#define HEX_VERBOSE(...) \
+    if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
+
 #define HEX_PROFILE(...) \
     if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
 
@@ -306,7 +309,7 @@ struct ggml_backend_hexagon_buffer_type_context {
 
 struct ggml_backend_hexagon_buffer_context {
     bool mmap_to(ggml_hexagon_session * s) {
-        GGML_LOG_DEBUG("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
+        HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
                     s->name.c_str(), (void *) this->base, s->domain_id, s->session_id, this->size, this->fd,
                     (int) this->repack);
 
@@ -357,7 +360,7 @@ struct ggml_backend_hexagon_buffer_context {
             return;
         }
 
-        GGML_LOG_DEBUG("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
+        HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
                     (void *) this->base, size, this->fd, (int) repack);
 
         this->sess   = sess;
@@ -400,7 +403,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf
     auto ctx  = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
     auto sess = ctx->sess;
 
-    GGML_LOG_DEBUG("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
+    HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
                 tensor->name, (void *) ctx->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage,
                 (int) ctx->repack);
 
@@ -425,7 +428,7 @@ static x2_q4 unpack_q4(uint8_t v) {
 }
 
 static void dump_block_q4_0(const block_q4_0 * b, int i) {
-    GGML_LOG_DEBUG("ggml-hex: repack q4_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_q4(b->qs[0]).v[0],
+    HEX_VERBOSE("ggml-hex: repack q4_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_q4(b->qs[0]).v[0],
                 unpack_q4(b->qs[1]).v[0], unpack_q4(b->qs[2]).v[0], unpack_q4(b->qs[3]).v[0], unpack_q4(b->qs[12]).v[1],
                 unpack_q4(b->qs[13]).v[1], unpack_q4(b->qs[14]).v[1], unpack_q4(b->qs[15]).v[1],
                 GGML_FP16_TO_FP32(b->d));
@@ -443,13 +446,13 @@ static void dump_packed_block_q4x4x2(const uint8_t * v, unsigned int i, size_t k
     const uint8_t *   q = v_q + i * qblk_size;
     const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
 
-    GGML_LOG_DEBUG("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
+    HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
                 unpack_q4(q[0]).v[0], unpack_q4(q[1]).v[0], unpack_q4(q[2]).v[0], unpack_q4(q[3]).v[0],
                 unpack_q4(q[60]).v[0], unpack_q4(q[61]).v[0], unpack_q4(q[62]).v[0], unpack_q4(q[63]).v[0],
                 unpack_q4(q[124]).v[0], unpack_q4(q[125]).v[0], unpack_q4(q[126]).v[0], unpack_q4(q[127]).v[0],
                 GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
 
-    GGML_LOG_DEBUG("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
+    HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
                 i + 1, unpack_q4(q[0]).v[1], unpack_q4(q[1]).v[1], unpack_q4(q[2]).v[1], unpack_q4(q[3]).v[1],
                 unpack_q4(q[60]).v[1], unpack_q4(q[61]).v[1], unpack_q4(q[62]).v[1], unpack_q4(q[63]).v[1],
                 unpack_q4(q[124]).v[1], unpack_q4(q[125]).v[1], unpack_q4(q[126]).v[1], unpack_q4(q[127]).v[1],
@@ -658,7 +661,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    GGML_LOG_DEBUG("ggml-hex: repack-q4_0-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+    HEX_VERBOSE("ggml-hex: repack-q4_0-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
                 t->ne[0], nrows, row_size);
 
     init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros
@@ -690,7 +693,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    GGML_LOG_DEBUG("ggml-hex: repack-q4x4x2-q4_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+    HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
                 t->ne[0], nrows, row_size);
 
     memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros
@@ -710,7 +713,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
 
 // ======== Q8x4x2 ====================
 static void dump_block_q8_0(const block_q8_0 * b, int i) {
-    GGML_LOG_DEBUG("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
+    HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
                 b->qs[3], b->qs[28], b->qs[29], b->qs[30], b->qs[31], GGML_FP16_TO_FP32(b->d));
 }
 
@@ -726,11 +729,11 @@ static void dump_packed_block_q8x4x2(const uint8_t * v, unsigned int i, size_t k
     const uint8_t *   q = v_q + i * qblk_size;
     const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
 
-    GGML_LOG_DEBUG("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
+    HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
                 q[0], q[1], q[2], q[3], q[60], q[61], q[62], q[63], q[124], q[125], q[126], q[127],
                 GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
 
-    GGML_LOG_DEBUG("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
+    HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
                 i + 1, q[128], q[129], q[130], q[131], q[192], q[193], q[194], q[195], q[252], q[253], q[254], q[255],
                 GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
 }
@@ -932,7 +935,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    GGML_LOG_DEBUG("ggml-hex: repack-q8_0-q8x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+    HEX_VERBOSE("ggml-hex: repack-q8_0-q8x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
                 t->ne[0], nrows, row_size);
 
     init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros
@@ -964,7 +967,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    GGML_LOG_DEBUG("ggml-hex: repack-q8x4x2-q8_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
+    HEX_VERBOSE("ggml-hex: repack-q8x4x2-q8_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
                 t->ne[0], nrows, row_size);
 
     memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros
@@ -995,7 +998,7 @@ static x2_mxfp4 unpack_mxfp4(uint8_t v) {
 }
 
 static void dump_block_mxfp4(const block_mxfp4 * b, int i) {
-    GGML_LOG_DEBUG("ggml-hex: repack mxfp4 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_mxfp4(b->qs[0]).v[0],
+    HEX_VERBOSE("ggml-hex: repack mxfp4 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_mxfp4(b->qs[0]).v[0],
                 unpack_mxfp4(b->qs[1]).v[0], unpack_mxfp4(b->qs[2]).v[0], unpack_mxfp4(b->qs[3]).v[0],
                 unpack_mxfp4(b->qs[12]).v[1], unpack_mxfp4(b->qs[13]).v[1], unpack_mxfp4(b->qs[14]).v[1],
                 unpack_mxfp4(b->qs[15]).v[1], GGML_E8M0_TO_FP32_HALF(b->e));
@@ -1013,14 +1016,14 @@ static void dump_packed_block_mxfp4x4x2(const uint8_t * v, unsigned int i, size_
     const uint8_t * q = v_q + i * qblk_size;
     const uint8_t * e = (const uint8_t *) (v_e + i * eblk_size);
 
-    GGML_LOG_DEBUG("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
+    HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
                 unpack_mxfp4(q[0]).v[0], unpack_mxfp4(q[1]).v[0], unpack_mxfp4(q[2]).v[0], unpack_mxfp4(q[3]).v[0],
                 unpack_mxfp4(q[60]).v[0], unpack_mxfp4(q[61]).v[0], unpack_mxfp4(q[62]).v[0], unpack_mxfp4(q[63]).v[0],
                 unpack_mxfp4(q[124]).v[0], unpack_mxfp4(q[125]).v[0], unpack_mxfp4(q[126]).v[0],
                 unpack_mxfp4(q[127]).v[0], GGML_E8M0_TO_FP32_HALF(e[0]), GGML_E8M0_TO_FP32_HALF(e[1]),
                 GGML_E8M0_TO_FP32_HALF(e[2]), GGML_E8M0_TO_FP32_HALF(e[3]));
 
-    GGML_LOG_DEBUG("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
+    HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
                 i + 1, unpack_mxfp4(q[0]).v[1], unpack_mxfp4(q[1]).v[1], unpack_mxfp4(q[2]).v[1],
                 unpack_mxfp4(q[3]).v[1], unpack_mxfp4(q[60]).v[1], unpack_mxfp4(q[61]).v[1], unpack_mxfp4(q[62]).v[1],
                 unpack_mxfp4(q[63]).v[1], unpack_mxfp4(q[124]).v[1], unpack_mxfp4(q[125]).v[1],
@@ -1231,7 +1234,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    GGML_LOG_DEBUG("ggml-hex: repack-mxfp4-mxfp4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
+    HEX_VERBOSE("ggml-hex: repack-mxfp4-mxfp4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
                 size, t->ne[0], nrows, row_size);
 
     init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros
@@ -1263,7 +1266,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
     void * buf_rp = ggml_aligned_malloc(row_size_rp);
     GGML_ASSERT(buf_rp != NULL);
 
-    GGML_LOG_DEBUG("ggml-hex: repack-mxfp4x4x2-mxfp4 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
+    HEX_VERBOSE("ggml-hex: repack-mxfp4x4x2-mxfp4 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
                 size, t->ne[0], nrows, row_size);
 
     memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros
@@ -1289,7 +1292,7 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
 
-    GGML_LOG_DEBUG("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
+    HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
                 offset, size);
 
     switch (tensor->type) {
@@ -1325,7 +1328,7 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
 
-    GGML_LOG_DEBUG("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
+    HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
                 offset, size);
 
     switch (tensor->type) {
@@ -1366,7 +1369,7 @@ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t      bu
 static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
     auto sess = ctx->sess;
-    GGML_LOG_DEBUG("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
+    HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
     memset(ctx->base, value, ctx->size);
 }
 
@@ -2128,7 +2131,7 @@ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer
     auto buf  = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
     auto sess = buf->sess;
 
-    GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
+    HEX_VERBOSE("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
                 t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
                 (unsigned int) d->size);
 }
@@ -2211,7 +2214,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        GGML_LOG_DEBUG("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
                     names, dims, types, strides, buffs, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
@@ -2348,7 +2351,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        GGML_LOG_DEBUG("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
                     names, dims, types, strides, buffs, req.flags);
 
         if (opt_verbose > 1) {
@@ -2496,7 +2499,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        GGML_LOG_DEBUG("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
+        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
                     ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
@@ -2635,7 +2638,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        GGML_LOG_DEBUG("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
+        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
                     ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
 
         if (opt_verbose > 1) {
@@ -2815,7 +2818,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        GGML_LOG_DEBUG("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
                     names, dims, types, strides, buffs, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
@@ -2988,7 +2991,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        GGML_LOG_DEBUG("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
                     names, dims, types, strides, buffs, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
@@ -3092,7 +3095,7 @@ static inline int last_compute_op(ggml_cgraph * graph) {
 static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
     auto sess = static_cast<ggml_hexagon_session *>(backend->context);
 
-    GGML_LOG_DEBUG("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
+    HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
 
     const int last = last_compute_op(graph);
 
@@ -3176,7 +3179,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
     auto sess = static_cast<ggml_hexagon_session *>(backend->context);
 
-    GGML_LOG_DEBUG("ggml-hex: %s synchronize\n", sess->name.c_str());
+    HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());
 
     // Wait until all pending ops complete
     while (sess->op_pending) {
@@ -3515,7 +3518,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
         hex_format_op_buffs(buffs, op);
         hex_format_op_names(names, op);
 
-        GGML_LOG_DEBUG("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
+        HEX_VERBOSE("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
                     ggml_op_name(op->op), names, dims, types, strides, buffs, (int) supp);
     }
 
@@ -3535,14 +3538,14 @@ static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, gg
     // Need session/domain-id for buffers to be compatible
     bool supp = (s0->session_id == s1->session_id);
 
-    GGML_LOG_DEBUG("ggml-hex: %s device-supports-buft %s (%d)\n", s0->name.c_str(), s1->name.c_str(), (int) supp);
+    HEX_VERBOSE("ggml-hex: %s device-supports-buft %s (%d)\n", s0->name.c_str(), s1->name.c_str(), (int) supp);
 
     return supp;
 }
 
 static ggml_backend_buffer_type_t * ggml_backend_hexagon_device_get_extra_buffers_type(ggml_backend_dev_t dev) {
     auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
-    GGML_LOG_DEBUG("ggml-hex: device-get-extra-buft : %s \n", s0->name.c_str());
+    HEX_VERBOSE("ggml-hex: device-get-extra-buft : %s \n", s0->name.c_str());
 
     static ggml_backend_buffer_type_t bufts[2];
     bufts[0] = ggml_backend_hexagon_device_get_repack_buffer_type(dev);
@@ -3687,7 +3690,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
 
     reg->context = new ggml_hexagon_registry(reg);
 
-    GGML_LOG_DEBUG("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
+    HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
                 sizeof(struct htp_general_rsp));
 }
 

From 19041f7d183a9d5647106d925bd7b9d2122df579 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 22 Oct 2025 08:32:21 -0700
Subject: [PATCH 31/37] docs: typos in hexagon developer docs (libggm-...)

---
 docs/backend/hexagon/developer.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/backend/hexagon/developer.md b/docs/backend/hexagon/developer.md
index aa7aa2cb62a..200a7aabc01 100644
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@@ -7,8 +7,8 @@ The Hexagon backend consist of two parts:
   - `libggml-hexagon`
     This is the regular CPU-side GGML backend library, either shared or statically linked
 
-  - `libggm-htp-vNN`
-    This is the NPU-side (HTP stands for Hexagon Tensor Process) shared library that contains the Op dispatcher and kernels.
+  - `libggml-htp-vNN`
+    This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels.
     The correct library is selected automatically at runtime based on the HW version.
 
 Here is an example of the build artifacts

From 3e4ff7398df892d90513eb2ce29d1a50d86203c4 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 22 Oct 2025 09:54:37 -0700
Subject: [PATCH 32/37] hexagon: overhaul error handling in the session/device
 allocation

this should handle all failure paths in the session allocation.
---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 131 +++++++++++++++++--------
 1 file changed, 90 insertions(+), 41 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 03e736a84dc..9b8220e0e1d 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -216,20 +216,26 @@ static inline void hex_format_op_names(char * str, const struct ggml_tensor * t)
 // ** backend sessions
 
 struct ggml_hexagon_session {
-    ggml_hexagon_session(int dev_id);
-    ~ggml_hexagon_session();
+    ggml_hexagon_session(int dev_id) noexcept(false);
+    ~ggml_hexagon_session() noexcept(true);
+
+    void allocate(int dev_id) noexcept(false);
+    void release() noexcept(true);
 
     ggml_backend_buffer_type buffer_type;
     ggml_backend_buffer_type repack_buffer_type;
 
-    std::string     name;
-    remote_handle64 handle;
-    dspqueue_t      queue;
-    uint32_t        session_id;
-    uint32_t        domain_id;
-    uint64_t        queue_id;
-    int             dev_id;
-
+    std::string      name;
+    remote_handle64  handle;
+    dspqueue_t       queue;
+    uint32_t         session_id;
+    uint32_t         domain_id;
+    uint64_t         queue_id;
+    int              dev_id;
+    bool             valid_session;
+    bool             valid_handle;
+    bool             valid_queue;
+    bool             valid_iface;
     std::atomic<int> op_pending;
     uint32_t         prof_usecs;
     uint32_t         prof_cycles;
@@ -349,7 +355,7 @@ struct ggml_backend_hexagon_buffer_context {
         this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
         if (!this->base) {
             GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
-            return;
+            throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
         }
 
         this->fd = rpcmem_to_fd(this->base);
@@ -357,7 +363,7 @@ struct ggml_backend_hexagon_buffer_context {
             GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base);
             rpcmem_free(this->base);
             this->base = NULL;
-            return;
+            throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)");
         }
 
         HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
@@ -1457,7 +1463,12 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
     /* .is_host          = */ ggml_backend_hexagon_repack_buffer_type_is_host,
 };
 
-ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
+void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
+    this->valid_session = false;
+    this->valid_handle  = false;
+    this->valid_queue   = false;
+    this->valid_iface   = false;
+
     this->domain_id  = 3;  // Default for CDSP, updated after the session is created
     this->session_id = 0;  // Default for CDSP, updated after the session is created
     this->dev_id     = dev_id;
@@ -1472,7 +1483,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
 
     domain * my_domain = get_domain(this->domain_id);
     if (my_domain == NULL) {
-        GGML_ABORT("ggml-hex: unable to get domain struct for CDSP\n");
+        GGML_LOG_ERROR("ggml-hex: unable to get domain struct for CDSP\n");
+        throw std::runtime_error("ggml-hex: failed to get CDSP domain (see log for details)");
     }
 
     // Create new session
@@ -1485,12 +1497,14 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
 
         int err = remote_session_control(FASTRPC_RESERVE_NEW_SESSION, (void *) &n, sizeof(n));
         if (err != AEE_SUCCESS) {
-            GGML_ABORT("ggml-hex: remote_session_control failed to reserve new session %d : error 0x%x\n", dev_id, err);
+            GGML_LOG_ERROR("ggml-hex: failed to reserve new session %d : error 0x%x\n", dev_id, err);
+            throw std::runtime_error("ggml-hex: remote_session_control(new-sess) failed (see log for details)");
         }
 
         // Save the IDs
         this->session_id = n.session_id;
         this->domain_id  = n.effective_domain_id;
+        this->valid_session = true;
     }
 
     // Get session URI
@@ -1510,7 +1524,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
 
         int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
         if (err != AEE_SUCCESS) {
-            GGML_ABORT("ggml-hex: remote_session_control failed to get URI for session %d : error 0x%x\n", dev_id, err);
+            GGML_LOG_ERROR("ggml-hex: failed to get URI for session %d : error 0x%x\n", dev_id, err);
+            throw std::runtime_error("ggml-hex: remote_session_control(get-uri) failed (see log for details)");
         }
     }
 
@@ -1521,17 +1536,20 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
         u.enable = 1;
         int err  = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *) &u, sizeof(u));
         if (err != AEE_SUCCESS) {
-            GGML_ABORT("ggml-hex: remote_session_control failed to enable unsigned PD for session %d : error 0x%x\n",
-                    dev_id, err);
+            GGML_LOG_ERROR("ggml-hex: failed to enable unsigned PD for session %d : error 0x%x\n", dev_id, err);
+            throw std::runtime_error("ggml-hex: remote_session_control(unsign) failed (see log for details)");
         }
     }
 
     // Open session
     int err = htp_iface_open(session_uri, &this->handle);
     if (err != AEE_SUCCESS) {
-        GGML_ABORT("ggml-hex: failed to open session %d : error 0x%x\n", dev_id, err);
+        GGML_LOG_ERROR("ggml-hex: failed to open session %d : error 0x%x\n", dev_id, err);
+        throw std::runtime_error("ggml-hex: failed to open session (see log for details)");
     }
 
+    this->valid_handle = true;
+
     GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
             this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
 
@@ -1542,7 +1560,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
 
         int err = remote_handle64_control(this->handle, DSPRPC_CONTROL_LATENCY, (void *) &l, sizeof(l));
         if (err != 0) {
-            GGML_LOG_ERROR("ggml-hex: failed to enable fastrpc QOS mode: 0x%08x\n", (unsigned) err);
+            GGML_LOG_WARN("ggml-hex: failed to enable fastrpc QOS mode: 0x%08x\n", (unsigned) err);
         }
     }
 
@@ -1554,15 +1572,18 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
                           htp_packet_callback, htp_error_callback,
                           (void *) this,  // Callback context
                           &queue);
-
     if (err != 0) {
-        GGML_ABORT("ggml-hex: dspqueue_create failed: 0x%08x\n", (unsigned) err);
+        GGML_LOG_ERROR("ggml-hex: %s dspqueue_create failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
+        throw std::runtime_error("ggml-hex: failed to create dspqueue (see log for details)");
     }
 
+    this->valid_queue = true;
+
     // Export queue for use on the DSP
     err = dspqueue_export(queue, &this->queue_id);
     if (err != 0) {
-        GGML_ABORT("ggml-hex: dspqueue_export failed: 0x%08x\n", (unsigned) err);
+        GGML_LOG_ERROR("ggml-hex: dspqueue_export failed: 0x%08x\n", (unsigned) err);
+        throw std::runtime_error("ggml-hex: dspqueue export failed (see log for details)");
     }
 
     if (opt_etm) {
@@ -1577,23 +1598,23 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) {
     // listening for packets in a callback.
     err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx);
     if (err != 0) {
-        GGML_ABORT("ggml-hex: htp_iface_start failed: 0x%08x\n", (unsigned) err);
+        GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
+        throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
     }
-
-    buffer_type.iface   = ggml_backend_hexagon_buffer_type_interface;
-    buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name, this);
-
-    repack_buffer_type.iface   = ggml_backend_hexagon_repack_buffer_type_interface;
-    repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
+    this->valid_iface = true;
 }
 
-ggml_hexagon_session::~ggml_hexagon_session() {
+void ggml_hexagon_session::release() noexcept(true) {
     GGML_LOG_INFO("ggml-hex: releasing session: %s\n", this->name.c_str());
 
+    int err;
+
     // Stop the DSP-side service and close the queue
-    int err = htp_iface_stop(this->handle);
-    if (err != 0) {
-        GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
+    if (this->valid_iface) {
+        err = htp_iface_stop(this->handle);
+        if (err != 0) {
+            GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
+        }
     }
 
     if (opt_etm) {
@@ -1603,12 +1624,35 @@ ggml_hexagon_session::~ggml_hexagon_session() {
         }
     }
 
-    err = dspqueue_close(queue);
-    if (err != 0) {
-        GGML_ABORT("ggml-hex: dspqueue_close failed: 0x%08x\n", (unsigned) err);
+    if (this->valid_queue) {
+        err = dspqueue_close(queue);
+        if (err != 0) {
+            GGML_ABORT("ggml-hex: dspqueue_close failed: 0x%08x\n", (unsigned) err);
+        }
+    }
+
+    if (this->valid_handle) {
+        htp_iface_close(this->handle);
+    }
+}
+
+ggml_hexagon_session::ggml_hexagon_session(int dev_id) noexcept(false) {
+    try {
+        allocate(dev_id);
+
+        buffer_type.iface   = ggml_backend_hexagon_buffer_type_interface;
+        buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name, this);
+
+        repack_buffer_type.iface   = ggml_backend_hexagon_repack_buffer_type_interface;
+        repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
+    } catch (std::exception const &exc) {
+        release();
+        throw;
     }
+}
 
-    htp_iface_close(this->handle);
+ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
+    release();
 }
 
 // ** backend interface
@@ -3599,7 +3643,12 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
     for (size_t i = 0; i < opt_ndev; i++) {
         devices[i].iface   = ggml_backend_hexagon_device_i;
         devices[i].reg     = reg;
-        devices[i].context = new ggml_hexagon_session(i);
+        try {
+            devices[i].context = new ggml_hexagon_session(i);
+        } catch (std::exception const &exc) {
+            GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
+            devices[i].context = nullptr;
+        }
     }
 }
 
@@ -3626,8 +3675,8 @@ static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg)
 static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) {
     auto hreg = static_cast<ggml_hexagon_registry *>(reg->context);
 
-    if (index >= opt_ndev) {
-        return NULL;
+    if (index >= opt_ndev || !hreg->devices[index].context) {
+        return nullptr;
     }
 
     return &hreg->devices[index];

From 6acc28548cc4ef1b1bb9880ae058bd3c377950f4 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 22 Oct 2025 11:13:36 -0700
Subject: [PATCH 33/37] hexagon: update cmake presets to enable fp16 vectors

---
 docs/backend/hexagon/CMakeUserPresets.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json
index 5422bc17307..e0b19db0f5a 100644
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -10,8 +10,8 @@
             "ANDROID_ABI":      "arm64-v8a",
             "ANDROID_PLATFORM": "android-31",
             "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
-            "CMAKE_C_FLAGS":   "-march=armv8.7a -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
-            "CMAKE_CXX_FLAGS": "-march=armv8.7a -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
             "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
             "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
             "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",

From dda466cf923234caf3346f74788a9df072e9d497 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 22 Oct 2025 11:15:52 -0700
Subject: [PATCH 34/37] hexagon: remove unused time_usec function

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 9b8220e0e1d..b2b2ef0e631 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -82,11 +82,6 @@ static const char * status_to_str(uint32_t status) {
     }
 }
 
-static uint64_t time_usec(void) {
-    auto now = std::chrono::high_resolution_clock::now().time_since_epoch();
-    return std::chrono::duration_cast<std::chrono::microseconds>(now).count();
-}
-
 // ** debug helpers
 
 static inline int hex_format_tensor_dims(char * str, const struct ggml_tensor * t) {

From b0e5beb9c5219d571dc3bee02c6fe8c53e4a84b8 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 22 Oct 2025 11:31:17 -0700
Subject: [PATCH 35/37] hexagon: don't forget to release buffer contexts

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index b2b2ef0e631..848d107b5c1 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1632,6 +1632,9 @@ void ggml_hexagon_session::release() noexcept(true) {
 }
 
 ggml_hexagon_session::ggml_hexagon_session(int dev_id) noexcept(false) {
+    buffer_type.context        = nullptr;
+    repack_buffer_type.context = nullptr;
+
     try {
         allocate(dev_id);
 
@@ -1648,6 +1651,9 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id) noexcept(false) {
 
 ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
     release();
+
+    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
+    delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
 }
 
 // ** backend interface

From 3049de502e4eaacc1056fe471390130aeea17046 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 22 Oct 2025 11:42:47 -0700
Subject: [PATCH 36/37] hexagon: fixed indents in hvx-utils (missed
 clang-format auto-format failure)

---
 ggml/src/ggml-hexagon/htp/hvx-utils.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index a056e165326..d3599bc9c12 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -88,9 +88,9 @@ void hvx_mul_f32_opt(const uint8_t * restrict src0,
                      const uint8_t * restrict src1,
                      uint8_t * restrict dst,
                      const int num_elems) {
-    htp_binary_ops_preamble
+    htp_binary_ops_preamble;
 
-        for (int i = 0; i < step_of_4; i++) {
+    for (int i = 0; i < step_of_4; i++) {
         HVX_Vector v1a = *(HVX_Vector *) src0_curr;
 
         HVX_Vector v1b = *(HVX_Vector *) src1_curr;
@@ -129,6 +129,7 @@ void hvx_mul_f32_opt(const uint8_t * restrict src0,
 
         dst_curr += 4 * VLEN;
     }
+
     for (int i = 0; i < step_of_2; i++) {
         HVX_Vector v1a = *(HVX_Vector *) src0_curr;
 
@@ -152,6 +153,7 @@ void hvx_mul_f32_opt(const uint8_t * restrict src0,
 
         dst_curr += 2 * VLEN;
     }
+
     for (int i = 0; i < step_of_1; i++) {
         HVX_Vector va = *(HVX_Vector *) src0_curr;
 
@@ -167,6 +169,7 @@ void hvx_mul_f32_opt(const uint8_t * restrict src0,
 
         dst_curr += VLEN;
     }
+
     if (remaining > 0) {
         HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
         hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
@@ -297,9 +300,9 @@ void hvx_add_f32_opt(const uint8_t * restrict src0,
                      const uint8_t * restrict src1,
                      uint8_t * restrict dst,
                      const int num_elems) {
-    htp_binary_ops_preamble
+    htp_binary_ops_preamble;
 
-        for (int i = 0; i < step_of_4; i++) {
+    for (int i = 0; i < step_of_4; i++) {
         HVX_Vector v1a = *(HVX_Vector *) src0_curr;
 
         HVX_Vector v1b = *(HVX_Vector *) src1_curr;
@@ -539,9 +542,9 @@ void hvx_sub_f32_opt(const uint8_t * restrict src0,
                      const uint8_t * restrict src1,
                      uint8_t * restrict dst,
                      const int num_elems) {
-    htp_binary_ops_preamble
+    htp_binary_ops_preamble;
 
-        for (int i = 0; i < step_of_4; i++) {
+    for (int i = 0; i < step_of_4; i++) {
         HVX_Vector v1a = *(HVX_Vector *) src0_curr;
 
         HVX_Vector v1b = *(HVX_Vector *) src1_curr;

From f7d74118d103e790fe9a21dccbf853296907aded Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 22 Oct 2025 12:21:35 -0700
Subject: [PATCH 37/37] hexagon: remove custom can_repeat function and use
 ggml_can_repeat

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 848d107b5c1..ecfc1c856cb 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1683,23 +1683,6 @@ static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_
     return true;
 }
 
-static bool hex_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    if ((t1->ne[0] % t0->ne[0]) != 0) {
-        return false;
-    }
-    if ((t1->ne[1] % t0->ne[1]) != 0) {
-        return false;
-    }
-    if ((t1->ne[2] % t0->ne[2]) != 0) {
-        return false;
-    }
-    if ((t1->ne[3] % t0->ne[3]) != 0) {
-        return false;
-    }
-
-    return true;
-}
-
 static bool hex_supported_src0_type(ggml_type t) {
     return t == GGML_TYPE_F32;
 }
@@ -1879,7 +1862,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
     if (!hex_supported_dims2(src0, dst)) {
         return false;
     }
-    if (!hex_can_repeat(src1, src0)) {
+    if (!ggml_can_repeat(src1, src0)) {
         return false;
     }