Add matrixVectorGenericWithResidualQ8_0Byte and fusedFeedForwardWithSiLUAndGLUActivationQ8_0Byte kernels for byte-based Q8_0 computations

orionpapadakis · orionpapadakis · commit 9d0fb162ab0a · 2025-12-04T20:31:36.000+02:00
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/kernels/TransformerComputeKernelsLayered.java b/src/main/java/org/beehive/gpullama3/tornadovm/kernels/TransformerComputeKernelsLayered.java
@@ -1128,6 +1128,26 @@ public static void matrixVectorGenericWithResidual(KernelContext context, FloatA
         }
     }
 
+    public static void matrixVectorGenericWithResidualQ8_0Byte(KernelContext context, FloatArray x, FloatArray hb, ByteArray w, int n, int d, int localWorkGroupSize) {
+        // One row per workgroup (not per thread)
+        int rowId = context.groupIdx;
+        int localId = context.localIdx;
+        int localSize = localWorkGroupSize;
+
+        // Early exit if this workgroup is beyond our output dimension
+        if (rowId >= d) {
+            return;
+        }
+
+        float sum = matrixVectorRowMajorOptimizedQ8_0Byte(context, localSize, x, w, n);
+
+        // Thread 0 in each workgroup writes the final result
+        if (localId == 0) {
+            float result = hb.get(rowId) + sum;
+            hb.set(rowId, result);
+        }
+    }
+
     public static void fusedFeedForwardWithSiLUAndGLUActivation(KernelContext context, FloatArray x, FloatArray hb, Int8Array w1_quants, HalfFloatArray w1_scales, Int8Array w3_quants,
             HalfFloatArray w3_scales, int n, int d, int localWorkGroupSize) {
         // One row per workgroup (not per thread)
@@ -1149,6 +1169,29 @@ public static void fusedFeedForwardWithSiLUAndGLUActivation(KernelContext contex
         }
     }
 
+    public static void fusedFeedForwardWithSiLUAndGLUActivationQ8_0Byte(KernelContext context, FloatArray x, FloatArray hb,
+                                                                        ByteArray w1,
+                                                                        ByteArray w3,
+                                                                        int n, int d, int localWorkGroupSize) {
+        // One row per workgroup (not per thread)
+        int rowId = context.groupIdx;
+        int localId = context.localIdx;
+
+        if (rowId >= d) {
+            return;
+        }
+
+        float sum1 = matrixVectorRowMajorOptimizedQ8_0Byte(context, localWorkGroupSize, x, w1, n);
+        float sum3 = matrixVectorRowMajorOptimizedQ8_0Byte(context, localWorkGroupSize, x, w3, n);
+
+        // Thread 0 in each workgroup writes the final result
+        if (localId == 0) {
+            float silu = siluActivation(sum1);  // Using the new SiLU method
+            float result = silu * sum3;
+            hb.set(rowId, result);
+        }
+    }
+
     /**
      * Orchestrates parallel multi-head attention computation across all heads. Each head processes attention independently in parallel.
      *