Add matrixVectorGenericQ8Byte and matrixVectorRowMajorOptimizedQ8_0Byte kernels for Q8_0 matrix-vector computations

orionpapadakis · orionpapadakis · commit 04db93d0bb57 · 2025-12-04T19:46:59.000+02:00
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/kernels/TransformerComputeKernelsLayered.java b/src/main/java/org/beehive/gpullama3/tornadovm/kernels/TransformerComputeKernelsLayered.java
@@ -3,10 +3,8 @@
 import uk.ac.manchester.tornado.api.KernelContext;
 import uk.ac.manchester.tornado.api.annotations.Parallel;
 import uk.ac.manchester.tornado.api.math.TornadoMath;
-import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
-import uk.ac.manchester.tornado.api.types.arrays.HalfFloatArray;
-import uk.ac.manchester.tornado.api.types.arrays.Int8Array;
-import uk.ac.manchester.tornado.api.types.arrays.IntArray;
+import uk.ac.manchester.tornado.api.types.HalfFloat;
+import uk.ac.manchester.tornado.api.types.arrays.*;
 
 public class TransformerComputeKernelsLayered {
 
@@ -1015,6 +1013,101 @@ public static float matrixVectorRowMajorOptimizedQ8_0(KernelContext context, int
         return localSums[0];
     }
 
+    public static void matrixVectorGenericQ8Byte(KernelContext context, FloatArray x, FloatArray output, ByteArray q, int dim1, int dim0, int localWorkGroupSize) {
+        int rowId = context.groupIdx;
+        int localId = context.localIdx;
+
+        if (rowId >= dim0) {
+            return;
+        }
+
+        float sum = matrixVectorRowMajorOptimizedQ8_0Byte(context, localWorkGroupSize, x, q, dim1);
+
+        // Thread 0 writes the result
+        if (localId == 0) {
+            output.set(rowId, sum);
+        }
+    }
+
+    public static float matrixVectorRowMajorOptimizedQ8_0Byte(KernelContext context, int localSize, FloatArray x, ByteArray q, int n) {
+        int rowId = context.groupIdx;
+        int localId = context.localIdx;
+        int blockSize = 32;
+        final int Q8_0_BLOCK_BYTES = 34; // 2 bytes scale + 32 bytes quants
+
+        // Allocate local memory for reduction
+        float[] localSums = context.allocateFloatLocalArray(localSize);
+
+        int blocksPerRow = (n + blockSize - 1) / blockSize;
+        int rowBlockOffset = rowId * blocksPerRow; // Starting block index for this row
+
+        // 4-way unrolling
+        float partialSum1 = 0.0f;
+        float partialSum2 = 0.0f;
+        float partialSum3 = 0.0f;
+        float partialSum4 = 0.0f;
+
+        // Main loop - process 4 elements at a time
+        for (int j = localId * 4; j < n - 3; j += localSize * 4) {
+            int blockIdx = j / blockSize;
+            int withinBlockIdx = j % blockSize;
+
+            // Calculate byte offset for this Q8_0 block
+            int blockByteOffset = (rowBlockOffset + blockIdx) * Q8_0_BLOCK_BYTES;
+
+            // Load scale (first 2 bytes of block as HalfFloat)
+            HalfFloat scale = q.getHalf(blockByteOffset);
+            float scaleFloat = scale.getFloat32();
+
+            // Load 4 consecutive quantized values
+            int quantsOffset = blockByteOffset + 2 + withinBlockIdx; // Skip 2-byte scale
+            byte quant1 = q.get(quantsOffset);
+            byte quant2 = q.get(quantsOffset + 1);
+            byte quant3 = q.get(quantsOffset + 2);
+            byte quant4 = q.get(quantsOffset + 3);
+
+            // Dequantize and multiply
+            partialSum1 += ((float) quant1 * scaleFloat) * x.get(j);
+            partialSum2 += ((float) quant2 * scaleFloat) * x.get(j + 1);
+            partialSum3 += ((float) quant3 * scaleFloat) * x.get(j + 2);
+            partialSum4 += ((float) quant4 * scaleFloat) * x.get(j + 3);
+        }
+
+        float partialSum = partialSum1 + partialSum2 + partialSum3 + partialSum4;
+
+        // Handle remaining elements
+        for (int j = ((n / 4) * 4) + localId; j < n; j += localSize) {
+            int blockIdx = j / blockSize;
+            int withinBlockIdx = j % blockSize;
+
+            // Calculate byte offset for this Q8_0 block
+            int blockByteOffset = (rowBlockOffset + blockIdx) * Q8_0_BLOCK_BYTES;
+
+            // Load scale
+            HalfFloat scale = q.getHalf(blockByteOffset);
+            float scaleFloat = scale.getFloat32();
+
+            // Load quantized value
+            byte quant = q.get(blockByteOffset + 2 + withinBlockIdx);
+
+            partialSum += ((float) quant * scaleFloat) * x.get(j);
+        }
+
+        localSums[localId] = partialSum;
+        context.localBarrier();
+
+        // Parallel reduction
+        for (int stride = localSize / 2; stride > 0; stride >>= 1) {
+            if (localId < stride) {
+                localSums[localId] += localSums[localId + stride];
+            }
+            context.localBarrier();
+        }
+
+        return localSums[0];
+
+    }
+
     public static void matrixVectorGenericWithResidual(KernelContext context, FloatArray x, FloatArray hb, Int8Array w_quants, HalfFloatArray w_scales, int n, int d, int localWorkGroupSize) {
         // One row per workgroup (not per thread)
         int rowId = context.groupIdx;