feat(ggml-metal): Efficient implementation of cumsum for metal

gabe-l-hart · gabe-l-hart · commit e5587cb156ab · 2025-10-15T08:30:38.000-06:00
Branch: Mamba2SSD

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -330,15 +330,16 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum(ggml_metal_library_
 
     snprintf(name, 256, "%s", base);
 
+    // reuse existing precompiled pipeline, but allow memory size setting
     ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
-    if (res) {
-        return res;
+    if (!res) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
     }
 
-    res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
-
-    // shared memory buffer for a single simd group size
-    ggml_metal_pipeline_set_smem(res, 32*sizeof(float));
+    // one shared memory element for each simd group in the threadgroup
+    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    const int nsg = (ne00 + 31)/32;
+    ggml_metal_pipeline_set_smem(res, nsg*sizeof(float));
 
     return res;
 }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1801,7 +1801,7 @@ kernel void kernel_cumsum(
         constant ggml_metal_kargs_cumsum & args,
         device const char * src0,
         device const char * dst,
-        threadgroup  float * shmem_f32 [[threadgroup(0)]],
+        threadgroup float * shmem_f32 [[threadgroup(0)]],
         uint3   tgpig[[threadgroup_position_in_grid]],
         ushort3 tpitg[[thread_position_in_threadgroup]],
         ushort  sgitg[[simdgroup_index_in_threadgroup]],
@@ -1822,40 +1822,31 @@ kernel void kernel_cumsum(
     // threadgroup, so this will loop once for each index that this thread is
     // responsible for
     for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
-        //DEBUG -- This is the _very_ neive version
-        dst_row[i0] = src_row[i0];
-        for (int64_t j = 0; j < i0; ++j) {
-            dst_row[i0] = static_cast<T>(static_cast<float>(src_row[j]) + static_cast<float>(dst_row[i0]));
-        }
-    }
-
-    // if (sgitg == 0) {
-    //     shmem_f32[tiisg] = 0.0f;
-    // }
-
-
-    // float sumf = 0;
-
-    // for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
-    //     sumf += src_row[i0];
-    // }
 
-    // sumf = simd_sum(sumf);
+        // Each thread does simd_prefix_inclusive_sum => every element of row
+        // now holds cumsum of the simd group
+        float sumf = static_cast<float>(src_row[i0]);
+        sumf = simd_prefix_inclusive_sum(sumf);
+        dst_row[i0] = static_cast<T>(sumf);
 
-    // threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // if (tiisg == 0) {
-    //     shmem_f32[sgitg] = sumf;
-    // }
-
-    // threadgroup_barrier(mem_flags::mem_threadgroup);
+        // If this is the last element of the simd group, store its value in
+        // shared memory
+        if (tiisg == N_SIMDWIDTH - 1 || i0 == args.ne00 - 1) {
+            const ushort shmem_idx = i0 / N_SIMDWIDTH;
+            shmem_f32[shmem_idx] = sumf;
+        }
+    }
 
-    // sumf = shmem_f32[tiisg];
-    // sumf = simd_sum(sumf);
+    // Ensure all simd groups sync here before proceeding
+    threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    // if (tpitg.x == 0) {
-    //     dst_row[0] = norm ? sumf / args.ne00 : sumf;
-    // }
+    // Each element then adds the final value of all preceding simd groups
+    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
+        const ushort shmem_idx = i0 / N_SIMDWIDTH;
+        for (ushort j = 0; j < shmem_idx; ++j) {
+            dst_row[i0] += static_cast<T>(shmem_f32[j]);
+        }
+    }
 }
 
 typedef decltype(kernel_cumsum<float>) kernel_cumsum_t;