feat(ggml-metal): Metal impl of tri

gabe-l-hart · gabe-l-hart · commit 5f0d2a1ed6f1 · 2025-10-15T10:17:08.000-06:00
Branch: Mamba2SSD

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -586,6 +586,27 @@ typedef struct {
     uint64_t nb3;
 } ggml_metal_kargs_cumsum;
 
+typedef struct {
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int64_t  ne0;
+    int64_t  ne1;
+    int64_t  ne2;
+    int64_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    float    c;
+    uint32_t ttype;
+} ggml_metal_kargs_tri;
+
 typedef struct {
     int32_t  ne00;
     int32_t  ne01;
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -1004,8 +1004,57 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
 
 int ggml_metal_op_tri(ggml_metal_op_t ctx, int idx) {
     ggml_tensor * op = ctx->node(idx);
-    //DEBUG
-    GGML_ASSERT(false);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+
+    const ggml_tri_type ttype = (ggml_tri_type) op->op_params[0];
+    const float         c     = *((float *) &(op->op_params[1]));
+
+    ggml_metal_kargs_tri args = {
+        /*.ne00  =*/ ne00,
+        /*.ne01  =*/ ne01,
+        /*.ne02  =*/ ne02,
+        /*.ne03  =*/ ne03,
+        /*.nb00  =*/ nb00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.ne0   =*/ ne0,
+        /*.ne1   =*/ ne1,
+        /*.ne2   =*/ ne2,
+        /*.ne3   =*/ ne3,
+        /*.nb0   =*/ nb0,
+        /*.nb1   =*/ nb1,
+        /*.nb2   =*/ nb2,
+        /*.nb3   =*/ nb3,
+        /*.c     =*/ c,
+        /*.ttype =*/ static_cast<uint32_t>(ttype)
+    };
+
+    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_tri(lib, op);
+
+    int nth = 32; // SIMD width
+
+    while (nth < ne00 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
+    }
+
+    nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    nth = std::min(nth, ne00);
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
+
     return 1;
 }
 
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1857,6 +1857,56 @@ template [[host_name("kernel_cumsum_f16")]] kernel kernel_cumsum_t kernel_cumsum
 template [[host_name("kernel_cumsum_bf16")]] kernel kernel_cumsum_t kernel_cumsum<bfloat>;
 #endif
 
+inline static bool _ggml_vec_tri_cmp(const int i, const int r, const uint32_t type) {
+    switch (type) {
+        // ggml.h:620
+        case /* GGML_TRI_TYPE_LOWER      */ 3: return i < r; break;
+        case /* GGML_TRI_TYPE_LOWER_DIAG */ 2: return i <= r; break;
+        case /* GGML_TRI_TYPE_UPPER      */ 1: return i > r; break;
+        case /* GGML_TRI_TYPE_UPPER_DIAG */ 0: return i >= r; break;
+    }
+}
+
+template<typename T>
+kernel void kernel_tri(
+        constant ggml_metal_kargs_tri & args,
+        device const char * src0,
+        device const char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i3 = tgpig.z;
+    const int64_t i2 = tgpig.y;
+    const int64_t i1 = tgpig.x;
+
+    if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
+        return;
+    }
+
+    device const T * src_row = (device const T *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
+    device       T * dst_row = (device       T *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
+
+    // Each thread is a single element of the row if ne00 < max threads per
+    // threadgroup, so this will loop once for each index that this thread is
+    // responsible for
+    const bool keep_org_val = isnan(args.c);
+    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
+        dst_row[i0] = _ggml_vec_tri_cmp(i0, i1, args.ttype)
+            ? (keep_org_val ? src_row[i0] : static_cast<T>(args.c))
+            : static_cast<T>(0.f);
+    }
+}
+
+typedef decltype(kernel_tri<float>) kernel_tri_t;
+
+template [[host_name("kernel_tri_f32")]] kernel kernel_tri_t kernel_tri<float>;
+template [[host_name("kernel_tri_f16")]] kernel kernel_tri_t kernel_tri<half>;
+#if defined(GGML_METAL_HAS_BF16)
+template [[host_name("kernel_tri_bf16")]] kernel kernel_tri_t kernel_tri<bfloat>;
+#endif
+
 template<typename T>
 kernel void kernel_soft_max(
         constant ggml_metal_kargs_soft_max & args,
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -6951,6 +6951,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval(int verbose
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER, GGML_TYPE_F32,  {8, 8, 4, 16}, 42.f));
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER, GGML_TYPE_F16,  {8, 8, 4, 16}, 42.f));
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER, GGML_TYPE_BF16, {8, 8, 4, 16}, 42.f));
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER, GGML_TYPE_F32,  {2025, 2025, 1, 1}));
 
     for (bool v : {false, true}) {
         test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v));
@@ -7123,6 +7124,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER, GGML_TYPE_F32,  {8, 8, 4, 16}, 42.f));
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER, GGML_TYPE_F16,  {8, 8, 4, 16}, 42.f));
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER, GGML_TYPE_BF16, {8, 8, 4, 16}, 42.f));
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER, GGML_TYPE_F32,  {2025, 2025, 1, 1}));
 
     for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
         for (ggml_type type_a : all_types) {