metal : add top_k support

ggerganov · ggerganov · commit 4dea5dd051b4 · 2025-11-20T10:29:42.000+02:00
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -7888,7 +7888,7 @@ static void ggml_compute_forward_top_k_f32(
 
     const int64_t nr = ggml_nrows(src0);
 
-    const int k = ggml_get_op_params_i32(dst, 0);
+    const int top_k = ne0;
 
     int32_t * tmp = (int32_t *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 
@@ -7899,11 +7899,11 @@ static void ggml_compute_forward_top_k_f32(
             tmp[j] = j;
         }
 
-        std::partial_sort(tmp, tmp + k, tmp + ne00, cmp_top_k{src_data});
+        std::partial_sort(tmp, tmp + top_k, tmp + ne00, cmp_top_k{src_data});
 
         int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
 
-        std::copy(tmp, tmp + k, dst_data);
+        std::copy(tmp, tmp + top_k, dst_data);
     }
 }
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -905,6 +905,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_LEAKY_RELU:
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_ARGSORT:
+        case GGML_OP_TOP_K:
         case GGML_OP_ARANGE:
             return true;
         case GGML_OP_FLASH_ATTN_EXT:
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -832,14 +832,19 @@ typedef struct {
 } ggml_metal_kargs_leaky_relu;
 
 typedef struct {
-    int64_t  ne00;
-    int64_t  ne01;
-    int64_t  ne02;
-    int64_t  ne03;
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
     uint64_t nb00;
     uint64_t nb01;
     uint64_t nb02;
     uint64_t nb03;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    int32_t  top_k;
 } ggml_metal_kargs_argsort;
 
 typedef struct {
@@ -851,6 +856,11 @@ typedef struct {
     uint64_t nb01;
     uint64_t nb02;
     uint64_t nb03;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    int32_t  top_k;
     int32_t  len;
 } ggml_metal_kargs_argsort_merge;
 
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -405,6 +405,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = ggml_metal_op_argsort(ctx, idx);
             } break;
+        case GGML_OP_TOP_K:
+            {
+                n_fuse = ggml_metal_op_top_k(ctx, idx);
+            } break;
         case GGML_OP_LEAKY_RELU:
             {
                 n_fuse = ggml_metal_op_leaky_relu(ctx, idx);
@@ -3677,14 +3681,19 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
     }
 
     ggml_metal_kargs_argsort args = {
-        /*.ne00 =*/ ne00,
-        /*.ne01 =*/ ne01,
-        /*.ne02 =*/ ne02,
-        /*.ne03 =*/ ne03,
-        /*.nb00 =*/ nb00,
-        /*.nb01 =*/ nb01,
-        /*.nb02 =*/ nb02,
-        /*.nb03 =*/ nb03,
+        /*.ne00  =*/ ne00,
+        /*.ne01  =*/ ne01,
+        /*.ne02  =*/ ne02,
+        /*.ne03  =*/ ne03,
+        /*.nb00  =*/ nb00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.ne0   =*/ ne0,
+        /*.ne1   =*/ ne1,
+        /*.ne2   =*/ ne2,
+        /*.ne3   =*/ ne3,
+        /*.top_k =*/ nth,
     };
 
     ggml_metal_encoder_set_pipeline(enc, pipeline);
@@ -3704,15 +3713,20 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
         ggml_metal_op_concurrency_reset(ctx);
 
         ggml_metal_kargs_argsort_merge args_merge = {
-            .ne00 = ne00,
-            .ne01 = ne01,
-            .ne02 = ne02,
-            .ne03 = ne03,
-            .nb00 = nb00,
-            .nb01 = nb01,
-            .nb02 = nb02,
-            .nb03 = nb03,
-            .len  = len,
+            /*.ne00  =*/ ne00,
+            /*.ne01  =*/ ne01,
+            /*.ne02  =*/ ne02,
+            /*.ne03  =*/ ne03,
+            /*.nb00  =*/ nb00,
+            /*.nb01  =*/ nb01,
+            /*.nb02  =*/ nb02,
+            /*.nb03  =*/ nb03,
+            /*.ne0   =*/ ne0,
+            /*.ne1   =*/ ne1,
+            /*.ne2   =*/ ne2,
+            /*.ne3   =*/ ne3,
+            /*.top_k =*/ ne00,
+            /*.len   =*/ len,
         };
 
         // merges per row
@@ -3736,6 +3750,119 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
     return 1;
 }
 
+int ggml_metal_op_top_k(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
+
+    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+
+    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);
+
+    // bitonic sort requires the number of elements to be power of 2
+    int nth = 1;
+    while (nth < ne00 && 2*nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
+    }
+
+    const int npr = (ne00 + nth - 1)/nth;
+
+    // Metal kernels require the buffer size to be multiple of 16 bytes
+    // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
+    const size_t smem = GGML_PAD(nth*sizeof(int32_t), 16);
+
+    ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
+    ggml_metal_buffer_id bid_dst  = ggml_metal_get_buffer_id(op);
+
+    ggml_metal_buffer_id bid_tmp = bid_dst;
+    bid_tmp.offs += sizeof(int32_t)*ggml_nelements(op->src[0]);
+
+    if ((int) ceil(std::log(npr) / std::log(2)) % 2 == 1) {
+        std::swap(bid_dst, bid_tmp);
+    }
+
+    const int top_k = ne0;
+
+    ggml_metal_kargs_argsort args = {
+        /*.ne00  =*/ ne00,
+        /*.ne01  =*/ ne01,
+        /*.ne02  =*/ ne02,
+        /*.ne03  =*/ ne03,
+        /*.nb00  =*/ nb00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.ne0   =*/ ne0,
+        /*.ne1   =*/ ne1,
+        /*.ne2   =*/ ne2,
+        /*.ne3   =*/ ne3,
+        /*.top_k =*/ std::min(nth, top_k),
+    };
+
+    if (npr > 1) {
+        args.ne0 = (npr - 1)*args.top_k + std::min(ne00 - (npr - 1)*nth, args.top_k);
+    }
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+    ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
+
+    ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
+
+    ggml_metal_pipeline_t pipeline_merge = ggml_metal_library_get_pipeline_argsort_merge(lib, op);
+
+    int len = args.top_k;
+
+    while (len < args.ne0) {
+        ggml_metal_op_concurrency_reset(ctx);
+
+        ggml_metal_kargs_argsort_merge args_merge = {
+            /*.ne00  =*/ ne00,
+            /*.ne01  =*/ ne01,
+            /*.ne02  =*/ ne02,
+            /*.ne03  =*/ ne03,
+            /*.nb00  =*/ nb00,
+            /*.nb01  =*/ nb01,
+            /*.nb02  =*/ nb02,
+            /*.nb03  =*/ nb03,
+            /*.ne0   =*/ args.ne0,
+            /*.ne1   =*/ ne1,
+            /*.ne2   =*/ ne2,
+            /*.ne3   =*/ ne3,
+            /*.top_k =*/ 2*len >= args.ne0 ? top_k : args.ne0,
+            /*.len   =*/ len,
+        };
+
+        // merges per row
+        const int nm = (args.ne0 + 2*len - 1) / (2*len);
+
+        const int nth = std::min(512, std::min(len, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_merge)));
+
+        ggml_metal_encoder_set_pipeline(enc, pipeline_merge);
+        ggml_metal_encoder_set_bytes   (enc, &args_merge, sizeof(args_merge), 0);
+        ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
+        ggml_metal_encoder_set_buffer  (enc, bid_dst,  2);
+        ggml_metal_encoder_set_buffer  (enc, bid_tmp,  3);
+
+        ggml_metal_encoder_dispatch_threadgroups(enc, nm*ne01, ne02, ne03, nth, 1, 1);
+
+        std::swap(bid_dst, bid_tmp);
+
+        len <<= 1;
+    }
+
+    return 1;
+}
+
 int ggml_metal_op_leaky_relu(ggml_metal_op_t ctx, int idx) {
     ggml_tensor * op = ctx->node(idx);
 
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h
@@ -81,6 +81,7 @@ int ggml_metal_op_arange            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_argmax            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_argsort           (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_top_k             (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_leaky_relu        (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_opt_step_adamw    (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_opt_step_sgd      (ggml_metal_op_t ctx, int idx);
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -202,6 +202,10 @@ static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_
             {
                 res *= 2;
             } break;
+        case GGML_OP_TOP_K:
+            {
+                res = 2*sizeof(int32_t)*ggml_nelements(tensor->src[0]);
+            } break;
         default:
             break;
     }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4670,11 +4670,12 @@ kernel void kernel_argsort_f32_i32(
         ushort3   ntg[[threads_per_threadgroup]]) {
     // bitonic sort
     const int col = tpitg[0];
+    const int ib  = tgpig[0] / args.ne01;
 
-    const int i00 = (tgpig[0]/args.ne01)*ntg.x;
-    const int i01 =  tgpig[0]%args.ne01;
-    const int i02 =  tgpig[1];
-    const int i03 =  tgpig[2];
+    const int i00 = ib*ntg.x;
+    const int i01 = tgpig[0] % args.ne01;
+    const int i02 = tgpig[1];
+    const int i03 = tgpig[2];
 
     device const float * src0_row = (device const float *) (src0 + args.nb01*i01 + args.nb02*i02 + args.nb03*i03);
 
@@ -4710,9 +4711,11 @@ kernel void kernel_argsort_f32_i32(
         }
     }
 
+    const int64_t i0 = ib*args.top_k;
+
     // copy the result to dst without the padding
-    if (i00 + col < args.ne00) {
-        dst += i00 + args.ne00*i01 + args.ne00*args.ne01*i02 + args.ne00*args.ne01*args.ne02*i03;
+    if (i0 + col < args.ne0 && col < args.top_k) {
+        dst += i0 + args.ne0*i01 + args.ne0*args.ne1*i02 + args.ne0*args.ne1*args.ne2*i03;
 
         dst[col] = shmem_i32[col];
     }
@@ -4747,22 +4750,22 @@ kernel void kernel_argsort_merge_f32_i32(
 
     const int start = im * (2 * args.len);
 
-    const int len0 = MIN(args.len, MAX(0, args.ne00 - (int)(start)));
-    const int len1 = MIN(args.len, MAX(0, args.ne00 - (int)(start + args.len)));
+    const int len0 = MIN(args.len, MAX(0, args.ne0 - (int)(start)));
+    const int len1 = MIN(args.len, MAX(0, args.ne0 - (int)(start + args.len)));
 
     const int total = len0 + len1;
 
     device const int32_t * tmp0 = tmp + start
-        + i01*args.ne00
-        + i02*args.ne00*args.ne01
-        + i03*args.ne00*args.ne01*args.ne02;
+        + i01*args.ne0
+        + i02*args.ne0*args.ne01
+        + i03*args.ne0*args.ne01*args.ne02;
 
     device const int32_t * tmp1 = tmp0 + args.len;
 
     dst += start
-        + i01*args.ne00
-        + i02*args.ne00*args.ne01
-        + i03*args.ne00*args.ne01*args.ne02;
+        + i01*args.top_k
+        + i02*args.top_k*args.ne01
+        + i03*args.top_k*args.ne01*args.ne02;
 
     device const float * src0_row = (device const float *)(src0
         + args.nb01*i01
@@ -4827,16 +4830,16 @@ kernel void kernel_argsort_merge_f32_i32(
         val1 = src0_row[idx1];
     }
 
-    for (int k = k0; k < k1; ++k) {
+    for (int k = k0; k < k1 && k < args.top_k; ++k) {
         int32_t out_idx;
 
         if (i >= len0) {
-            while (k < k1) {
+            while (k < k1 && k < args.top_k) {
                 dst[k++] = tmp1[j++];
             }
             break;
         } else if (j >= len1) {
-            while (k < k1) {
+            while (k < k1 && k < args.top_k) {
                 dst[k++] = tmp0[i++];
             }
             break;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -5158,7 +5158,8 @@ struct ggml_tensor * ggml_top_k(
 
     struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_I32, k, a->ne[1], a->ne[2], a->ne[3]);
 
-    ggml_set_op_params_i32(result, 0, (int32_t) k);
+    // TODO: tmp
+    ggml_set_op_params_i32(result, 0, (int32_t) GGML_SORT_ORDER_DESC);
 
     result->op     = GGML_OP_TOP_K;
     result->src[0] = a;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

Original file line number	Diff line number	Diff line change
`@@ -7888,7 +7888,7 @@ static void ggml_compute_forward_top_k_f32(`
`7888`	`7888`
`7889`	`7889`	`const int64_t nr = ggml_nrows(src0);`
`7890`	`7890`
`7891`		`- const int k = ggml_get_op_params_i32(dst, 0);`
	`7891`	`+ const int top_k = ne0;`
`7892`	`7892`
`7893`	`7893`	`int32_t * tmp = (int32_t ) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) ith;`
`7894`	`7894`
`@@ -7899,11 +7899,11 @@ static void ggml_compute_forward_top_k_f32(`
`7899`	`7899`	`tmp[j] = j;`
`7900`	`7900`	`}`
`7901`	`7901`
`7902`		`- std::partial_sort(tmp, tmp + k, tmp + ne00, cmp_top_k{src_data});`
	`7902`	`+ std::partial_sort(tmp, tmp + top_k, tmp + ne00, cmp_top_k{src_data});`
`7903`	`7903`
`7904`	`7904`	`int32_t * dst_data = (int32_t )((char ) dst->data + i*nb1);`
`7905`	`7905`
`7906`		`- std::copy(tmp, tmp + k, dst_data);`
	`7906`	`+ std::copy(tmp, tmp + top_k, dst_data);`
`7907`	`7907`	`}`
`7908`	`7908`	`}`
`7909`	`7909`
Original file line number	Diff line number	Diff line change
`@@ -202,6 +202,10 @@ static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_`
`202`	`202`	`{`
`203`	`203`	`res *= 2;`
`204`	`204`	`} break;`
	`205`	`+ case GGML_OP_TOP_K:`
	`206`	`+ {`
	`207`	`+ res = 2sizeof(int32_t)ggml_nelements(tensor->src[0]);`
	`208`	`+ } break;`
`205`	`209`	`default:`
`206`	`210`	`break;`
`207`	`211`	`}`