From 1dc94065039c44e82c31da027293cf7e44dc4210 Mon Sep 17 00:00:00 2001 From: Ilia Ilmer Date: Sun, 12 Oct 2025 15:36:52 -0400 Subject: [PATCH 01/11] initial: headers and metal-device.cpp updates --- ggml/src/ggml-metal/ggml-metal-device.cpp | 25 +++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal-device.h | 1 + ggml/src/ggml-metal/ggml-metal-device.m | 1 + ggml/src/ggml-metal/ggml-metal-ops.h | 1 + 4 files changed, 28 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index e23abdda97..3634c050fa 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -1387,6 +1387,31 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d(ggml_met return res; } +ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_CONV_TRANSPOSE_2D); + + GGML_ASSERT(ggml_is_contiguous(op->src[0])); + GGML_ASSERT(ggml_is_contiguous(op->src[1])); + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_conv_transpose_2d_%s_%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name); + if (res) { + return res; + } + + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + + return res; +} + ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale(ggml_metal_library_t lib, const ggml_tensor * op) { assert(op->op == GGML_OP_UPSCALE); diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 1034e4bbf6..76af8be10b 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -129,6 +129,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm (ggml_me ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op); +ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op); ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 9527973015..cacdbebf65 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -647,6 +647,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_REPEAT: case GGML_OP_SCALE: case GGML_OP_CONV_TRANSPOSE_1D: + case GGML_OP_CONV_TRANSPOSE_2D: return true; case GGML_OP_CLAMP: return op->src[0]->type == GGML_TYPE_F32; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index d4cb944621..4ef826b737 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -70,6 +70,7 @@ int ggml_metal_op_norm (ggml_metal_op_t ctx, int idx); int ggml_metal_op_rope (ggml_metal_op_t ctx, int idx); int ggml_metal_op_im2col (ggml_metal_op_t ctx, int idx); int ggml_metal_op_conv_transpose_1d (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_upscale (ggml_metal_op_t ctx, int idx); int ggml_metal_op_pad (ggml_metal_op_t ctx, int idx); int ggml_metal_op_pad_reflect_1d (ggml_metal_op_t ctx, int idx); From 09661b92fd2ec2f9b155f16fe8f9373b13002e7b Mon Sep 17 00:00:00 2001 From: Ilia Ilmer Date: Sun, 12 Oct 2025 15:38:08 -0400 Subject: [PATCH 02/11] adding conv_transpose_2d --- ggml/src/ggml-metal/ggml-metal-impl.h | 13 +++++ ggml/src/ggml-metal/ggml-metal-ops.cpp | 56 +++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 75 ++++++++++++++++++++++++++ 3 files changed, 144 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index c9dff87305..2f1f5fdbfc 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -513,6 +513,19 @@ typedef struct { uint64_t nb1; } ggml_metal_kargs_conv_transpose_1d; +typedef struct { + int32_t IC; + int32_t IH; + int32_t IW; + int32_t KH; + int32_t KW; + int32_t OC; + int32_t s0; + uint64_t nb0; + uint64_t nb1; + uint64_t nb2; +} ggml_metal_kargs_conv_transpose_2d; + typedef struct { uint64_t ofs0; uint64_t ofs1; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 5f9370449b..38106e35eb 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -364,6 +364,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_conv_transpose_1d(ctx, idx); } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + n_fuse = ggml_metal_op_conv_transpose_2d(ctx, idx); + } break; case GGML_OP_UPSCALE: { n_fuse = ggml_metal_op_upscale(ctx, idx); @@ -3068,6 +3072,58 @@ int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) { return 1; } +int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne); + GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb); + GGML_TENSOR_LOCALS( int32_t, ne, op, ne); + GGML_TENSOR_LOCALS(uint32_t, nb, op, nb); + + const int32_t s0 = ((const int32_t *)(op->op_params))[0]; + + const int32_t IC = op->src[1]->ne[2]; + const int32_t IH = op->src[1]->ne[1]; + const int32_t IW = op->src[1]->ne[0]; + + const int32_t KH = op->src[0]->ne[1]; + const int32_t KW = op->src[0]->ne[0]; + + const int32_t OW = op->ne[0]; + const int32_t OH = op->ne[1]; + const int32_t OC = op->ne[2]; + + ggml_metal_kargs_conv_transpose_2d args = { + /*.IC =*/ IC, + /*.IH =*/ IH, + /*.IW =*/ IW, + /*.KH =*/ KH, + /*.KW =*/ KW, + /*.OC =*/ OC, + /*.s0 =*/ s0, + /*.nb0 =*/ nb0, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + }; + + ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); + + ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, 1, 1, 1); + + return 1; +} + int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) { ggml_tensor * op = ctx->node(idx); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index ddc285042d..6e6566f961 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4131,6 +4131,81 @@ kernel void kernel_conv_transpose_1d( uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpg[[threadgroups_per_grid]]); + +typedef void (conv_transpose_2d_t)( + constant ggml_metal_kargs_conv_transpose_2d & args, + device const float * src0, + device const float * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]]); + +template +kernel void kernel_conv_transpose_2d( + constant ggml_metal_kargs_conv_transpose_2d & args, + device const T * src0, + device const float * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]]) { + + const int32_t out_x = tgpig[0]; + const int32_t out_y = tgpig[1]; + const int32_t out_c = tgpig[2]; + + float v = 0.0f; + + for (int32_t in_c = 0; in_c= args.IH) continue; + + for (int32_t kw = 0; kw= args.IW) continue; + + const int32_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x; + const int32_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw; + + v += (float)src0[kernel_idx] * src1[input_idx]; + + } + } + } + device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2); + + dst_ptr[0] = v; +} + +template [[host_name("kernel_conv_transpose_2d_f32_f32")]] +kernel void kernel_conv_transpose_2d( + constant ggml_metal_kargs_conv_transpose_2d & args, + device const float * src0, + device const float * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]]); + +template [[host_name("kernel_conv_transpose_2d_f16_f32")]] +kernel void kernel_conv_transpose_2d( + constant ggml_metal_kargs_conv_transpose_2d & args, + device const half * src0, + device const float * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]]); + kernel void kernel_upscale_f32( constant ggml_metal_kargs_upscale & args, device const char * src0, From 2f77e82be7e1fb51901d09b5dde85f646dc5e20a Mon Sep 17 00:00:00 2001 From: Ilia Ilmer Date: Sun, 12 Oct 2025 16:16:04 -0400 Subject: [PATCH 03/11] fix type --- ggml/src/ggml-metal/ggml-metal.metal | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 6e6566f961..dacb32ef5b 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4155,10 +4155,10 @@ kernel void kernel_conv_transpose_2d( float v = 0.0f; - for (int32_t in_c = 0; in_c= args.IH) continue; - for (int32_t kw = 0; kw= args.IW) continue; - const int32_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x; - const int32_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw; + const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x; + const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw; v += (float)src0[kernel_idx] * src1[input_idx]; From a190a9dd468014babe836768ad1b4b7bd3c1967d Mon Sep 17 00:00:00 2001 From: Ilia Ilmer Date: Sun, 12 Oct 2025 16:22:04 -0400 Subject: [PATCH 04/11] fix type: int32->int64 --- ggml/src/ggml-metal/ggml-metal.metal | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index dacb32ef5b..9b9321abcf 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4149,9 +4149,9 @@ kernel void kernel_conv_transpose_2d( uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpg[[threadgroups_per_grid]]) { - const int32_t out_x = tgpig[0]; - const int32_t out_y = tgpig[1]; - const int32_t out_c = tgpig[2]; + const int64_t out_x = tgpig[0]; + const int64_t out_y = tgpig[1]; + const int64_t out_c = tgpig[2]; float v = 0.0f; From aa4b222e599a7e1451f6ee5d519aeeea67cd36ff Mon Sep 17 00:00:00 2001 From: Ilia Ilmer Date: Mon, 13 Oct 2025 19:57:48 -0400 Subject: [PATCH 05/11] Update ggml/src/ggml-metal/ggml-metal.metal Co-authored-by: Georgi Gerganov --- ggml/src/ggml-metal/ggml-metal.metal | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 9b9321abcf..8c4393c5cd 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4155,8 +4155,8 @@ kernel void kernel_conv_transpose_2d( float v = 0.0f; - for (int64_t in_c = 0; in_c Date: Mon, 13 Oct 2025 19:57:57 -0400 Subject: [PATCH 06/11] Update ggml/src/ggml-metal/ggml-metal.metal Co-authored-by: Georgi Gerganov --- ggml/src/ggml-metal/ggml-metal.metal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 8c4393c5cd..65d842d7ee 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4169,7 +4169,7 @@ kernel void kernel_conv_transpose_2d( for (int64_t kw = 0; kw Date: Mon, 13 Oct 2025 19:58:06 -0400 Subject: [PATCH 07/11] Update ggml/src/ggml-metal/ggml-metal.metal Co-authored-by: Georgi Gerganov --- ggml/src/ggml-metal/ggml-metal.metal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 65d842d7ee..14b3cdb0f9 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4166,7 +4166,7 @@ kernel void kernel_conv_transpose_2d( if (in_y >= args.IH) continue; - for (int64_t kw = 0; kw Date: Mon, 13 Oct 2025 20:18:37 -0400 Subject: [PATCH 08/11] add checks for src[0] and src[1]; add type checks --- ggml/src/ggml-metal/ggml-metal-device.m | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index cacdbebf65..9b9fdcfa29 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -647,8 +647,12 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_REPEAT: case GGML_OP_SCALE: case GGML_OP_CONV_TRANSPOSE_1D: - case GGML_OP_CONV_TRANSPOSE_2D: return true; + case GGML_OP_CONV_TRANSPOSE_2D: + return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && + (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) && + op->src[1]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32; case GGML_OP_CLAMP: return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_SQR: From 2f1ed3ccc41e73c3a026aa190f425bdd468a641c Mon Sep 17 00:00:00 2001 From: Ilia Ilmer Date: Tue, 14 Oct 2025 06:35:54 -0400 Subject: [PATCH 09/11] Update ggml-metal.metal Co-authored-by: Georgi Gerganov --- ggml/src/ggml-metal/ggml-metal.metal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 14b3cdb0f9..101ae4e4d3 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4166,7 +4166,7 @@ kernel void kernel_conv_transpose_2d( if (in_y >= args.IH) continue; - for (int64_t kw = 0; kw Date: Wed, 15 Oct 2025 22:00:06 -0400 Subject: [PATCH 10/11] add more tests, add optimization to threading --- ggml/src/ggml-metal/ggml-metal-ops.cpp | 4 +- ggml/src/ggml-metal/ggml-metal.metal | 54 ++++++++++++++++---------- tests/test-backend-ops.cpp | 2 + 3 files changed, 39 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 38106e35eb..b85da862a7 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -3094,6 +3094,8 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) { const int32_t KH = op->src[0]->ne[1]; const int32_t KW = op->src[0]->ne[0]; + GGML_ASSERT(KW * KH <= 64 && "conv_transpose_2d kernel size exceeds threadgroup memory limit"); + const int32_t OW = op->ne[0]; const int32_t OH = op->ne[1]; const int32_t OC = op->ne[2]; @@ -3119,7 +3121,7 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); - ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, 1, 1, 1); + ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, KW, KH, 1); return 1; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 101ae4e4d3..fcc30c8213 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4147,45 +4147,57 @@ kernel void kernel_conv_transpose_2d( device const float * src1, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]]) { + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { const int64_t out_x = tgpig[0]; const int64_t out_y = tgpig[1]; const int64_t out_c = tgpig[2]; + const int64_t kw = tpitg[0]; + const int64_t kh = tpitg[1]; + float v = 0.0f; for (int64_t in_c = 0; in_c < args.IC; in_c++) { - for (int64_t kh = 0; kh < args.KH; kh++) { + int64_t in_y = out_y - kh; + + if (in_y < 0 || in_y % args.s0) continue; - int64_t in_y = out_y - kh; + in_y /= args.s0; - if (in_y < 0 || in_y % args.s0) continue; + if (in_y >= args.IH) continue; - in_y /= args.s0; + int64_t in_x = out_x - kw; - if (in_y >= args.IH) continue; + if (in_x < 0 || in_x % args.s0) continue; - for (int64_t kw = 0; kw < args.KW; kw++) { - int64_t in_x = out_x - kw; + in_x /= args.s0; - if (in_x < 0 || in_x % args.s0) continue; + if (in_x >= args.IW) continue; - in_x /= args.s0; + const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x; + const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw; - if (in_x >= args.IW) continue; + v += (float)src0[kernel_idx] * src1[input_idx]; + } - const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x; - const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw; + threadgroup float shared_sum[64]; + const uint tid = tpitg.y * ntg.x + tpitg.x; + shared_sum[tid] = v; - v += (float)src0[kernel_idx] * src1[input_idx]; + threadgroup_barrier(mem_flags::mem_threadgroup); - } + if (tid == 0) { + float total = 0.0f; + const uint num_threads = ntg.x * ntg.y; + for (uint i = 0; i < num_threads; i++) { + total += shared_sum[i]; } - } - device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2); - dst_ptr[0] = v; + device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2); + dst_ptr[0] = total; + } } template [[host_name("kernel_conv_transpose_2d_f32_f32")]] @@ -4195,7 +4207,8 @@ kernel void kernel_conv_transpose_2d( device const float * src1, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]]); + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); template [[host_name("kernel_conv_transpose_2d_f16_f32")]] kernel void kernel_conv_transpose_2d( @@ -4204,7 +4217,8 @@ kernel void kernel_conv_transpose_2d( device const float * src1, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tgpg[[threadgroups_per_grid]]); + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); kernel void kernel_upscale_f32( constant ggml_metal_kargs_upscale & args, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2fa16b497a..7fde283a36 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6952,6 +6952,8 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true)); test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1)); + test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1)); + test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2)); test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1})); From 9f3e11c7981f57383d08fa0572c1ac88b30c85e7 Mon Sep 17 00:00:00 2001 From: Ilia Ilmer Date: Thu, 16 Oct 2025 08:59:43 -0400 Subject: [PATCH 11/11] add dynamic memory allocation in metal --- ggml/src/ggml-metal/ggml-metal-ops.cpp | 6 ++++-- ggml/src/ggml-metal/ggml-metal.metal | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index b85da862a7..427b77ae5b 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -3094,8 +3094,6 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) { const int32_t KH = op->src[0]->ne[1]; const int32_t KW = op->src[0]->ne[0]; - GGML_ASSERT(KW * KH <= 64 && "conv_transpose_2d kernel size exceeds threadgroup memory limit"); - const int32_t OW = op->ne[0]; const int32_t OH = op->ne[1]; const int32_t OC = op->ne[2]; @@ -3121,6 +3119,10 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); + // Metal requires buffer size to be multiple of 16 bytes + const size_t smem = GGML_PAD(KW * KH * sizeof(float), 16); + ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0); + ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, KW, KH, 1); return 1; diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index fcc30c8213..994f9a0e49 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4146,6 +4146,7 @@ kernel void kernel_conv_transpose_2d( device const T * src0, device const float * src1, device char * dst, + threadgroup float * shared_sum [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { @@ -4182,7 +4183,6 @@ kernel void kernel_conv_transpose_2d( v += (float)src0[kernel_idx] * src1[input_idx]; } - threadgroup float shared_sum[64]; const uint tid = tpitg.y * ntg.x + tpitg.x; shared_sum[tid] = v; @@ -4206,6 +4206,7 @@ kernel void kernel_conv_transpose_2d( device const float * src0, device const float * src1, device char * dst, + threadgroup float * shared_sum [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]); @@ -4216,6 +4217,7 @@ kernel void kernel_conv_transpose_2d( device const half * src0, device const float * src1, device char * dst, + threadgroup float * shared_sum [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]);