From 1dc94065039c44e82c31da027293cf7e44dc4210 Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <contact@iliailmer.com>
Date: Sun, 12 Oct 2025 15:36:52 -0400
Subject: [PATCH 01/11] initial: headers and metal-device.cpp updates

---
 ggml/src/ggml-metal/ggml-metal-device.cpp | 25 +++++++++++++++++++++++
 ggml/src/ggml-metal/ggml-metal-device.h   |  1 +
 ggml/src/ggml-metal/ggml-metal-device.m   |  1 +
 ggml/src/ggml-metal/ggml-metal-ops.h      |  1 +
 4 files changed, 28 insertions(+)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index e23abdda97..3634c050fa 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -1387,6 +1387,31 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d(ggml_met
     return res;
 }
 
+ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d(ggml_metal_library_t lib, const ggml_tensor * op) {
+    assert(op->op == GGML_OP_CONV_TRANSPOSE_2D);
+
+    GGML_ASSERT(ggml_is_contiguous(op->src[0]));
+    GGML_ASSERT(ggml_is_contiguous(op->src[1]));
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type         == GGML_TYPE_F32);
+
+    char base[256];
+    char name[256];
+
+    snprintf(base, 256, "kernel_conv_transpose_2d_%s_%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type));
+    snprintf(name, 256, "%s", base);
+
+    ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
+    if (res) {
+        return res;
+    }
+
+    res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+
+    return res;
+}
+
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale(ggml_metal_library_t lib, const ggml_tensor * op) {
     assert(op->op == GGML_OP_UPSCALE);
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h
index 1034e4bbf6..76af8be10b 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -129,6 +129,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm              (ggml_me
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope              (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col            (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
+ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale           (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad               (ggml_metal_library_t lib, const struct ggml_tensor * op);
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad_reflect_1d    (ggml_metal_library_t lib, const struct ggml_tensor * op);
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 9527973015..cacdbebf65 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -647,6 +647,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_REPEAT:
         case GGML_OP_SCALE:
         case GGML_OP_CONV_TRANSPOSE_1D:
+        case GGML_OP_CONV_TRANSPOSE_2D:
             return true;
         case GGML_OP_CLAMP:
             return op->src[0]->type == GGML_TYPE_F32;
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h
index d4cb944621..4ef826b737 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.h
+++ b/ggml/src/ggml-metal/ggml-metal-ops.h
@@ -70,6 +70,7 @@ int ggml_metal_op_norm              (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_rope              (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_im2col            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_conv_transpose_1d (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_upscale           (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_pad               (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_pad_reflect_1d    (ggml_metal_op_t ctx, int idx);

From 09661b92fd2ec2f9b155f16fe8f9373b13002e7b Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <contact@iliailmer.com>
Date: Sun, 12 Oct 2025 15:38:08 -0400
Subject: [PATCH 02/11] adding conv_transpose_2d

---
 ggml/src/ggml-metal/ggml-metal-impl.h  | 13 +++++
 ggml/src/ggml-metal/ggml-metal-ops.cpp | 56 +++++++++++++++++++
 ggml/src/ggml-metal/ggml-metal.metal   | 75 ++++++++++++++++++++++++++
 3 files changed, 144 insertions(+)

diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
index c9dff87305..2f1f5fdbfc 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -513,6 +513,19 @@ typedef struct {
     uint64_t nb1;
 } ggml_metal_kargs_conv_transpose_1d;
 
+typedef struct {
+    int32_t  IC;
+    int32_t  IH;
+    int32_t  IW;
+    int32_t  KH;
+    int32_t  KW;
+    int32_t  OC;
+    int32_t  s0;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+} ggml_metal_kargs_conv_transpose_2d;
+
 typedef struct {
     uint64_t  ofs0;
     uint64_t  ofs1;
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 5f9370449b..38106e35eb 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -364,6 +364,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = ggml_metal_op_conv_transpose_1d(ctx, idx);
             } break;
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            {
+                n_fuse = ggml_metal_op_conv_transpose_2d(ctx, idx);
+            } break;
         case GGML_OP_UPSCALE:
             {
                 n_fuse = ggml_metal_op_upscale(ctx, idx);
@@ -3068,6 +3072,58 @@ int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
     return 1;
 }
 
+int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
+    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    GGML_TENSOR_LOCALS(uint32_t, nb,  op,         nb);
+
+    const int32_t s0 = ((const int32_t *)(op->op_params))[0];
+
+    const int32_t IC = op->src[1]->ne[2];
+    const int32_t IH = op->src[1]->ne[1];
+    const int32_t IW = op->src[1]->ne[0];
+
+    const int32_t KH = op->src[0]->ne[1];
+    const int32_t KW = op->src[0]->ne[0];
+
+    const int32_t OW = op->ne[0];
+    const int32_t OH = op->ne[1];
+    const int32_t OC = op->ne[2];
+
+    ggml_metal_kargs_conv_transpose_2d args = {
+        /*.IC  =*/ IC,
+        /*.IH  =*/ IH,
+        /*.IW  =*/ IW,
+        /*.KH  =*/ KH,
+        /*.KW  =*/ KW,
+        /*.OC  =*/ OC,
+        /*.s0  =*/ s0,
+        /*.nb0 =*/ nb0,
+        /*.nb1 =*/ nb1,
+        /*.nb2 =*/ nb2,
+    };
+
+    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, 1, 1, 1);
+
+    return 1;
+}
+
 int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
     ggml_tensor * op = ctx->node(idx);
 
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index ddc285042d..6e6566f961 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4131,6 +4131,81 @@ kernel void kernel_conv_transpose_1d<half>(
     uint3   tgpig[[threadgroup_position_in_grid]],
     uint3    tgpg[[threadgroups_per_grid]]);
 
+
+typedef void (conv_transpose_2d_t)(
+        constant ggml_metal_kargs_conv_transpose_2d & args,
+        device const float * src0,
+        device const float * src1,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        uint3    tgpg[[threadgroups_per_grid]]);
+
+template <typename T>
+kernel void kernel_conv_transpose_2d(
+        constant ggml_metal_kargs_conv_transpose_2d & args,
+        device const T * src0,
+        device const float * src1,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        uint3    tgpg[[threadgroups_per_grid]]) {
+
+    const int32_t out_x = tgpig[0];
+    const int32_t out_y = tgpig[1];
+    const int32_t out_c = tgpig[2];
+
+    float v = 0.0f;
+
+    for (int32_t in_c = 0; in_c<args.IC; in_c++){
+        for (int32_t kh = 0; kh<args.KH; kh++){
+
+            int32_t in_y = out_y - kh;
+
+            if (in_y < 0 || in_y % args.s0) continue;
+
+            in_y /= args.s0;
+
+            if (in_y >= args.IH) continue;
+
+            for (int32_t kw = 0; kw<args.KW; kw++){
+                int32_t in_x = out_x - kw;
+
+                if (in_x <0 || in_x % args.s0) continue;
+
+                in_x /= args.s0;
+
+                if (in_x >= args.IW) continue;
+
+                const int32_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x;
+                const int32_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw;
+
+                v += (float)src0[kernel_idx] * src1[input_idx];
+
+            }
+        }
+    }
+    device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2);
+
+    dst_ptr[0] = v;
+}
+
+template [[host_name("kernel_conv_transpose_2d_f32_f32")]]
+kernel void kernel_conv_transpose_2d<float>(
+    constant ggml_metal_kargs_conv_transpose_2d & args,
+    device const float * src0,
+    device const float * src1,
+    device        char * dst,
+    uint3   tgpig[[threadgroup_position_in_grid]],
+    uint3    tgpg[[threadgroups_per_grid]]);
+
+template [[host_name("kernel_conv_transpose_2d_f16_f32")]]
+kernel void kernel_conv_transpose_2d<half>(
+    constant ggml_metal_kargs_conv_transpose_2d & args,
+    device const half  * src0,
+    device const float * src1,
+    device        char * dst,
+    uint3   tgpig[[threadgroup_position_in_grid]],
+    uint3    tgpg[[threadgroups_per_grid]]);
+
 kernel void kernel_upscale_f32(
     constant ggml_metal_kargs_upscale & args,
     device  const char * src0,

From 2f77e82be7e1fb51901d09b5dde85f646dc5e20a Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <contact@iliailmer.com>
Date: Sun, 12 Oct 2025 16:16:04 -0400
Subject: [PATCH 03/11] fix type

---
 ggml/src/ggml-metal/ggml-metal.metal | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 6e6566f961..dacb32ef5b 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4155,10 +4155,10 @@ kernel void kernel_conv_transpose_2d(
 
     float v = 0.0f;
 
-    for (int32_t in_c = 0; in_c<args.IC; in_c++){
-        for (int32_t kh = 0; kh<args.KH; kh++){
+    for (int64_t in_c = 0; in_c<args.IC; in_c++){
+        for (int64_t kh = 0; kh<args.KH; kh++){
 
-            int32_t in_y = out_y - kh;
+            int64_t in_y = out_y - kh;
 
             if (in_y < 0 || in_y % args.s0) continue;
 
@@ -4166,8 +4166,8 @@ kernel void kernel_conv_transpose_2d(
 
             if (in_y >= args.IH) continue;
 
-            for (int32_t kw = 0; kw<args.KW; kw++){
-                int32_t in_x = out_x - kw;
+            for (int64_t kw = 0; kw<args.KW; kw++){
+                int64_t in_x = out_x - kw;
 
                 if (in_x <0 || in_x % args.s0) continue;
 
@@ -4175,8 +4175,8 @@ kernel void kernel_conv_transpose_2d(
 
                 if (in_x >= args.IW) continue;
 
-                const int32_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x;
-                const int32_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw;
+                const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x;
+                const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw;
 
                 v += (float)src0[kernel_idx] * src1[input_idx];
 

From a190a9dd468014babe836768ad1b4b7bd3c1967d Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <contact@iliailmer.com>
Date: Sun, 12 Oct 2025 16:22:04 -0400
Subject: [PATCH 04/11] fix type: int32->int64

---
 ggml/src/ggml-metal/ggml-metal.metal | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index dacb32ef5b..9b9321abcf 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4149,9 +4149,9 @@ kernel void kernel_conv_transpose_2d(
         uint3   tgpig[[threadgroup_position_in_grid]],
         uint3    tgpg[[threadgroups_per_grid]]) {
 
-    const int32_t out_x = tgpig[0];
-    const int32_t out_y = tgpig[1];
-    const int32_t out_c = tgpig[2];
+    const int64_t out_x = tgpig[0];
+    const int64_t out_y = tgpig[1];
+    const int64_t out_c = tgpig[2];
 
     float v = 0.0f;
 

From aa4b222e599a7e1451f6ee5d519aeeea67cd36ff Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <iliailmer@users.noreply.github.com>
Date: Mon, 13 Oct 2025 19:57:48 -0400
Subject: [PATCH 05/11] Update ggml/src/ggml-metal/ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml/src/ggml-metal/ggml-metal.metal | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 9b9321abcf..8c4393c5cd 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4155,8 +4155,8 @@ kernel void kernel_conv_transpose_2d(
 
     float v = 0.0f;
 
-    for (int64_t in_c = 0; in_c<args.IC; in_c++){
-        for (int64_t kh = 0; kh<args.KH; kh++){
+    for (int64_t in_c = 0; in_c < args.IC; in_c++) {
+        for (int64_t kh = 0; kh < args.KH; kh++) {
 
             int64_t in_y = out_y - kh;
 

From 86f6d44899f91ac37d098973648117dc6d92993b Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <iliailmer@users.noreply.github.com>
Date: Mon, 13 Oct 2025 19:57:57 -0400
Subject: [PATCH 06/11] Update ggml/src/ggml-metal/ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml/src/ggml-metal/ggml-metal.metal | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 8c4393c5cd..65d842d7ee 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4169,7 +4169,7 @@ kernel void kernel_conv_transpose_2d(
             for (int64_t kw = 0; kw<args.KW; kw++){
                 int64_t in_x = out_x - kw;
 
-                if (in_x <0 || in_x % args.s0) continue;
+                if (in_x < 0 || in_x % args.s0) continue;
 
                 in_x /= args.s0;
 

From 731246a959ee57afac5992c8c93631e04a8eaaa2 Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <iliailmer@users.noreply.github.com>
Date: Mon, 13 Oct 2025 19:58:06 -0400
Subject: [PATCH 07/11] Update ggml/src/ggml-metal/ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml/src/ggml-metal/ggml-metal.metal | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 65d842d7ee..14b3cdb0f9 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4166,7 +4166,7 @@ kernel void kernel_conv_transpose_2d(
 
             if (in_y >= args.IH) continue;
 
-            for (int64_t kw = 0; kw<args.KW; kw++){
+            for (int64_t kw = 0; kw<args.KW; kw++) {
                 int64_t in_x = out_x - kw;
 
                 if (in_x < 0 || in_x % args.s0) continue;

From 93beae416d3641b6dae86564e98e066906172721 Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <contact@iliailmer.com>
Date: Mon, 13 Oct 2025 20:18:37 -0400
Subject: [PATCH 08/11] add checks for src[0] and src[1]; add type checks

---
 ggml/src/ggml-metal/ggml-metal-device.m | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index cacdbebf65..9b9fdcfa29 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -647,8 +647,12 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_REPEAT:
         case GGML_OP_SCALE:
         case GGML_OP_CONV_TRANSPOSE_1D:
-        case GGML_OP_CONV_TRANSPOSE_2D:
             return true;
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) &&
+                (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) &&
+                op->src[1]->type == GGML_TYPE_F32 &&
+                op->type == GGML_TYPE_F32;
         case GGML_OP_CLAMP:
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_SQR:

From 2f1ed3ccc41e73c3a026aa190f425bdd468a641c Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <iliailmer@users.noreply.github.com>
Date: Tue, 14 Oct 2025 06:35:54 -0400
Subject: [PATCH 09/11] Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml/src/ggml-metal/ggml-metal.metal | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 14b3cdb0f9..101ae4e4d3 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4166,7 +4166,7 @@ kernel void kernel_conv_transpose_2d(
 
             if (in_y >= args.IH) continue;
 
-            for (int64_t kw = 0; kw<args.KW; kw++) {
+            for (int64_t kw = 0; kw < args.KW; kw++) {
                 int64_t in_x = out_x - kw;
 
                 if (in_x < 0 || in_x % args.s0) continue;

From 7b6f66246e6d9862dafae7fa28afc0b7dc09026f Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <contact@iliailmer.com>
Date: Wed, 15 Oct 2025 22:00:06 -0400
Subject: [PATCH 10/11] add more tests, add optimization to threading

---
 ggml/src/ggml-metal/ggml-metal-ops.cpp |  4 +-
 ggml/src/ggml-metal/ggml-metal.metal   | 54 ++++++++++++++++----------
 tests/test-backend-ops.cpp             |  2 +
 3 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 38106e35eb..b85da862a7 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -3094,6 +3094,8 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
     const int32_t KH = op->src[0]->ne[1];
     const int32_t KW = op->src[0]->ne[0];
 
+    GGML_ASSERT(KW * KH <= 64 && "conv_transpose_2d kernel size exceeds threadgroup memory limit");
+
     const int32_t OW = op->ne[0];
     const int32_t OH = op->ne[1];
     const int32_t OC = op->ne[2];
@@ -3119,7 +3121,7 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, 1, 1, 1);
+    ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, KW, KH, 1);
 
     return 1;
 }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 101ae4e4d3..fcc30c8213 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4147,45 +4147,57 @@ kernel void kernel_conv_transpose_2d(
         device const float * src1,
         device        char * dst,
         uint3   tgpig[[threadgroup_position_in_grid]],
-        uint3    tgpg[[threadgroups_per_grid]]) {
+        uint3   tpitg[[thread_position_in_threadgroup]],
+        uint3     ntg[[threads_per_threadgroup]]) {
 
     const int64_t out_x = tgpig[0];
     const int64_t out_y = tgpig[1];
     const int64_t out_c = tgpig[2];
 
+    const int64_t kw = tpitg[0];
+    const int64_t kh = tpitg[1];
+
     float v = 0.0f;
 
     for (int64_t in_c = 0; in_c < args.IC; in_c++) {
-        for (int64_t kh = 0; kh < args.KH; kh++) {
+        int64_t in_y = out_y - kh;
+
+        if (in_y < 0 || in_y % args.s0) continue;
 
-            int64_t in_y = out_y - kh;
+        in_y /= args.s0;
 
-            if (in_y < 0 || in_y % args.s0) continue;
+        if (in_y >= args.IH) continue;
 
-            in_y /= args.s0;
+        int64_t in_x = out_x - kw;
 
-            if (in_y >= args.IH) continue;
+        if (in_x < 0 || in_x % args.s0) continue;
 
-            for (int64_t kw = 0; kw < args.KW; kw++) {
-                int64_t in_x = out_x - kw;
+        in_x /= args.s0;
 
-                if (in_x < 0 || in_x % args.s0) continue;
+        if (in_x >= args.IW) continue;
 
-                in_x /= args.s0;
+        const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x;
+        const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw;
 
-                if (in_x >= args.IW) continue;
+        v += (float)src0[kernel_idx] * src1[input_idx];
+    }
 
-                const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x;
-                const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw;
+    threadgroup float shared_sum[64];
+    const uint tid = tpitg.y * ntg.x + tpitg.x;
+    shared_sum[tid] = v;
 
-                v += (float)src0[kernel_idx] * src1[input_idx];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
 
-            }
+    if (tid == 0) {
+        float total = 0.0f;
+        const uint num_threads = ntg.x * ntg.y;
+        for (uint i = 0; i < num_threads; i++) {
+            total += shared_sum[i];
         }
-    }
-    device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2);
 
-    dst_ptr[0] = v;
+        device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2);
+        dst_ptr[0] = total;
+    }
 }
 
 template [[host_name("kernel_conv_transpose_2d_f32_f32")]]
@@ -4195,7 +4207,8 @@ kernel void kernel_conv_transpose_2d<float>(
     device const float * src1,
     device        char * dst,
     uint3   tgpig[[threadgroup_position_in_grid]],
-    uint3    tgpg[[threadgroups_per_grid]]);
+    uint3   tpitg[[thread_position_in_threadgroup]],
+    uint3     ntg[[threads_per_threadgroup]]);
 
 template [[host_name("kernel_conv_transpose_2d_f16_f32")]]
 kernel void kernel_conv_transpose_2d<half>(
@@ -4204,7 +4217,8 @@ kernel void kernel_conv_transpose_2d<half>(
     device const float * src1,
     device        char * dst,
     uint3   tgpig[[threadgroup_position_in_grid]],
-    uint3    tgpg[[threadgroups_per_grid]]);
+    uint3   tpitg[[thread_position_in_threadgroup]],
+    uint3     ntg[[threads_per_threadgroup]]);
 
 kernel void kernel_upscale_f32(
     constant ggml_metal_kargs_upscale & args,
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 2fa16b497a..7fde283a36 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6952,6 +6952,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
 
     test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1));
+    test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1));
+    test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
 
     test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
 

From 9f3e11c7981f57383d08fa0572c1ac88b30c85e7 Mon Sep 17 00:00:00 2001
From: Ilia Ilmer <contact@iliailmer.com>
Date: Thu, 16 Oct 2025 08:59:43 -0400
Subject: [PATCH 11/11] add dynamic memory allocation in metal

---
 ggml/src/ggml-metal/ggml-metal-ops.cpp | 6 ++++--
 ggml/src/ggml-metal/ggml-metal.metal   | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index b85da862a7..427b77ae5b 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -3094,8 +3094,6 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
     const int32_t KH = op->src[0]->ne[1];
     const int32_t KW = op->src[0]->ne[0];
 
-    GGML_ASSERT(KW * KH <= 64 && "conv_transpose_2d kernel size exceeds threadgroup memory limit");
-
     const int32_t OW = op->ne[0];
     const int32_t OH = op->ne[1];
     const int32_t OC = op->ne[2];
@@ -3121,6 +3119,10 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
 
+    // Metal requires buffer size to be multiple of 16 bytes
+    const size_t smem = GGML_PAD(KW * KH * sizeof(float), 16);
+    ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
+
     ggml_metal_encoder_dispatch_threadgroups(enc, OW, OH, OC, KW, KH, 1);
 
     return 1;
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index fcc30c8213..994f9a0e49 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4146,6 +4146,7 @@ kernel void kernel_conv_transpose_2d(
         device const T * src0,
         device const float * src1,
         device        char * dst,
+        threadgroup float * shared_sum [[threadgroup(0)]],
         uint3   tgpig[[threadgroup_position_in_grid]],
         uint3   tpitg[[thread_position_in_threadgroup]],
         uint3     ntg[[threads_per_threadgroup]]) {
@@ -4182,7 +4183,6 @@ kernel void kernel_conv_transpose_2d(
         v += (float)src0[kernel_idx] * src1[input_idx];
     }
 
-    threadgroup float shared_sum[64];
     const uint tid = tpitg.y * ntg.x + tpitg.x;
     shared_sum[tid] = v;
 
@@ -4206,6 +4206,7 @@ kernel void kernel_conv_transpose_2d<float>(
     device const float * src0,
     device const float * src1,
     device        char * dst,
+    threadgroup float * shared_sum [[threadgroup(0)]],
     uint3   tgpig[[threadgroup_position_in_grid]],
     uint3   tpitg[[thread_position_in_threadgroup]],
     uint3     ntg[[threads_per_threadgroup]]);
@@ -4216,6 +4217,7 @@ kernel void kernel_conv_transpose_2d<half>(
     device const half  * src0,
     device const float * src1,
     device        char * dst,
+    threadgroup float * shared_sum [[threadgroup(0)]],
     uint3   tgpig[[threadgroup_position_in_grid]],
     uint3   tpitg[[thread_position_in_threadgroup]],
     uint3     ntg[[threads_per_threadgroup]]);