diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 605fcfcb9c29f..a088f0c0a3a77 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2190,6 +2190,15 @@ extern "C" {
             int                  p2,
             int                  p3);
 
+    // pad each dimension with values on the other side of the torus (looping around)
+    GGML_API struct ggml_tensor * ggml_pad_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  p0,
+            int                  p1,
+            int                  p2,
+            int                  p3);
+
     GGML_API struct ggml_tensor * ggml_pad_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -2203,6 +2212,20 @@ extern "C" {
             int                  rp3
             );
 
+    // pad each dimension with values on the other side of the torus (looping around)
+    GGML_API struct ggml_tensor * ggml_pad_ext_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  lp0,
+            int                  rp0,
+            int                  lp1,
+            int                  rp1,
+            int                  lp2,
+            int                  rp2,
+            int                  lp3,
+            int                  rp3
+            );
+
     // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
     GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
             struct ggml_context * ctx,
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index b6209588db1e4..c7207ec1c0a69 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6554,8 +6554,13 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
     ggml_compute_forward_mul_mat(params, &dst);
 }
 
+static inline int64_t ggml_wrap_coord(int64_t coord, int64_t size) {
+    return (coord  + size) % size; // adding size avoids negative number weirdness
+}
+
 // ggml_compute_forward_conv_2d
 
+
 static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
                                               const ggml_tensor *         kernel,  // [KW, KH, IC, OC]
                                               const ggml_tensor *         src,     // [W, H, C, N]
@@ -7555,24 +7560,51 @@ static void ggml_compute_forward_pad_f32(
     const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
     const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
     const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
+    const int32_t circular = ggml_get_op_params_i32(dst, 8);
 
 
     // TODO: optimize
 
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-                    if ((i0 >= lp0 && i0 < ne0 - rp0) \
-                         && (i1 >= lp1 && i1 < ne1 - rp1) \
-                         && (i2 >= lp2 && i2 < ne2 - rp2) \
-                         && (i3 >= lp3 && i3 < ne3 - rp3)) {
-                        const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
+    if (circular == 0)  {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                        const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+                        if ((i0 >= lp0 && i0 < ne0 - rp0) \
+                            && (i1 >= lp1 && i1 < ne1 - rp1) \
+                            && (i2 >= lp2 && i2 < ne2 - rp2) \
+                            && (i3 >= lp3 && i3 < ne3 - rp3)) {
+                            const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
+                            const float * src_ptr = (const float *)((char *) src0->data + src_idx);
+                            dst_ptr[dst_idx] = *src_ptr;
+                        } else {
+                            dst_ptr[dst_idx] = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                        const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+                        const int64_t src_i0 = ggml_wrap_coord(i0 - lp0, ne00);
+                        const int64_t src_i1 = ggml_wrap_coord(i1 - lp1, ne01);
+                        const int64_t src_i2 = ggml_wrap_coord(i2 - lp2, ne02);
+                        const int64_t src_i3 = ggml_wrap_coord(i3 - lp3, ne03);
+
+                        const int64_t src_idx =
+                            src_i3*nb03 +
+                            src_i2*nb02 +
+                            src_i1*nb01 +
+                            src_i0*nb00;
+                        
                         const float * src_ptr = (const float *)((char *) src0->data + src_idx);
                         dst_ptr[dst_idx] = *src_ptr;
-                    } else {
-                        dst_ptr[dst_idx] = 0;
                     }
                 }
             }
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index 29aef33c1a4b8..f3f06897e4299 100644
--- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu
@@ -1,9 +1,18 @@
+#include <stdint.h>
+
 #include "pad.cuh"
 
+
+
+__device__ __forceinline__ int64_t wrap_coord(int64_t coord, int64_t size) {
+    return (coord % size + size) % size;
+}
+
 static __global__ void pad_f32(const float * src, float * dst,
                                const int lp0, const int rp0, const int lp1, const int rp1,
                                const int lp2, const int rp2, const int lp3, const int rp3,
-                               const int ne0, const int ne1, const int ne2, const int ne3) {
+                               const int ne0, const int ne1, const int ne2, const int ne3,
+                               const int circular) {
     // blockIdx.z: i3*ne2+i2
     // blockIdx.y: i1
     // blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
@@ -12,39 +21,59 @@ static __global__ void pad_f32(const float * src, float * dst,
     int i1 = blockIdx.y;
     int i2 = blockIdx.z % ne2;
     int i3 = blockIdx.z / ne2;
+
     if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
         return;
     }
-
-    // operation
+    
     const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-    if ((i0 >= lp0 && i0 < ne0 - rp0) &&
-        (i1 >= lp1 && i1 < ne1 - rp1) &&
-        (i2 >= lp2 && i2 < ne2 - rp2) &&
-        (i3 >= lp3 && i3 < ne3 - rp3)) {
-        const int64_t i00 = i0 - lp0;
-        const int64_t i01 = i1 - lp1;
-        const int64_t i02 = i2 - lp2;
-        const int64_t i03 = i3 - lp3;
-        const int64_t ne02 = ne2 - lp2 - rp2;
-        const int64_t ne01 = ne1 - lp1 - rp1;
+
+    if (circular == 0) {
+        // operation
+        if ((i0 >= lp0 && i0 < ne0 - rp0) &&
+            (i1 >= lp1 && i1 < ne1 - rp1) &&
+            (i2 >= lp2 && i2 < ne2 - rp2) &&
+            (i3 >= lp3 && i3 < ne3 - rp3)) {
+            const int64_t i00 = i0 - lp0;
+            const int64_t i01 = i1 - lp1;
+            const int64_t i02 = i2 - lp2;
+            const int64_t i03 = i3 - lp3;
+            const int64_t ne02 = ne2 - lp2 - rp2;
+            const int64_t ne01 = ne1 - lp1 - rp1;
+            const int64_t ne00 = ne0 - lp0 - rp0;
+
+            const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
+
+            dst[dst_idx] = src[src_idx];
+        } else {
+            dst[dst_idx] = 0.0f;
+        }
+    }
+    else {
         const int64_t ne00 = ne0 - lp0 - rp0;
+        const int64_t ne01 = ne1 - lp1 - rp1;
+        const int64_t ne02 = ne2 - lp2 - rp2;
+        const int64_t ne03 = ne3 - lp3 - rp3;
+
+        const int64_t i00 = wrap_coord(i0 - lp0, ne00);
+        const int64_t i01 = wrap_coord(i1 - lp1, ne01);
+        const int64_t i02 = wrap_coord(i2 - lp2, ne02);
+        const int64_t i03 = wrap_coord(i3 - lp3, ne03);
 
         const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
 
         dst[dst_idx] = src[src_idx];
-    } else {
-        dst[dst_idx] = 0.0f;
     }
 }
 
 static void pad_f32_cuda(const float * src, float * dst,
     const int lp0, const int rp0, const int lp1, const int rp1,
     const int lp2, const int rp2, const int lp3, const int rp3,
-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+    const int ne0, const int ne1, const int ne2, const int ne3,
+    const int circular, cudaStream_t stream) {
     int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
     dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3);
+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3, circular);
 }
 
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -65,8 +94,9 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
     const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
     const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
+    const int32_t circular = ((const int32_t*)(dst->op_params))[8]; 
 
     pad_f32_cuda(src0_d, dst_d,
                  lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
-                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], circular, stream);
 }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c6503f0326031..41fa79283e313 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -966,6 +966,7 @@ struct vk_op_pad_push_constants {
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
     uint32_t misalign_offsets;
+    uint32_t circular;
 
     uint32_t lp0; uint32_t rp0;
     uint32_t lp1; uint32_t rp1;
@@ -1008,6 +1009,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor
     p.rp2 = dst->op_params[5];
     p.lp3 = dst->op_params[6];
     p.rp3 = dst->op_params[7];
+    p.circular = dst->op_params[8];
 
     return p; // fastdiv values and offsets are initialized later in ggml_vk_op
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
index f3c8176872758..f2fd5929bf41d 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
@@ -8,6 +8,7 @@ layout (push_constant) uniform parameter
     uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint misalign_offsets;
+    uint circular;
 
     uint lp0; uint rp0;
     uint lp1; uint rp1;
@@ -18,6 +19,10 @@ layout (push_constant) uniform parameter
 uint get_aoffset() { return p.misalign_offsets >> 16; }
 uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
 
+uint wrap_coord(int coord, uint size) {
+    return (uint(coord + int(size))) % size; // add size to avoid issues with negative
+}
+
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 
@@ -40,10 +45,21 @@ void main() {
     const uint src0_idx = (i3 - p.lp3)*p.nb03 + (i2 - p.lp2)*p.nb02 + (i1 - p.lp1)*p.nb01 + (i0 - p.lp0)*p.nb00;
     const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
 
-    const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 &&
-                         i1 >= p.lp1 && i1 < p.ne11 - p.rp1 &&
-                         i2 >= p.lp2 && i2 < p.ne12 - p.rp2 &&
-                         i3 >= p.lp3 && i3 < p.ne13 - p.rp3;
+    if (p.circular != 0u) {
+        const uint ci0 = wrap_coord(int(i0) - int(p.lp0), p.ne00);
+        const uint ci1 = wrap_coord(int(i1) - int(p.lp1), p.ne01);
+        const uint ci2 = wrap_coord(int(i2) - int(p.lp2), p.ne02);
+        const uint ci3 = wrap_coord(int(i3) - int(p.lp3), p.ne03);
+        const uint circular_src_idx = ci3*p.nb03 + ci2*p.nb02 + ci1*p.nb01 + ci0*p.nb00;
+        data_d[get_doffset() + dst_idx] = D_TYPE(data_a[get_aoffset() + circular_src_idx]);
+    }
+    else {
+        const bool is_src0 = i0 >= p.lp0 && i0 < p.ne10 - p.rp0 &&
+                            i1 >= p.lp1 && i1 < p.ne11 - p.rp1 &&
+                            i2 >= p.lp2 && i2 < p.ne12 - p.rp2 &&
+                            i3 >= p.lp3 && i3 < p.ne13 - p.rp3;
+        data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
+    }
+    
 
-    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index a5846a23937ce..bfca9a1157491 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4943,6 +4943,18 @@ struct ggml_tensor * ggml_pad(
     return ggml_pad_ext(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
 }
 
+// ggml_pad_circular
+
+struct ggml_tensor * ggml_pad_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   p0,
+        int                   p1,
+        int                   p2,
+        int                   p3) {
+    return ggml_pad_ext_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
+}
+
 struct ggml_tensor * ggml_pad_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -4969,6 +4981,7 @@ struct ggml_tensor * ggml_pad_ext(
     ggml_set_op_params_i32(result, 5, rp2);
     ggml_set_op_params_i32(result, 6, lp3);
     ggml_set_op_params_i32(result, 7, rp3);
+    ggml_set_op_params_i32(result, 8, 0); // not circular by default
 
 
     result->op     = GGML_OP_PAD;
@@ -4977,6 +4990,25 @@ struct ggml_tensor * ggml_pad_ext(
     return result;
 }
 
+// ggml_pad_ext_circular
+
+struct ggml_tensor * ggml_pad_ext_circular(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                  lp0,
+        int                  rp0,
+        int                  lp1,
+        int                  rp1,
+        int                  lp2,
+        int                  rp2,
+        int                  lp3,
+        int                  rp3
+        ) {
+    struct ggml_tensor * result = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+    ggml_set_op_params_i32(result, 8, 1); // circular
+    return result;
+}
+
 // ggml_pad_reflect_1d
 
 struct ggml_tensor * ggml_pad_reflect_1d(
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index a7707eb03fe87..fd7f6de584265 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5407,21 +5407,24 @@ struct test_pad : public test_case {
     const std::array<int64_t, 4> ne_a;
     const int pad_0;
     const int pad_1;
+    const bool circular;
 
     std::string vars() override {
-        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+        return VARS_TO_STR5(type, ne_a, pad_0, pad_1, circular);
     }
 
     test_pad(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
-            int pad_0 = 1, int pad_1 = 1)
-        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
+            int pad_0 = 1, int pad_1 = 1, bool circular = false)
+        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1), circular(circular) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
         ggml_set_name(a, "a");
 
-        ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
+        ggml_tensor * out = circular
+            ? ggml_pad_circular(ctx, a, pad_0, pad_1, 0, 0)
+            : ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
         ggml_set_name(out, "out");
 
         return out;
@@ -5441,17 +5444,19 @@ struct test_pad_ext : public test_case {
     const int lp3;
     const int rp3;
     const bool v;
+    const bool circular;
 
     std::string vars() override {
-        return VARS_TO_STR11(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, v);
+        return VARS_TO_STR12(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, v, circular);
     }
 
     test_pad_ext(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne_a = {512, 512, 3, 1},
             int lp0 = 1, int rp0 = 1, int lp1 = 1, int rp1 = 1,
             int lp2 = 1, int rp2 = 1, int lp3 = 1, int rp3 = 1,
-            bool v = false)
-        : type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3), v(v) {}
+            bool v = false, bool circular = false)
+        : type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3),
+          v(v), circular(circular) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
@@ -5462,13 +5467,94 @@ struct test_pad_ext : public test_case {
             ggml_set_name(a, "view of a");
         }
 
-        ggml_tensor * out = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+        ggml_tensor * out = circular
+            ? ggml_pad_ext_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+            : ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
         ggml_set_name(out, "out");
 
         return out;
     }
 };
 
+// GGML_OP_PAD_REFLECT_1D
+
+struct test_pad_ext_circular : public test_case {
+    const std::array<int64_t, 4> ne_src{4, 3, 1, 1};
+    const std::array<int64_t, 4> pads_l{1, 2, 0, 0};
+    const std::array<int64_t, 4> pads_r{2, 1, 0, 0};
+
+    ggml_tensor * input     = nullptr;
+    ggml_tensor * expected  = nullptr;
+
+    std::string vars() override {
+        return "manual_pad_ext_circular";
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_src.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * actual = ggml_pad_ext_circular(ctx, input,
+                pads_l[0], pads_r[0], pads_l[1], pads_r[1], pads_l[2], pads_r[2], pads_l[3], pads_r[3]);
+        ggml_set_name(actual, "actual");
+
+        int64_t ne_dst[4] = {
+            ne_src[0] + pads_l[0] + pads_r[0],
+            ne_src[1] + pads_l[1] + pads_r[1],
+            ne_src[2] + pads_l[2] + pads_r[2],
+            ne_src[3] + pads_l[3] + pads_r[3],
+        };
+
+        expected = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_dst);
+        ggml_set_name(expected, "expected");
+
+        ggml_tensor * diff = ggml_sub(ctx, actual, expected);
+        ggml_tensor * sq   = ggml_sqr(ctx, diff);
+        ggml_tensor * loss = ggml_sum(ctx, sq);
+        ggml_set_name(loss, "loss");
+        return loss;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        test_case::initialize_tensors(ctx);
+
+        std::vector<float> src_data(ggml_nelements(input));
+        for (size_t i = 0; i < src_data.size(); ++i) {
+            src_data[i] = static_cast<float>(i + 1);
+        }
+        ggml_backend_tensor_set(input, src_data.data(), 0, src_data.size() * sizeof(float));
+
+        int64_t ne_dst[4] = {
+            ne_src[0] + pads_l[0] + pads_r[0],
+            ne_src[1] + pads_l[1] + pads_r[1],
+            ne_src[2] + pads_l[2] + pads_r[2],
+            ne_src[3] + pads_l[3] + pads_r[3],
+        };
+
+        std::vector<float> exp_data(ggml_nelements(expected));
+        for (int64_t i3 = 0; i3 < ne_dst[3]; ++i3) {
+            for (int64_t i2 = 0; i2 < ne_dst[2]; ++i2) {
+                for (int64_t i1 = 0; i1 < ne_dst[1]; ++i1) {
+                    for (int64_t i0 = 0; i0 < ne_dst[0]; ++i0) {
+                        const int64_t src_i0 = wrap_coord_circular(i0 - pads_l[0], ne_src[0]);
+                        const int64_t src_i1 = wrap_coord_circular(i1 - pads_l[1], ne_src[1]);
+                        const int64_t src_i2 = wrap_coord_circular(i2 - pads_l[2], ne_src[2]);
+                        const int64_t src_i3 = wrap_coord_circular(i3 - pads_l[3], ne_src[3]);
+                        exp_data[offset4d(ne_dst, i0, i1, i2, i3)] =
+                            src_data[offset4d(ne_src.data(), src_i0, src_i1, src_i2, src_i3)];
+                    }
+                }
+            }
+        }
+        ggml_backend_tensor_set(expected, exp_data.data(), 0, exp_data.size() * sizeof(float));
+    }
+
+    double max_nmse_err() override {
+        return 1e-8;
+    }
+};
+
+
 // GGML_OP_PAD_REFLECT_1D
 struct test_pad_reflect_1d : public test_case {
     const ggml_type type;
@@ -7539,7 +7625,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {9, 9, 1280, 1}));
     test_cases.emplace_back(new test_acc());
     test_cases.emplace_back(new test_pad());
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular
     test_cases.emplace_back(new test_pad_ext());
+    test_cases.emplace_back(new test_pad_ext_circular());
     test_cases.emplace_back(new test_pad_reflect_1d());
     test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
     test_cases.emplace_back(new test_roll());