From adb72c52f1e7442e7959600e5c8ce24c82366635 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Wed, 3 Dec 2025 21:39:05 -0500 Subject: [PATCH 1/3] relax the transposed copy condition check a bit and add another case --- ggml/src/ggml-cuda/cpy.cu | 60 ++++++++++++++++++++++++++------------ tests/test-backend-ops.cpp | 5 ++++ 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index c4ceb4fc579..1222b11fdf8 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -200,23 +200,13 @@ static void ggml_cpy_scalar_cuda( if (transposed) { GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed - int ne00n, ne01n, ne02n; - if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here - ne00n = ne00; - ne01n = ne01; - ne02n = ne02; - } else { - ne00n = ne00; - ne01n = ne01*ne02; - ne02n = 1; - } - dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, - (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, - (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM); + dim3 dimGrid( (ne01 + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, + (ne00 + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, + (ne/(ne01*ne00) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM); dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); cpy_scalar_transpose<<>> - (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } else { const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; cpy_scalar><<>> @@ -359,9 +349,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; + int64_t ne00 = src0->ne[0]; + int64_t ne01 = src0->ne[1]; + int64_t ne02 = src0->ne[2]; + //GGML_ASSERT(src0->ne[3] == 1); @@ -387,8 +378,39 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg char * src1_ddc = (char *) src1->data; const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1); - const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && - src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0); + + bool can_be_transposed = false; + if (src0->ne[3] == 1 ) { + int64_t ne00n, ne01n, ne02n; + if (nb01 == (int64_t)ggml_element_size(src0) && + (nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0) || + nb00 == ne01 * ne02 * (int64_t)ggml_element_size(src0))) { + if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here + ne00n = ne00; + ne01n = ne01; + ne02n = ne02; + } else { + ne00n = ne00; + ne01n = ne01*ne02; + ne02n = 1; + } + ne00 = ne00n ; + ne01 = ne01n; + ne02 = ne02n; + can_be_transposed = true; + } + if ((nb02 == (int64_t)ggml_element_size(src0) && + nb01 == ne02 * ne00 * (int64_t)ggml_element_size(src0))) { + GGML_ASSERT(nb00 <= nb01); + ne00n = ne00*ne01; + ne01n = ne02; + ne02n = 1; // not used + ne00 = ne00n ; + ne01 = ne01n; + ne02 = ne02n; + can_be_transposed = true; + } + } if (src0->type == src1->type && contiguous_srcs) { GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7ef7f2ad81e..45a6dbc498e 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7928,6 +7928,11 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {4352, 1, 9216, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4352, 1, 9216, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {21504, 4352, 1, 1}, {2, 0, 1, 3}, {0, 0, 0, 0})); + + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); From 3683721688cc79a3d73c7ae938925ea8df6efe2e Mon Sep 17 00:00:00 2001 From: bssrdf Date: Thu, 4 Dec 2025 08:44:08 -0500 Subject: [PATCH 2/3] minor tweak --- ggml/src/ggml-cuda/cpy.cu | 8 ++++---- tests/test-backend-ops.cpp | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 1222b11fdf8..deed207687a 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -394,18 +394,18 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg ne01n = ne01*ne02; ne02n = 1; } - ne00 = ne00n ; + ne00 = ne00n; ne01 = ne01n; ne02 = ne02n; can_be_transposed = true; } - if ((nb02 == (int64_t)ggml_element_size(src0) && + if ((nb02 == (int64_t)ggml_element_size(src0) && nb00 <= nb01 && nb01 == ne02 * ne00 * (int64_t)ggml_element_size(src0))) { - GGML_ASSERT(nb00 <= nb01); + // GGML_ASSERT(nb00 <= nb01); ne00n = ne00*ne01; ne01n = ne02; ne02n = 1; // not used - ne00 = ne00n ; + ne00 = ne00n; ne01 = ne01n; ne02 = ne02n; can_be_transposed = true; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 45a6dbc498e..30992103de2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7928,6 +7928,7 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + // sd.cpp cases test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {4352, 1, 9216, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4352, 1, 9216, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {21504, 4352, 1, 1}, {2, 0, 1, 3}, {0, 0, 0, 0})); From 74a391287457303ef6b7822bd8a8e69f37736e58 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Thu, 4 Dec 2025 20:45:56 -0500 Subject: [PATCH 3/3] swap block dims in case number of blocks bigger than allowed gridDimY --- ggml/src/ggml-cuda/cpy.cu | 32 ++++++++++++++++++++------------ tests/test-backend-ops.cpp | 3 +++ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index deed207687a..b1e41d2156f 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -39,7 +39,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne, cpy_1(cx + x_offset, cdst + dst_offset); } -template +template static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, @@ -51,10 +51,10 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const const int64_t nmat = ne / (ne00 * ne01); const int64_t n = ne00 * ne01; - const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x; - const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y; - const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset - const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y; + const int x = (swap == 0 ? blockIdx.x : blockIdx.y) * CUDA_CPY_TILE_DIM_2D + threadIdx.x; + const int y = (swap == 0 ? blockIdx.y : blockIdx.x) * CUDA_CPY_TILE_DIM_2D + threadIdx.y; + const int tx = (swap == 0 ? blockIdx.y : blockIdx.x) * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset + const int ty = (swap == 0 ? blockIdx.x : blockIdx.y) * CUDA_CPY_TILE_DIM_2D + threadIdx.y; __shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1]; @@ -200,13 +200,21 @@ static void ggml_cpy_scalar_cuda( if (transposed) { GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed - - dim3 dimGrid( (ne01 + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, - (ne00 + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, - (ne/(ne01*ne00) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM); - dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); - cpy_scalar_transpose<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + if(ne01 > ne00) { + dim3 dimGrid( (ne01 + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, + (ne00 + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, + (ne/(ne01*ne00) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM); + dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); + cpy_scalar_transpose<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + } else { + dim3 dimGrid( (ne00 + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, + (ne01 + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D, + (ne/(ne01*ne00) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM); + dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); + cpy_scalar_transpose<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + } } else { const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; cpy_scalar><<>> diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 30992103de2..1be4efbe8ad 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7073,6 +7073,9 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {4, 1, 256, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {16, 256, 1, 1}, {2, 0, 1, 3}, {0, 0, 0, 0})); + for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) { for (bool use_view_slice : { true, false }) {