fix

orca-zhang · orca-zhang · commit 97677f7f540a · 2025-02-27T22:05:17.000+08:00
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
@@ -57,7 +57,6 @@ static void pad_f32_cuda(const float * x, float * dst,
     dim3 gridDim(num_blocks, ne1, ne2*ne3);
     pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
 }
-#include "ggml-impl.h"
 
 static void pad_f16_cuda(const half * x, half * dst,
     const int ne00, const int ne01, const int ne02, const int ne03,
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -9568,7 +9568,7 @@ struct llama_context * llama_init_from_model(
     }
 
     if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
-        LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k != n_embd_head_v - forcing off\n", __func__);
+        LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
         params.flash_attn = false;
     }
 

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,6 @@ static void pad_f32_cuda(const float * x, float * dst,`
`57`	`57`	`dim3 gridDim(num_blocks, ne1, ne2*ne3);`
`58`	`58`	`pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);`
`59`	`59`	`}`
`60`		`-#include "ggml-impl.h"`
`61`	`60`
`62`	`61`	`static void pad_f16_cuda(const half * x, half * dst,`
`63`	`62`	`const int ne00, const int ne01, const int ne02, const int ne03,`
Original file line number	Diff line number	Diff line change
`@@ -9568,7 +9568,7 @@ struct llama_context * llama_init_from_model(`
`9568`	`9568`	`}`
`9569`	`9569`
`9570`	`9570`	`if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {`
`9571`		`- LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k != n_embd_head_v - forcing off\n", __func__);`
	`9571`	`+ LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);`
`9572`	`9572`	`params.flash_attn = false;`
`9573`	`9573`	`}`
`9574`	`9574`