Sync llama: use FA + max. GPU layers by default

JamePeng · JamePeng · commit 2be720daf093 · 2025-08-31T20:34:00.000+08:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -79,8 +79,10 @@ def __init__(
         n_threads_batch: Optional[int] = None,
         rope_scaling_type: Optional[
             int
-        ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        ] = llama_cpp.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
+        attention_type: Optional[int] = llama_cpp.llama_attention_type.LLAMA_ATTENTION_TYPE_UNSPECIFIED,
+        flash_attn_type: Optional[int] = llama_cpp.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_AUTO,
         rope_freq_base: float = 0.0,
         rope_freq_scale: float = 0.0,
         yarn_ext_factor: float = -1.0,
@@ -91,7 +93,6 @@ def __init__(
         logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
-        flash_attn: bool = False,
         op_offload: Optional[bool] = None,
         swa_full: Optional[bool] = None,
         kv_unified: Optional[bool] = None,
@@ -164,6 +165,8 @@ def __init__(
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggml-org/llama.cpp/pull/2054
             pooling_type: Pooling type, from `enum llama_pooling_type`.
+            attention_type: attention type to use for embeddings
+            flash_attn_type: when to enable Flash Attention
             rope_freq_base: RoPE base frequency, 0 = from model
             rope_freq_scale: RoPE frequency scaling factor, 0 = from model
             yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
@@ -174,7 +177,6 @@ def __init__(
             logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
-            flash_attn: Use flash attention.
             op_offload: whether to offload host tensor operations to device
             swa_full: whether to use full-size SWA cache
             kv_unified: use single unified KV buffer for the KV cache of all sequences
@@ -318,9 +320,23 @@ def __init__(
         self.context_params.rope_scaling_type = (
             rope_scaling_type
             if rope_scaling_type is not None
-            else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
+            else llama_cpp.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
+        )
+        self.context_params.pooling_type = (
+            pooling_type
+            if pooling_type is not None
+            else llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED
+        )
+        self.context_params.attention_type = (
+            attention_type
+            if attention_type is not None
+            else llama_cpp.llama_attention_type.LLAMA_ATTENTION_TYPE_UNSPECIFIED
+        )
+        self.context_params.flash_attn_type = (
+            flash_attn_type
+            if flash_attn_type is not None
+            else llama_cpp.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_AUTO
         )
-        self.context_params.pooling_type = pooling_type
         self.context_params.rope_freq_base = (
             rope_freq_base if rope_freq_base != 0.0 else 0
         )
@@ -343,7 +359,6 @@ def __init__(
         self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-        self.context_params.flash_attn = flash_attn
 
         if op_offload is not None:
             self.context_params.op_offload = op_offload
@@ -2201,6 +2216,8 @@ def __getstate__(self):
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
             pooling_type=self.context_params.pooling_type,
+            attention_type=self.context_params.attention_type,
+            flash_attn_type=self.context_params.flash_attn_type,
             rope_freq_base=self.context_params.rope_freq_base,
             rope_freq_scale=self.context_params.rope_freq_scale,
             yarn_ext_factor=self.context_params.yarn_ext_factor,
@@ -2211,7 +2228,6 @@ def __getstate__(self):
             logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
-            flash_attn=self.context_params.flash_attn,
             op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             kv_unified= self.context_params.kv_unified,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import ctypes
+import enum
 import os
 import pathlib
 
@@ -451,12 +452,13 @@
 #     LLAMA_ROPE_SCALING_TYPE_LONGROPE    = 3,
 #     LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
 # };
-LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1
-LLAMA_ROPE_SCALING_TYPE_NONE = 0
-LLAMA_ROPE_SCALING_TYPE_LINEAR = 1
-LLAMA_ROPE_SCALING_TYPE_YARN = 2
-LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3
-LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
+class llama_rope_scaling_type(enum.IntEnum):
+    LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1
+    LLAMA_ROPE_SCALING_TYPE_NONE = 0
+    LLAMA_ROPE_SCALING_TYPE_LINEAR = 1
+    LLAMA_ROPE_SCALING_TYPE_YARN = 2
+    LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3
+    LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
 
 # enum llama_pooling_type {
 #     LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
@@ -478,10 +480,33 @@
 #     LLAMA_ATTENTION_TYPE_CAUSAL      = 0,
 #     LLAMA_ATTENTION_TYPE_NON_CAUSAL  = 1,
 # };
-LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1
-LLAMA_ATTENTION_TYPE_CAUSAL = 0
-LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
+class llama_attention_type(enum.IntEnum):
+    LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1
+    LLAMA_ATTENTION_TYPE_CAUSAL = 0
+    LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
+
+# enum llama_flash_attn_type {
+#     LLAMA_FLASH_ATTN_TYPE_AUTO     = -1,
+#     LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
+#     LLAMA_FLASH_ATTN_TYPE_ENABLED  = 1,
+# };
+class llama_flash_attn_type(enum.IntEnum):
+    LLAMA_FLASH_ATTN_TYPE_AUTO     = -1
+    LLAMA_FLASH_ATTN_TYPE_DISABLED = 0
+    LLAMA_FLASH_ATTN_TYPE_ENABLED  = 1
 
+# LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
+@ctypes_function(
+    "llama_flash_attn_type_name",
+    [ctypes.c_int],
+    ctypes.c_char_p,
+)
+def llama_flash_attn_type_name(
+    flash_attn_type: llama_flash_attn_type, /
+) -> bytes:
+    """
+    Gets the name of a llama_flash_attn_type.
+    """
 
 # enum llama_split_mode {
 #     LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
@@ -793,6 +818,7 @@ class llama_model_params(ctypes.Structure):
 #     enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 #     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 #     enum llama_attention_type    attention_type;    // attention type to use for embeddings
+#     enum llama_flash_attn_type   flash_attn_type;   // when to enable Flash Attention
 
 #     // ref: https://github.com/ggml-org/llama.cpp/pull/2054
 #     float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -818,7 +844,6 @@ class llama_model_params(ctypes.Structure):
 #     // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
-#     bool flash_attn;  // use flash attention [EXPERIMENTAL]
 #     bool no_perf;     // measure performance timings
 #     bool op_offload;  // offload host tensor operations to device
 #     bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@@ -841,6 +866,7 @@ class llama_context_params(ctypes.Structure):
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
         pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
         attention_type (int): attention type to use for embeddings
+        flash_attn_type (int): when to enable Flash Attention
         rope_freq_base (float): RoPE base frequency, 0 = from model
         rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
         yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -857,7 +883,6 @@ class llama_context_params(ctypes.Structure):
         abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
         embeddings (bool): if true, extract embeddings (together with logits)
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
-        flash_attn (bool): whether to use flash attention
         no_perf (bool): whether to measure performance timings
         op_offload(bool): whether to offload host tensor operations to device
         swa_full(bool): whether to use full-size SWA cache
@@ -874,6 +899,7 @@ class llama_context_params(ctypes.Structure):
         rope_scaling_type: int
         pooling_type: int
         attention_type: int
+        flash_attn_type: int
         rope_freq_base: float
         rope_freq_scale: float
         yarn_ext_factor: float
@@ -890,7 +916,6 @@ class llama_context_params(ctypes.Structure):
         abort_callback_data: ctypes.c_void_p
         embeddings: bool
         offload_kqv: bool
-        flash_attn: bool
         no_perf: bool
         op_offload:bool
         swa_full:bool
@@ -906,6 +931,7 @@ class llama_context_params(ctypes.Structure):
         ("rope_scaling_type", ctypes.c_int),
         ("pooling_type", ctypes.c_int),
         ("attention_type", ctypes.c_int),
+        ("flash_attn_type", ctypes.c_int),
         ("rope_freq_base", ctypes.c_float),
         ("rope_freq_scale", ctypes.c_float),
         ("yarn_ext_factor", ctypes.c_float),
@@ -922,7 +948,6 @@ class llama_context_params(ctypes.Structure):
         ("abort_callback_data", ctypes.c_void_p),
         ("embeddings", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
-        ("flash_attn", ctypes.c_bool),
         ("no_perf", ctypes.c_bool),
         ("op_offload", ctypes.c_bool),
         ("swa_full", ctypes.c_bool),
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -281,6 +281,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             n_threads=settings.n_threads,
             n_threads_batch=settings.n_threads_batch,
             rope_scaling_type=settings.rope_scaling_type,
+            pooling_type=settings.pooling_type,
+            attention_type=settings.attention_type,
+            flash_attn_type=settings.flash_attn_type,
             rope_freq_base=settings.rope_freq_base,
             rope_freq_scale=settings.rope_freq_scale,
             yarn_ext_factor=settings.yarn_ext_factor,
@@ -292,7 +295,6 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             logits_all=settings.logits_all,
             embedding=settings.embedding,
             offload_kqv=settings.offload_kqv,
-            flash_attn=settings.flash_attn,
             op_offload=settings.op_offload,
             swa_full=settings.swa_full,
             kv_unified=settings.kv_unified,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -84,7 +84,20 @@ class ModelSettings(BaseSettings):
         description="The number of threads to use when batch processing. Use -1 for max cpu threads",
     )
     rope_scaling_type: int = Field(
-        default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
+        default=llama_cpp.llama_rope_scaling_type.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        description="RoPE scaling type, from `enum llama_rope_scaling_type",
+    )
+    pooling_type: int = Field(
+        default=llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
+        description="whether to pool (sum) embedding results by sequence id",
+    )
+    attention_type: int = Field(
+        default=llama_cpp.llama_attention_type.LLAMA_ATTENTION_TYPE_UNSPECIFIED,
+        description="attention type to use for embeddings",
+    )
+    flash_attn_type: int = Field(
+        default=llama_cpp.llama_flash_attn_type.LLAMA_FLASH_ATTN_TYPE_AUTO,
+        description="when to enable Flash Attention",
     )
     rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
     rope_freq_scale: float = Field(
@@ -103,9 +116,6 @@ class ModelSettings(BaseSettings):
     offload_kqv: bool = Field(
         default=True, description="Whether to offload kqv to the GPU."
     )
-    flash_attn: bool = Field(
-        default=False, description="Whether to use flash attention."
-    )
     op_offload: bool = Field(
         default=True, description="Whether to offload host tensor operations to device"
     )
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -129,7 +129,6 @@ def test_real_llama(llama_cpp_model_path):
         n_threads=multiprocessing.cpu_count(),
         n_threads_batch=multiprocessing.cpu_count(),
         logits_all=False,
-        flash_attn=True,
         swa_full=True,
         kv_unified=True,
     )
@@ -234,7 +233,6 @@ def test_real_llama_embeddings(llama_cpp_model_path):
         n_threads=multiprocessing.cpu_count(),
         n_threads_batch=multiprocessing.cpu_count(),
         logits_all=False,
-        flash_attn=True,
         swa_full=True,
         kv_unified=True,
         embedding=True