381381# //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
382382# LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
383383# LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
384+ # LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
384385#
385386# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
386387# };
419420# LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35
420421LLAMA_FTYPE_MOSTLY_TQ1_0 = 36
421422LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
423+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
422424LLAMA_FTYPE_GUESSED = 1024
423425
424426# enum llama_rope_scaling_type {
@@ -691,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure):
691693# bool use_mmap; // use mmap if possible
692694# bool use_mlock; // force system to keep model in RAM
693695# bool check_tensors; // validate model tensor data
696+ # bool use_extra_bufts; // use extra buffer types (used for weight repacking)
694697# };
695698class llama_model_params(ctypes.Structure):
696699 """Parameters for llama_model
@@ -708,7 +711,8 @@ class llama_model_params(ctypes.Structure):
708711 vocab_only (bool): only load the vocabulary, no weights
709712 use_mmap (bool): use mmap if possible
710713 use_mlock (bool): force system to keep model in RAM
711- check_tensors (bool): validate model tensor data"""
714+ check_tensors (bool): validate model tensor data
715+ use_extra_bufts (bool): use extra buffer types (used for weight repacking)"""
712716
713717 if TYPE_CHECKING:
714718 devices: CtypesArray[ctypes.c_void_p] # NOTE: unused
@@ -724,6 +728,7 @@ class llama_model_params(ctypes.Structure):
724728 use_mmap: bool
725729 use_mlock: bool
726730 check_tensors: bool
731+ use_extra_bufts: bool
727732
728733 _fields_ = [
729734 ("devices", ctypes.c_void_p), # NOTE: unnused
@@ -739,6 +744,7 @@ class llama_model_params(ctypes.Structure):
739744 ("use_mmap", ctypes.c_bool),
740745 ("use_mlock", ctypes.c_bool),
741746 ("check_tensors", ctypes.c_bool),
747+ ("use_extra_bufts", ctypes.c_bool),
742748 ]
743749
744750
@@ -787,6 +793,9 @@ class llama_model_params(ctypes.Structure):
787793# bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
788794# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
789795# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
796+ # bool kv_unified; // use a unified buffer across the input sequences when computing the attention
797+ # // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
798+ # // ref: https://github.com/ggml-org/llama.cpp/pull/14363
790799# };
791800class llama_context_params(ctypes.Structure):
792801 """Parameters for llama_context
@@ -821,6 +830,7 @@ class llama_context_params(ctypes.Structure):
821830 no_perf (bool): whether to measure performance timings
822831 op_offload (bool): offload host tensor operations to device
823832 swa_full (bool): use full-size SWA cache
833+ kv_unified (bool): use a unified buffer across the input sequences when computing the attention
824834 """
825835
826836 if TYPE_CHECKING:
@@ -853,6 +863,7 @@ class llama_context_params(ctypes.Structure):
853863 no_perf: bool
854864 op_offload: bool
855865 swa_full: bool
866+ kv_unified: bool
856867
857868 _fields_ = [
858869 ("n_ctx", ctypes.c_uint32),
@@ -884,6 +895,7 @@ class llama_context_params(ctypes.Structure):
884895 ("no_perf", ctypes.c_bool),
885896 ("op_offload", ctypes.c_bool),
886897 ("swa_full", ctypes.c_bool),
898+ ("kv_unified", ctypes.c_bool),
887899 ]
888900
889901
@@ -1651,6 +1663,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
16511663 ...
16521664
16531665
1666+ # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
1667+ # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
1668+ @ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool)
1669+ def llama_model_is_diffusion(model: llama_model_p, /) -> bool:
1670+ """Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)"""
1671+ ...
1672+
1673+
16541674# // Returns 0 on success
16551675# LLAMA_API uint32_t llama_model_quantize(
16561676# const char * fname_inp,
@@ -2833,6 +2853,7 @@ def llama_synchronize(ctx: llama_context_p, /):
28332853# // in the order they have appeared in the batch.
28342854# // Rows: number of tokens for which llama_batch.logits[i] != 0
28352855# // Cols: n_vocab
2856+ # // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
28362857# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
28372858@ctypes_function(
28382859 "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -2873,6 +2894,7 @@ def llama_get_logits_ith(
28732894# // in the order they have appeared in the batch.
28742895# // shape: [n_outputs*n_embd]
28752896# // Otherwise, returns NULL.
2897+ # // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
28762898# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
28772899@ctypes_function(
28782900 "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -3020,6 +3042,13 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
30203042 ...
30213043
30223044
3045+ # LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
3046+ @ctypes_function("llama_vocab_mask", [llama_vocab_p_ctypes], llama_token)
3047+ def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token:
3048+ """mask"""
3049+ ...
3050+
3051+
30233052# LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
30243053@ctypes_function(
30253054 "llama_vocab_get_add_bos",
@@ -4176,6 +4205,7 @@ def llama_log_set(
41764205
41774206# int32_t n_p_eval;
41784207# int32_t n_eval;
4208+ # int32_t n_reused; // number of times a ggml compute graph had been reused
41794209# };
41804210class llama_perf_context_data(ctypes.Structure):
41814211 _fields_ = [
@@ -4185,6 +4215,7 @@ class llama_perf_context_data(ctypes.Structure):
41854215 ("t_eval_ms", ctypes.c_double),
41864216 ("n_p_eval", ctypes.c_int32),
41874217 ("n_eval", ctypes.c_int32),
4218+ ("n_reused", ctypes.c_int32),
41884219 ]
41894220
41904221
0 commit comments