Skip to content

Commit 7034082

Browse files
Merge pull request #284 from menloresearch/update-dev-from-master-2025-10-10-00-33
Sync master with upstream release b6724
2 parents b28a935 + 1deee0f commit 7034082

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+2196
-1053
lines changed

.github/workflows/build.yml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,3 +1515,29 @@ jobs:
15151515
run: |
15161516
vulkaninfo --summary
15171517
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
1518+
1519+
ggml-ci-arm64-cpu-kleidiai:
1520+
runs-on: ubuntu-22.04-arm
1521+
1522+
steps:
1523+
- name: Clone
1524+
id: checkout
1525+
uses: actions/checkout@v4
1526+
1527+
- name: ccache
1528+
uses: ggml-org/ccache-action@v1.2.16
1529+
with:
1530+
key: ggml-ci-arm64-cpu-kleidiai
1531+
evict-old-files: 1d
1532+
1533+
- name: Dependencies
1534+
id: depends
1535+
run: |
1536+
sudo apt-get update
1537+
sudo apt-get install -y build-essential libcurl4-openssl-dev
1538+
1539+
- name: Test
1540+
id: ggml-ci
1541+
run: |
1542+
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
1543+

ci/run.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
# # with MUSA support
2323
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
2424
#
25+
# # with KLEIDIAI support
26+
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
27+
#
2528

2629
if [ -z "$2" ]; then
2730
echo "usage: $0 <output-dir> <mnt-dir>"
@@ -115,6 +118,34 @@ if [ ! -z ${GG_BUILD_NO_SVE} ]; then
115118
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
116119
fi
117120

121+
if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
122+
echo ">>===== Enabling KleidiAI support"
123+
124+
CANDIDATES=("armv9-a+dotprod+i8mm" "armv8.6-a+dotprod+i8mm" "armv8.2-a+dotprod")
125+
CPU=""
126+
127+
for cpu in "${CANDIDATES[@]}"; do
128+
if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
129+
CPU="$cpu"
130+
break
131+
fi
132+
done
133+
134+
if [ -z "$CPU" ]; then
135+
echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
136+
exit 1
137+
fi
138+
139+
echo ">>===== Using ARM baseline: ${CPU}"
140+
141+
CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
142+
-DGGML_NATIVE=OFF \
143+
-DGGML_CPU_KLEIDIAI=ON \
144+
-DGGML_CPU_AARCH64=ON \
145+
-DGGML_CPU_ARM_ARCH=${CPU} \
146+
-DBUILD_SHARED_LIBS=OFF"
147+
fi
148+
118149
## helpers
119150

120151
# download a file if it does not exist or if it is outdated

common/arg.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1935,6 +1935,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19351935
params.n_ctx_checkpoints = value;
19361936
}
19371937
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1938+
add_opt(common_arg(
1939+
{"--cache-ram", "-cram"}, "N",
1940+
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
1941+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1942+
[](common_params & params, int value) {
1943+
params.cache_ram_mib = value;
1944+
}
1945+
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
19381946
add_opt(common_arg(
19391947
{"--kv-unified", "-kvu"},
19401948
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"

common/chat.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ struct common_chat_msg_content_part {
3333
struct common_chat_msg {
3434
std::string role;
3535
std::string content;
36-
std::vector<common_chat_msg_content_part> content_parts = {};
37-
std::vector<common_chat_tool_call> tool_calls = {};
36+
std::vector<common_chat_msg_content_part> content_parts;
37+
std::vector<common_chat_tool_call> tool_calls;
3838
std::string reasoning_content;
3939
std::string tool_name;
4040
std::string tool_call_id;
@@ -44,7 +44,7 @@ struct common_chat_msg {
4444
bool empty() const {
4545
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
4646
}
47-
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
47+
void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
4848
for (auto i = 0u; i < tool_calls.size(); i++) {
4949
if (ids_cache.size() <= i) {
5050
auto id = tool_calls[i].id;

common/common.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ struct common_params {
378378
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
379379
bool cont_batching = true; // insert new sequences for decoding on-the-fly
380380
bool no_perf = false; // disable performance metrics
381-
bool ctx_shift = false; // context shift on infinite text generation
381+
bool ctx_shift = false; // context shift on infinite text generation
382382
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
383383
bool kv_unified = false; // enable unified KV cache
384384

@@ -425,7 +425,8 @@ struct common_params {
425425
int32_t timeout_write = timeout_read; // http write timeout in seconds
426426
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
427427
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
428-
int32_t n_ctx_checkpoints = 3; // max number of context checkpoints per slot
428+
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
429+
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
429430

430431
std::string hostname = "127.0.0.1";
431432
std::string public_path = ""; // NOLINT

convert_hf_to_gguf.py

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,15 @@ class ModelBase:
9393
# Mistral format specifics
9494
is_mistral_format: bool = False
9595
disable_mistral_community_chat_template: bool = False
96+
sentence_transformers_dense_modules: bool = False
9697

9798
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
9899
use_temp_file: bool = False, eager: bool = False,
99100
metadata_override: Path | None = None, model_name: str | None = None,
100101
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
101102
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
102-
disable_mistral_community_chat_template: bool = False):
103+
disable_mistral_community_chat_template: bool = False,
104+
sentence_transformers_dense_modules: bool = False):
103105
if type(self) is ModelBase or \
104106
type(self) is TextModel or \
105107
type(self) is MmprojModel:
@@ -114,6 +116,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
114116
self.lazy = not eager or (remote_hf_model_id is not None)
115117
self.dry_run = dry_run
116118
self.remote_hf_model_id = remote_hf_model_id
119+
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
117120
if remote_hf_model_id is not None:
118121
self.is_safetensors = True
119122

@@ -5269,6 +5272,53 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
52695272
@ModelBase.register("Gemma3TextModel")
52705273
class EmbeddingGemma(Gemma3Model):
52715274
model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
5275+
module_paths = []
5276+
dense_features_dims = {}
5277+
5278+
def __init__(self, *args, **kwargs):
5279+
super().__init__(*args, **kwargs)
5280+
if self.sentence_transformers_dense_modules:
5281+
# read modules.json to determine if model has Dense layers
5282+
modules_file = self.dir_model / "modules.json"
5283+
if modules_file.is_file():
5284+
with open(modules_file, encoding="utf-8") as modules_json_file:
5285+
mods = json.load(modules_json_file)
5286+
for mod in mods:
5287+
if mod["type"] == "sentence_transformers.models.Dense":
5288+
mod_path = mod["path"]
5289+
# check if model.safetensors file for Dense layer exists
5290+
model_tensors_file = self.dir_model / mod_path / "model.safetensors"
5291+
if model_tensors_file.is_file():
5292+
self.module_paths.append(mod_path)
5293+
# read config.json of the Dense layer to get in/out features
5294+
mod_conf_file = self.dir_model / mod_path / "config.json"
5295+
if mod_conf_file.is_file():
5296+
with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file:
5297+
mod_conf = json.load(mod_conf_json_file)
5298+
# hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights
5299+
prefix = self._get_dense_prefix(mod_path)
5300+
if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None:
5301+
self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"])
5302+
5303+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
5304+
from safetensors.torch import load_file
5305+
module_paths = list(self.module_paths)
5306+
for i, module_path in enumerate(module_paths):
5307+
tensors_file = self.dir_model / module_path / "model.safetensors"
5308+
local_tensors = load_file(tensors_file)
5309+
tensor_name = self._get_dense_prefix(module_path)
5310+
for name, local_tensor in local_tensors.items():
5311+
if not name.endswith(".weight"):
5312+
continue
5313+
orig_name = name.replace("linear", tensor_name)
5314+
name = self.map_tensor_name(orig_name)
5315+
yield name, local_tensor.clone()
5316+
5317+
@staticmethod
5318+
def _get_dense_prefix(module_path) -> str:
5319+
"""Get the tensor name prefix for the Dense layer from module path."""
5320+
tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3"
5321+
return tensor_name
52725322

52735323
def set_gguf_parameters(self):
52745324
super().set_gguf_parameters()
@@ -5285,6 +5335,10 @@ def set_gguf_parameters(self):
52855335
logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
52865336
f"instead of {self.hparams['sliding_window']}")
52875337
self.gguf_writer.add_sliding_window(orig_sliding_window)
5338+
if self.sentence_transformers_dense_modules:
5339+
for dense, dims in self.dense_features_dims.items():
5340+
logger.info(f"Setting dense layer {dense} in/out features to {dims}")
5341+
self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1])
52885342

52895343
self._try_set_pooling_type()
52905344

@@ -9335,6 +9389,13 @@ def parse_args() -> argparse.Namespace:
93359389
)
93369390
)
93379391

9392+
parser.add_argument(
9393+
"--sentence-transformers-dense-modules", action="store_true",
9394+
help=("Whether to include sentence-transformers dense modules."
9395+
"It can be used for sentence-transformers models, like google/embeddinggemma-300m"
9396+
"Default these modules are not included.")
9397+
)
9398+
93389399
args = parser.parse_args()
93399400
if not args.print_supported_models and args.model is None:
93409401
parser.error("the following arguments are required: model")
@@ -9397,9 +9458,13 @@ def main() -> None:
93979458
if args.remote:
93989459
hf_repo_id = args.model
93999460
from huggingface_hub import snapshot_download
9461+
allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]
9462+
if args.sentence_transformers_dense_modules:
9463+
# include sentence-transformers dense modules safetensors files
9464+
allowed_patterns.append("*.safetensors")
94009465
local_dir = snapshot_download(
94019466
repo_id=hf_repo_id,
9402-
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
9467+
allow_patterns=allowed_patterns)
94039468
dir_model = Path(local_dir)
94049469
logger.info(f"Downloaded config and tokenizer to {local_dir}")
94059470
else:
@@ -9467,7 +9532,8 @@ def main() -> None:
94679532
split_max_tensors=args.split_max_tensors,
94689533
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
94699534
small_first_shard=args.no_tensor_first_split,
9470-
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template
9535+
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
9536+
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
94719537
)
94729538

94739539
if args.vocab_only:

examples/model-conversion/Makefile

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,20 +116,39 @@ embedding-convert-model:
116116
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
117117
./scripts/embedding/convert-model.sh
118118

119+
embedding-convert-model-st:
120+
$(call validate_embedding_model_path,embedding-convert-model-st)
121+
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
122+
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
123+
./scripts/embedding/convert-model.sh -st
124+
119125
embedding-run-original-model:
120126
$(call validate_embedding_model_path,embedding-run-original-model)
121127
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
128+
USE_SENTENCE_TRANSFORMERS="$(USE_SENTENCE_TRANSFORMERS)" \
122129
./scripts/embedding/run-original-model.py \
123-
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
130+
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
131+
$(if $(USE_SENTENCE_TRANSFORMERS),--use-sentence-transformers)
132+
133+
embedding-run-original-model-st: USE_SENTENCE_TRANSFORMERS=1
134+
embedding-run-original-model-st: embedding-run-original-model
124135

125136
embedding-run-converted-model:
126137
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
127-
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
138+
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
139+
$(if $(USE_POOLING),--pooling)
140+
141+
embedding-run-converted-model-st: USE_POOLING=1
142+
embedding-run-converted-model-st: embedding-run-converted-model
128143

129144
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
130145
@./scripts/embedding/compare-embeddings-logits.sh \
131146
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
132147

148+
embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model-st
149+
@./scripts/embedding/compare-embeddings-logits.sh \
150+
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
151+
133152
embedding-inspect-original-model:
134153
$(call validate_embedding_model_path,embedding-inspect-original-model)
135154
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}

examples/model-conversion/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,23 @@ This command will save two files to the `data` directory, one is a binary
189189
file containing logits which will be used for comparison with the converted
190190
model, and the other is a text file which allows for manual visual inspection.
191191

192+
#### Using SentenceTransformer with numbered layers
193+
For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
194+
03_Dense, 04_Normalize), use the `-st` targets to apply all these layers:
195+
196+
```console
197+
# Run original model with SentenceTransformer (applies all numbered layers)
198+
(venv) $ make embedding-run-original-model-st
199+
200+
# Run converted model with pooling enabled
201+
(venv) $ make embedding-run-converted-model-st
202+
```
203+
204+
This will use the SentenceTransformer library to load and run the model, which
205+
automatically applies all the numbered layers in the correct order. This is
206+
particularly useful when comparing with models that should include these
207+
additional transformation layers beyond just the base model output.
208+
192209
### Model conversion
193210
After updates have been made to [gguf-py](../../gguf-py) to add support for the
194211
new model the model can be converted to GGUF format using the following command:
@@ -208,6 +225,13 @@ was done manually in the previous steps) and compare the logits:
208225
(venv) $ make embedding-verify-logits
209226
```
210227

228+
For models with SentenceTransformer layers, use the `-st` verification target:
229+
```console
230+
(venv) $ make embedding-verify-logits-st
231+
```
232+
This convenience target automatically runs both the original model with SentenceTransformer
233+
and the converted model with pooling enabled, then compares the results.
234+
211235
### llama-server verification
212236
To verify that the converted model works with llama-server, the following
213237
command can be used:

0 commit comments

Comments
 (0)