From c4b630f25d67bd9c9e9093c95f7f288f091f1bbb Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 29 Aug 2025 11:49:09 -0400
Subject: [PATCH 1/3] convert : parse safetensors directly

---
 convert_hf_to_gguf.py   | 27 ++++++++++-----
 gguf-py/gguf/utility.py | 75 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 222f6ed6dc4..7a32f64fae8 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -218,8 +218,7 @@ def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Call
             logger.info(f"gguf: indexing model part '{part_name}'")
             ctx: ContextManager[Any]
             if is_safetensors:
-                from safetensors import safe_open
-                ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
+                ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name))
             else:
                 ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
 
@@ -228,18 +227,18 @@ def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Call
 
                 for name in model_part.keys():
                     if is_safetensors:
+                        data: gguf.utility.LocalTensor = model_part[name]
                         if self.lazy:
-                            data = model_part.get_slice(name)
-                            data_gen = lambda data=data: LazyTorchTensor.from_safetensors_slice(data)  # noqa: E731
+                            data_gen = lambda data=data: LazyTorchTensor.from_safetensors_meta(data)  # noqa: E731
                         else:
-                            data = model_part.get_tensor(name)
-                            data_gen = lambda data=data: data  # noqa: E731
+                            dtype = LazyTorchTensor._dtype_str_map[data.dtype]
+                            data_gen = lambda data=data: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape)  # noqa: E731
                     else:
-                        data = model_part[name]
+                        data_torch: Tensor = model_part[name]
                         if self.lazy:
-                            data_gen = lambda data=data: LazyTorchTensor.from_eager(data)  # noqa: E731
+                            data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data)  # noqa: E731
                         else:
-                            data_gen = lambda data=data: data  # noqa: E731
+                            data_gen = lambda data=data_torch: data  # noqa: E731
                     tensors[name] = data_gen
 
         # verify tensor name presence and identify potentially missing files
@@ -10002,6 +10001,16 @@ def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
         lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
         return cast(torch.Tensor, lazy)
 
+    @classmethod
+    def from_safetensors_meta(cls, t: gguf.utility.LocalTensor) -> Tensor:
+        def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
+            dtype = cls._dtype_str_map[tensor.dtype]
+            return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
+        dtype = cls._dtype_str_map[t.dtype]
+        shape = t.shape
+        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
+        return cast(torch.Tensor, lazy)
+
     @classmethod
     def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
         dtype = cls._dtype_str_map[remote_tensor.dtype]
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py
index 769ccb02f0d..88f055746a9 100644
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Literal
 
 import os
 import json
+import numpy as np
 
 
 def fill_templated_filename(filename: str, output_type: str | None) -> str:
@@ -266,3 +268,76 @@ def _get_request_headers(cls) -> dict[str, str]:
         if os.environ.get("HF_TOKEN"):
             headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}"
         return headers
+
+
+@dataclass
+class LocalTensorRange:
+    filename: Path
+    offset: int
+    size: int
+
+
+@dataclass
+class LocalTensor:
+    dtype: str
+    shape: tuple[int, ...]
+    data_range: LocalTensorRange
+
+    def mmap_bytes(self) -> np.ndarray:
+        return np.memmap(self.data_range.filename, offset=self.data_range.offset, shape=self.data_range.size)
+
+
+class SafetensorsLocal:
+    """
+        Read a safetensors file from the local filesystem.
+
+        Custom parsing gives a bit more control over the memory usage.
+        The official safetensors library doesn't expose file ranges.
+    """
+    ALIGNMENT = 8  # bytes
+
+    tensors: dict[str, LocalTensor]
+
+    def __init__(self, filename: Path):
+        with open(filename, "rb") as f:
+            metadata_length = int.from_bytes(f.read(8), byteorder='little')
+            file_size = os.stat(filename).st_size
+            if file_size < 8 + metadata_length:
+                raise ValueError(f"Could not read complete metadata. Need {8 + metadata_length} bytes, got {file_size}")
+
+            metadata_str = f.read(metadata_length).decode('utf-8')
+            try:
+                metadata = json.loads(metadata_str)
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Failed to parse safetensors metadata as JSON: {e}")
+
+            data_start_offset = f.tell()
+            alignment = self.ALIGNMENT
+            if data_start_offset % alignment != 0:
+                data_start_offset += alignment - (data_start_offset % alignment)
+
+            tensors: dict[str, LocalTensor] = {}
+            for name, meta in metadata.items():
+                if name == "__metadata__":
+                    # ignore metadata, it's not a tensor
+                    continue
+
+                tensors[name] = LocalTensor(
+                    dtype=meta["dtype"],
+                    shape=tuple(meta["shape"]),
+                    data_range=LocalTensorRange(
+                        filename,
+                        data_start_offset + meta["data_offsets"][0],
+                        meta["data_offsets"][1] - meta["data_offsets"][0],
+                    ),
+                )
+
+            # order by offset
+            self.tensors = dict(sorted(tensors.items(), key=lambda t: t[1].data_range.offset))
+
+    def __enter__(self, *args, **kwargs):
+        del args, kwargs  # unused
+        return self.tensors
+
+    def __exit__(self, *args, **kwargs):
+        del args, kwargs  # unused

From e7b7ed8ab146d549902fc53afd71c659af3b0eed Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 9 Sep 2025 13:31:06 -0400
Subject: [PATCH 2/3] gguf-py : order safetensors tensors by name

Applies to both local and remote safetensors custom parsing.
This matches the behavior of the official safetensors implementation.

* convert : rename from_safetensors_meta to from_local_tensor

For consistency with from_remote_tensor
---
 convert_hf_to_gguf.py   | 4 ++--
 gguf-py/gguf/utility.py | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 7a32f64fae8..faf43d56a3c 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -229,7 +229,7 @@ def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Call
                     if is_safetensors:
                         data: gguf.utility.LocalTensor = model_part[name]
                         if self.lazy:
-                            data_gen = lambda data=data: LazyTorchTensor.from_safetensors_meta(data)  # noqa: E731
+                            data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data)  # noqa: E731
                         else:
                             dtype = LazyTorchTensor._dtype_str_map[data.dtype]
                             data_gen = lambda data=data: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape)  # noqa: E731
@@ -10002,7 +10002,7 @@ def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
         return cast(torch.Tensor, lazy)
 
     @classmethod
-    def from_safetensors_meta(cls, t: gguf.utility.LocalTensor) -> Tensor:
+    def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
         def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
             dtype = cls._dtype_str_map[tensor.dtype]
             return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py
index 88f055746a9..c9401a1c0a2 100644
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@@ -179,6 +179,10 @@ def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]:
             except KeyError as e:
                 raise ValueError(f"Missing key in metadata for tensor '{name}': {e}, meta = {meta}")
 
+        # order by name (same as default safetensors behavior)
+        # ref: https://github.com/huggingface/safetensors/blob/0816a1ae1d6b731cefd67f061d80d1cadd0dd7bb/bindings/python/src/lib.rs#L606
+        res = dict(sorted(res.items(), key=lambda t: t[0]))
+
         return res
 
     @classmethod
@@ -332,8 +336,9 @@ def __init__(self, filename: Path):
                     ),
                 )
 
-            # order by offset
-            self.tensors = dict(sorted(tensors.items(), key=lambda t: t[1].data_range.offset))
+            # order by name (same as default safetensors behavior)
+            # ref: https://github.com/huggingface/safetensors/blob/0816a1ae1d6b731cefd67f061d80d1cadd0dd7bb/bindings/python/src/lib.rs#L606
+            self.tensors = dict(sorted(tensors.items(), key=lambda t: t[0]))
 
     def __enter__(self, *args, **kwargs):
         del args, kwargs  # unused

From e996f3aef80bd19c65e1085b13c9b045073edd98 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 9 Sep 2025 13:51:05 -0400
Subject: [PATCH 3/3] convert : fix no-lazy dtypes from direct safetensors

---
 convert_hf_to_gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index faf43d56a3c..aa0324ab043 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -232,7 +232,7 @@ def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Call
                             data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data)  # noqa: E731
                         else:
                             dtype = LazyTorchTensor._dtype_str_map[data.dtype]
-                            data_gen = lambda data=data: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape)  # noqa: E731
+                            data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape)  # noqa: E731
                     else:
                         data_torch: Tensor = model_part[name]
                         if self.lazy: