From 3b533a61c68e046a591fd7a9ab49a4f8b963ac60 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 4 Dec 2024 11:36:48 +0100 Subject: [PATCH 01/30] First draft for a DDUF parser --- .../en/package_reference/serialization.md | 52 +++++++ src/huggingface_hub/__init__.py | 10 ++ src/huggingface_hub/errors.py | 7 + src/huggingface_hub/serialization/_dduf.py | 133 ++++++++++++++++++ 4 files changed, 202 insertions(+) create mode 100644 src/huggingface_hub/serialization/_dduf.py diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index 0149855e02..3b67fb06e8 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -6,6 +6,58 @@ rendered properly in your Markdown viewer. `huggingface_hub` contains helpers to help ML libraries serialize models weights in a standardized way. This part of the lib is still under development and will be improved in future releases. The goal is to harmonize how weights are serialized on the Hub, both to remove code duplication across libraries and to foster conventions on the Hub. +## DDUF file format + +DDUF is a file format designed for diffusers models. It allows saving all the information to run a model in a single file. This work is inspired by the GGUF format. `huggingface_hub` provides helpers to save and load DDUF files, ensuring the file format is respected. + + + +This is a very early version of the parser. The API and implementation can evolve in the near future. + +The parser currently does very little validation. For more details about the file format, check out https://github.com/huggingface/huggingface.js/tree/main/packages/dduf. + + + +### How to read a DDUF file? + +```python +>>> import json +>>> import safetensors.load +>>> from huggingface_hub import read_dduf_file + +# Read DDUF metadata +>>> dduf_entries = read_dduf_file("FLUX.1-dev.dduf") + +# Returns a mapping filename <> DDUFEntry +>>> dduf_entries["model_index.json"] +DDUFEntry(filename='model_index.json', offset=66, length=587) + +# Load model index as JSON +>>> json.loads(dduf_entries["model_index.json"].read_text()) +{'_class_name': 'FluxPipeline', '_diffusers_version': '0.32.0.dev0', '_name_or_path': 'black-forest-labs/FLUX.1-dev', 'scheduler': ['diffusers', 'FlowMatchEulerDiscreteScheduler'], 'text_encoder': ['transformers', 'CLIPTextModel'], 'text_encoder_2': ['transformers', 'T5EncoderModel'], 'tokenizer': ['transformers', 'CLIPTokenizer'], 'tokenizer_2': ['transformers', 'T5TokenizerFast'], 'transformer': ['diffusers', 'FluxTransformer2DModel'], 'vae': ['diffusers', 'AutoencoderKL']} + +# Load VAE weights using safetensors +>>> with dduf_entries["vae/diffusion_pytorch_model.safetensors"].as_mmap() as mm: +... state_dict = safetensors.torch.load(mm) +``` + +### How to write a DDUF file? + +```python +>>> from huggingface_hub import write_dduf_file +>>> write_dduf_file("FLUX.1-dev.dduf", diffuser_path="path/to/FLUX.1-dev") +``` + +### Helpers + +[[autodoc]] huggingface_hub.write_dduf_file + +[[autodoc]] huggingface_hub.read_dduf_file + +[[autodoc]] huggingface_hub.DDUFEntry + +[[autodoc]] huggingface_hub.errors.DDUFCorruptedFileError + ## Save torch state dict The main helper of the `serialization` module takes a torch `nn.Module` as input and saves it to disk. It handles the logic to save shared tensors (see [safetensors explanation](https://huggingface.co/docs/safetensors/torch_shared_tensors)) as well as logic to split the state dictionary into shards, using [`split_torch_state_dict_into_shards`] under the hood. At the moment, only `torch` framework is supported. diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 3b013cd02d..1fdedf323a 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -468,6 +468,11 @@ "split_tf_state_dict_into_shards", "split_torch_state_dict_into_shards", ], + "serialization._dduf": [ + "DDUFEntry", + "read_dduf_file", + "write_dduf_file", + ], "utils": [ "CacheNotFound", "CachedFileInfo", @@ -995,6 +1000,11 @@ def __dir__(): split_tf_state_dict_into_shards, # noqa: F401 split_torch_state_dict_into_shards, # noqa: F401 ) + from .serialization._dduf import ( + DDUFEntry, # noqa: F401 + read_dduf_file, # noqa: F401 + write_dduf_file, # noqa: F401 + ) from .utils import ( CachedFileInfo, # noqa: F401 CachedRepoInfo, # noqa: F401 diff --git a/src/huggingface_hub/errors.py b/src/huggingface_hub/errors.py index 1dae6ddf97..ed7ff00947 100644 --- a/src/huggingface_hub/errors.py +++ b/src/huggingface_hub/errors.py @@ -308,3 +308,10 @@ class BadRequestError(HfHubHTTPError, ValueError): huggingface_hub.utils._errors.BadRequestError: Bad request for check endpoint: {details} (Request ID: XXX) ``` """ + + +# DDUF file format ERROR + + +class DDUFCorruptedFileError(Exception): + pass diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py new file mode 100644 index 0000000000..1bc4c0bae1 --- /dev/null +++ b/src/huggingface_hub/serialization/_dduf.py @@ -0,0 +1,133 @@ +import logging +import mmap +import shutil +import zipfile +from contextlib import contextmanager +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, Generator, Union + +from ..errors import DDUFCorruptedFileError + + +logger = logging.getLogger(__name__) + +DDUF_ALLOWED_ENTRIES = {".json", ".gguf", ".txt", ".safetensors"} + + +@dataclass +class DDUFEntry: + """Object representing a file entry in a DDUF file. + + See [`read_dduf_file`] for how to read a DDUF file. + + Attributes: + filename (str): + The name of the file in the DDUF archive. + offset (int): + The offset of the file in the DDUF archive. + length (int): + The length of the file in the DDUF archive. + dduf_path (str): + The path to the DDUF archive (for internal use). + """ + + filename: str + length: int + offset: int + + dduf_path: Path = field(repr=False) + + @contextmanager + def as_mmap(self) -> Generator[bytes, None, None]: + """Open the file as a memory-mapped file. + + Useful to load safetensors directly from the file. + + Example: + ```py + >>> import safetensors.torch + >>> with entry.as_mmap() as mm: + ... tensors = safetensors.torch.load(mm) + ``` + """ + with self.dduf_path.open("rb") as f: + with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm: + yield mm[self.offset : self.offset + self.length] + + def read_text(self, encoding="utf-8") -> str: + """Read the file as text. + + Useful for '.txt' and '.json' entries. + """ + with self.dduf_path.open("rb") as f: + f.seek(self.offset) + return f.read(self.length).decode(encoding=encoding) + + +def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: + """ + Read a DDUF file and return a dictionary of entries. + + Only the metadata is read, the data is not loaded in memory. + + Args: + dduf_path (`str` or `Path`): + The path to the DDUF file to read. + + Returns: + `Dict[str, DDUFEntry]`: + A dictionary of [`DDUFEntry`] indexed by filename. + + Raises: + - [`DDUFCorruptedFileError`]: If the DDUF file is corrupted (i.e. doesn't follow the DDUF format). + """ + entries = {} + dduf_path = Path(dduf_path) + with zipfile.ZipFile(str(dduf_path), "r") as zf: + for info in zf.infolist(): + if info.compress_type != zipfile.ZIP_STORED: + raise DDUFCorruptedFileError("Data must not be compressed in GGUF file.") + + # Use private attribute to get data range for this file. + # Let's reconsider later if it's too problematic (worse case, we can build our own metadata parser). + # Note: simply doing `info.header_offset + len(info.FileHeader())` doesn't work because of the ZIP64 extension. + offset = info._end_offset - info.compress_size + + entries[info.filename] = DDUFEntry( + filename=info.filename, offset=offset, length=info.file_size, dduf_path=dduf_path + ) + return entries + + +def write_dduf_file(dduf_path: Union[str, Path], diffuser_path: Union[str, Path]) -> None: + """ + Write a DDUF file from a diffusers folder. + + A DDUF file is simply a ZIP archive with a few constraints (force ZIP64, no compression, only certain files). + + Args: + dduf_path (`str` or `Path`): + The path to the DDUF file to write. + diffuser_path (`str` or `Path`): + The path to the folder containing the diffusers model. + """ + # TODO: update method signature. + # DDUF filename should be inferred as much as possible from high-level info (precision, model, etc.) to ensure consistency. + # Example: "stable-diffusion-3.5-Q4-BNB.dduf" + # See https://github.com/huggingface/diffusers/pull/10037#discussion_r1862275730. + logger.info("Writing DDUF file %s from folder %s", dduf_path, diffuser_path) + diffuser_path = Path(diffuser_path) + with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: + for path in diffuser_path.glob("**/*"): + if path.is_dir(): + logger.debug("Skipping directory %s", path) + continue + if path.suffix not in DDUF_ALLOWED_ENTRIES: + logger.debug("Skipping file %s", path) + continue + logger.debug("Adding file %s", path) + with archive.open(str(path.relative_to(diffuser_path)), "w", force_zip64=True) as f: + with path.open("rb") as src: + shutil.copyfileobj(src, f, 1024 * 8) + logger.info("Done writing DDUF file %s", dduf_path) From 953bbae49f2b68c3c7f47fde99868d83ed3774a0 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 4 Dec 2024 11:42:23 +0100 Subject: [PATCH 02/30] write before read --- docs/source/en/package_reference/serialization.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index 3b67fb06e8..12301c2ef4 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -18,6 +18,13 @@ The parser currently does very little validation. For more details about the fil +### How to write a DDUF file? + +```python +>>> from huggingface_hub import write_dduf_file +>>> write_dduf_file("FLUX.1-dev.dduf", diffuser_path="path/to/FLUX.1-dev") +``` + ### How to read a DDUF file? ```python @@ -41,13 +48,6 @@ DDUFEntry(filename='model_index.json', offset=66, length=587) ... state_dict = safetensors.torch.load(mm) ``` -### How to write a DDUF file? - -```python ->>> from huggingface_hub import write_dduf_file ->>> write_dduf_file("FLUX.1-dev.dduf", diffuser_path="path/to/FLUX.1-dev") -``` - ### Helpers [[autodoc]] huggingface_hub.write_dduf_file From d30558be700cb0289007b9067650a972b3b677fb Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 4 Dec 2024 11:47:33 +0100 Subject: [PATCH 03/30] comments and lint --- src/huggingface_hub/serialization/_dduf.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 1bc4c0bae1..0c9826740f 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -84,19 +84,22 @@ def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: """ entries = {} dduf_path = Path(dduf_path) + logger.info("Reading DDUF file %s", dduf_path) with zipfile.ZipFile(str(dduf_path), "r") as zf: for info in zf.infolist(): + logger.debug("Reading entry %s", info.filename) if info.compress_type != zipfile.ZIP_STORED: raise DDUFCorruptedFileError("Data must not be compressed in GGUF file.") # Use private attribute to get data range for this file. # Let's reconsider later if it's too problematic (worse case, we can build our own metadata parser). # Note: simply doing `info.header_offset + len(info.FileHeader())` doesn't work because of the ZIP64 extension. - offset = info._end_offset - info.compress_size + offset = info._end_offset - info.compress_size # type: ignore[attr-defined] entries[info.filename] = DDUFEntry( filename=info.filename, offset=offset, length=info.file_size, dduf_path=dduf_path ) + logger.info("Done reading DDUF file %s. Found %d entries", dduf_path, len(entries)) return entries @@ -129,5 +132,7 @@ def write_dduf_file(dduf_path: Union[str, Path], diffuser_path: Union[str, Path] logger.debug("Adding file %s", path) with archive.open(str(path.relative_to(diffuser_path)), "w", force_zip64=True) as f: with path.open("rb") as src: - shutil.copyfileobj(src, f, 1024 * 8) + # taken from zipfile source code + # TODO: optimize this for large files + shutil.copyfileobj(src, f, 1024 * 8) # type: ignore[misc] logger.info("Done writing DDUF file %s", dduf_path) From 0f21bd3a69af4350de39cba077bf28ee00a8e66a Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 4 Dec 2024 12:22:08 +0100 Subject: [PATCH 04/30] forbid nested directoroes --- src/huggingface_hub/serialization/_dduf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 0c9826740f..0a04fc17fa 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -127,10 +127,14 @@ def write_dduf_file(dduf_path: Union[str, Path], diffuser_path: Union[str, Path] logger.debug("Skipping directory %s", path) continue if path.suffix not in DDUF_ALLOWED_ENTRIES: - logger.debug("Skipping file %s", path) + logger.debug("Skipping file %s (file type not allowed)", path) + continue + path_in_archive = path.relative_to(diffuser_path) + if len(path_in_archive.parts) > 3: + logger.debug("Skipping file %s (nested directories not allowed)", path) continue logger.debug("Adding file %s", path) - with archive.open(str(path.relative_to(diffuser_path)), "w", force_zip64=True) as f: + with archive.open(str(path_in_archive), "w", force_zip64=True) as f: with path.open("rb") as src: # taken from zipfile source code # TODO: optimize this for large files From b4bf0302e867802f045591f3f75a60f8b24a9276 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 4 Dec 2024 12:39:10 +0100 Subject: [PATCH 05/30] gguf typo --- src/huggingface_hub/serialization/_dduf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 0a04fc17fa..838d93f6bd 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -89,7 +89,7 @@ def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: for info in zf.infolist(): logger.debug("Reading entry %s", info.filename) if info.compress_type != zipfile.ZIP_STORED: - raise DDUFCorruptedFileError("Data must not be compressed in GGUF file.") + raise DDUFCorruptedFileError("Data must not be compressed in DDUF file.") # Use private attribute to get data range for this file. # Let's reconsider later if it's too problematic (worse case, we can build our own metadata parser). From 1b11f0b38c71c3d6ad3e1a7adfb9d2956e219741 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 4 Dec 2024 17:03:50 +0100 Subject: [PATCH 06/30] export_from_entries --- .../en/package_reference/serialization.md | 22 ++- src/huggingface_hub/__init__.py | 6 +- src/huggingface_hub/errors.py | 12 +- src/huggingface_hub/serialization/_dduf.py | 151 +++++++++++++++--- 4 files changed, 159 insertions(+), 32 deletions(-) diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index 12301c2ef4..cd90e960d8 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -20,9 +20,17 @@ The parser currently does very little validation. For more details about the fil ### How to write a DDUF file? +Here is how to export a folder containing different parts of a diffusion model: + +```python +# Export a folder as a DDUF file +>>> from huggingface_hub import export_folder_as_dduf +>>> export_folder_as_dduf("FLUX.1-dev.dduf", diffuser_path="path/to/FLUX.1-dev") +``` + +If your model is loaded in memory, you can directly serialize it to a GGUF file without saving to disk first. + ```python ->>> from huggingface_hub import write_dduf_file ->>> write_dduf_file("FLUX.1-dev.dduf", diffuser_path="path/to/FLUX.1-dev") ``` ### How to read a DDUF file? @@ -50,14 +58,22 @@ DDUFEntry(filename='model_index.json', offset=66, length=587) ### Helpers -[[autodoc]] huggingface_hub.write_dduf_file +[[autodoc]] huggingface_hub.export_as_dduf + +[[autodoc]] huggingface_hub.export_folder_as_dduf [[autodoc]] huggingface_hub.read_dduf_file [[autodoc]] huggingface_hub.DDUFEntry +### Errors + +[[autodoc]] huggingface_hub.errors.DDUFError + [[autodoc]] huggingface_hub.errors.DDUFCorruptedFileError +[[autodoc]] huggingface_hub.errors.DDUFExportError + ## Save torch state dict The main helper of the `serialization` module takes a torch `nn.Module` as input and saves it to disk. It handles the logic to save shared tensors (see [safetensors explanation](https://huggingface.co/docs/safetensors/torch_shared_tensors)) as well as logic to split the state dictionary into shards, using [`split_torch_state_dict_into_shards`] under the hood. At the moment, only `torch` framework is supported. diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 1fdedf323a..d222831443 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -470,8 +470,9 @@ ], "serialization._dduf": [ "DDUFEntry", + "export_as_dduf", + "export_folder_as_dduf", "read_dduf_file", - "write_dduf_file", ], "utils": [ "CacheNotFound", @@ -1002,8 +1003,9 @@ def __dir__(): ) from .serialization._dduf import ( DDUFEntry, # noqa: F401 + export_as_dduf, # noqa: F401 + export_folder_as_dduf, # noqa: F401 read_dduf_file, # noqa: F401 - write_dduf_file, # noqa: F401 ) from .utils import ( CachedFileInfo, # noqa: F401 diff --git a/src/huggingface_hub/errors.py b/src/huggingface_hub/errors.py index ed7ff00947..e4881e7596 100644 --- a/src/huggingface_hub/errors.py +++ b/src/huggingface_hub/errors.py @@ -313,5 +313,13 @@ class BadRequestError(HfHubHTTPError, ValueError): # DDUF file format ERROR -class DDUFCorruptedFileError(Exception): - pass +class DDUFError(Exception): + """Base exception for errors related to the DDUF format.""" + + +class DDUFCorruptedFileError(DDUFError): + """Exception thrown when the DDUF file is corrupted.""" + + +class DDUFExportError(DDUFError): + """Base exception for errors during DDUF export.""" diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 838d93f6bd..97385e3934 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -5,14 +5,20 @@ from contextlib import contextmanager from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, Generator, Union +from typing import Dict, Generator, Iterable, Tuple, Union -from ..errors import DDUFCorruptedFileError +from ..errors import DDUFCorruptedFileError, DDUFExportError logger = logging.getLogger(__name__) -DDUF_ALLOWED_ENTRIES = {".json", ".gguf", ".txt", ".safetensors"} +DDUF_ALLOWED_ENTRIES = { + ".gguf", + ".json", + ".model", + ".safetensors", + ".txt", +} @dataclass @@ -59,6 +65,12 @@ def read_text(self, encoding="utf-8") -> str: """Read the file as text. Useful for '.txt' and '.json' entries. + + Example: + ```py + >>> import json + >>> index = json.loads(entry.read_text()) + ``` """ with self.dduf_path.open("rb") as f: f.seek(self.offset) @@ -81,6 +93,28 @@ def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: Raises: - [`DDUFCorruptedFileError`]: If the DDUF file is corrupted (i.e. doesn't follow the DDUF format). + + Example: + ```python + >>> import json + >>> import safetensors.load + >>> from huggingface_hub import read_dduf_file + + # Read DDUF metadata + >>> dduf_entries = read_dduf_file("FLUX.1-dev.dduf") + + # Returns a mapping filename <> DDUFEntry + >>> dduf_entries["model_index.json"] + DDUFEntry(filename='model_index.json', offset=66, length=587) + + # Load model index as JSON + >>> json.loads(dduf_entries["model_index.json"].read_text()) + {'_class_name': 'FluxPipeline', '_diffusers_version': '0.32.0.dev0', '_name_or_path': 'black-forest-labs/FLUX.1-dev', ... + + # Load VAE weights using safetensors + >>> with dduf_entries["vae/diffusion_pytorch_model.safetensors"].as_mmap() as mm: + ... state_dict = safetensors.torch.load(mm) + ``` """ entries = {} dduf_path = Path(dduf_path) @@ -103,40 +137,107 @@ def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: return entries -def write_dduf_file(dduf_path: Union[str, Path], diffuser_path: Union[str, Path]) -> None: +def export_as_dduf(dduf_path: Union[str, Path], entries: Iterable[Tuple[str, Union[str, Path, bytes]]]) -> None: + """Write a DDUF file from an iterable of entries. + + Args: + dduf_path (`str` or `Path`): + The path to the DDUF file to write. + entries (`Iterable[Tuple[str, Union[str, Path, bytes]]]`): + An iterable of entries to write in the DDUF file. Each entry is a tuple with the filename and the content. + The filename should be the path to the file in the DDUF archive. + The content can be a string or a pathlib.Path representing a path to a file on the local disk or directly the content as bytes. + + Raises: + - [`DDUFExportError`]: If entry type is not supported (must be str, Path or bytes). """ - Write a DDUF file from a diffusers folder. + logger.info("Exporting DDUF file '%s'", dduf_path) + with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: + for filename, content in entries: + if "." + filename.split(".")[-1] not in DDUF_ALLOWED_ENTRIES: + raise DDUFExportError(f"File type not allowed: {filename}") + logger.debug("Adding file %s to DDUF file", filename) + _dump_content_in_archive(archive, filename, content) + + logger.info("Done writing DDUF file %s", dduf_path) - A DDUF file is simply a ZIP archive with a few constraints (force ZIP64, no compression, only certain files). + +def export_folder_as_dduf(dduf_path: Union[str, Path], folder_path: Union[str, Path]) -> None: + """ + Export a folder as a DDUF file. + + AUses [`export_as_dduf`] under the hood. Args: dduf_path (`str` or `Path`): The path to the DDUF file to write. - diffuser_path (`str` or `Path`): - The path to the folder containing the diffusers model. + folder_path (`str` or `Path`): + The path to the folder containing the diffusion model. + + Example: + ```python + >>> from huggingface_hub import export_folder_as_dduf + >>> export_folder_as_dduf("FLUX.1-dev.dduf", diffuser_path="path/to/FLUX.1-dev") + ``` """ - # TODO: update method signature. - # DDUF filename should be inferred as much as possible from high-level info (precision, model, etc.) to ensure consistency. - # Example: "stable-diffusion-3.5-Q4-BNB.dduf" - # See https://github.com/huggingface/diffusers/pull/10037#discussion_r1862275730. - logger.info("Writing DDUF file %s from folder %s", dduf_path, diffuser_path) - diffuser_path = Path(diffuser_path) - with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: - for path in diffuser_path.glob("**/*"): + folder_path = Path(folder_path) + + def _iterate_over_folder() -> Iterable[Tuple[str, Path]]: + for path in Path(folder_path).glob("**/*"): if path.is_dir(): - logger.debug("Skipping directory %s", path) continue if path.suffix not in DDUF_ALLOWED_ENTRIES: logger.debug("Skipping file %s (file type not allowed)", path) continue - path_in_archive = path.relative_to(diffuser_path) + path_in_archive = path.relative_to(folder_path) if len(path_in_archive.parts) > 3: logger.debug("Skipping file %s (nested directories not allowed)", path) continue - logger.debug("Adding file %s", path) - with archive.open(str(path_in_archive), "w", force_zip64=True) as f: - with path.open("rb") as src: - # taken from zipfile source code - # TODO: optimize this for large files - shutil.copyfileobj(src, f, 1024 * 8) # type: ignore[misc] - logger.info("Done writing DDUF file %s", dduf_path) + yield path_in_archive.as_posix(), path + + export_as_dduf(dduf_path, _iterate_over_folder()) + + +def add_entry_to_dduf(dduf_path: Union[str, Path], filename: str, content: Union[str, Path, bytes]) -> None: + """ + Add an entry to an existing DDUF file. + + Args: + dduf_path (`str` or `Path`): + The path to the DDUF file to write. + filename (`str`): + The path to the file in the DDUF archive. + content (`str`, `Path` or `bytes`): + The content of the file to add to the DDUF archive. + + Raises: + - [`DDUFExportError`]: If the entry already exists in the DDUF file. + """ + dduf_path = str(dduf_path) + # Ensure the zip file exists + try: + with zipfile.ZipFile(dduf_path, "r") as zf: + # Check if the file already exists in the zip + if filename in zf.namelist(): + raise DDUFExportError(f"Entry '{filename}' already exists in DDUF file.") + except FileNotFoundError: + # If the zip doesn't exist, create it + with zipfile.ZipFile(dduf_path, "w") as _: + pass + + # Reopen the zip in append mode and add the new file + with zipfile.ZipFile(dduf_path, "a", zipfile.ZIP_STORED) as archive: + logger.debug("Adding file %s to DDUF file", filename) + _dump_content_in_archive(archive, filename, content) + + +def _dump_content_in_archive(archive: zipfile.ZipFile, filename: str, content: Union[str, Path, bytes]) -> None: + with archive.open(filename, "w", force_zip64=True) as archive_fh: + if isinstance(content, (str, Path)): + content_path = Path(content) + with content_path.open("rb") as content_fh: + shutil.copyfileobj(content_fh, archive_fh, 1024 * 1024 * 8) # type: ignore[misc] + elif isinstance(content, bytes): + archive_fh.write(content) + else: + raise DDUFExportError(f"Invalid content type for {filename}. Must be str, Path or bytes.") From f349cbeaa8e6d9bcc64bad6e5a7c5f6470d7a3d6 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 4 Dec 2024 17:44:57 +0100 Subject: [PATCH 07/30] some docs --- .../en/package_reference/serialization.md | 48 +++++++++++++++-- src/huggingface_hub/__init__.py | 4 +- src/huggingface_hub/serialization/_dduf.py | 52 +++++++++++++++++-- 3 files changed, 95 insertions(+), 9 deletions(-) diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index cd90e960d8..99ffd043d8 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -20,19 +20,59 @@ The parser currently does very little validation. For more details about the fil ### How to write a DDUF file? -Here is how to export a folder containing different parts of a diffusion model: +Here is how to export a folder containing different parts of a diffusion model using [`export_folder_as_dduf`]: ```python # Export a folder as a DDUF file >>> from huggingface_hub import export_folder_as_dduf ->>> export_folder_as_dduf("FLUX.1-dev.dduf", diffuser_path="path/to/FLUX.1-dev") +>>> export_folder_as_dduf("FLUX.1-dev.dduf", folder_path="path/to/FLUX.1-dev") ``` -If your model is loaded in memory, you can directly serialize it to a GGUF file without saving to disk first. +For more flexibility, to can use [`export_entries_as_dduf`] and pass a list of files to include in the final DDUF file: ```python +# Export specific files from the local disk. +>>> from huggingface_hub import export_entries_as_dduf +>>> export_entries_as_dduf( +... "stable-diffusion-v1-4-FP16.dduf", +... entries=[ # List entries to add to the DDUF file (here, only FP16 weights) +... ("model_index.json", "path/to/model_index.json"), +... ("vae/config.json", "path/to/vae/config.json"), +... ("vae/diffusion_pytorch_model.fp16.safetensors", "path/to/vae/diffusion_pytorch_model.fp16.safetensors"), +... ("text_encoder/config.json", "path/to/text_encoder/config.json"), +... ("text_encoder/model.fp16.safetensors", "path/to/text_encoder/model.fp16.safetensors"), +... # ... add more entries here +... ] +... ) ``` +The `entries` parameter also support passing an iterable of paths or bytes. This can prove useful if you have a loaded model and want to serialize it directly in a DDUF file instead of having to serialize each component to disk first and then as a DDUF file. Here is an example on how a `StableDiffusionPipeline` can be serialized as DDUF: + + +```python +# Export state_dicts one by one from a loaded pipeline +>>> from diffusers import DiffusionPipeline +>>> from typing import Generator, Tuple +>>> import safetensors.torch +>>> from huggingface_hub import export_entries_as_dduf +>>> pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") +... # ... do some work with the pipeline + +>>> def as_entries(pipe: DiffusionPipeline) -> Generator[Tuple[str, bytes], None, None]: +... # Build an generator that yields the entries to add to the DDUF file. +... # The first element of the tuple is the filename in the DDUF archive (must use UNIX separator!). The second element is the content of the file. +... # Entries will be evaluated lazily when the DDUF file is created (only 1 entry is loaded in memory at a time) +... yield "vae/config.json", pipe.vae.to_json_string().encode() +... yield "vae/diffusion_pytorch_model.safetensors", safetensors.torch.save(pipe.vae.state_dict()) +... yield "text_encoder/config.json", pipe.text_encoder.config.to_json_string().encode() +... yield "text_encoder/model.safetensors", safetensors.torch.save(pipe.text_encoder.state_dict()) +... # ... add more entries here + +>>> export_entries_as_dduf("stable-diffusion-v1-4.dduf", as_entries=as_entries(pipe)) +``` + +**Note:** in practice, `diffusers` provides a method to directly serialize a pipeline in a DDUF file. The snippet above is only meant as an example. + ### How to read a DDUF file? ```python @@ -58,7 +98,7 @@ DDUFEntry(filename='model_index.json', offset=66, length=587) ### Helpers -[[autodoc]] huggingface_hub.export_as_dduf +[[autodoc]] huggingface_hub.export_entries_as_dduf [[autodoc]] huggingface_hub.export_folder_as_dduf diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index d222831443..e010eef022 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -470,7 +470,7 @@ ], "serialization._dduf": [ "DDUFEntry", - "export_as_dduf", + "export_entries_as_dduf", "export_folder_as_dduf", "read_dduf_file", ], @@ -1003,7 +1003,7 @@ def __dir__(): ) from .serialization._dduf import ( DDUFEntry, # noqa: F401 - export_as_dduf, # noqa: F401 + export_entries_as_dduf, # noqa: F401 export_folder_as_dduf, # noqa: F401 read_dduf_file, # noqa: F401 ) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 97385e3934..a29812db3c 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -137,9 +137,14 @@ def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: return entries -def export_as_dduf(dduf_path: Union[str, Path], entries: Iterable[Tuple[str, Union[str, Path, bytes]]]) -> None: +def export_entries_as_dduf( + dduf_path: Union[str, Path], entries: Iterable[Tuple[str, Union[str, Path, bytes]]] +) -> None: """Write a DDUF file from an iterable of entries. + This is a lower-level helper than [`export_folder_as_dduf`] that allows more flexibility when serializing data. + In particular, you don't need to save the data on disk before exporting it in the DDUF file. + Args: dduf_path (`str` or `Path`): The path to the DDUF file to write. @@ -150,12 +155,53 @@ def export_as_dduf(dduf_path: Union[str, Path], entries: Iterable[Tuple[str, Uni Raises: - [`DDUFExportError`]: If entry type is not supported (must be str, Path or bytes). + + Example: + ```python + # Export specific files from the local disk. + >>> from huggingface_hub import export_entries_as_dduf + >>> export_entries_as_dduf( + ... "stable-diffusion-v1-4-FP16.dduf", + ... entries=[ # List entries to add to the DDUF file (here, only FP16 weights) + ... ("model_index.json", "path/to/model_index.json"), + ... ("vae/config.json", "path/to/vae/config.json"), + ... ("vae/diffusion_pytorch_model.fp16.safetensors", "path/to/vae/diffusion_pytorch_model.fp16.safetensors"), + ... ("text_encoder/config.json", "path/to/text_encoder/config.json"), + ... ("text_encoder/model.fp16.safetensors", "path/to/text_encoder/model.fp16.safetensors"), + ... # ... add more entries here + ... ] + ... ) + ``` + + ```python + # Export state_dicts one by one from a loaded pipeline + >>> from diffusers import DiffusionPipeline + >>> from typing import Generator, Tuple + >>> import safetensors.torch + >>> from huggingface_hub import export_entries_as_dduf + >>> pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") + ... # ... do some work with the pipeline + + >>> def as_entries(pipe: DiffusionPipeline) -> Generator[Tuple[str, bytes], None, None]: + ... # Build an generator that yields the entries to add to the DDUF file. + ... # The first element of the tuple is the filename in the DDUF archive (must use UNIX separator!). The second element is the content of the file. + ... # Entries will be evaluated lazily when the DDUF file is created (only 1 entry is loaded in memory at a time) + ... yield "vae/config.json", pipe.vae.to_json_string().encode() + ... yield "vae/diffusion_pytorch_model.safetensors", safetensors.torch.save(pipe.vae.state_dict()) + ... yield "text_encoder/config.json", pipe.text_encoder.config.to_json_string().encode() + ... yield "text_encoder/model.safetensors", safetensors.torch.save(pipe.text_encoder.state_dict()) + ... # ... add more entries here + + >>> export_entries_as_dduf("stable-diffusion-v1-4.dduf", as_entries=as_entries(pipe)) + ``` """ logger.info("Exporting DDUF file '%s'", dduf_path) with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: for filename, content in entries: if "." + filename.split(".")[-1] not in DDUF_ALLOWED_ENTRIES: raise DDUFExportError(f"File type not allowed: {filename}") + if "\\" in filename: + raise DDUFExportError(f"Filenames must use UNIX separators: {filename}") logger.debug("Adding file %s to DDUF file", filename) _dump_content_in_archive(archive, filename, content) @@ -166,7 +212,7 @@ def export_folder_as_dduf(dduf_path: Union[str, Path], folder_path: Union[str, P """ Export a folder as a DDUF file. - AUses [`export_as_dduf`] under the hood. + AUses [`export_entries_as_dduf`] under the hood. Args: dduf_path (`str` or `Path`): @@ -195,7 +241,7 @@ def _iterate_over_folder() -> Iterable[Tuple[str, Path]]: continue yield path_in_archive.as_posix(), path - export_as_dduf(dduf_path, _iterate_over_folder()) + export_entries_as_dduf(dduf_path, _iterate_over_folder()) def add_entry_to_dduf(dduf_path: Union[str, Path], filename: str, content: Union[str, Path, bytes]) -> None: From bf7dc846830557b322707f1054a00a25903e1bfb Mon Sep 17 00:00:00 2001 From: Lucain Date: Fri, 6 Dec 2024 15:20:02 +0100 Subject: [PATCH 08/30] Update docs/source/en/package_reference/serialization.md Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- docs/source/en/package_reference/serialization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index 99ffd043d8..4653d74780 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -68,7 +68,7 @@ The `entries` parameter also support passing an iterable of paths or bytes. This ... yield "text_encoder/model.safetensors", safetensors.torch.save(pipe.text_encoder.state_dict()) ... # ... add more entries here ->>> export_entries_as_dduf("stable-diffusion-v1-4.dduf", as_entries=as_entries(pipe)) +>>> export_entries_as_dduf("stable-diffusion-v1-4.dduf", entries=as_entries(pipe)) ``` **Note:** in practice, `diffusers` provides a method to directly serialize a pipeline in a DDUF file. The snippet above is only meant as an example. From 16c3e150baa861312aa470d31d80ca2d69c4d348 Mon Sep 17 00:00:00 2001 From: Lucain Date: Fri, 6 Dec 2024 16:03:07 +0100 Subject: [PATCH 09/30] Update src/huggingface_hub/serialization/_dduf.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/huggingface_hub/serialization/_dduf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index a29812db3c..1674cbbbfc 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -192,7 +192,7 @@ def export_entries_as_dduf( ... yield "text_encoder/model.safetensors", safetensors.torch.save(pipe.text_encoder.state_dict()) ... # ... add more entries here - >>> export_entries_as_dduf("stable-diffusion-v1-4.dduf", as_entries=as_entries(pipe)) + >>> export_entries_as_dduf("stable-diffusion-v1-4.dduf", entries=as_entries(pipe)) ``` """ logger.info("Exporting DDUF file '%s'", dduf_path) From ba1e6a4e05070d7a902b4d66db29fc9955150b2c Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Tue, 10 Dec 2024 10:10:07 +0100 Subject: [PATCH 10/30] compute data offset without private arg --- src/huggingface_hub/serialization/_dduf.py | 42 +++++++++++++++++++--- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 1674cbbbfc..656de74e60 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -125,10 +125,7 @@ def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: if info.compress_type != zipfile.ZIP_STORED: raise DDUFCorruptedFileError("Data must not be compressed in DDUF file.") - # Use private attribute to get data range for this file. - # Let's reconsider later if it's too problematic (worse case, we can build our own metadata parser). - # Note: simply doing `info.header_offset + len(info.FileHeader())` doesn't work because of the ZIP64 extension. - offset = info._end_offset - info.compress_size # type: ignore[attr-defined] + offset = _get_data_offset(zf, info) entries[info.filename] = DDUFEntry( filename=info.filename, offset=offset, length=info.file_size, dduf_path=dduf_path @@ -287,3 +284,40 @@ def _dump_content_in_archive(archive: zipfile.ZipFile, filename: str, content: U archive_fh.write(content) else: raise DDUFExportError(f"Invalid content type for {filename}. Must be str, Path or bytes.") + + +def _get_data_offset(zf: zipfile.ZipFile, info: zipfile.ZipInfo) -> int: + """ + Calculate the data offset for a file in a ZIP archive. + + Args: + zf (`zipfile.ZipFile`): + The opened ZIP file. Must be opened in read mode. + info (`zipfile.ZipInfo`): + The file info. + + Returns: + int: The offset of the file data in the ZIP archive. + """ + if zf.fp is None: + raise DDUFCorruptedFileError("ZipFile object must be opened in read mode.") + + # Step 1: Get the local file header offset + header_offset = info.header_offset + + # Step 2: Read the local file header + zf.fp.seek(header_offset) + local_file_header = zf.fp.read(30) # Fixed-size part of the local header + + if len(local_file_header) < 30: + raise DDUFCorruptedFileError("Incomplete local file header.") + + # Step 3: Parse the header fields to calculate the start of file data + # Local file header: https://en.wikipedia.org/wiki/ZIP_(file_format)#File_headers + filename_len = int.from_bytes(local_file_header[26:28], "little") + extra_field_len = int.from_bytes(local_file_header[28:30], "little") + + # Data offset is after the fixed header, filename, and extra fields + data_offset = header_offset + 30 + filename_len + extra_field_len + + return data_offset From 0546ca1c034f189b812b047a2bea9bbdd4f02a5e Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 11 Dec 2024 12:17:41 +0100 Subject: [PATCH 11/30] type annotations --- src/huggingface_hub/serialization/_dduf.py | 25 ++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 656de74e60..87d5c6ee71 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -1,5 +1,6 @@ import logging import mmap +import os import shutil import zipfile from contextlib import contextmanager @@ -61,7 +62,7 @@ def as_mmap(self) -> Generator[bytes, None, None]: with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm: yield mm[self.offset : self.offset + self.length] - def read_text(self, encoding="utf-8") -> str: + def read_text(self, encoding: str = "utf-8") -> str: """Read the file as text. Useful for '.txt' and '.json' entries. @@ -77,14 +78,14 @@ def read_text(self, encoding="utf-8") -> str: return f.read(self.length).decode(encoding=encoding) -def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: +def read_dduf_file(dduf_path: Union[os.PathLike, str]) -> Dict[str, DDUFEntry]: """ Read a DDUF file and return a dictionary of entries. Only the metadata is read, the data is not loaded in memory. Args: - dduf_path (`str` or `Path`): + dduf_path (`str` or `os.PathLike`): The path to the DDUF file to read. Returns: @@ -135,7 +136,7 @@ def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: def export_entries_as_dduf( - dduf_path: Union[str, Path], entries: Iterable[Tuple[str, Union[str, Path, bytes]]] + dduf_path: Union[str, os.PathLike], entries: Iterable[Tuple[str, Union[str, Path, bytes]]] ) -> None: """Write a DDUF file from an iterable of entries. @@ -143,7 +144,7 @@ def export_entries_as_dduf( In particular, you don't need to save the data on disk before exporting it in the DDUF file. Args: - dduf_path (`str` or `Path`): + dduf_path (`str` or `os.PathLike`): The path to the DDUF file to write. entries (`Iterable[Tuple[str, Union[str, Path, bytes]]]`): An iterable of entries to write in the DDUF file. Each entry is a tuple with the filename and the content. @@ -205,16 +206,16 @@ def export_entries_as_dduf( logger.info("Done writing DDUF file %s", dduf_path) -def export_folder_as_dduf(dduf_path: Union[str, Path], folder_path: Union[str, Path]) -> None: +def export_folder_as_dduf(dduf_path: Union[str, os.PathLike], folder_path: Union[str, os.PathLike]) -> None: """ Export a folder as a DDUF file. AUses [`export_entries_as_dduf`] under the hood. Args: - dduf_path (`str` or `Path`): + dduf_path (`str` or `os.PathLike`): The path to the DDUF file to write. - folder_path (`str` or `Path`): + folder_path (`str` or `os.PathLike`): The path to the folder containing the diffusion model. Example: @@ -241,12 +242,14 @@ def _iterate_over_folder() -> Iterable[Tuple[str, Path]]: export_entries_as_dduf(dduf_path, _iterate_over_folder()) -def add_entry_to_dduf(dduf_path: Union[str, Path], filename: str, content: Union[str, Path, bytes]) -> None: +def add_entry_to_dduf( + dduf_path: Union[str, os.PathLike], filename: str, content: Union[str, os.PathLike, bytes] +) -> None: """ Add an entry to an existing DDUF file. Args: - dduf_path (`str` or `Path`): + dduf_path (`str` or `os.PathLike`): The path to the DDUF file to write. filename (`str`): The path to the file in the DDUF archive. @@ -274,7 +277,7 @@ def add_entry_to_dduf(dduf_path: Union[str, Path], filename: str, content: Union _dump_content_in_archive(archive, filename, content) -def _dump_content_in_archive(archive: zipfile.ZipFile, filename: str, content: Union[str, Path, bytes]) -> None: +def _dump_content_in_archive(archive: zipfile.ZipFile, filename: str, content: Union[str, os.PathLike, bytes]) -> None: with archive.open(filename, "w", force_zip64=True) as archive_fh: if isinstance(content, (str, Path)): content_path = Path(content) From 706597ec10a69c60f5daaead5112acf92d4b8cc1 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 11 Dec 2024 12:27:37 +0100 Subject: [PATCH 12/30] enforce 1 level of directory only --- src/huggingface_hub/serialization/_dduf.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 87d5c6ee71..b27829bd2a 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -125,6 +125,8 @@ def read_dduf_file(dduf_path: Union[os.PathLike, str]) -> Dict[str, DDUFEntry]: logger.debug("Reading entry %s", info.filename) if info.compress_type != zipfile.ZIP_STORED: raise DDUFCorruptedFileError("Data must not be compressed in DDUF file.") + if info.filename.count("/") > 1: + raise DDUFCorruptedFileError(f"DDUF only supports 1 level of directory. Got {info.filename}.") offset = _get_data_offset(zf, info) @@ -196,10 +198,7 @@ def export_entries_as_dduf( logger.info("Exporting DDUF file '%s'", dduf_path) with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: for filename, content in entries: - if "." + filename.split(".")[-1] not in DDUF_ALLOWED_ENTRIES: - raise DDUFExportError(f"File type not allowed: {filename}") - if "\\" in filename: - raise DDUFExportError(f"Filenames must use UNIX separators: {filename}") + filename = _validate_dduf_entry_name(filename) logger.debug("Adding file %s to DDUF file", filename) _dump_content_in_archive(archive, filename, content) @@ -260,6 +259,8 @@ def add_entry_to_dduf( - [`DDUFExportError`]: If the entry already exists in the DDUF file. """ dduf_path = str(dduf_path) + filename = _validate_dduf_entry_name(filename) + # Ensure the zip file exists try: with zipfile.ZipFile(dduf_path, "r") as zf: @@ -289,6 +290,17 @@ def _dump_content_in_archive(archive: zipfile.ZipFile, filename: str, content: U raise DDUFExportError(f"Invalid content type for {filename}. Must be str, Path or bytes.") +def _validate_dduf_entry_name(entry_name: str) -> str: + if "." + entry_name.split(".")[-1] not in DDUF_ALLOWED_ENTRIES: + raise DDUFExportError(f"File type not allowed: {entry_name}") + if "\\" in entry_name: + raise DDUFExportError(f"Entry names must use UNIX separators ('/'). Got {entry_name}.") + entry_name = entry_name.strip("/") + if entry_name.count("/") > 1: + raise DDUFExportError(f"DDUF only supports 1 level of directory. Got {entry_name}.") + return entry_name + + def _get_data_offset(zf: zipfile.ZipFile, info: zipfile.ZipInfo) -> int: """ Calculate the data offset for a file in a ZIP archive. From a6588aafdac9302949b64bfd6c56fd29af8e2226 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 11 Dec 2024 12:47:26 +0100 Subject: [PATCH 13/30] raise correct error DDUFInvalidEntryNameError --- src/huggingface_hub/errors.py | 4 ++++ src/huggingface_hub/serialization/_dduf.py | 25 +++++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/huggingface_hub/errors.py b/src/huggingface_hub/errors.py index e4881e7596..226c8bb400 100644 --- a/src/huggingface_hub/errors.py +++ b/src/huggingface_hub/errors.py @@ -323,3 +323,7 @@ class DDUFCorruptedFileError(DDUFError): class DDUFExportError(DDUFError): """Base exception for errors during DDUF export.""" + + +class DDUFInvalidEntryNameError(DDUFExportError): + """Exception thrown when the entry name is invalid.""" diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index b27829bd2a..ce5a87c06d 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Dict, Generator, Iterable, Tuple, Union -from ..errors import DDUFCorruptedFileError, DDUFExportError +from ..errors import DDUFCorruptedFileError, DDUFExportError, DDUFInvalidEntryNameError logger = logging.getLogger(__name__) @@ -125,8 +125,11 @@ def read_dduf_file(dduf_path: Union[os.PathLike, str]) -> Dict[str, DDUFEntry]: logger.debug("Reading entry %s", info.filename) if info.compress_type != zipfile.ZIP_STORED: raise DDUFCorruptedFileError("Data must not be compressed in DDUF file.") - if info.filename.count("/") > 1: - raise DDUFCorruptedFileError(f"DDUF only supports 1 level of directory. Got {info.filename}.") + + try: + _validate_dduf_entry_name(info.filename) + except DDUFInvalidEntryNameError as e: + raise DDUFCorruptedFileError(f"Invalid entry name in DDUF file: {info.filename}") from e offset = _get_data_offset(zf, info) @@ -198,7 +201,10 @@ def export_entries_as_dduf( logger.info("Exporting DDUF file '%s'", dduf_path) with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: for filename, content in entries: - filename = _validate_dduf_entry_name(filename) + try: + filename = _validate_dduf_entry_name(filename) + except DDUFInvalidEntryNameError as e: + raise DDUFExportError(f"Invalid entry name: {filename}") from e logger.debug("Adding file %s to DDUF file", filename) _dump_content_in_archive(archive, filename, content) @@ -259,7 +265,10 @@ def add_entry_to_dduf( - [`DDUFExportError`]: If the entry already exists in the DDUF file. """ dduf_path = str(dduf_path) - filename = _validate_dduf_entry_name(filename) + try: + filename = _validate_dduf_entry_name(filename) + except DDUFInvalidEntryNameError as e: + raise DDUFExportError(f"Invalid entry name: {filename}") from e # Ensure the zip file exists try: @@ -292,12 +301,12 @@ def _dump_content_in_archive(archive: zipfile.ZipFile, filename: str, content: U def _validate_dduf_entry_name(entry_name: str) -> str: if "." + entry_name.split(".")[-1] not in DDUF_ALLOWED_ENTRIES: - raise DDUFExportError(f"File type not allowed: {entry_name}") + raise DDUFInvalidEntryNameError(f"File type not allowed: {entry_name}") if "\\" in entry_name: - raise DDUFExportError(f"Entry names must use UNIX separators ('/'). Got {entry_name}.") + raise DDUFInvalidEntryNameError(f"Entry names must use UNIX separators ('/'). Got {entry_name}.") entry_name = entry_name.strip("/") if entry_name.count("/") > 1: - raise DDUFExportError(f"DDUF only supports 1 level of directory. Got {entry_name}.") + raise DDUFInvalidEntryNameError(f"DDUF only supports 1 level of directory. Got {entry_name}.") return entry_name From 947a593ea129c31c867cf2ef6c77bcf19095ac13 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 11 Dec 2024 13:29:49 +0100 Subject: [PATCH 14/30] add tests --- .../en/package_reference/serialization.md | 2 + src/huggingface_hub/serialization/_dduf.py | 7 +- tests/test_dduf.py | 153 ++++++++++++++++++ 3 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 tests/test_dduf.py diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index 4653d74780..68682a8f3a 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -114,6 +114,8 @@ DDUFEntry(filename='model_index.json', offset=66, length=587) [[autodoc]] huggingface_hub.errors.DDUFExportError +[[autodoc]] huggingface_hub.errors.DDUFInvalidEntryNameError + ## Save torch state dict The main helper of the `serialization` module takes a torch `nn.Module` as input and saves it to disk. It handles the logic to save shared tensors (see [safetensors explanation](https://huggingface.co/docs/safetensors/torch_shared_tensors)) as well as logic to split the state dictionary into shards, using [`split_torch_state_dict_into_shards`] under the hood. At the moment, only `torch` framework is supported. diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index ce5a87c06d..3a0e3d143a 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -199,8 +199,13 @@ def export_entries_as_dduf( ``` """ logger.info("Exporting DDUF file '%s'", dduf_path) + filenames = set() with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: for filename, content in entries: + if filename in filenames: + raise DDUFExportError(f"Can't add duplicate entry: {filename}") + filenames.add(filename) + try: filename = _validate_dduf_entry_name(filename) except DDUFInvalidEntryNameError as e: @@ -239,7 +244,7 @@ def _iterate_over_folder() -> Iterable[Tuple[str, Path]]: logger.debug("Skipping file %s (file type not allowed)", path) continue path_in_archive = path.relative_to(folder_path) - if len(path_in_archive.parts) > 3: + if len(path_in_archive.parts) >= 3: logger.debug("Skipping file %s (nested directories not allowed)", path) continue yield path_in_archive.as_posix(), path diff --git a/tests/test_dduf.py b/tests/test_dduf.py new file mode 100644 index 0000000000..614094f891 --- /dev/null +++ b/tests/test_dduf.py @@ -0,0 +1,153 @@ +from pathlib import Path + +import pytest +from pytest_mock import MockerFixture +from typing import Iterable, Tuple, Union +import json +from huggingface_hub.errors import DDUFInvalidEntryNameError, DDUFExportError +from huggingface_hub.serialization._dduf import ( + DDUFEntry, + _validate_dduf_entry_name, + export_entries_as_dduf, + read_dduf_file, + export_folder_as_dduf, +) +import zipfile + + +class TestDDUFEntry: + @pytest.fixture + def dummy_entry(self, tmp_path: Path) -> DDUFEntry: + dummy_dduf = tmp_path / "dummy_dduf.dduf" + dummy_dduf.write_bytes(b"somethingCONTENTsomething") + return DDUFEntry(filename="dummy.json", length=7, offset=9, dduf_path=dummy_dduf) + + def test_dataclass(self, dummy_entry: DDUFEntry): + assert dummy_entry.filename == "dummy.json" + assert dummy_entry.length == 7 + assert dummy_entry.offset == 9 + assert str(dummy_entry.dduf_path).endswith("dummy_dduf.dduf") + + def test_read_text(self, dummy_entry: DDUFEntry): + assert dummy_entry.read_text() == "CONTENT" + + def test_as_mmap(self, dummy_entry: DDUFEntry): + with dummy_entry.as_mmap() as mmap: + assert mmap == b"CONTENT" + + +class TestUtils: + @pytest.mark.parametrize("filename", ["dummy.txt", "dummy.json", "dummy.safetensors"]) + def test_entry_name_valid_extension(self, filename: str): + assert _validate_dduf_entry_name(filename) == filename + + @pytest.mark.parametrize("filename", ["dummy", "dummy.bin", "dummy.dduf"]) + def test_entry_name_invalid_extension(self, filename: str): + with pytest.raises(DDUFInvalidEntryNameError): + _validate_dduf_entry_name(filename) + + @pytest.mark.parametrize("filename", ["encoder\\dummy.json", "C:\\dummy.json"]) + def test_entry_name_no_windows_path(self, filename: str): + with pytest.raises(DDUFInvalidEntryNameError): + _validate_dduf_entry_name(filename) + + def test_entry_name_stripped( + self, + ): + assert _validate_dduf_entry_name("/dummy.json") == "dummy.json" + + def test_entry_name_no_nested_directory(self): + _validate_dduf_entry_name("bar/dummy.json") # 1 level is ok + with pytest.raises(DDUFInvalidEntryNameError): + _validate_dduf_entry_name("foo/bar/dummy.json") # not more + + +class TestExportFolder: + @pytest.fixture + def dummy_folder(self, tmp_path: Path): + folder_path = tmp_path / "dummy_folder" + folder_path.mkdir() + encoder_path = folder_path / "encoder" + encoder_path.mkdir() + subdir_path = encoder_path / "subdir" + subdir_path.mkdir() + + (folder_path / "config.json").touch() + (folder_path / "model.safetensors").touch() + (folder_path / "model.bin").touch() # won't be included + (encoder_path / "config.json").touch() + (encoder_path / "model.safetensors").touch() + (encoder_path / "model.bin").touch() # won't be included + (subdir_path / "config.json").touch() # won't be included + return folder_path + + def test_export_folder(self, dummy_folder: Path, mocker: MockerFixture): + mock = mocker.patch("huggingface_hub.serialization._dduf.export_entries_as_dduf") + export_folder_as_dduf("dummy.dduf", dummy_folder) + mock.assert_called_once() + args = mock.call_args_list[0].args + + assert args[0] == "dummy.dduf" + assert list(args[1]) == [ + # args[1] is a generator of tuples (path_in_archive, path_on_disk) + ("config.json", dummy_folder / "config.json"), + ("model.safetensors", dummy_folder / "model.safetensors"), + ("encoder/config.json", dummy_folder / "encoder/config.json"), + ("encoder/model.safetensors", dummy_folder / "encoder/model.safetensors"), + ] + +class TestExportEntries: + @pytest.fixture + def dummy_entries(self, tmp_path: Path) -> Iterable[Tuple[str, Union[str, Path, bytes]]]: + (tmp_path / "config.json").write_text(json.dumps({"foo": "bar"})) + (tmp_path / "does_have_to_be_same_name.safetensors").write_bytes(b"this is safetensors content") + + return [ + ("config.json", str(tmp_path / "config.json")), # string path + ("model.safetensors", tmp_path / "does_have_to_be_same_name.safetensors"), # pathlib path + ("hello.txt", b"hello world"), # raw bytes + ] + + def test_export_entries(self, tmp_path: Path, dummy_entries: Iterable[Tuple[str, Union[str, Path, bytes]]]): + export_entries_as_dduf(tmp_path / "dummy.dduf", dummy_entries) + + with zipfile.ZipFile(tmp_path / "dummy.dduf", "r") as archive: + assert archive.compression == zipfile.ZIP_STORED # uncompressed! + assert archive.namelist() == ["config.json", "model.safetensors", "hello.txt"] + assert archive.read("config.json") == b'{"foo": "bar"}' + assert archive.read("model.safetensors") == b"this is safetensors content" + assert archive.read("hello.txt") == b"hello world" + + def test_export_entries_invalid_name(self, tmp_path: Path): + with pytest.raises(DDUFExportError, match="Invalid entry name") as e: + export_entries_as_dduf(tmp_path / "dummy.dduf", [("config", "config.json")]) + assert isinstance(e.value.__cause__, DDUFInvalidEntryNameError) + + def test_export_entries_no_duplicate(self, tmp_path: Path): + with pytest.raises(DDUFExportError, match="Can't add duplicate entry"): + export_entries_as_dduf(tmp_path / "dummy.dduf", [("config.json", b"content1"), ("config.json", b"content2")]) + +class TestReadDDUFFile: + @pytest.fixture + def dummy_dduf_file(self, tmp_path: Path) -> Path: + with zipfile.ZipFile(tmp_path / "dummy.dduf", "w") as archive: + archive.writestr("config.json", b'{"foo": "bar"}') + archive.writestr("model.safetensors", b"this is safetensors content") + archive.writestr("hello.txt", b"hello world") + return tmp_path / "dummy.dduf" + + def test_read_dduf_file(self, dummy_dduf_file: Path): + entries = read_dduf_file(dummy_dduf_file) + assert len(entries) == 3 + + assert entries["config.json"].filename == "config.json" + assert entries["config.json"].dduf_path == dummy_dduf_file + assert entries["config.json"].read_text() == '{"foo": "bar"}' + + assert entries["model.safetensors"].filename == "model.safetensors" + assert entries["model.safetensors"].dduf_path == dummy_dduf_file + assert entries["model.safetensors"].read_text() == "this is safetensors content" + + assert entries["hello.txt"].filename == "hello.txt" + assert entries["hello.txt"].dduf_path == dummy_dduf_file + assert entries["hello.txt"].read_text() == "hello world" \ No newline at end of file From 2881a57f21d54d0c1aeb0fe087077b8e9bbe77e4 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 11 Dec 2024 13:31:53 +0100 Subject: [PATCH 15/30] note --- src/huggingface_hub/serialization/_dduf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 3a0e3d143a..b0d214e2d6 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -258,6 +258,8 @@ def add_entry_to_dduf( """ Add an entry to an existing DDUF file. + Note: this method is not tested, not documented and not publicly exposed. Used for internal tests + future development. + Args: dduf_path (`str` or `os.PathLike`): The path to the DDUF file to write. From 2b7baf5484e14888e8f731cc1bfadf3c5f37c7bb Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 11 Dec 2024 13:34:37 +0100 Subject: [PATCH 16/30] test uncompress --- tests/test_dduf.py | 65 +++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/tests/test_dduf.py b/tests/test_dduf.py index 614094f891..476ebaad8b 100644 --- a/tests/test_dduf.py +++ b/tests/test_dduf.py @@ -1,18 +1,19 @@ +import json +import zipfile from pathlib import Path +from typing import Iterable, Tuple, Union import pytest from pytest_mock import MockerFixture -from typing import Iterable, Tuple, Union -import json -from huggingface_hub.errors import DDUFInvalidEntryNameError, DDUFExportError + +from huggingface_hub.errors import DDUFExportError, DDUFInvalidEntryNameError from huggingface_hub.serialization._dduf import ( DDUFEntry, _validate_dduf_entry_name, export_entries_as_dduf, - read_dduf_file, export_folder_as_dduf, + read_dduf_file, ) -import zipfile class TestDDUFEntry: @@ -96,6 +97,7 @@ def test_export_folder(self, dummy_folder: Path, mocker: MockerFixture): ("encoder/model.safetensors", dummy_folder / "encoder/model.safetensors"), ] + class TestExportEntries: @pytest.fixture def dummy_entries(self, tmp_path: Path) -> Iterable[Tuple[str, Union[str, Path, bytes]]]: @@ -103,16 +105,16 @@ def dummy_entries(self, tmp_path: Path) -> Iterable[Tuple[str, Union[str, Path, (tmp_path / "does_have_to_be_same_name.safetensors").write_bytes(b"this is safetensors content") return [ - ("config.json", str(tmp_path / "config.json")), # string path - ("model.safetensors", tmp_path / "does_have_to_be_same_name.safetensors"), # pathlib path - ("hello.txt", b"hello world"), # raw bytes + ("config.json", str(tmp_path / "config.json")), # string path + ("model.safetensors", tmp_path / "does_have_to_be_same_name.safetensors"), # pathlib path + ("hello.txt", b"hello world"), # raw bytes ] - + def test_export_entries(self, tmp_path: Path, dummy_entries: Iterable[Tuple[str, Union[str, Path, bytes]]]): export_entries_as_dduf(tmp_path / "dummy.dduf", dummy_entries) with zipfile.ZipFile(tmp_path / "dummy.dduf", "r") as archive: - assert archive.compression == zipfile.ZIP_STORED # uncompressed! + assert archive.compression == zipfile.ZIP_STORED # uncompressed! assert archive.namelist() == ["config.json", "model.safetensors", "hello.txt"] assert archive.read("config.json") == b'{"foo": "bar"}' assert archive.read("model.safetensors") == b"this is safetensors content" @@ -125,7 +127,10 @@ def test_export_entries_invalid_name(self, tmp_path: Path): def test_export_entries_no_duplicate(self, tmp_path: Path): with pytest.raises(DDUFExportError, match="Can't add duplicate entry"): - export_entries_as_dduf(tmp_path / "dummy.dduf", [("config.json", b"content1"), ("config.json", b"content2")]) + export_entries_as_dduf( + tmp_path / "dummy.dduf", [("config.json", b"content1"), ("config.json", b"content2")] + ) + class TestReadDDUFFile: @pytest.fixture @@ -135,19 +140,31 @@ def dummy_dduf_file(self, tmp_path: Path) -> Path: archive.writestr("model.safetensors", b"this is safetensors content") archive.writestr("hello.txt", b"hello world") return tmp_path / "dummy.dduf" - + def test_read_dduf_file(self, dummy_dduf_file: Path): entries = read_dduf_file(dummy_dduf_file) assert len(entries) == 3 - - assert entries["config.json"].filename == "config.json" - assert entries["config.json"].dduf_path == dummy_dduf_file - assert entries["config.json"].read_text() == '{"foo": "bar"}' - - assert entries["model.safetensors"].filename == "model.safetensors" - assert entries["model.safetensors"].dduf_path == dummy_dduf_file - assert entries["model.safetensors"].read_text() == "this is safetensors content" - - assert entries["hello.txt"].filename == "hello.txt" - assert entries["hello.txt"].dduf_path == dummy_dduf_file - assert entries["hello.txt"].read_text() == "hello world" \ No newline at end of file + config_entry = entries["config.json"] + model_entry = entries["model.safetensors"] + hello_entry = entries["hello.txt"] + + assert config_entry.filename == "config.json" + assert config_entry.dduf_path == dummy_dduf_file + assert config_entry.read_text() == '{"foo": "bar"}' + with dummy_dduf_file.open("rb") as f: + f.seek(config_entry.offset) + assert f.read(config_entry.length) == b'{"foo": "bar"}' + + assert model_entry.filename == "model.safetensors" + assert model_entry.dduf_path == dummy_dduf_file + assert model_entry.read_text() == "this is safetensors content" + with dummy_dduf_file.open("rb") as f: + f.seek(model_entry.offset) + assert f.read(model_entry.length) == b"this is safetensors content" + + assert hello_entry.filename == "hello.txt" + assert hello_entry.dduf_path == dummy_dduf_file + assert hello_entry.read_text() == "hello world" + with dummy_dduf_file.open("rb") as f: + f.seek(hello_entry.offset) + assert f.read(hello_entry.length) == b"hello world" From f5f0f252fe0afc2d414e849833a33f98475054d9 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 11 Dec 2024 14:38:46 +0100 Subject: [PATCH 17/30] required model_index.json --- src/huggingface_hub/serialization/_dduf.py | 6 ++- tests/test_dduf.py | 44 +++++++++++++--------- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index b0d214e2d6..743e63f657 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -14,7 +14,6 @@ logger = logging.getLogger(__name__) DDUF_ALLOWED_ENTRIES = { - ".gguf", ".json", ".model", ".safetensors", @@ -136,6 +135,8 @@ def read_dduf_file(dduf_path: Union[os.PathLike, str]) -> Dict[str, DDUFEntry]: entries[info.filename] = DDUFEntry( filename=info.filename, offset=offset, length=info.file_size, dduf_path=dduf_path ) + if "model_index.json" not in entries: + raise DDUFCorruptedFileError("Missing required 'model_index.json' entry in DDUF file.") logger.info("Done reading DDUF file %s. Found %d entries", dduf_path, len(entries)) return entries @@ -213,6 +214,9 @@ def export_entries_as_dduf( logger.debug("Adding file %s to DDUF file", filename) _dump_content_in_archive(archive, filename, content) + if "model_index.json" not in filenames: + raise DDUFExportError("Missing required 'model_index.json' entry in DDUF file.") + logger.info("Done writing DDUF file %s", dduf_path) diff --git a/tests/test_dduf.py b/tests/test_dduf.py index 476ebaad8b..73b1d9e506 100644 --- a/tests/test_dduf.py +++ b/tests/test_dduf.py @@ -6,7 +6,7 @@ import pytest from pytest_mock import MockerFixture -from huggingface_hub.errors import DDUFExportError, DDUFInvalidEntryNameError +from huggingface_hub.errors import DDUFCorruptedFileError, DDUFExportError, DDUFInvalidEntryNameError from huggingface_hub.serialization._dduf import ( DDUFEntry, _validate_dduf_entry_name, @@ -42,7 +42,7 @@ class TestUtils: def test_entry_name_valid_extension(self, filename: str): assert _validate_dduf_entry_name(filename) == filename - @pytest.mark.parametrize("filename", ["dummy", "dummy.bin", "dummy.dduf"]) + @pytest.mark.parametrize("filename", ["dummy", "dummy.bin", "dummy.dduf", "dummy.gguf"]) def test_entry_name_invalid_extension(self, filename: str): with pytest.raises(DDUFInvalidEntryNameError): _validate_dduf_entry_name(filename) @@ -101,12 +101,12 @@ def test_export_folder(self, dummy_folder: Path, mocker: MockerFixture): class TestExportEntries: @pytest.fixture def dummy_entries(self, tmp_path: Path) -> Iterable[Tuple[str, Union[str, Path, bytes]]]: - (tmp_path / "config.json").write_text(json.dumps({"foo": "bar"})) - (tmp_path / "does_have_to_be_same_name.safetensors").write_bytes(b"this is safetensors content") + (tmp_path / "model_index.json").write_text(json.dumps({"foo": "bar"})) + (tmp_path / "doesnt_have_to_be_same_name.safetensors").write_bytes(b"this is safetensors content") return [ - ("config.json", str(tmp_path / "config.json")), # string path - ("model.safetensors", tmp_path / "does_have_to_be_same_name.safetensors"), # pathlib path + ("model_index.json", str(tmp_path / "model_index.json")), # string path + ("model.safetensors", tmp_path / "doesnt_have_to_be_same_name.safetensors"), # pathlib path ("hello.txt", b"hello world"), # raw bytes ] @@ -115,28 +115,32 @@ def test_export_entries(self, tmp_path: Path, dummy_entries: Iterable[Tuple[str, with zipfile.ZipFile(tmp_path / "dummy.dduf", "r") as archive: assert archive.compression == zipfile.ZIP_STORED # uncompressed! - assert archive.namelist() == ["config.json", "model.safetensors", "hello.txt"] - assert archive.read("config.json") == b'{"foo": "bar"}' + assert archive.namelist() == ["model_index.json", "model.safetensors", "hello.txt"] + assert archive.read("model_index.json") == b'{"foo": "bar"}' assert archive.read("model.safetensors") == b"this is safetensors content" assert archive.read("hello.txt") == b"hello world" def test_export_entries_invalid_name(self, tmp_path: Path): with pytest.raises(DDUFExportError, match="Invalid entry name") as e: - export_entries_as_dduf(tmp_path / "dummy.dduf", [("config", "config.json")]) + export_entries_as_dduf(tmp_path / "dummy.dduf", [("config", "model_index.json")]) assert isinstance(e.value.__cause__, DDUFInvalidEntryNameError) def test_export_entries_no_duplicate(self, tmp_path: Path): with pytest.raises(DDUFExportError, match="Can't add duplicate entry"): export_entries_as_dduf( - tmp_path / "dummy.dduf", [("config.json", b"content1"), ("config.json", b"content2")] + tmp_path / "dummy.dduf", [("model_index.json", b"content1"), ("model_index.json", b"content2")] ) + def test_export_entries_model_index_required(self, tmp_path: Path): + with pytest.raises(DDUFExportError, match="Missing required 'model_index.json' entry"): + export_entries_as_dduf(tmp_path / "dummy.dduf", [("model.safetensors", b"content")]) + class TestReadDDUFFile: @pytest.fixture def dummy_dduf_file(self, tmp_path: Path) -> Path: with zipfile.ZipFile(tmp_path / "dummy.dduf", "w") as archive: - archive.writestr("config.json", b'{"foo": "bar"}') + archive.writestr("model_index.json", b'{"foo": "bar"}') archive.writestr("model.safetensors", b"this is safetensors content") archive.writestr("hello.txt", b"hello world") return tmp_path / "dummy.dduf" @@ -144,16 +148,16 @@ def dummy_dduf_file(self, tmp_path: Path) -> Path: def test_read_dduf_file(self, dummy_dduf_file: Path): entries = read_dduf_file(dummy_dduf_file) assert len(entries) == 3 - config_entry = entries["config.json"] + index_entry = entries["model_index.json"] model_entry = entries["model.safetensors"] hello_entry = entries["hello.txt"] - assert config_entry.filename == "config.json" - assert config_entry.dduf_path == dummy_dduf_file - assert config_entry.read_text() == '{"foo": "bar"}' + assert index_entry.filename == "model_index.json" + assert index_entry.dduf_path == dummy_dduf_file + assert index_entry.read_text() == '{"foo": "bar"}' with dummy_dduf_file.open("rb") as f: - f.seek(config_entry.offset) - assert f.read(config_entry.length) == b'{"foo": "bar"}' + f.seek(index_entry.offset) + assert f.read(index_entry.length) == b'{"foo": "bar"}' assert model_entry.filename == "model.safetensors" assert model_entry.dduf_path == dummy_dduf_file @@ -168,3 +172,9 @@ def test_read_dduf_file(self, dummy_dduf_file: Path): with dummy_dduf_file.open("rb") as f: f.seek(hello_entry.offset) assert f.read(hello_entry.length) == b"hello world" + + def test_model_index_required(self, tmp_path: Path): + with zipfile.ZipFile(tmp_path / "dummy.dduf", "w") as archive: + archive.writestr("model.safetensors", b"this is safetensors content") + with pytest.raises(DDUFCorruptedFileError, match="Missing required 'model_index.json' entry"): + read_dduf_file(tmp_path / "dummy.dduf") From 0d1045dd8afb3788eecf45f20a6b5b4d41de49fd Mon Sep 17 00:00:00 2001 From: Lucain Date: Wed, 11 Dec 2024 15:10:57 +0100 Subject: [PATCH 18/30] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Célina --- docs/source/en/package_reference/serialization.md | 6 +++--- src/huggingface_hub/serialization/_dduf.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index 68682a8f3a..f4aedc4ec5 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -8,7 +8,7 @@ rendered properly in your Markdown viewer. ## DDUF file format -DDUF is a file format designed for diffusers models. It allows saving all the information to run a model in a single file. This work is inspired by the GGUF format. `huggingface_hub` provides helpers to save and load DDUF files, ensuring the file format is respected. +DDUF is a file format designed for diffusion models. It allows saving all the information to run a model in a single file. This work is inspired by the GGUF format. `huggingface_hub` provides helpers to save and load DDUF files, ensuring the file format is respected. @@ -28,7 +28,7 @@ Here is how to export a folder containing different parts of a diffusion model u >>> export_folder_as_dduf("FLUX.1-dev.dduf", folder_path="path/to/FLUX.1-dev") ``` -For more flexibility, to can use [`export_entries_as_dduf`] and pass a list of files to include in the final DDUF file: +For more flexibility, you can use [`export_entries_as_dduf`] and pass a list of files to include in the final DDUF file: ```python # Export specific files from the local disk. @@ -46,7 +46,7 @@ For more flexibility, to can use [`export_entries_as_dduf`] and pass a list of f ... ) ``` -The `entries` parameter also support passing an iterable of paths or bytes. This can prove useful if you have a loaded model and want to serialize it directly in a DDUF file instead of having to serialize each component to disk first and then as a DDUF file. Here is an example on how a `StableDiffusionPipeline` can be serialized as DDUF: +The `entries` parameter also supports passing an iterable of paths or bytes. This can prove useful if you have a loaded model and want to serialize it directly into a DDUF file instead of having to serialize each component to disk first and then as a DDUF file. Here is an example of how a `StableDiffusionPipeline` can be serialized as DDUF: ```python diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 743e63f657..e6aeb5ba66 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -97,7 +97,7 @@ def read_dduf_file(dduf_path: Union[os.PathLike, str]) -> Dict[str, DDUFEntry]: Example: ```python >>> import json - >>> import safetensors.load + >>> import safetensors.torch >>> from huggingface_hub import read_dduf_file # Read DDUF metadata @@ -235,14 +235,14 @@ def export_folder_as_dduf(dduf_path: Union[str, os.PathLike], folder_path: Union Example: ```python >>> from huggingface_hub import export_folder_as_dduf - >>> export_folder_as_dduf("FLUX.1-dev.dduf", diffuser_path="path/to/FLUX.1-dev") + >>> export_folder_as_dduf("FLUX.1-dev.dduf", folder_path="path/to/FLUX.1-dev") ``` """ folder_path = Path(folder_path) def _iterate_over_folder() -> Iterable[Tuple[str, Path]]: for path in Path(folder_path).glob("**/*"): - if path.is_dir(): + if not path.is_file(): continue if path.suffix not in DDUF_ALLOWED_ENTRIES: logger.debug("Skipping file %s (file type not allowed)", path) From dca1586c901de2c44de50c562c21bbd5ee2f81b3 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Wed, 11 Dec 2024 15:32:54 +0100 Subject: [PATCH 19/30] use f-string in logs --- src/huggingface_hub/serialization/_dduf.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 743e63f657..1a3af60d9b 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -118,10 +118,10 @@ def read_dduf_file(dduf_path: Union[os.PathLike, str]) -> Dict[str, DDUFEntry]: """ entries = {} dduf_path = Path(dduf_path) - logger.info("Reading DDUF file %s", dduf_path) + logger.info(f"Reading DDUF file {dduf_path}") with zipfile.ZipFile(str(dduf_path), "r") as zf: for info in zf.infolist(): - logger.debug("Reading entry %s", info.filename) + logger.debug(f"Reading entry {info.filename}") if info.compress_type != zipfile.ZIP_STORED: raise DDUFCorruptedFileError("Data must not be compressed in DDUF file.") @@ -137,7 +137,7 @@ def read_dduf_file(dduf_path: Union[os.PathLike, str]) -> Dict[str, DDUFEntry]: ) if "model_index.json" not in entries: raise DDUFCorruptedFileError("Missing required 'model_index.json' entry in DDUF file.") - logger.info("Done reading DDUF file %s. Found %d entries", dduf_path, len(entries)) + logger.info(f"Done reading DDUF file {dduf_path}. Found {len(entries)} entries") return entries @@ -199,7 +199,7 @@ def export_entries_as_dduf( >>> export_entries_as_dduf("stable-diffusion-v1-4.dduf", entries=as_entries(pipe)) ``` """ - logger.info("Exporting DDUF file '%s'", dduf_path) + logger.info(f"Exporting DDUF file '{dduf_path}'") filenames = set() with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: for filename, content in entries: @@ -211,13 +211,13 @@ def export_entries_as_dduf( filename = _validate_dduf_entry_name(filename) except DDUFInvalidEntryNameError as e: raise DDUFExportError(f"Invalid entry name: {filename}") from e - logger.debug("Adding file %s to DDUF file", filename) + logger.debug(f"Adding entry '{filename}' to DDUF file") _dump_content_in_archive(archive, filename, content) if "model_index.json" not in filenames: raise DDUFExportError("Missing required 'model_index.json' entry in DDUF file.") - logger.info("Done writing DDUF file %s", dduf_path) + logger.info(f"Done writing DDUF file {dduf_path}") def export_folder_as_dduf(dduf_path: Union[str, os.PathLike], folder_path: Union[str, os.PathLike]) -> None: @@ -245,11 +245,11 @@ def _iterate_over_folder() -> Iterable[Tuple[str, Path]]: if path.is_dir(): continue if path.suffix not in DDUF_ALLOWED_ENTRIES: - logger.debug("Skipping file %s (file type not allowed)", path) + logger.debug(f"Skipping file '{path}' (file type not allowed)") continue path_in_archive = path.relative_to(folder_path) if len(path_in_archive.parts) >= 3: - logger.debug("Skipping file %s (nested directories not allowed)", path) + logger.debug(f"Skipping file '{path}' (nested directories not allowed)") continue yield path_in_archive.as_posix(), path @@ -294,7 +294,7 @@ def add_entry_to_dduf( # Reopen the zip in append mode and add the new file with zipfile.ZipFile(dduf_path, "a", zipfile.ZIP_STORED) as archive: - logger.debug("Adding file %s to DDUF file", filename) + logger.debug(f"Adding file '{filename}' to DDUF file") _dump_content_in_archive(archive, filename, content) From 157633c9480151f8f8645af9204ec4f388807d4b Mon Sep 17 00:00:00 2001 From: Lucain Date: Wed, 11 Dec 2024 15:34:44 +0100 Subject: [PATCH 20/30] Update docs/source/en/package_reference/serialization.md Co-authored-by: Pedro Cuenca --- docs/source/en/package_reference/serialization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index f4aedc4ec5..b50c26ed10 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -59,7 +59,7 @@ The `entries` parameter also supports passing an iterable of paths or bytes. Thi ... # ... do some work with the pipeline >>> def as_entries(pipe: DiffusionPipeline) -> Generator[Tuple[str, bytes], None, None]: -... # Build an generator that yields the entries to add to the DDUF file. +... # Build a generator that yields the entries to add to the DDUF file. ... # The first element of the tuple is the filename in the DDUF archive (must use UNIX separator!). The second element is the content of the file. ... # Entries will be evaluated lazily when the DDUF file is created (only 1 entry is loaded in memory at a time) ... yield "vae/config.json", pipe.vae.to_json_string().encode() From 5cf560d7fa52893614922173ce1f0cb54570b019 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 12 Dec 2024 09:11:30 +0100 Subject: [PATCH 21/30] remove add_entry_to_dduf --- src/huggingface_hub/serialization/_dduf.py | 42 ---------------------- 1 file changed, 42 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index fa09390b65..d2e4e22c7a 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -256,48 +256,6 @@ def _iterate_over_folder() -> Iterable[Tuple[str, Path]]: export_entries_as_dduf(dduf_path, _iterate_over_folder()) -def add_entry_to_dduf( - dduf_path: Union[str, os.PathLike], filename: str, content: Union[str, os.PathLike, bytes] -) -> None: - """ - Add an entry to an existing DDUF file. - - Note: this method is not tested, not documented and not publicly exposed. Used for internal tests + future development. - - Args: - dduf_path (`str` or `os.PathLike`): - The path to the DDUF file to write. - filename (`str`): - The path to the file in the DDUF archive. - content (`str`, `Path` or `bytes`): - The content of the file to add to the DDUF archive. - - Raises: - - [`DDUFExportError`]: If the entry already exists in the DDUF file. - """ - dduf_path = str(dduf_path) - try: - filename = _validate_dduf_entry_name(filename) - except DDUFInvalidEntryNameError as e: - raise DDUFExportError(f"Invalid entry name: {filename}") from e - - # Ensure the zip file exists - try: - with zipfile.ZipFile(dduf_path, "r") as zf: - # Check if the file already exists in the zip - if filename in zf.namelist(): - raise DDUFExportError(f"Entry '{filename}' already exists in DDUF file.") - except FileNotFoundError: - # If the zip doesn't exist, create it - with zipfile.ZipFile(dduf_path, "w") as _: - pass - - # Reopen the zip in append mode and add the new file - with zipfile.ZipFile(dduf_path, "a", zipfile.ZIP_STORED) as archive: - logger.debug(f"Adding file '{filename}' to DDUF file") - _dump_content_in_archive(archive, filename, content) - - def _dump_content_in_archive(archive: zipfile.ZipFile, filename: str, content: Union[str, os.PathLike, bytes]) -> None: with archive.open(filename, "w", force_zip64=True) as archive_fh: if isinstance(content, (str, Path)): From e6c62da99804ef690985cbf1b4af7bef107cbcd2 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 12 Dec 2024 10:19:50 +0100 Subject: [PATCH 22/30] new rules: folders in model_index.json + config files in folders --- src/huggingface_hub/serialization/_dduf.py | 76 +++++++++++++++++++++- tests/test_dduf.py | 64 +++++++++++++++++- 2 files changed, 134 insertions(+), 6 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index d2e4e22c7a..747f0b58fc 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -1,3 +1,4 @@ +import json import logging import mmap import os @@ -6,7 +7,7 @@ from contextlib import contextmanager from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, Generator, Iterable, Tuple, Union +from typing import Any, Dict, Generator, Iterable, Tuple, Union from ..errors import DDUFCorruptedFileError, DDUFExportError, DDUFInvalidEntryNameError @@ -14,12 +15,20 @@ logger = logging.getLogger(__name__) DDUF_ALLOWED_ENTRIES = { + # Allowed file extensions in a DDUF file ".json", ".model", ".safetensors", ".txt", } +DDUF_FOLDER_REQUIRED_ENTRIES = { + # Each folder must contain at least one of these entries + "config.json", + "tokenizer_config.json", + "image_processor.json", +} + @dataclass class DDUFEntry: @@ -135,8 +144,13 @@ def read_dduf_file(dduf_path: Union[os.PathLike, str]) -> Dict[str, DDUFEntry]: entries[info.filename] = DDUFEntry( filename=info.filename, offset=offset, length=info.file_size, dduf_path=dduf_path ) + + # Consistency checks on the DDUF file if "model_index.json" not in entries: raise DDUFCorruptedFileError("Missing required 'model_index.json' entry in DDUF file.") + index = json.loads(entries["model_index.json"].read_text()) + _validate_dduf_structure(index, entries.keys()) + logger.info(f"Done reading DDUF file {dduf_path}. Found {len(entries)} entries") return entries @@ -158,7 +172,7 @@ def export_entries_as_dduf( The content can be a string or a pathlib.Path representing a path to a file on the local disk or directly the content as bytes. Raises: - - [`DDUFExportError`]: If entry type is not supported (must be str, Path or bytes). + - [`DDUFExportError`]: If anything goes wrong during the export (e.g. invalid entry name, missing 'model_index.json', etc.). Example: ```python @@ -201,12 +215,19 @@ def export_entries_as_dduf( """ logger.info(f"Exporting DDUF file '{dduf_path}'") filenames = set() + index = None with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: for filename, content in entries: if filename in filenames: raise DDUFExportError(f"Can't add duplicate entry: {filename}") filenames.add(filename) + if filename == "model_index.json": + try: + index = json.loads(_load_content(content).decode()) + except json.JSONDecodeError as e: + raise DDUFExportError("Failed to parse 'model_index.json'.") from e + try: filename = _validate_dduf_entry_name(filename) except DDUFInvalidEntryNameError as e: @@ -214,8 +235,13 @@ def export_entries_as_dduf( logger.debug(f"Adding entry '{filename}' to DDUF file") _dump_content_in_archive(archive, filename, content) - if "model_index.json" not in filenames: + # Consistency checks on the DDUF file + if index is None: raise DDUFExportError("Missing required 'model_index.json' entry in DDUF file.") + try: + _validate_dduf_structure(index, filenames) + except DDUFCorruptedFileError as e: + raise DDUFExportError("Invalid DDUF file structure.") from e logger.info(f"Done writing DDUF file {dduf_path}") @@ -268,6 +294,19 @@ def _dump_content_in_archive(archive: zipfile.ZipFile, filename: str, content: U raise DDUFExportError(f"Invalid content type for {filename}. Must be str, Path or bytes.") +def _load_content(content: Union[str, Path, bytes]) -> bytes: + """Load the content of an entry as bytes. + + Used only for small checks (not to dump content into archive). + """ + if isinstance(content, (str, Path)): + return Path(content).read_bytes() + elif isinstance(content, bytes): + return content + else: + raise DDUFExportError(f"Invalid content type. Must be str, Path or bytes. Got {type(content)}.") + + def _validate_dduf_entry_name(entry_name: str) -> str: if "." + entry_name.split(".")[-1] not in DDUF_ALLOWED_ENTRIES: raise DDUFInvalidEntryNameError(f"File type not allowed: {entry_name}") @@ -279,6 +318,37 @@ def _validate_dduf_entry_name(entry_name: str) -> str: return entry_name +def _validate_dduf_structure(index: Any, entry_names: Iterable[str]) -> None: + """ + Consistency checks on the DDUF file structure. + + Rules: + - The 'model_index.json' entry is required and must contain a dictionary. + - Each folder name must correspond to an entry in 'model_index.json'. + - Each folder must contain at least a config file ('config.json', 'tokenizer_config.json', 'image_processor.json'). + + Args: + index (Any): + The content of the 'model_index.json' entry. + entry_names (Iterable[str]): + The list of entry names in the DDUF file. + + Raises: + - [`DDUFCorruptedFileError`]: If the DDUF file is corrupted (i.e. doesn't follow the DDUF format). + """ + if not isinstance(index, dict): + raise DDUFCorruptedFileError(f"Invalid 'model_index.json' content. Must be a dictionary. Got {type(index)}.") + + dduf_folders = {entry.split("/")[0] for entry in entry_names if "/" in entry} + for folder in dduf_folders: + if folder not in index: + raise DDUFCorruptedFileError(f"Missing required entry '{folder}' in 'model_index.json'.") + if not any(f"{folder}/{required_entry}" in entry_names for required_entry in DDUF_FOLDER_REQUIRED_ENTRIES): + raise DDUFCorruptedFileError( + f"Missing required file in folder '{folder}'. Must contains at least one of {DDUF_FOLDER_REQUIRED_ENTRIES}." + ) + + def _get_data_offset(zf: zipfile.ZipFile, info: zipfile.ZipInfo) -> int: """ Calculate the data offset for a file in a ZIP archive. diff --git a/tests/test_dduf.py b/tests/test_dduf.py index 73b1d9e506..43f8d6c9fc 100644 --- a/tests/test_dduf.py +++ b/tests/test_dduf.py @@ -9,7 +9,9 @@ from huggingface_hub.errors import DDUFCorruptedFileError, DDUFExportError, DDUFInvalidEntryNameError from huggingface_hub.serialization._dduf import ( DDUFEntry, + _load_content, _validate_dduf_entry_name, + _validate_dduf_structure, export_entries_as_dduf, export_folder_as_dduf, read_dduf_file, @@ -62,6 +64,50 @@ def test_entry_name_no_nested_directory(self): with pytest.raises(DDUFInvalidEntryNameError): _validate_dduf_entry_name("foo/bar/dummy.json") # not more + def test_load_content(self, tmp_path: Path): + content = b"hello world" + path = tmp_path / "hello.txt" + path.write_bytes(content) + + assert _load_content(content) == content # from bytes + assert _load_content(path) == content # from Path + assert _load_content(str(path)) == content # from str + + def test_validate_dduf_structure_valid(self): + _validate_dduf_structure( + { # model_index.json content + "_some_key": "some_value", + "encoder": { + "config.json": {}, + "model.safetensors": {}, + }, + }, + { # entries in DDUF archive + "model_index.json", + "something.txt", + "encoder/config.json", + "encoder/model.safetensors", + }, + ) + + def test_validate_dduf_structure_not_a_dict(self): + with pytest.raises(DDUFCorruptedFileError, match="Must be a dictionary."): + _validate_dduf_structure(["not a dict"], {}) # content from 'model_index.json' + + def test_validate_dduf_structure_missing_folder(self): + with pytest.raises(DDUFCorruptedFileError, match="Missing required entry 'encoder' in 'model_index.json'."): + _validate_dduf_structure({}, {"encoder/config.json", "encoder/model.safetensors"}) + + def test_validate_dduf_structure_missing_config_file(self): + with pytest.raises(DDUFCorruptedFileError, match="Missing required file in folder 'encoder'."): + _validate_dduf_structure( + {"encoder": {}}, + { + "encoder/not_a_config.json", # expecting a config.json / tokenizer_config.json / image_processor.json + "encoder/model.safetensors", + }, + ) + class TestExportFolder: @pytest.fixture @@ -110,8 +156,12 @@ def dummy_entries(self, tmp_path: Path) -> Iterable[Tuple[str, Union[str, Path, ("hello.txt", b"hello world"), # raw bytes ] - def test_export_entries(self, tmp_path: Path, dummy_entries: Iterable[Tuple[str, Union[str, Path, bytes]]]): + def test_export_entries( + self, tmp_path: Path, dummy_entries: Iterable[Tuple[str, Union[str, Path, bytes]]], mocker: MockerFixture + ): + mock = mocker.patch("huggingface_hub.serialization._dduf._validate_dduf_structure") export_entries_as_dduf(tmp_path / "dummy.dduf", dummy_entries) + mock.assert_called_once_with({"foo": "bar"}, {"model_index.json", "model.safetensors", "hello.txt"}) with zipfile.ZipFile(tmp_path / "dummy.dduf", "r") as archive: assert archive.compression == zipfile.ZIP_STORED # uncompressed! @@ -128,7 +178,11 @@ def test_export_entries_invalid_name(self, tmp_path: Path): def test_export_entries_no_duplicate(self, tmp_path: Path): with pytest.raises(DDUFExportError, match="Can't add duplicate entry"): export_entries_as_dduf( - tmp_path / "dummy.dduf", [("model_index.json", b"content1"), ("model_index.json", b"content2")] + tmp_path / "dummy.dduf", + [ + ("model_index.json", b'{"key": "content1"}'), + ("model_index.json", b'{"key": "content2"}'), + ], ) def test_export_entries_model_index_required(self, tmp_path: Path): @@ -145,13 +199,17 @@ def dummy_dduf_file(self, tmp_path: Path) -> Path: archive.writestr("hello.txt", b"hello world") return tmp_path / "dummy.dduf" - def test_read_dduf_file(self, dummy_dduf_file: Path): + def test_read_dduf_file(self, dummy_dduf_file: Path, mocker: MockerFixture): + mock = mocker.patch("huggingface_hub.serialization._dduf._validate_dduf_structure") + entries = read_dduf_file(dummy_dduf_file) assert len(entries) == 3 index_entry = entries["model_index.json"] model_entry = entries["model.safetensors"] hello_entry = entries["hello.txt"] + mock.assert_called_once_with({"foo": "bar"}, {"model_index.json", "model.safetensors", "hello.txt"}) + assert index_entry.filename == "model_index.json" assert index_entry.dduf_path == dummy_dduf_file assert index_entry.read_text() == '{"foo": "bar"}' From 381ac7e0fbb732b36dae3fb4dab521e6b529db29 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 12 Dec 2024 10:42:07 +0100 Subject: [PATCH 23/30] add arg --- src/huggingface_hub/serialization/_dduf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 747f0b58fc..23d4050572 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -179,7 +179,7 @@ def export_entries_as_dduf( # Export specific files from the local disk. >>> from huggingface_hub import export_entries_as_dduf >>> export_entries_as_dduf( - ... "stable-diffusion-v1-4-FP16.dduf", + ... dduf_path="stable-diffusion-v1-4-FP16.dduf", ... entries=[ # List entries to add to the DDUF file (here, only FP16 weights) ... ("model_index.json", "path/to/model_index.json"), ... ("vae/config.json", "path/to/vae/config.json"), @@ -210,7 +210,7 @@ def export_entries_as_dduf( ... yield "text_encoder/model.safetensors", safetensors.torch.save(pipe.text_encoder.state_dict()) ... # ... add more entries here - >>> export_entries_as_dduf("stable-diffusion-v1-4.dduf", entries=as_entries(pipe)) + >>> export_entries_as_dduf(dduf_path="stable-diffusion-v1-4.dduf", entries=as_entries(pipe)) ``` """ logger.info(f"Exporting DDUF file '{dduf_path}'") @@ -261,7 +261,7 @@ def export_folder_as_dduf(dduf_path: Union[str, os.PathLike], folder_path: Union Example: ```python >>> from huggingface_hub import export_folder_as_dduf - >>> export_folder_as_dduf("FLUX.1-dev.dduf", folder_path="path/to/FLUX.1-dev") + >>> export_folder_as_dduf(dduf_path="FLUX.1-dev.dduf", folder_path="path/to/FLUX.1-dev") ``` """ folder_path = Path(folder_path) From 76265b4d35be4f52bf8ef1254274cea259a5e930 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 12 Dec 2024 10:42:43 +0100 Subject: [PATCH 24/30] add arg --- docs/source/en/package_reference/serialization.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index b50c26ed10..c180114388 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -34,7 +34,7 @@ For more flexibility, you can use [`export_entries_as_dduf`] and pass a list of # Export specific files from the local disk. >>> from huggingface_hub import export_entries_as_dduf >>> export_entries_as_dduf( -... "stable-diffusion-v1-4-FP16.dduf", +... dduf_path="stable-diffusion-v1-4-FP16.dduf", ... entries=[ # List entries to add to the DDUF file (here, only FP16 weights) ... ("model_index.json", "path/to/model_index.json"), ... ("vae/config.json", "path/to/vae/config.json"), @@ -68,7 +68,7 @@ The `entries` parameter also supports passing an iterable of paths or bytes. Thi ... yield "text_encoder/model.safetensors", safetensors.torch.save(pipe.text_encoder.state_dict()) ... # ... add more entries here ->>> export_entries_as_dduf("stable-diffusion-v1-4.dduf", entries=as_entries(pipe)) +>>> export_entries_as_dduf(dduf_path="stable-diffusion-v1-4.dduf", entries=as_entries(pipe)) ``` **Note:** in practice, `diffusers` provides a method to directly serialize a pipeline in a DDUF file. The snippet above is only meant as an example. From d7962520821a791ea9811e3d4a76c0643736b947 Mon Sep 17 00:00:00 2001 From: Lucain Date: Thu, 12 Dec 2024 10:49:55 +0100 Subject: [PATCH 25/30] Update docs/source/en/package_reference/serialization.md Co-authored-by: Sayak Paul --- docs/source/en/package_reference/serialization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index c180114388..a94f94dc76 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -8,7 +8,7 @@ rendered properly in your Markdown viewer. ## DDUF file format -DDUF is a file format designed for diffusion models. It allows saving all the information to run a model in a single file. This work is inspired by the GGUF format. `huggingface_hub` provides helpers to save and load DDUF files, ensuring the file format is respected. +DDUF is a file format designed for diffusion models. It allows saving all the information to run a model in a single file. This work is inspired by the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) format. `huggingface_hub` provides helpers to save and load DDUF files, ensuring the file format is respected. From 5c2bb63d422facf52a408e5b168e1deb21110c0b Mon Sep 17 00:00:00 2001 From: Lucain Date: Thu, 12 Dec 2024 11:57:24 +0100 Subject: [PATCH 26/30] Update docs/source/en/package_reference/serialization.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Célina --- docs/source/en/package_reference/serialization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index a94f94dc76..0fffd31413 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -77,7 +77,7 @@ The `entries` parameter also supports passing an iterable of paths or bytes. Thi ```python >>> import json ->>> import safetensors.load +>>> import safetensors.torch >>> from huggingface_hub import read_dduf_file # Read DDUF metadata From 360ddd1a22b2254aa1c2fc19dd2c4431c9653e00 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Thu, 12 Dec 2024 15:06:24 +0000 Subject: [PATCH 27/30] add scheduler config --- src/huggingface_hub/serialization/_dduf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 23d4050572..20cdcea38d 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -27,6 +27,7 @@ "config.json", "tokenizer_config.json", "image_processor.json", + "scheduler_config.json" } From c168e23aa7716c1db52f8d926c5288969d580031 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 12 Dec 2024 16:08:47 +0100 Subject: [PATCH 28/30] scheduler_config --- src/huggingface_hub/serialization/_dduf.py | 2 +- tests/test_dduf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 20cdcea38d..202868893f 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -326,7 +326,7 @@ def _validate_dduf_structure(index: Any, entry_names: Iterable[str]) -> None: Rules: - The 'model_index.json' entry is required and must contain a dictionary. - Each folder name must correspond to an entry in 'model_index.json'. - - Each folder must contain at least a config file ('config.json', 'tokenizer_config.json', 'image_processor.json'). + - Each folder must contain at least a config file ('config.json', 'tokenizer_config.json', 'image_processor.json', 'scheduler_config.json'). Args: index (Any): diff --git a/tests/test_dduf.py b/tests/test_dduf.py index 43f8d6c9fc..6b1fdff845 100644 --- a/tests/test_dduf.py +++ b/tests/test_dduf.py @@ -103,7 +103,7 @@ def test_validate_dduf_structure_missing_config_file(self): _validate_dduf_structure( {"encoder": {}}, { - "encoder/not_a_config.json", # expecting a config.json / tokenizer_config.json / image_processor.json + "encoder/not_a_config.json", # expecting a config.json / tokenizer_config.json / image_processor.json / scheduler_config.json "encoder/model.safetensors", }, ) From 4553f4c7e476f56b15fb737ade4d73f2bd417f8b Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 12 Dec 2024 16:09:24 +0100 Subject: [PATCH 29/30] style --- src/huggingface_hub/serialization/_dduf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index 202868893f..ac457225e6 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -27,7 +27,7 @@ "config.json", "tokenizer_config.json", "image_processor.json", - "scheduler_config.json" + "scheduler_config.json", } From 6ad29e77104d28d07673459678fef38fbaad34bf Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 13 Dec 2024 10:40:32 +0100 Subject: [PATCH 30/30] preprocessor_config.json --- src/huggingface_hub/serialization/_dduf.py | 4 ++-- tests/test_dduf.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py index ac457225e6..a1debadb3a 100644 --- a/src/huggingface_hub/serialization/_dduf.py +++ b/src/huggingface_hub/serialization/_dduf.py @@ -26,7 +26,7 @@ # Each folder must contain at least one of these entries "config.json", "tokenizer_config.json", - "image_processor.json", + "preprocessor_config.json", "scheduler_config.json", } @@ -326,7 +326,7 @@ def _validate_dduf_structure(index: Any, entry_names: Iterable[str]) -> None: Rules: - The 'model_index.json' entry is required and must contain a dictionary. - Each folder name must correspond to an entry in 'model_index.json'. - - Each folder must contain at least a config file ('config.json', 'tokenizer_config.json', 'image_processor.json', 'scheduler_config.json'). + - Each folder must contain at least a config file ('config.json', 'tokenizer_config.json', 'preprocessor_config.json', 'scheduler_config.json'). Args: index (Any): diff --git a/tests/test_dduf.py b/tests/test_dduf.py index 6b1fdff845..ece8aa9dfc 100644 --- a/tests/test_dduf.py +++ b/tests/test_dduf.py @@ -103,7 +103,7 @@ def test_validate_dduf_structure_missing_config_file(self): _validate_dduf_structure( {"encoder": {}}, { - "encoder/not_a_config.json", # expecting a config.json / tokenizer_config.json / image_processor.json / scheduler_config.json + "encoder/not_a_config.json", # expecting a config.json / tokenizer_config.json / preprocessor_config.json / scheduler_config.json "encoder/model.safetensors", }, )