huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 6 additions & 2 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎docs/source/en/api/models/z_image_transformer2d.md‎
Lines changed: 19 additions & 0 deletions b/‎docs/source/en/api/models/z_image_transformer2d.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/kandinsky5_image.md‎
Lines changed: 5 additions & 1 deletion b/‎docs/source/en/api/pipelines/kandinsky5_image.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/source/en/api/pipelines/kandinsky5_video.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/en/api/pipelines/kandinsky5_video.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/z_image.md‎
Lines changed: 33 additions & 0 deletions b/‎docs/source/en/api/pipelines/z_image.md‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎docs/source/en/optimization/attention_backends.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/en/optimization/attention_backends.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/hooks/group_offloading.py‎
Lines changed: 106 additions & 51 deletions b/‎src/diffusers/hooks/group_offloading.py‎
Lines changed: 106 additions & 51 deletions
diff --git a/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 11 additions & 0 deletions b/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 11 additions & 0 deletions
@@ -401,6 +401,8 @@
         title: WanAnimateTransformer3DModel
       - local: api/models/wan_transformer_3d
         title: WanTransformer3DModel
+      - local: api/models/z_image_transformer2d
+        title: ZImageTransformer2DModel        
       title: Transformers
     - sections:
       - local: api/models/stable_cascade_unet
@@ -551,6 +553,8 @@
         title: Kandinsky 2.2
       - local: api/pipelines/kandinsky3
         title: Kandinsky 3
+      - local: api/pipelines/kandinsky5_image
+        title: Kandinsky 5.0 Image
       - local: api/pipelines/kolors
         title: Kolors
       - local: api/pipelines/latent_consistency_models
@@ -646,6 +650,8 @@
         title: VisualCloze
       - local: api/pipelines/wuerstchen
         title: Wuerstchen
+      - local: api/pipelines/z_image
+        title: Z-Image        
       title: Image
     - sections:
       - local: api/pipelines/allegro
@@ -664,8 +670,6 @@
         title: HunyuanVideo1.5
       - local: api/pipelines/i2vgenxl
         title: I2VGen-XL
-      - local: api/pipelines/kandinsky5_image
-        title: Kandinsky 5.0 Image
       - local: api/pipelines/kandinsky5_video
         title: Kandinsky 5.0 Video
       - local: api/pipelines/latte
 
@@ -0,0 +1,19 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ZImageTransformer2DModel
+
+A Transformer model for image-like data from [Z-Image](https://huggingface.co/Tongyi-MAI/Z-Image-Turbo).
+
+## ZImageTransformer2DModel
+
+[[autodoc]] ZImageTransformer2DModel
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License.
 
 [Kandinsky 5.0](https://arxiv.org/abs/2511.14993) is a family of diffusion models for Video & Image generation. 
 
-Kandinsky 5.0 Image Lite is a lightweight image generation model (6B parameters) 
+Kandinsky 5.0 Image Lite is a lightweight image generation model (6B parameters).
 
 The model introduces several key innovations:
 - **Latent diffusion pipeline** with **Flow Matching** for improved training stability
@@ -21,10 +21,14 @@ The model introduces several key innovations:
 
 The original codebase can be found at [kandinskylab/Kandinsky-5](https://github.com/kandinskylab/Kandinsky-5).
 
+> [!TIP]
+> Check out the [Kandinsky Lab](https://huggingface.co/kandinskylab) organization on the Hub for the official model checkpoints for text-to-video generation, including pretrained, SFT, no-CFG, and distilled variants.
+
 
 ## Available Models
 
 Kandinsky 5.0 Image Lite:
+
 | model_id | Description | Use Cases |
 |------------|-------------|-----------|
 | [**kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers) | 6B image Supervised Fine-Tuned model | Highest generation quality |
 
@@ -30,6 +30,7 @@ The original codebase can be found at [kandinskylab/Kandinsky-5](https://github.
 ## Available Models
 
 Kandinsky 5.0 T2V Pro:
+
 | model_id | Description | Use Cases |
 |------------|-------------|-----------|
 | **kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers** | 5 second Text-to-Video Pro model | High-quality text-to-video generation |
 
@@ -0,0 +1,33 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Z-Image
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+[Z-Image](https://huggingface.co/papers/2511.22699) is a powerful and highly efficient image generation model with 6B parameters. Currently there's only one model with two more to be released:
+
+|Model|Hugging Face|
+|---|---|
+|Z-Image-Turbo|https://huggingface.co/Tongyi-MAI/Z-Image-Turbo|
+
+## Z-Image-Turbo
+
+Z-Image-Turbo is a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers sub-second inference latency on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.
+
+## ZImagePipeline
+
+[[autodoc]] ZImagePipeline
+	- all
+	- __call__
@@ -32,7 +32,7 @@ This guide will show you how to set and use the different attention backends.
 
 The [`~ModelMixin.set_attention_backend`] method iterates through all the modules in the model and sets the appropriate attention backend to use. The attention backend setting persists until [`~ModelMixin.reset_attention_backend`] is called.
 
-The example below demonstrates how to enable the `_flash_3_hub` implementation for FlashAttention-3 from the [kernel](https://github.com/huggingface/kernels) library, which allows you to instantly use optimized compute kernels from the Hub without requiring any setup.
+The example below demonstrates how to enable the `_flash_3_hub` implementation for FlashAttention-3 from the [`kernels`](https://github.com/huggingface/kernels) library, which allows you to instantly use optimized compute kernels from the Hub without requiring any setup.
 
 > [!NOTE]
 > FlashAttention-3 is not supported for non-Hopper architectures, in which case, use FlashAttention with `set_attention_backend("flash")`.
@@ -156,4 +156,4 @@ Refer to the table below for a complete list of available attention backends and
 | `_sage_qk_int8_pv_fp16_triton` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (Triton) |
 | `xformers` | [xFormers](https://github.com/facebookresearch/xformers) | Memory-efficient attention |
 
-</details>
+</details>
@@ -15,7 +15,7 @@
 import hashlib
 import os
 from contextlib import contextmanager, nullcontext
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from enum import Enum
 from typing import Dict, List, Optional, Set, Tuple, Union
 
@@ -59,6 +59,9 @@ class GroupOffloadingConfig:
     num_blocks_per_group: Optional[int] = None
     offload_to_disk_path: Optional[str] = None
     stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
+    block_modules: Optional[List[str]] = None
+    exclude_kwargs: Optional[List[str]] = None
+    module_prefix: Optional[str] = ""
 
 
 class ModuleGroup:
@@ -77,7 +80,7 @@ def __init__(
         low_cpu_mem_usage: bool = False,
         onload_self: bool = True,
         offload_to_disk_path: Optional[str] = None,
-        group_id: Optional[int] = None,
+        group_id: Optional[Union[int, str]] = None,
     ) -> None:
         self.modules = modules
         self.offload_device = offload_device
@@ -322,7 +325,21 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.group.stream.synchronize()
 
         args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
-        kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+
+        # Some Autoencoder models use a feature cache that is passed through submodules
+        # and modified in place. The `send_to_device` call returns a copy of this feature cache object
+        # which breaks the inplace updates. Use `exclude_kwargs` to mark these cache features
+        exclude_kwargs = self.config.exclude_kwargs or []
+        if exclude_kwargs:
+            moved_kwargs = send_to_device(
+                {k: v for k, v in kwargs.items() if k not in exclude_kwargs},
+                self.group.onload_device,
+                non_blocking=self.group.non_blocking,
+            )
+            kwargs.update(moved_kwargs)
+        else:
+            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+
         return args, kwargs
 
     def post_forward(self, module: torch.nn.Module, output):
@@ -455,6 +472,8 @@ def apply_group_offloading(
     record_stream: bool = False,
     low_cpu_mem_usage: bool = False,
     offload_to_disk_path: Optional[str] = None,
+    block_modules: Optional[List[str]] = None,
+    exclude_kwargs: Optional[List[str]] = None,
 ) -> None:
     r"""
     Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
@@ -512,6 +531,13 @@ def apply_group_offloading(
             If True, the CPU memory usage is minimized by pinning tensors on-the-fly instead of pre-pinning them. This
             option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
             the CPU memory is a bottleneck but may counteract the benefits of using streams.
+        block_modules (`List[str]`, *optional*):
+            List of module names that should be treated as blocks for offloading. If provided, only these modules will
+            be considered for block-level offloading. If not provided, the default block detection logic will be used.
+        exclude_kwargs (`List[str]`, *optional*):
+            List of kwarg keys that should not be processed by send_to_device. This is useful for mutable state like
+            caching lists that need to maintain their object identity across forward passes. If not provided, will be
+            inferred from the module's `_skip_keys` attribute if it exists.
 
     Example:
         ```python
@@ -553,6 +579,12 @@ def apply_group_offloading(
 
     _raise_error_if_accelerate_model_or_sequential_hook_present(module)
 
+    if block_modules is None:
+        block_modules = getattr(module, "_group_offload_block_modules", None)
+
+    if exclude_kwargs is None:
+        exclude_kwargs = getattr(module, "_skip_keys", None)
+
     config = GroupOffloadingConfig(
         onload_device=onload_device,
         offload_device=offload_device,
@@ -563,6 +595,8 @@ def apply_group_offloading(
         record_stream=record_stream,
         low_cpu_mem_usage=low_cpu_mem_usage,
         offload_to_disk_path=offload_to_disk_path,
+        block_modules=block_modules,
+        exclude_kwargs=exclude_kwargs,
     )
     _apply_group_offloading(module, config)
 
@@ -578,46 +612,66 @@ def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConf
 
 def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
     r"""
-    This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks. In comparison to
-    the "leaf_level" offloading, which is more fine-grained, this offloading is done at the top-level blocks.
-    """
+    This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks, and explicitly
+    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading is
+    done at the top-level blocks and modules specified in block_modules.
 
+    When block_modules is provided, only those modules will be treated as blocks for offloading. For each specified
+    module, recursively apply block offloading to it.
+    """
     if config.stream is not None and config.num_blocks_per_group != 1:
         logger.warning(
             f"Using streams is only supported for num_blocks_per_group=1. Got {config.num_blocks_per_group=}. Setting it to 1."
         )
         config.num_blocks_per_group = 1
 
-    # Create module groups for ModuleList and Sequential blocks
+    block_modules = set(config.block_modules) if config.block_modules is not None else set()
+
+    # Create module groups for ModuleList and Sequential blocks, and explicitly defined block modules
     modules_with_group_offloading = set()
     unmatched_modules = []
     matched_module_groups = []
+
     for name, submodule in module.named_children():
-        if not isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
-            unmatched_modules.append((name, submodule))
+        # Check if this is an explicitly defined block module
+        if name in block_modules:
+            # Track submodule using a prefix to avoid filename collisions during disk offload.
+            # Without this, submodules sharing the same model class would be assigned identical
+            # filenames (derived from the class name).
+            prefix = f"{config.module_prefix}{name}." if config.module_prefix else f"{name}."
+            submodule_config = replace(config, module_prefix=prefix)
+
+            _apply_group_offloading_block_level(submodule, submodule_config)
             modules_with_group_offloading.add(name)
-            continue
 
-        for i in range(0, len(submodule), config.num_blocks_per_group):
-            current_modules = submodule[i : i + config.num_blocks_per_group]
-            group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
-            group = ModuleGroup(
-                modules=current_modules,
-                offload_device=config.offload_device,
-                onload_device=config.onload_device,
-                offload_to_disk_path=config.offload_to_disk_path,
-                offload_leader=current_modules[-1],
-                onload_leader=current_modules[0],
-                non_blocking=config.non_blocking,
-                stream=config.stream,
-                record_stream=config.record_stream,
-                low_cpu_mem_usage=config.low_cpu_mem_usage,
-                onload_self=True,
-                group_id=group_id,
-            )
-            matched_module_groups.append(group)
-            for j in range(i, i + len(current_modules)):
-                modules_with_group_offloading.add(f"{name}.{j}")
+        elif isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+            # Handle ModuleList and Sequential blocks as before
+            for i in range(0, len(submodule), config.num_blocks_per_group):
+                current_modules = list(submodule[i : i + config.num_blocks_per_group])
+                if len(current_modules) == 0:
+                    continue
+
+                group_id = f"{config.module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
+                group = ModuleGroup(
+                    modules=current_modules,
+                    offload_device=config.offload_device,
+                    onload_device=config.onload_device,
+                    offload_to_disk_path=config.offload_to_disk_path,
+                    offload_leader=current_modules[-1],
+                    onload_leader=current_modules[0],
+                    non_blocking=config.non_blocking,
+                    stream=config.stream,
+                    record_stream=config.record_stream,
+                    low_cpu_mem_usage=config.low_cpu_mem_usage,
+                    onload_self=True,
+                    group_id=group_id,
+                )
+                matched_module_groups.append(group)
+                for j in range(i, i + len(current_modules)):
+                    modules_with_group_offloading.add(f"{name}.{j}")
+        else:
+            # This is an unmatched module
+            unmatched_modules.append((name, submodule))
 
     # Apply group offloading hooks to the module groups
     for i, group in enumerate(matched_module_groups):
@@ -632,28 +686,29 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
     parameters = [param for _, param in parameters]
     buffers = [buffer for _, buffer in buffers]
 
-    # Create a group for the unmatched submodules of the top-level module so that they are on the correct
-    # device when the forward pass is called.
+    # Create a group for the remaining unmatched submodules of the top-level
+    # module so that they are on the correct device when the forward pass is called.
     unmatched_modules = [unmatched_module for _, unmatched_module in unmatched_modules]
-    unmatched_group = ModuleGroup(
-        modules=unmatched_modules,
-        offload_device=config.offload_device,
-        onload_device=config.onload_device,
-        offload_to_disk_path=config.offload_to_disk_path,
-        offload_leader=module,
-        onload_leader=module,
-        parameters=parameters,
-        buffers=buffers,
-        non_blocking=False,
-        stream=None,
-        record_stream=False,
-        onload_self=True,
-        group_id=f"{module.__class__.__name__}_unmatched_group",
-    )
-    if config.stream is None:
-        _apply_group_offloading_hook(module, unmatched_group, config=config)
-    else:
-        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+    if len(unmatched_modules) > 0 or len(parameters) > 0 or len(buffers) > 0:
+        unmatched_group = ModuleGroup(
+            modules=unmatched_modules,
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=module,
+            onload_leader=module,
+            parameters=parameters,
+            buffers=buffers,
+            non_blocking=False,
+            stream=None,
+            record_stream=False,
+            onload_self=True,
+            group_id=f"{config.module_prefix}{module.__class__.__name__}_unmatched_group",
+        )
+        if config.stream is None:
+            _apply_group_offloading_hook(module, unmatched_group, config=config)
+        else:
+            _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
 
 
 def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
 
@@ -2417,6 +2417,17 @@ def convert_key(key: str) -> str:
 
         state_dict = {convert_key(k): v for k, v in state_dict.items()}
 
+    def normalize_out_key(k: str) -> str:
+        if ".to_out" in k:
+            return k
+        return re.sub(
+            r"\.out(?=\.(?:lora_down|lora_up)\.weight$|\.alpha$)",
+            ".to_out.0",
+            k,
+        )
+
+    state_dict = {normalize_out_key(k): v for k, v in state_dict.items()}
+
     has_default = any("default." in k for k in state_dict)
     if has_default:
         state_dict = {k.replace("default.", ""): v for k, v in state_dict.items()}