From 016367da93cd2f61542446852b35a73a1d690eeb Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Tue, 22 Oct 2024 14:45:17 -0400
Subject: [PATCH 01/15] support sd3.5 in controlnet

---
 src/diffusers/models/controlnet_sd3.py         |  6 ++++++
 .../controlnet_sd3/test_controlnet_sd3.py      | 18 +++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/controlnet_sd3.py b/src/diffusers/models/controlnet_sd3.py
index 43b52a645a0d..9bd862017c6d 100644
--- a/src/diffusers/models/controlnet_sd3.py
+++ b/src/diffusers/models/controlnet_sd3.py
@@ -55,6 +55,10 @@ def __init__(
         pooled_projection_dim: int = 2048,
         out_channels: int = 16,
         pos_embed_max_size: int = 96,
+        dual_attention_layers: Tuple[
+            int, ...
+        ] = (),  # () for sd3.0; (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) for sd3.5
+        qk_norm: Optional[str] = None,
         extra_conditioning_channels: int = 0,
     ):
         super().__init__()
@@ -84,6 +88,8 @@ def __init__(
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=self.config.attention_head_dim,
                     context_pre_only=False,
+                    qk_norm=qk_norm,
+                    use_dual_attention=True if i in dual_attention_layers else False,
                 )
                 for i in range(num_layers)
             ]
diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
index aae1dc0ebcb0..16faa1f21dd4 100644
--- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
+++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -59,7 +59,7 @@ class StableDiffusion3ControlNetPipelineFastTests(unittest.TestCase, PipelineTes
     )
     batch_params = frozenset(["prompt", "negative_prompt"])
 
-    def get_dummy_components(self):
+    def get_dummy_components(self, qk_norm=None, use_dual_attention=False):
         torch.manual_seed(0)
         transformer = SD3Transformer2DModel(
             sample_size=32,
@@ -72,6 +72,8 @@ def get_dummy_components(self):
             caption_projection_dim=32,
             pooled_projection_dim=64,
             out_channels=8,
+            qk_norm=qk_norm,
+            dual_attention_layers=() if not use_dual_attention else (0, 1),
         )
 
         torch.manual_seed(0)
@@ -86,7 +88,10 @@ def get_dummy_components(self):
             caption_projection_dim=32,
             pooled_projection_dim=64,
             out_channels=8,
+            qk_norm=qk_norm,
+            dual_attention_layers=() if not use_dual_attention else (0,),
         )
+
         clip_text_encoder_config = CLIPTextConfig(
             bos_token_id=0,
             eos_token_id=2,
@@ -171,8 +176,7 @@ def get_dummy_inputs(self, device, seed=0):
 
         return inputs
 
-    def test_controlnet_sd3(self):
-        components = self.get_dummy_components()
+    def run_pipe(self, components):
         sd_pipe = StableDiffusion3ControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(torch_device, dtype=torch.float16)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -191,6 +195,14 @@ def test_controlnet_sd3(self):
             np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         ), f"Expected: {expected_slice}, got: {image_slice.flatten()}"
 
+    def test_controlnet_sd3(self):
+        components = self.get_dummy_components()
+        self.run_pipe(components)
+
+    def test_controlnet_sd35(self):
+        components = self.get_dummy_components(qk_norm="rms_norm", use_dual_attention=True)
+        self.run_pipe(components)
+
     @unittest.skip("xFormersAttnProcessor does not work with SD3 Joint Attention")
     def test_xformers_attention_forwardGenerator_pass(self):
         pass

From 41664d6b617a054846c178005e6855b0d4744afd Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Tue, 22 Oct 2024 15:17:15 -0400
Subject: [PATCH 02/15] fix train controlnet sd3 from transformer

---
 examples/controlnet/train_controlnet_sd3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/controlnet/train_controlnet_sd3.py b/examples/controlnet/train_controlnet_sd3.py
index 2bb68220e268..b590e902c48f 100644
--- a/examples/controlnet/train_controlnet_sd3.py
+++ b/examples/controlnet/train_controlnet_sd3.py
@@ -986,7 +986,7 @@ def main(args):
         controlnet = SD3ControlNetModel.from_pretrained(args.controlnet_model_name_or_path)
     else:
         logger.info("Initializing controlnet weights from transformer")
-        controlnet = SD3ControlNetModel.from_transformer(transformer)
+        controlnet = SD3ControlNetModel.from_transformer(transformer, num_extra_conditioning_channels=0)
 
     transformer.requires_grad_(False)
     vae.requires_grad_(False)

From f35a31efe9edc8775c444163eb6a6222127c0820 Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Tue, 22 Oct 2024 15:17:51 -0400
Subject: [PATCH 03/15] add test script for training controlnet sd3.5

---
 examples/controlnet/test_controlnet.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/examples/controlnet/test_controlnet.py b/examples/controlnet/test_controlnet.py
index 3c508f80f1a4..7f744e3b284d 100644
--- a/examples/controlnet/test_controlnet.py
+++ b/examples/controlnet/test_controlnet.py
@@ -138,6 +138,27 @@ def test_controlnet_sd3(self):
             self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
 
 
+class ControlNetSD35(ExamplesTestsAccelerate):
+    def test_controlnet_sd3(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/controlnet/train_controlnet_sd3.py
+            --pretrained_model_name_or_path=DavyMorgan/tiny-sd35-pipe
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=DavyMorgan/tiny-controlnet-sd35
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
+
+
 class ControlNetflux(ExamplesTestsAccelerate):
     def test_controlnet_flux(self):
         with tempfile.TemporaryDirectory() as tmpdir:

From b9b70288e2d6ddcf4531dc484342976ef84a39a0 Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Tue, 22 Oct 2024 17:07:28 -0400
Subject: [PATCH 04/15] fix controlnet

---
 src/diffusers/models/transformers/transformer_sd3.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
index b28350b8ed9c..45688f41acaa 100644
--- a/src/diffusers/models/transformers/transformer_sd3.py
+++ b/src/diffusers/models/transformers/transformer_sd3.py
@@ -11,8 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
+import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -344,7 +343,7 @@ def custom_forward(*inputs):
 
             # controlnet residual
             if block_controlnet_hidden_states is not None and block.context_pre_only is False:
-                interval_control = len(self.transformer_blocks) // len(block_controlnet_hidden_states)
+                interval_control = int(math.ceil(len(self.transformer_blocks) / len(block_controlnet_hidden_states)))
                 hidden_states = hidden_states + block_controlnet_hidden_states[index_block // interval_control]
 
         hidden_states = self.norm_out(hidden_states, temb)

From 6ac42510ec05f54b4be7e438e204a3dbd875e91a Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Tue, 22 Oct 2024 17:37:24 -0400
Subject: [PATCH 05/15] add sd3.5 to readme

---
 examples/controlnet/README_sd3.md | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/examples/controlnet/README_sd3.md b/examples/controlnet/README_sd3.md
index 7a7b4841125f..c48ab8b950df 100644
--- a/examples/controlnet/README_sd3.md
+++ b/examples/controlnet/README_sd3.md
@@ -1,6 +1,6 @@
-# ControlNet training example for Stable Diffusion 3 (SD3)
+# ControlNet training example for Stable Diffusion 3/3.5 (SD3/3.5)
 
-The `train_controlnet_sd3.py` script shows how to implement the ControlNet training procedure and adapt it for [Stable Diffusion 3](https://arxiv.org/abs/2403.03206).
+The `train_controlnet_sd3.py` script shows how to implement the ControlNet training procedure and adapt it for [Stable Diffusion 3](https://arxiv.org/abs/2403.03206) and [Stable Diffusion 3.5](https://stability.ai/news/introducing-stable-diffusion-3-5).
 
 ## Running locally with PyTorch
 
@@ -51,9 +51,9 @@ Please download the dataset and unzip it in the directory `fill50k` in the `exam
 
 ## Training
 
-First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium). We will use it as a base model for the ControlNet training.
+First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium) or the SD3.5 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3.5-large). We will use it as a base model for the ControlNet training.
 > [!NOTE]
-> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
+> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) or [Stable Diffusion 3.5 Large Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3.5-large), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
 
 ```bash
 huggingface-cli login
@@ -73,7 +73,7 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma
 Then run the following commands to train a ControlNet model.
 
 ```bash
-export MODEL_DIR="stabilityai/stable-diffusion-3-medium-diffusers"
+export MODEL_DIR="stabilityai/stable-diffusion-3-medium"
 export OUTPUT_DIR="sd3-controlnet-out"
 
 accelerate launch train_controlnet_sd3.py \
@@ -90,6 +90,8 @@ accelerate launch train_controlnet_sd3.py \
     --gradient_accumulation_steps=4
 ```
 
+To train a ControlNet model for Stable Diffusion 3.5, replace the `MODEL_DIR` with `stabilityai/stable-diffusion-3.5-large`.
+
 To better track our training experiments, we're using flags `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
 
 Our experiments were conducted on a single 40GB A100 GPU.
@@ -103,7 +105,7 @@ from diffusers import StableDiffusion3ControlNetPipeline, SD3ControlNetModel
 from diffusers.utils import load_image
 import torch
 
-base_model_path = "stabilityai/stable-diffusion-3-medium-diffusers"
+base_model_path = "stabilityai/stable-diffusion-3-medium"
 controlnet_path = "DavyMorgan/sd3-controlnet-out"
 
 controlnet = SD3ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
@@ -124,6 +126,8 @@ image = pipe(
 image.save("./output.png")
 ```
 
+Similarly, for SD3.5, replace the `base_model_path` with `stabilityai/stable-diffusion-3.5-large` and controlnet_path `DavyMorgan/sd35-controlnet-out'.
+
 ## Notes
 
 ### GPU usage
@@ -135,6 +139,8 @@ Make sure to use the right GPU when configuring the [accelerator](https://huggin
 
 ## Example results
 
+### SD3
+
 #### After 500 steps with batch size 8
 
 | |  |

From fb1ae0ce5f4d5e03b8b9befb801dd40512d6aa3b Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Fri, 25 Oct 2024 21:49:22 -0400
Subject: [PATCH 06/15] add preprocessing batchsize

---
 examples/controlnet/train_controlnet_sd3.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/controlnet/train_controlnet_sd3.py b/examples/controlnet/train_controlnet_sd3.py
index b590e902c48f..069cd064e16f 100644
--- a/examples/controlnet/train_controlnet_sd3.py
+++ b/examples/controlnet/train_controlnet_sd3.py
@@ -539,6 +539,9 @@ def parse_args(input_args=None):
         default=77,
         help="Maximum sequence length to use with with the T5 text encoder",
     )
+    parser.add_argument(
+        "--dataset_preprocess_batch_size", type=int, default=1000, help="Batch size for preprocessing dataset."
+    )
     parser.add_argument(
         "--validation_prompt",
         type=str,
@@ -1123,7 +1126,11 @@ def compute_text_embeddings(batch, text_encoders, tokenizers):
         # fingerprint used by the cache for the other processes to load the result
         # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
         new_fingerprint = Hasher.hash(args)
-        train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)
+        train_dataset = train_dataset.map(
+            compute_embeddings_fn,
+            batched=True,
+            batch_size=args.dataset_preprocess_batch_size,
+            new_fingerprint=new_fingerprint)
 
     del text_encoder_one, text_encoder_two, text_encoder_three
     del tokenizer_one, tokenizer_two, tokenizer_three

From dbd60e30c3e8368bcd5d4d19ce114262c39a31d6 Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Mon, 4 Nov 2024 10:09:29 -0500
Subject: [PATCH 07/15] update readme

---
 examples/controlnet/README_sd3.md | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/examples/controlnet/README_sd3.md b/examples/controlnet/README_sd3.md
index c48ab8b950df..c95f34e32f38 100644
--- a/examples/controlnet/README_sd3.md
+++ b/examples/controlnet/README_sd3.md
@@ -51,9 +51,9 @@ Please download the dataset and unzip it in the directory `fill50k` in the `exam
 
 ## Training
 
-First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium) or the SD3.5 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3.5-large). We will use it as a base model for the ControlNet training.
+First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) or the SD3.5 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). We will use it as a base model for the ControlNet training.
 > [!NOTE]
-> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) or [Stable Diffusion 3.5 Large Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3.5-large), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
+> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) or [Stable Diffusion 3.5 Large Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
 
 ```bash
 huggingface-cli login
@@ -73,7 +73,7 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma
 Then run the following commands to train a ControlNet model.
 
 ```bash
-export MODEL_DIR="stabilityai/stable-diffusion-3-medium"
+export MODEL_DIR="stabilityai/stable-diffusion-3-medium-diffusers"
 export OUTPUT_DIR="sd3-controlnet-out"
 
 accelerate launch train_controlnet_sd3.py \
@@ -90,7 +90,7 @@ accelerate launch train_controlnet_sd3.py \
     --gradient_accumulation_steps=4
 ```
 
-To train a ControlNet model for Stable Diffusion 3.5, replace the `MODEL_DIR` with `stabilityai/stable-diffusion-3.5-large`.
+To train a ControlNet model for Stable Diffusion 3.5, replace the `MODEL_DIR` with `stabilityai/stable-diffusion-3.5-medium`.
 
 To better track our training experiments, we're using flags `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
 
@@ -105,7 +105,7 @@ from diffusers import StableDiffusion3ControlNetPipeline, SD3ControlNetModel
 from diffusers.utils import load_image
 import torch
 
-base_model_path = "stabilityai/stable-diffusion-3-medium"
+base_model_path = "stabilityai/stable-diffusion-3-medium-diffusers"
 controlnet_path = "DavyMorgan/sd3-controlnet-out"
 
 controlnet = SD3ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
@@ -126,7 +126,7 @@ image = pipe(
 image.save("./output.png")
 ```
 
-Similarly, for SD3.5, replace the `base_model_path` with `stabilityai/stable-diffusion-3.5-large` and controlnet_path `DavyMorgan/sd35-controlnet-out'.
+Similarly, for SD3.5, replace the `base_model_path` with `stabilityai/stable-diffusion-3.5-medium` and controlnet_path `DavyMorgan/sd35-controlnet-out'.
 
 ## Notes
 
@@ -156,3 +156,20 @@ Make sure to use the right GPU when configuring the [accelerator](https://huggin
 || pale golden rod circle with old lace background |
  ![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![pale golden rod circle with old lace background](https://huggingface.co/datasets/DavyMorgan/sd3-controlnet-results/resolve/main/step-6500.png) |
 
+### SD3.5
+
+#### After 500 steps with batch size 8
+
+| |                                                                                                                                                     |
+|-------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------:|
+||                                                   pale golden rod circle with old lace background                                                   |
+ ![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![pale golden rod circle with old lace background](https://huggingface.co/datasets/DavyMorgan/sd3-controlnet-results/resolve/main/step-500-3.5.png) |
+
+
+#### After 3000 steps with batch size 8:
+
+| |                                                                                                                                                      |
+|-------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------:|
+||                                                   pale golden rod circle with old lace background                                                    |
+ ![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![pale golden rod circle with old lace background](https://huggingface.co/datasets/DavyMorgan/sd3-controlnet-results/resolve/main/step-3000-3.5.png) |
+

From 97656012a8c795150e9eac5f3202340d00498991 Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Mon, 4 Nov 2024 10:13:46 -0500
Subject: [PATCH 08/15] pass make quality

---
 examples/controlnet/train_controlnet_sd3.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/controlnet/train_controlnet_sd3.py b/examples/controlnet/train_controlnet_sd3.py
index 069cd064e16f..a986e9f47714 100644
--- a/examples/controlnet/train_controlnet_sd3.py
+++ b/examples/controlnet/train_controlnet_sd3.py
@@ -1130,7 +1130,8 @@ def compute_text_embeddings(batch, text_encoders, tokenizers):
             compute_embeddings_fn,
             batched=True,
             batch_size=args.dataset_preprocess_batch_size,
-            new_fingerprint=new_fingerprint)
+            new_fingerprint=new_fingerprint,
+        )
 
     del text_encoder_one, text_encoder_two, text_encoder_three
     del tokenizer_one, tokenizer_two, tokenizer_three

From 4f8b3b00b6233f1954dc601a5752bdfdea7c22ff Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Mon, 4 Nov 2024 14:12:14 -0500
Subject: [PATCH 09/15] update test

---
 tests/pipelines/controlnet_sd3/test_controlnet_sd3.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
index 16faa1f21dd4..88d6b02f3e8f 100644
--- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
+++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -176,7 +176,7 @@ def get_dummy_inputs(self, device, seed=0):
 
         return inputs
 
-    def run_pipe(self, components):
+    def run_pipe(self, components, use_sd35=False):
         sd_pipe = StableDiffusion3ControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(torch_device, dtype=torch.float16)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -189,7 +189,10 @@ def run_pipe(self, components):
 
         assert image.shape == (1, 32, 32, 3)
 
-        expected_slice = np.array([0.5767, 0.7100, 0.5981, 0.5674, 0.5952, 0.4102, 0.5093, 0.5044, 0.6030])
+        if not use_sd35:
+            expected_slice = np.array([0.5767, 0.7100, 0.5981, 0.5674, 0.5952, 0.4102, 0.5093, 0.5044, 0.6030])
+        else:
+            expected_slice = np.array([1.0000, 0.9072, 0.4209, 0.2744, 0.5737, 0.3840, 0.6113, 0.6250, 0.6328])
 
         assert (
             np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -201,7 +204,7 @@ def test_controlnet_sd3(self):
 
     def test_controlnet_sd35(self):
         components = self.get_dummy_components(qk_norm="rms_norm", use_dual_attention=True)
-        self.run_pipe(components)
+        self.run_pipe(components, use_sd35=True)
 
     @unittest.skip("xFormersAttnProcessor does not work with SD3 Joint Attention")
     def test_xformers_attention_forwardGenerator_pass(self):

From c0f1a872a14a4741575bb35ce095583045578c0a Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Sat, 16 Nov 2024 21:58:48 -0500
Subject: [PATCH 10/15] fix sd3.5

---
 src/diffusers/models/controlnets/controlnet_sd3.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/diffusers/models/controlnets/controlnet_sd3.py b/src/diffusers/models/controlnets/controlnet_sd3.py
index 209aad93244e..1fd80604a84f 100644
--- a/src/diffusers/models/controlnets/controlnet_sd3.py
+++ b/src/diffusers/models/controlnets/controlnet_sd3.py
@@ -55,6 +55,10 @@ def __init__(
         pooled_projection_dim: int = 2048,
         out_channels: int = 16,
         pos_embed_max_size: int = 96,
+        dual_attention_layers: Tuple[
+            int, ...
+        ] = (),  # () for sd3.0; (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) for sd3.5
+        qk_norm: Optional[str] = None,
         extra_conditioning_channels: int = 0,
     ):
         super().__init__()
@@ -84,6 +88,8 @@ def __init__(
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=self.config.attention_head_dim,
                     context_pre_only=False,
+                    qk_norm=qk_norm,
+                    use_dual_attention=True if i in dual_attention_layers else False,
                 )
                 for i in range(num_layers)
             ]

From df9712bc081d8c1d6b2265235a466728b0fd830f Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Sat, 16 Nov 2024 22:24:54 -0500
Subject: [PATCH 11/15] add extra conditioning channels to cli, use
 hf-internal-testing host

---
 examples/controlnet/test_controlnet.py      |  2 +-
 examples/controlnet/train_controlnet_sd3.py | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/controlnet/test_controlnet.py b/examples/controlnet/test_controlnet.py
index 7f744e3b284d..d595a1a312b0 100644
--- a/examples/controlnet/test_controlnet.py
+++ b/examples/controlnet/test_controlnet.py
@@ -143,7 +143,7 @@ def test_controlnet_sd3(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             test_args = f"""
             examples/controlnet/train_controlnet_sd3.py
-            --pretrained_model_name_or_path=DavyMorgan/tiny-sd35-pipe
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-sd35-pipe
             --dataset_name=hf-internal-testing/fill10
             --output_dir={tmpdir}
             --resolution=64
diff --git a/examples/controlnet/train_controlnet_sd3.py b/examples/controlnet/train_controlnet_sd3.py
index a986e9f47714..cbbce2932ef8 100644
--- a/examples/controlnet/train_controlnet_sd3.py
+++ b/examples/controlnet/train_controlnet_sd3.py
@@ -263,6 +263,12 @@ def parse_args(input_args=None):
         help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
         " If not specified controlnet weights are initialized from unet.",
     )
+    parser.add_argument(
+        "--num_extra_conditioning_channels",
+        type=int,
+        default=0,
+        help="Number of extra conditioning channels for controlnet.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
@@ -989,7 +995,9 @@ def main(args):
         controlnet = SD3ControlNetModel.from_pretrained(args.controlnet_model_name_or_path)
     else:
         logger.info("Initializing controlnet weights from transformer")
-        controlnet = SD3ControlNetModel.from_transformer(transformer, num_extra_conditioning_channels=0)
+        controlnet = SD3ControlNetModel.from_transformer(
+            transformer, num_extra_conditioning_channels=args.num_extra_conditioning_channels
+        )
 
     transformer.requires_grad_(False)
     vae.requires_grad_(False)

From 13c8d136f425fc91e93185ca6aa9a89bbbb84ea3 Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Mon, 2 Dec 2024 22:18:50 -0500
Subject: [PATCH 12/15] rebase

---
 src/diffusers/models/controlnets/controlnet_sd3.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/diffusers/models/controlnets/controlnet_sd3.py b/src/diffusers/models/controlnets/controlnet_sd3.py
index 8e7c43c0a315..4f3253d82f3d 100644
--- a/src/diffusers/models/controlnets/controlnet_sd3.py
+++ b/src/diffusers/models/controlnets/controlnet_sd3.py
@@ -56,10 +56,6 @@ def __init__(
         pooled_projection_dim: int = 2048,
         out_channels: int = 16,
         pos_embed_max_size: int = 96,
-        dual_attention_layers: Tuple[
-            int, ...
-        ] = (),  # () for sd3.0; (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) for sd3.5
-        qk_norm: Optional[str] = None,
         extra_conditioning_channels: int = 0,
         dual_attention_layers: Tuple[int, ...] = (),
         qk_norm: Optional[str] = None,

From f1430657b6f7d05cebb358d8417bc549247eb4d0 Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Mon, 2 Dec 2024 22:21:19 -0500
Subject: [PATCH 13/15] rebase

---
 src/diffusers/models/transformers/transformer_sd3.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
index 846f356079d6..ef0e3d7595e1 100644
--- a/src/diffusers/models/transformers/transformer_sd3.py
+++ b/src/diffusers/models/transformers/transformer_sd3.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np

From 3a06a977e06335cc84e127806d58981a94df8b09 Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Thu, 5 Dec 2024 11:07:15 -0500
Subject: [PATCH 14/15] quality

---
 tests/pipelines/controlnet_sd3/test_controlnet_sd3.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
index b6a2c32831ee..c1fed5eb0e50 100644
--- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
+++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -61,10 +61,7 @@ class StableDiffusion3ControlNetPipelineFastTests(unittest.TestCase, PipelineTes
     batch_params = frozenset(["prompt", "negative_prompt"])
 
     def get_dummy_components(
-      self, 
-      num_controlnet_layers: int = 3, 
-      qk_norm: Optional[str] = "rms_norm", 
-      use_dual_attention=False
+        self, num_controlnet_layers: int = 3, qk_norm: Optional[str] = "rms_norm", use_dual_attention=False
     ):
         torch.manual_seed(0)
         transformer = SD3Transformer2DModel(

From 0ac2192c424ec4f53be75244b6edab66fddb0d35 Mon Sep 17 00:00:00 2001
From: Yu Zheng <zhengyu.davy@foxmail.com>
Date: Thu, 5 Dec 2024 21:31:39 -0500
Subject: [PATCH 15/15] fix typo

---
 tests/pipelines/controlnet_sd3/test_controlnet_sd3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
index c1fed5eb0e50..5c547164c29a 100644
--- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
+++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -206,7 +206,7 @@ def test_controlnet_sd3(self):
         self.run_pipe(components)
 
     def test_controlnet_sd35(self):
-        components = self.get_dummy_components(num_control_layers=1, qk_norm="rms_norm", use_dual_attention=True)
+        components = self.get_dummy_components(num_controlnet_layers=1, qk_norm="rms_norm", use_dual_attention=True)
         self.run_pipe(components, use_sd35=True)
 
     @unittest.skip("xFormersAttnProcessor does not work with SD3 Joint Attention")