From 6cf9280bce6a48b98f55a838dc17afab3c284e0d Mon Sep 17 00:00:00 2001
From: DefTruth <qiustudent_r@163.com>
Date: Thu, 16 Oct 2025 10:48:17 +0000
Subject: [PATCH 1/2] bugfix: fix wan-i2v pipeline condition shape mismatch

---
 src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
index b7fd0b05980f..e13a2b72ab20 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
@@ -427,12 +427,12 @@ def prepare_latents(
 
         elif last_image is None:
             video_condition = torch.cat(
-                [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
+                [image, image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 1, height, width)], dim=2
             )
         else:
             last_image = last_image.unsqueeze(2)
             video_condition = torch.cat(
-                [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 2, height, width), last_image],
+                [image, image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 2, height, width), last_image],
                 dim=2,
             )
         video_condition = video_condition.to(device=device, dtype=self.vae.dtype)

From c417330bc9a5b3fcf121a94aafd1584b16b06222 Mon Sep 17 00:00:00 2001
From: DefTruth <qiustudent_r@163.com>
Date: Thu, 16 Oct 2025 10:48:55 +0000
Subject: [PATCH 2/2] bugfix: fix wan-i2v pipeline condition shape mismatch

---
 src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
index e13a2b72ab20..b5022c9f8ada 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
@@ -432,7 +432,11 @@ def prepare_latents(
         else:
             last_image = last_image.unsqueeze(2)
             video_condition = torch.cat(
-                [image, image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 2, height, width), last_image],
+                [
+                    image,
+                    image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 2, height, width),
+                    last_image,
+                ],
                 dim=2,
             )
         video_condition = video_condition.to(device=device, dtype=self.vae.dtype)