From 6cf9280bce6a48b98f55a838dc17afab3c284e0d Mon Sep 17 00:00:00 2001 From: DefTruth Date: Thu, 16 Oct 2025 10:48:17 +0000 Subject: [PATCH 1/2] bugfix: fix wan-i2v pipeline condition shape mismatch --- src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index b7fd0b05980f..e13a2b72ab20 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -427,12 +427,12 @@ def prepare_latents( elif last_image is None: video_condition = torch.cat( - [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2 + [image, image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 1, height, width)], dim=2 ) else: last_image = last_image.unsqueeze(2) video_condition = torch.cat( - [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 2, height, width), last_image], + [image, image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 2, height, width), last_image], dim=2, ) video_condition = video_condition.to(device=device, dtype=self.vae.dtype) From c417330bc9a5b3fcf121a94aafd1584b16b06222 Mon Sep 17 00:00:00 2001 From: DefTruth Date: Thu, 16 Oct 2025 10:48:55 +0000 Subject: [PATCH 2/2] bugfix: fix wan-i2v pipeline condition shape mismatch --- src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index e13a2b72ab20..b5022c9f8ada 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -432,7 +432,11 @@ def prepare_latents( else: last_image = last_image.unsqueeze(2) video_condition = torch.cat( - [image, image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 2, height, width), last_image], + [ + image, + image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 2, height, width), + last_image, + ], dim=2, ) video_condition = video_condition.to(device=device, dtype=self.vae.dtype)