diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index b7fd0b05980f..b5022c9f8ada 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -427,12 +427,16 @@ def prepare_latents( elif last_image is None: video_condition = torch.cat( - [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2 + [image, image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 1, height, width)], dim=2 ) else: last_image = last_image.unsqueeze(2) video_condition = torch.cat( - [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 2, height, width), last_image], + [ + image, + image.new_zeros(image.shape[0], image.shape[1], num_latent_frames - 2, height, width), + last_image, + ], dim=2, ) video_condition = video_condition.to(device=device, dtype=self.vae.dtype)