Skip to content

Commit f909481

Browse files
authored
Merge branch 'main' into integrations/wan2.2-s2v
2 parents 065d982 + a9e4883 commit f909481

31 files changed

+1445
-190
lines changed

docs/source/en/api/models/wan_animate_transformer_3d.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ The model can be loaded with the following code snippet.
1818
```python
1919
from diffusers import WanAnimateTransformer3DModel
2020

21-
transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-720P-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
21+
transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
2222
```
2323

2424
## WanAnimateTransformer3DModel

docs/source/en/api/pipelines/wan.md

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ For replacement mode, you additionally need:
283283
- **Mask video**: A mask indicating where to generate content (white) vs. preserve original (black)
284284

285285
> [!NOTE]
286-
> The preprocessing tools are available in the original Wan-Animate repository. Integration of these preprocessing steps into Diffusers is planned for a future release.
286+
> Raw videos should not be used for inputs such as `pose_video`, which the pipeline expects to be preprocessed to extract the proper information. Preprocessing scripts to prepare these inputs are available in the [original Wan-Animate repository](https://github.com/Wan-Video/Wan2.2?tab=readme-ov-file#1-preprocessing). Integration of these preprocessing steps into Diffusers is planned for a future release.
287287
288288
The example below demonstrates how to use the Wan-Animate pipeline:
289289

@@ -295,13 +295,10 @@ import numpy as np
295295
import torch
296296
from diffusers import AutoencoderKLWan, WanAnimatePipeline
297297
from diffusers.utils import export_to_video, load_image, load_video
298-
from transformers import CLIPVisionModel
299298

300299
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
301300
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
302-
pipe = WanAnimatePipeline.from_pretrained(
303-
model_id, vae=vae, torch_dtype=torch.bfloat16
304-
)
301+
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
305302
pipe.to("cuda")
306303

307304
# Load character image and preprocessed videos
@@ -332,11 +329,11 @@ output = pipe(
332329
negative_prompt=negative_prompt,
333330
height=height,
334331
width=width,
335-
num_frames=81,
336-
guidance_scale=5.0,
337-
mode="animation", # Animation mode (default)
332+
segment_frame_length=77,
333+
guidance_scale=1.0,
334+
mode="animate", # Animation mode (default)
338335
).frames[0]
339-
export_to_video(output, "animated_character.mp4", fps=16)
336+
export_to_video(output, "animated_character.mp4", fps=30)
340337
```
341338

342339
</hfoption>
@@ -347,14 +344,10 @@ import numpy as np
347344
import torch
348345
from diffusers import AutoencoderKLWan, WanAnimatePipeline
349346
from diffusers.utils import export_to_video, load_image, load_video
350-
from transformers import CLIPVisionModel
351347

352348
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
353-
image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float16)
354349
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
355-
pipe = WanAnimatePipeline.from_pretrained(
356-
model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
357-
)
350+
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
358351
pipe.to("cuda")
359352

360353
# Load all required inputs for replacement mode
@@ -389,11 +382,11 @@ output = pipe(
389382
negative_prompt=negative_prompt,
390383
height=height,
391384
width=width,
392-
num_frames=81,
393-
guidance_scale=5.0,
394-
mode="replacement", # Replacement mode
385+
segment_frame_lengths=77,
386+
guidance_scale=1.0,
387+
mode="replace", # Replacement mode
395388
).frames[0]
396-
export_to_video(output, "character_replaced.mp4", fps=16)
389+
export_to_video(output, "character_replaced.mp4", fps=30)
397390
```
398391

399392
</hfoption>
@@ -404,14 +397,10 @@ import numpy as np
404397
import torch
405398
from diffusers import AutoencoderKLWan, WanAnimatePipeline
406399
from diffusers.utils import export_to_video, load_image, load_video
407-
from transformers import CLIPVisionModel
408400

409401
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
410-
image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float16)
411402
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
412-
pipe = WanAnimatePipeline.from_pretrained(
413-
model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
414-
)
403+
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
415404
pipe.to("cuda")
416405

417406
image = load_image("path/to/character.jpg")
@@ -445,25 +434,24 @@ output = pipe(
445434
negative_prompt=negative_prompt,
446435
height=height,
447436
width=width,
448-
num_frames=81,
437+
segment_frame_length=77,
449438
num_inference_steps=50,
450439
guidance_scale=5.0,
451-
num_frames_for_temporal_guidance=5, # Use 5 frames for temporal guidance (1 or 5 recommended)
440+
prev_segment_conditioning_frames=5, # Use 5 frames for temporal guidance (1 or 5 recommended)
452441
callback_on_step_end=callback_fn,
453442
callback_on_step_end_tensor_inputs=["latents"],
454443
).frames[0]
455-
export_to_video(output, "animated_advanced.mp4", fps=16)
444+
export_to_video(output, "animated_advanced.mp4", fps=30)
456445
```
457446

458447
</hfoption>
459448
</hfoptions>
460449

461450
#### Key Parameters
462451

463-
- **mode**: Choose between `"animation"` (default) or `"replacement"`
464-
- **num_frames_for_temporal_guidance**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory
465-
- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt
466-
- **num_frames**: Total number of frames to generate. Should be divisible by `vae_scale_factor_temporal` (default: 4)
452+
- **mode**: Choose between `"animate"` (default) or `"replace"`
453+
- **prev_segment_conditioning_frames**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory
454+
- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt. For Wan-Animate, CFG is disabled by default (`guidance_scale=1.0`) but can be enabled to support negative prompts and finer control over facial expressions. (Note that CFG will only target the text prompt and face conditioning.)
467455

468456

469457
### Wan-S2V: Audio-Driven Cinematic Video Generation

src/diffusers/schedulers/scheduling_consistency_models.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def set_begin_index(self, begin_index: int = 0):
121121
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
122122
123123
Args:
124-
begin_index (`int`):
124+
begin_index (`int`, defaults to `0`):
125125
The begin index for the scheduler.
126126
"""
127127
self._begin_index = begin_index
@@ -287,7 +287,23 @@ def get_scalings_for_boundary_condition(self, sigma):
287287
return c_skip, c_out
288288

289289
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
290-
def index_for_timestep(self, timestep, schedule_timesteps=None):
290+
def index_for_timestep(
291+
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
292+
) -> int:
293+
"""
294+
Find the index of a given timestep in the timestep schedule.
295+
296+
Args:
297+
timestep (`float` or `torch.Tensor`):
298+
The timestep value to find in the schedule.
299+
schedule_timesteps (`torch.Tensor`, *optional*):
300+
The timestep schedule to search in. If `None`, uses `self.timesteps`.
301+
302+
Returns:
303+
`int`:
304+
The index of the timestep in the schedule. For the very first step, returns the second index if
305+
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
306+
"""
291307
if schedule_timesteps is None:
292308
schedule_timesteps = self.timesteps
293309

@@ -302,7 +318,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None):
302318
return indices[pos].item()
303319

304320
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
305-
def _init_step_index(self, timestep):
321+
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
322+
"""
323+
Initialize the step index for the scheduler based on the given timestep.
324+
325+
Args:
326+
timestep (`float` or `torch.Tensor`):
327+
The current timestep to initialize the step index from.
328+
"""
306329
if self.begin_index is None:
307330
if isinstance(timestep, torch.Tensor):
308331
timestep = timestep.to(self.timesteps.device)
@@ -410,6 +433,21 @@ def add_noise(
410433
noise: torch.Tensor,
411434
timesteps: torch.Tensor,
412435
) -> torch.Tensor:
436+
"""
437+
Add noise to the original samples according to the noise schedule at the specified timesteps.
438+
439+
Args:
440+
original_samples (`torch.Tensor`):
441+
The original samples to which noise will be added.
442+
noise (`torch.Tensor`):
443+
The noise tensor to add to the original samples.
444+
timesteps (`torch.Tensor`):
445+
The timesteps at which to add noise, determining the noise level from the schedule.
446+
447+
Returns:
448+
`torch.Tensor`:
449+
The noisy samples with added noise scaled according to the timestep schedule.
450+
"""
413451
# Make sure sigmas and timesteps have the same device and dtype as original_samples
414452
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
415453
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):

src/diffusers/schedulers/scheduling_cosine_dpmsolver_multistep.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def set_begin_index(self, begin_index: int = 0):
137137
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
138138
139139
Args:
140-
begin_index (`int`):
140+
begin_index (`int`, defaults to `0`):
141141
The begin index for the scheduler.
142142
"""
143143
self._begin_index = begin_index
@@ -266,6 +266,19 @@ def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> t
266266

267267
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
268268
def _sigma_to_t(self, sigma, log_sigmas):
269+
"""
270+
Convert sigma values to corresponding timestep values through interpolation.
271+
272+
Args:
273+
sigma (`np.ndarray`):
274+
The sigma value(s) to convert to timestep(s).
275+
log_sigmas (`np.ndarray`):
276+
The logarithm of the sigma schedule used for interpolation.
277+
278+
Returns:
279+
`np.ndarray`:
280+
The interpolated timestep value(s) corresponding to the input sigma(s).
281+
"""
269282
# get log sigma
270283
log_sigma = np.log(np.maximum(sigma, 1e-10))
271284

@@ -537,6 +550,21 @@ def add_noise(
537550
noise: torch.Tensor,
538551
timesteps: torch.Tensor,
539552
) -> torch.Tensor:
553+
"""
554+
Add noise to the original samples according to the noise schedule at the specified timesteps.
555+
556+
Args:
557+
original_samples (`torch.Tensor`):
558+
The original samples to which noise will be added.
559+
noise (`torch.Tensor`):
560+
The noise tensor to add to the original samples.
561+
timesteps (`torch.Tensor`):
562+
The timesteps at which to add noise, determining the noise level from the schedule.
563+
564+
Returns:
565+
`torch.Tensor`:
566+
The noisy samples with added noise scaled according to the timestep schedule.
567+
"""
540568
# Make sure sigmas and timesteps have the same device and dtype as original_samples
541569
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
542570
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):

src/diffusers/schedulers/scheduling_ddim.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,11 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
9999
100100
Args:
101101
betas (`torch.Tensor`):
102-
the betas that the scheduler is being initialized with.
102+
The betas that the scheduler is being initialized with.
103103
104104
Returns:
105-
`torch.Tensor`: rescaled betas with zero terminal SNR
105+
`torch.Tensor`:
106+
Rescaled betas with zero terminal SNR.
106107
"""
107108
# Convert betas to alphas_bar_sqrt
108109
alphas = 1.0 - betas

src/diffusers/schedulers/scheduling_ddim_inverse.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,11 @@ def rescale_zero_terminal_snr(betas):
9898
9999
Args:
100100
betas (`torch.Tensor`):
101-
the betas that the scheduler is being initialized with.
101+
The betas that the scheduler is being initialized with.
102102
103103
Returns:
104-
`torch.Tensor`: rescaled betas with zero terminal SNR
104+
`torch.Tensor`:
105+
Rescaled betas with zero terminal SNR.
105106
"""
106107
# Convert betas to alphas_bar_sqrt
107108
alphas = 1.0 - betas

src/diffusers/schedulers/scheduling_ddim_parallel.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,11 @@ def rescale_zero_terminal_snr(betas):
100100
101101
Args:
102102
betas (`torch.Tensor`):
103-
the betas that the scheduler is being initialized with.
103+
The betas that the scheduler is being initialized with.
104104
105105
Returns:
106-
`torch.Tensor`: rescaled betas with zero terminal SNR
106+
`torch.Tensor`:
107+
Rescaled betas with zero terminal SNR.
107108
"""
108109
# Convert betas to alphas_bar_sqrt
109110
alphas = 1.0 - betas

src/diffusers/schedulers/scheduling_ddpm.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,11 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
9797
9898
Args:
9999
betas (`torch.Tensor`):
100-
the betas that the scheduler is being initialized with.
100+
The betas that the scheduler is being initialized with.
101101
102102
Returns:
103-
`torch.Tensor`: rescaled betas with zero terminal SNR
103+
`torch.Tensor`:
104+
Rescaled betas with zero terminal SNR.
104105
"""
105106
# Convert betas to alphas_bar_sqrt
106107
alphas = 1.0 - betas

src/diffusers/schedulers/scheduling_ddpm_parallel.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,11 @@ def rescale_zero_terminal_snr(betas):
9999
100100
Args:
101101
betas (`torch.Tensor`):
102-
the betas that the scheduler is being initialized with.
102+
The betas that the scheduler is being initialized with.
103103
104104
Returns:
105-
`torch.Tensor`: rescaled betas with zero terminal SNR
105+
`torch.Tensor`:
106+
Rescaled betas with zero terminal SNR.
106107
"""
107108
# Convert betas to alphas_bar_sqrt
108109
alphas = 1.0 - betas

0 commit comments

Comments
 (0)