Adds flash dynamic mask attention forward pass

LoserCheems · LoserCheems · commit a7ee9bcb9c9b · 2025-08-23T18:07:52.000+08:00
Implements forward function that integrates with transformers library
for flash attention with dynamic masking capabilities.

Handles input validation, tensor transposition, dtype casting for PEFT
compatibility, and delegates to core attention implementation with
proper parameter mapping.

Provides warning for unsupported features like output_attentions and
head_mask, directing users to eager attention mode when needed.
diff --git a/flash_dmattn/integrations/flash_dynamic_mask_attention.py b/flash_dmattn/integrations/flash_dynamic_mask_attention.py
@@ -0,0 +1,78 @@
+from typing import Optional
+
+import torch
+
+from .modeling_flash_dynamic_mask_attention_utils import _flash_dynamic_mask_attention_forward
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+
+def flash_dynamic_mask_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    attention_bias: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, None]:
+    if kwargs.get("output_attentions", False) or kwargs.get("head_mask") is not None:
+        logger.warning_once(
+            "`flash_dynamic_mask_attention` does not support `output_attentions=True` or `head_mask`."
+            " Please set your attention to `eager` if you want any of these features."
+        )
+
+    # This is before the transpose
+    seq_len = query.shape[2]
+
+    if any(dim == 0 for dim in query.shape):
+        raise ValueError(
+            "Tensor query has shape  with a zero dimension.\n"
+            "FlashDynamicMaskAttention does not support inputs with dim=0.\n"
+            "Please check your input shapes or use SDPA instead."
+        )
+    # FDMA uses non-transposed inputs
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+
+    # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+    # therefore the input hidden states gets silently casted in float32. Hence, we need
+    # cast them back in the correct dtype just to be sure everything works as expected.
+    # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+    # in fp32. (usually our RMSNorm modules handle it correctly)
+    target_dtype = None
+    if query.dtype == torch.float32:
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        # Handle the case where the model is quantized
+        elif hasattr(module.config, "_pre_quantization_dtype"):
+            target_dtype = module.config._pre_quantization_dtype
+        else:
+            target_dtype = next(layer for layer in module.modules() if isinstance(layer, torch.nn.Linear)).weight.dtype
+
+    # FDMA always relies on the value set in the module, so remove it if present in kwargs to avoid passing it twice
+    kwargs.pop("is_causal", None)
+
+    attn_output = _flash_dynamic_mask_attention_forward(
+        query,
+        key,
+        value,
+        attention_mask,
+        attention_bias,
+        query_length=seq_len,
+        is_causal=module.is_causal,
+        softmax_scale=scaling,
+        softcap=softcap,
+        target_dtype=target_dtype,
+        attn_implementation=module.config._attn_implementation,
+        layer_idx=module.layer_idx if hasattr(module, "layer_idx") else None,
+        **kwargs,
+    )
+
+    return attn_output, None