flash-algo
diff --git a/‎demo_varlen_fix.py‎
Lines changed: 0 additions & 221 deletions b/‎demo_varlen_fix.py‎
Lines changed: 0 additions & 221 deletions
diff --git a/‎flash_dmattn/integrations/flash_dynamic_mask_attention.py‎
Lines changed: 78 additions & 0 deletions b/‎flash_dmattn/integrations/flash_dynamic_mask_attention.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎flash_dmattn/integrations/import_utils.py‎
Lines changed: 95 additions & 0 deletions b/‎flash_dmattn/integrations/import_utils.py‎
Lines changed: 95 additions & 0 deletions
@@ -0,0 +1,78 @@
+from typing import Optional
+
+import torch
+
+from .modeling_flash_dynamic_mask_attention_utils import _flash_dynamic_mask_attention_forward
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+
+def flash_dynamic_mask_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    attention_bias: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, None]:
+    if kwargs.get("output_attentions", False) or kwargs.get("head_mask") is not None:
+        logger.warning_once(
+            "`flash_dynamic_mask_attention` does not support `output_attentions=True` or `head_mask`."
+            " Please set your attention to `eager` if you want any of these features."
+        )
+
+    # This is before the transpose
+    seq_len = query.shape[2]
+
+    if any(dim == 0 for dim in query.shape):
+        raise ValueError(
+            "Tensor query has shape  with a zero dimension.\n"
+            "FlashDynamicMaskAttention does not support inputs with dim=0.\n"
+            "Please check your input shapes or use SDPA instead."
+        )
+    # FDMA uses non-transposed inputs
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+
+    # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+    # therefore the input hidden states gets silently casted in float32. Hence, we need
+    # cast them back in the correct dtype just to be sure everything works as expected.
+    # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+    # in fp32. (usually our RMSNorm modules handle it correctly)
+    target_dtype = None
+    if query.dtype == torch.float32:
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        # Handle the case where the model is quantized
+        elif hasattr(module.config, "_pre_quantization_dtype"):
+            target_dtype = module.config._pre_quantization_dtype
+        else:
+            target_dtype = next(layer for layer in module.modules() if isinstance(layer, torch.nn.Linear)).weight.dtype
+
+    # FDMA always relies on the value set in the module, so remove it if present in kwargs to avoid passing it twice
+    kwargs.pop("is_causal", None)
+
+    attn_output = _flash_dynamic_mask_attention_forward(
+        query,
+        key,
+        value,
+        attention_mask,
+        attention_bias,
+        query_length=seq_len,
+        is_causal=module.is_causal,
+        softmax_scale=scaling,
+        softcap=softcap,
+        target_dtype=target_dtype,
+        attn_implementation=module.config._attn_implementation,
+        layer_idx=module.layer_idx if hasattr(module, "layer_idx") else None,
+        **kwargs,
+    )
+
+    return attn_output, None
@@ -0,0 +1,95 @@
+# Copyright 2025 Jingze Shi and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Import utilities: Utilities related to imports and our lazy inits.
+"""
+
+import importlib.metadata
+import importlib.util
+from functools import lru_cache
+from typing import Union
+
+
+from transformers import is_torch_available
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better.
+def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[tuple[bool, str], bool]:
+    # Check if the package spec exists and grab its version to avoid importing a local directory
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+    if package_exists:
+        try:
+            # TODO: Once python 3.9 support is dropped, `importlib.metadata.packages_distributions()`
+            # should be used here to map from package name to distribution names
+            # e.g. PIL -> Pillow, Pillow-SIMD; quark -> amd-quark; onnxruntime -> onnxruntime-gpu.
+            # `importlib.metadata.packages_distributions()` is not available in Python 3.9.
+
+            # Primary method to get the package version
+            package_version = importlib.metadata.version(pkg_name)
+        except importlib.metadata.PackageNotFoundError:
+            # Fallback method: Only for "torch" and versions containing "dev"
+            if pkg_name == "torch":
+                try:
+                    package = importlib.import_module(pkg_name)
+                    temp_version = getattr(package, "__version__", "N/A")
+                    # Check if the version contains "dev"
+                    if "dev" in temp_version:
+                        package_version = temp_version
+                        package_exists = True
+                    else:
+                        package_exists = False
+                except ImportError:
+                    # If the package can't be imported, it's not available
+                    package_exists = False
+            elif pkg_name == "quark":
+                # TODO: remove once `importlib.metadata.packages_distributions()` is supported.
+                try:
+                    package_version = importlib.metadata.version("amd-quark")
+                except Exception:
+                    package_exists = False
+            elif pkg_name == "triton":
+                try:
+                    package_version = importlib.metadata.version("pytorch-triton")
+                except Exception:
+                    package_exists = False
+            else:
+                # For packages other than "torch", don't attempt the fallback and set as not available
+                package_exists = False
+        logger.debug(f"Detected {pkg_name} version: {package_version}")
+    if return_version:
+        return package_exists, package_version
+    else:
+        return package_exists
+
+
+
+@lru_cache
+def is_flash_dmattn_available():
+    if not is_torch_available():
+        return False
+
+    if not _is_package_available("flash_dmattn"):
+        return False
+
+    import torch
+
+    if not torch.cuda.is_available():
+        return False
+
+    return True