feat(attention): added support for flash attention 4

BlueCrescent · BlueCrescent · commit d30807926979 · 2026-04-21T16:09:32.000+02:00
diff --git a/README.md b/README.md
@@ -52,6 +52,14 @@ uv sync --extra [cpu|cu126|cu128|cu130] --extra tests --extra linting
 pre-commit install --install-hooks
 ```
 
+Additionally, flash attention 4 can be installed via:
+
+```sh
+uv pip install --prerelease=allow flash-attn-4
+# or (if you want to install the CUDA 13 version)
+uv pip install --prerelease=allow flash-attn-4[cu13]
+```
+
 ### Option 2: Using pip and manual installation of dependencies
 
 ```sh
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,17 +46,20 @@ cpu = ["torch>=2.10,<2.11.0", "torchvision"]
 cu126 = [
   "torch>=2.10,<2.11.0",
   "torchvision",
-  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
+  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'",
+  # "flash-attn-4; platform_system == 'Linux' and platform_machine != 'aarch64'"
 ]
 cu128 = [
   "torch>=2.10,<2.11.0",
   "torchvision",
-  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
+  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'",
+  # "flash-attn-4; platform_system == 'Linux' and platform_machine != 'aarch64'"
 ]
 cu130 = [
   "torch>=2.10,<2.11.0",
   "torchvision",
-  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
+  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'",
+  # "flash-attn-4[cu13]; platform_system == 'Linux' and platform_machine != 'aarch64'"
 ]
 
 [tool.uv]
@@ -106,8 +109,8 @@ explicit = true
 
 [tool.uv.extra-build-dependencies]
 flash-attn = [
-    { requirement = "torch", match-runtime = true },
-    { requirement = "ninja", match-runtime = true },
+  { requirement = "torch", match-runtime = true },
+  { requirement = "ninja", match-runtime = true },
 ]
 
 [tool.black]
diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py
@@ -20,9 +20,14 @@
 from modalities.util import parse_enum_by_name
 
 try:
-    from flash_attn import flash_attn_func
+    from flash_attn import flash_attn_func as flash_attn_func_v2
 except ModuleNotFoundError:
-    flash_attn_func = None
+    flash_attn_func_v2 = None
+
+try:
+    from flash_attn.cute import flash_attn_func as flash_attn_func_v4
+except Exception:
+    flash_attn_func_v4 = None
 
 # Logger configuration
 logger = logging.getLogger(__name__)
@@ -249,12 +254,14 @@ class AttentionImplementation(str, Enum):
     Attributes:
         MANUAL (str): Manual attention implementation.
         PYTORCH_FLASH (str): PyTorch's flash attention implementation.
-        DAO_FLASH (str): DAO's flash attention implementation.
+        DAO_FLASH (str): DAO's FlashAttention-2 implementation.
+        DAO_FLASH_V4 (str): DAO's FlashAttention-4 implementation.
     """
 
     MANUAL = "manual"
     PYTORCH_FLASH = "pytorch_flash"
     DAO_FLASH = "dao_flash"
+    DAO_FLASH_V4 = "dao_flash_v4"
 
 
 class AttentionConfig(BaseModel):
@@ -439,6 +446,14 @@ def __init__(
         super().__init__()
         assert n_embd % n_head_q == 0, "`n_embd needs` to be divisible by `n_head_q`."
         assert n_head_q % n_head_kv == 0, "`n_head_q needs` to be divisible by `n_head_kv`."
+        if attention_impl == AttentionImplementation.DAO_FLASH:
+            if flash_attn_func_v2 is None:
+                raise NotImplementedError("ERROR! Dao Flash Attention 2 is not installed.")
+        if attention_impl == AttentionImplementation.DAO_FLASH_V4:
+            if flash_attn_func_v4 is None:
+                raise NotImplementedError("ERROR! Dao Flash Attention 4 is not installed.")
+            if dropout > 0.0:
+                raise NotImplementedError("ERROR! Dao Flash Attention 4 does not support attention dropout.")
 
         self.n_rep = n_head_q // n_head_kv
         self.attention_impl = attention_impl
@@ -644,19 +659,50 @@ def execute_attention(
             # Due to the lack of GPUs in github actions and the requirement of those in the flash-attn library,
             # we have to check if the library is installed and raise an error if not.
             # Note, that the library is not required for the CPU-only tests.
-            if flash_attn_func is None:
-                raise NotImplementedError("ERROR! Dao Flash Attention is not installed.")
-            # the next three lines are only needed for flash-attn from Daio Lab
-            q = q.transpose(1, 2).contiguous()  # (B, T, nh_q, hd)
-            k = k.transpose(1, 2).contiguous()  # (B, T, nh_kv, hd)
-            v = v.transpose(1, 2).contiguous()  # (B, T, nh_kv, hd)
-            y = flash_attn_func(
-                q, k, v, dropout_p=dropout, causal=True, softmax_scale=None, window_size=(-1, -1)
-            )  # (B, T, nh_q, hd)
+            y = cls._execute_dao_flash_v2(q, k, v, dropout)
+        elif attention_impl == AttentionImplementation.DAO_FLASH_V4:
+            if cls._requires_backward(q, k, v) and torch.cuda.get_device_capability(q.device)[0] < 9:
+                y = cls._execute_dao_flash_v2(q, k, v, dropout)
+            else:
+                # TODO added due to upstream failure in its pack_gqa handling,
+                #      can be removed once the issue is resolved:
+                k, v = cls.repeat_kv_heads(q, k, v)
+                q = q.transpose(1, 2).contiguous()  # (B, T, nh_q, hd)
+                k = k.transpose(1, 2).contiguous()  # (B, T, nh_kv, hd)
+                v = v.transpose(1, 2).contiguous()  # (B, T, nh_kv, hd)
+                y = cls._unwrap_flash_attention_output(
+                    flash_attn_func_v4(
+                        q,
+                        k,
+                        v,
+                        causal=True,
+                        softmax_scale=None,
+                        window_size=(None, None),
+                    )
+                )
         else:
             raise NotImplementedError(f"Attention implementation {attention_impl} not supported")
         return y  # (B, T, nh_q, hd)
 
+    @staticmethod
+    def _unwrap_flash_attention_output(
+        output: torch.Tensor | tuple[torch.Tensor, Optional[torch.Tensor]],
+    ) -> torch.Tensor:
+        if isinstance(output, tuple):
+            return output[0]
+        return output
+
+    @staticmethod
+    def _execute_dao_flash_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, dropout: float) -> torch.Tensor:
+        q = q.transpose(1, 2).contiguous()  # (B, T, nh_q, hd)
+        k = k.transpose(1, 2).contiguous()  # (B, T, nh_kv, hd)
+        v = v.transpose(1, 2).contiguous()  # (B, T, nh_kv, hd)
+        return flash_attn_func_v2(q, k, v, dropout_p=dropout, causal=True, softmax_scale=None, window_size=(-1, -1))
+
+    @staticmethod
+    def _requires_backward(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> bool:
+        return torch.is_grad_enabled() and any(tensor.requires_grad for tensor in (q, k, v))
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass of the CausalSelfAttention module.
diff --git a/tests/models/test_causal_self_attention.py b/tests/models/test_causal_self_attention.py
@@ -1,8 +1,15 @@
 """
-Note: test_attention_types_approximate_equality can print the output of different attention implementations. 
+Note: test_attention_types_approximate_equality can print the output of different attention implementations.
       To do so, turn on verbose and run 'pytest tests/models/test_causal_self_attention.py -s'
 """
+
+import os
+import subprocess
+import sys
+import textwrap
 from copy import deepcopy
+from importlib.util import find_spec
+from pathlib import Path
 
 import pytest
 import torch
@@ -17,6 +24,10 @@
 
 torch.manual_seed(0)
 
+FLASH_ATTN_V4_AVAILABLE = find_spec("flash_attn.cute") is not None
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SRC_ROOT = REPO_ROOT / "src"
+
 
 def _get_random_input_seq(embedding_shape):
     flash_attn_supported_dtype = torch.bfloat16
@@ -272,3 +283,146 @@ def test_qk_norm(n_head_q, n_head_kv, n_embd, attention_impl):
 
     assert output_no_norm.shape == output_with_norm.shape == embedding_shape
     assert not torch.allclose(output_no_norm, output_with_norm, atol=1e-6)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires 1 GPU.")
+@pytest.mark.skipif(not FLASH_ATTN_V4_AVAILABLE, reason="FA4 not installed")
+def test_dao_flash_v4_forward_mha_subprocess():
+    result = _run_fa4_subprocess(
+        """
+        import torch
+        from modalities.models.gpt2.gpt2_model import CausalSelfAttention
+
+        q = torch.rand(2, 4, 12, 32, dtype=torch.bfloat16, device='cuda')
+        k = torch.rand(2, 4, 12, 32, dtype=torch.bfloat16, device='cuda')
+        v = torch.rand(2, 4, 12, 32, dtype=torch.bfloat16, device='cuda')
+        out = CausalSelfAttention.execute_attention(q, k, v, dropout=0.0, attention_impl='dao_flash_v4')
+        torch.cuda.synchronize()
+        assert tuple(out.shape) == (2, 12, 4, 32)
+        print('ok')
+        """
+    )
+    assert result.stdout.strip().endswith("ok")
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires 1 GPU.")
+@pytest.mark.skipif(not FLASH_ATTN_V4_AVAILABLE, reason="FA4 not installed")
+def test_dao_flash_v4_forward_gqa_subprocess():
+    result = _run_fa4_subprocess(
+        """
+        import torch
+        from modalities.models.gpt2.gpt2_model import CausalSelfAttention
+
+        q = torch.rand(2, 8, 12, 32, dtype=torch.bfloat16, device='cuda')
+        k = torch.rand(2, 2, 12, 32, dtype=torch.bfloat16, device='cuda')
+        v = torch.rand(2, 2, 12, 32, dtype=torch.bfloat16, device='cuda')
+        out = CausalSelfAttention.execute_attention(q, k, v, dropout=0.0, attention_impl='dao_flash_v4')
+        torch.cuda.synchronize()
+        assert tuple(out.shape) == (2, 12, 8, 32)
+        print('ok')
+        """
+    )
+    assert result.stdout.strip().endswith("ok")
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires 1 GPU.")
+@pytest.mark.skipif(not FLASH_ATTN_V4_AVAILABLE, reason="FA4 not installed")
+def test_dao_flash_v4_qk_norm_subprocess():
+    result = _run_fa4_subprocess(
+        """
+        import torch
+        from modalities.models.gpt2.gpt2_model import (
+            AttentionConfig,
+            CausalSelfAttention,
+            LayerNorms,
+            LayerNormWrapperConfig,
+            PytorchRMSLayerNormConfig,
+        )
+
+        torch.manual_seed(0)
+        attention_config_no_norm = AttentionConfig(qkv_transforms=[])
+        attention_config_with_norm = AttentionConfig(
+            qkv_transforms=[],
+            qk_norm_config=LayerNormWrapperConfig(
+                norm_type=LayerNorms.pytorch_rms_norm,
+                config=PytorchRMSLayerNormConfig(normalized_shape=8),
+            ),
+        )
+
+        torch.manual_seed(0)
+        layer_no_norm = CausalSelfAttention(
+            4, 4, 32, attention_config_no_norm, 'dao_flash_v4', False, 0.0
+        ).cuda().bfloat16()
+        torch.manual_seed(0)
+        layer_with_norm = CausalSelfAttention(
+            4, 4, 32, attention_config_with_norm, 'dao_flash_v4', False, 0.0
+        ).cuda().bfloat16()
+        x = torch.rand((2, 9, 32), dtype=torch.bfloat16, device='cuda')
+        out_no_norm = layer_no_norm(x)
+        out_with_norm = layer_with_norm(x)
+        torch.cuda.synchronize()
+        assert out_no_norm.shape == out_with_norm.shape == (2, 9, 32)
+        assert not torch.allclose(out_no_norm, out_with_norm, atol=1e-6)
+        print('ok')
+        """
+    )
+    assert result.stdout.strip().endswith("ok")
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires 1 GPU.")
+@pytest.mark.skipif(not FLASH_ATTN_V4_AVAILABLE, reason="FA4 not installed")
+def test_dao_flash_v4_backward_approximate_equality_subprocess():
+    result = _run_fa4_subprocess(
+        """
+        import torch
+        from modalities.models.gpt2.gpt2_model import CausalSelfAttention
+
+        query_ref = torch.rand((2, 8, 12, 64), dtype=torch.bfloat16, device='cuda', requires_grad=True)
+        key_ref = torch.rand((2, 2, 12, 64), dtype=torch.bfloat16, device='cuda', requires_grad=True)
+        value_ref = torch.rand((2, 2, 12, 64), dtype=torch.bfloat16, device='cuda', requires_grad=True)
+
+        query_fa4 = query_ref.detach().clone().requires_grad_(True)
+        key_fa4 = key_ref.detach().clone().requires_grad_(True)
+        value_fa4 = value_ref.detach().clone().requires_grad_(True)
+
+        output_ref = CausalSelfAttention.execute_attention(
+            query_ref, key_ref, value_ref, dropout=0.0, attention_impl='pytorch_flash'
+        )
+        output_fa4 = CausalSelfAttention.execute_attention(
+            query_fa4, key_fa4, value_fa4, dropout=0.0, attention_impl='dao_flash_v4'
+        )
+        torch.testing.assert_close(output_ref, output_fa4, atol=2.5e-3, rtol=0.016)
+
+        output_ref.float().sum().backward()
+        output_fa4.float().sum().backward()
+        torch.cuda.synchronize()
+
+        torch.testing.assert_close(query_ref.grad, query_fa4.grad, atol=5e-3, rtol=0.02)
+        torch.testing.assert_close(key_ref.grad, key_fa4.grad, atol=5e-3, rtol=0.02)
+        torch.testing.assert_close(value_ref.grad, value_fa4.grad, atol=5e-3, rtol=0.02)
+        print('ok')
+        """
+    )
+    assert result.stdout.strip().endswith("ok")
+
+
+def _run_fa4_subprocess(code: str) -> subprocess.CompletedProcess[str]:
+    """Run flash attention 4 related code in a subprocess to isolate FA4's CUDA context
+    and avoid conflicts with other tests.
+    The code should print 'ok' if it runs successfully.
+    The function returns the CompletedProcess object,
+    which contains stdout and stderr for further inspection if needed.
+    TODO: This might be an A100 / SM80-specific issue, so we can consider removing this subprocess isolation
+          if we confirm that FA4 works well on newer architectures without it.
+    """
+    env = os.environ.copy()
+    existing_pythonpath = env.get("PYTHONPATH")
+    env["PYTHONPATH"] = f"{SRC_ROOT}:{existing_pythonpath}" if existing_pythonpath else str(SRC_ROOT)
+    return subprocess.run(
+        [sys.executable, "-c", textwrap.dedent(code)],
+        cwd=REPO_ROOT,
+        env=env,
+        check=True,
+        capture_output=True,
+        text=True,
+    )