fix(attention): Workaround for flash attention 4 not playing nice with torch compile currently.

BlueCrescent · BlueCrescent · commit 807930e71bcb · 2026-04-22T16:00:50.000+02:00
diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py
@@ -3,7 +3,7 @@
 from abc import abstractmethod
 from enum import Enum
 from importlib import import_module
-from typing import Annotated, Callable, Optional, overload
+from typing import Annotated, Callable, Optional, cast, overload
 
 import torch
 import torch.nn as nn
@@ -73,6 +73,31 @@ def _raise_flash_attn_v4_unavailable() -> None:
     raise NotImplementedError(error_message)
 
 
+@torch.compiler.disable
+def _execute_dao_flash_v4_eager(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    flash_attn_v4 = get_flash_attn_func_v4()
+    if flash_attn_v4 is None:
+        _raise_flash_attn_v4_unavailable()
+
+    output = flash_attn_v4(
+        q,
+        k,
+        v,
+        causal=True,
+        softmax_scale=None,
+        window_size=(None, None),
+    )
+    return _unwrap_flash_attention_output(cast(torch.Tensor | tuple[torch.Tensor, Optional[torch.Tensor]], output))
+
+
+def _unwrap_flash_attention_output(
+    output: torch.Tensor | tuple[torch.Tensor, Optional[torch.Tensor]],
+) -> torch.Tensor:
+    if isinstance(output, tuple):
+        return output[0]
+    return output
+
+
 class LayerNorms(LookupEnum):
     """
     Enum lookup class for LayerNorms.
@@ -698,7 +723,6 @@ def execute_attention(
             # Note, that the library is not required for the CPU-only tests.
             y = cls._execute_dao_flash_v2(q, k, v, dropout)
         elif attention_impl == AttentionImplementation.DAO_FLASH_V4:
-            flash_attn_v4 = get_flash_attn_func_v4()
             if cls._requires_backward(q, k, v) and torch.cuda.get_device_capability(q.device)[0] < 9:
                 y = cls._execute_dao_flash_v2(q, k, v, dropout)
             else:
@@ -708,28 +732,11 @@ def execute_attention(
                 q = q.transpose(1, 2).contiguous()  # (B, T, nh_q, hd)
                 k = k.transpose(1, 2).contiguous()  # (B, T, nh_kv, hd)
                 v = v.transpose(1, 2).contiguous()  # (B, T, nh_kv, hd)
-                y = cls._unwrap_flash_attention_output(
-                    flash_attn_v4(
-                        q,
-                        k,
-                        v,
-                        causal=True,
-                        softmax_scale=None,
-                        window_size=(None, None),
-                    )
-                )
+                y = _execute_dao_flash_v4_eager(q, k, v)
         else:
             raise NotImplementedError(f"Attention implementation {attention_impl} not supported")
         return y  # (B, T, nh_q, hd)
 
-    @staticmethod
-    def _unwrap_flash_attention_output(
-        output: torch.Tensor | tuple[torch.Tensor, Optional[torch.Tensor]],
-    ) -> torch.Tensor:
-        if isinstance(output, tuple):
-            return output[0]
-        return output
-
     @staticmethod
     def _execute_dao_flash_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, dropout: float) -> torch.Tensor:
         q = q.transpose(1, 2).contiguous()  # (B, T, nh_q, hd)
diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py
@@ -35,6 +35,7 @@
     GPT2LLM,
     AttentionConfig,
     AttentionImplementation,
+    GPT2Block,
     LayerNormWrapperConfig,
     PositionTypes,
     SwiGLU,
@@ -61,6 +62,14 @@
 class ModelFactory:
     """Model factory class to create models."""
 
+    @staticmethod
+    def _requires_graph_break_friendly_compile(module: nn.Module) -> bool:
+        if isinstance(module, GPT2Block):
+            return module.attn.attention_impl == AttentionImplementation.DAO_FLASH_V4
+
+        attention_impl = getattr(module, "attention_impl", None)
+        return attention_impl == AttentionImplementation.DAO_FLASH_V4
+
     @staticmethod
     def _is_model_on_meta_device(model: nn.Module) -> bool:
         """
@@ -402,7 +411,16 @@ def get_parent_module_and_child_name(child_module: nn.Module, model: nn.Module)
         for _, module in model.named_modules():
             if isinstance(module, block_types):
                 options = {"trace.enabled": True} if debug else {}
-                compiled_module = torch.compile(module, fullgraph=fullgraph, options=options)
+                compiled_fullgraph = fullgraph
+                if compiled_fullgraph and ModelFactory._requires_graph_break_friendly_compile(module):
+                    compiled_fullgraph = False
+                    logger.warning(
+                        "Disabling `fullgraph=True` for `%s` because FlashAttention-4 currently graph-breaks under "
+                        "torch.compile when tracing into flash_attn.cute internals.",
+                        module.__class__.__name__,
+                    )
+
+                compiled_module = torch.compile(module, fullgraph=compiled_fullgraph, options=options)
                 parent_module, child_name = get_parent_module_and_child_name(child_module=module, model=model)
                 parent_module.register_module(name=child_name, module=compiled_module)
         return model
diff --git a/tests/test_torch_compile.py b/tests/test_torch_compile.py
@@ -1,8 +1,12 @@
 import copy
+from typing import Any, cast
 
 import pytest
+import torch
 import torch.nn as nn
+from _pytest.monkeypatch import MonkeyPatch
 
+from modalities.models.components.layer_norms import LayerNormConfig
 from modalities.models.gpt2.gpt2_model import (
     GPT2LLM,
     ActivationType,
@@ -12,27 +16,31 @@
     LayerNormWrapperConfig,
     PositionTypes,
     QueryKeyValueTransformType,
+    is_flash_attn_v4_available,
 )
 from modalities.models.model_factory import ModelFactory
 
 
-def create_gpt2_configs():
+def create_gpt2_configs() -> tuple[AttentionConfig, LayerNormWrapperConfig]:
     attention_config = AttentionConfig(
         qkv_transforms=[
             AttentionConfig.QueryKeyValueTransformConfig(
-                type_hint=QueryKeyValueTransformType.RotaryTransform.name,
+                type_hint=cast(Any, QueryKeyValueTransformType.RotaryTransform.name),
                 config=AttentionConfig.QueryKeyValueTransformConfig.RotaryTransformConfig(
                     n_embd=512, n_head=8, seq_length_dim=-2, base_freq=10000
                 ),
             )
         ]
     )
-    norm_config = LayerNormWrapperConfig(norm_type=LayerNorms.layer_norm, config={"normalized_shape": 512})
+    norm_config = LayerNormWrapperConfig(
+        norm_type=LayerNorms.layer_norm,
+        config=LayerNormConfig(normalized_shape=512, eps=1e-6, elementwise_affine=True, bias=True),
+    )
     return attention_config, norm_config
 
 
 @pytest.fixture
-def gpt2_model():
+def gpt2_model() -> GPT2LLM:
     attention_config, norm_config = create_gpt2_configs()
     model = GPT2LLM(
         sample_key="input_ids",
@@ -58,7 +66,7 @@ def gpt2_model():
     return model
 
 
-def test_get_compiled_model_compiles_blocks(gpt2_model):
+def test_get_compiled_model_compiles_blocks(gpt2_model: GPT2LLM) -> None:
     original_model = copy.deepcopy(gpt2_model)
     original_wte = gpt2_model.transformer.wte
     original_lm_head = gpt2_model.transformer.lm_head
@@ -79,7 +87,7 @@ def test_get_compiled_model_compiles_blocks(gpt2_model):
     assert result_model is gpt2_model, "Should return the same model instance"
 
 
-def test_get_compiled_model_no_matching_blocks(gpt2_model):
+def test_get_compiled_model_no_matching_blocks(gpt2_model: GPT2LLM) -> None:
     """
     Test that get_compiled_model raises a ValueError if no blocks match the specified types.
     """
@@ -88,10 +96,28 @@ def test_get_compiled_model_no_matching_blocks(gpt2_model):
         ModelFactory.get_compiled_model(gpt2_model, block_names=[block_name], fullgraph=True)
 
 
-def test_get_compiled_model_empty_block_names(gpt2_model):
+def test_get_compiled_model_empty_block_names(gpt2_model: GPT2LLM) -> None:
     original_model_dict = dict(gpt2_model.named_modules())
     result_model = ModelFactory.get_compiled_model(gpt2_model, block_names=[], fullgraph=True)
 
     new_model_dict = dict(result_model.named_modules())
     assert new_model_dict == original_model_dict, "Model should remain unchanged with empty block_names"
     assert result_model is gpt2_model, "Should return the same model instance"
+
+
+@pytest.mark.skipif(not is_flash_attn_v4_available(), reason="FA4 not installed")
+def test_get_compiled_model_disables_fullgraph_for_fa4(monkeypatch: MonkeyPatch, gpt2_model: GPT2LLM) -> None:
+    recorded_fullgraph_values: list[bool] = []
+
+    for block in gpt2_model.transformer.h.values():
+        block.attn.attention_impl = AttentionImplementation.DAO_FLASH_V4
+
+    def fake_compile(module: nn.Module, fullgraph: bool, options: dict[str, object]) -> nn.Module:
+        recorded_fullgraph_values.append(fullgraph)
+        return module
+
+    monkeypatch.setattr(torch, "compile", fake_compile)
+
+    ModelFactory.get_compiled_model(gpt2_model, ["GPT2Block"], fullgraph=True)
+
+    assert recorded_fullgraph_values == [False] * len(gpt2_model.transformer.h)