add fa4

dcw02 · dcw02 · commit 0279e03af267 · 2025-11-20T23:01:55.000Z
diff --git a/requirements.txt b/requirements.txt
@@ -15,3 +15,4 @@ pydantic
 sglang[all]==0.5.4
 openai-harmony
 flash-attn>=2.6.3
+flash-attn-cute @ git+https://github.com/Dao-AILab/flash-attention.git@54d8aa6751fc9d5f0357854079261913d5df1f9d#subdirectory=flash_attn/cute
diff --git a/specforge/core/eagle3.py b/specforge/core/eagle3.py
@@ -140,7 +140,7 @@ def forward(
         plosses = []
         vlosses = []
         acces = []
-        if self.attention_backend in ["sdpa", "fa"]:
+        if self.attention_backend in ["sdpa", "fa", "fa4"]:
             cache_hidden = [[], []]
             past_key_values = None
         elif self.attention_backend == "flex_attention":
@@ -517,7 +517,7 @@ def forward(
         plosses = []
         vlosses = []
         acces = []
-        if self.attention_backend in ["sdpa", "fa"]:
+        if self.attention_backend in ["sdpa", "fa", "fa4"]:
             cache_hidden = [[], []]
             past_key_values = None
         elif self.attention_backend == "flex_attention":
diff --git a/specforge/modeling/draft/llama3_eagle.py b/specforge/modeling/draft/llama3_eagle.py
@@ -10,6 +10,7 @@
 from transformers.cache_utils import Cache
 from transformers.models.llama.configuration_llama import LlamaConfig
 from flash_attn import flash_attn_func
+from flash_attn.cute import flash_attn_func as flash_attn_func_v4
 
 from specforge.modeling.draft.flex_attention import (
     compile_friendly_create_block_mask,
@@ -425,7 +426,6 @@ def yarn_linear_ramp_mask(min_val, max_val, dim):
 
 
 class LlamaYarnRotaryEmbedding(LlamaRotaryEmbedding):
-
     def __init__(
         self,
         dim,
@@ -850,6 +850,10 @@ class LlamaFlashAttention(LlamaAttention):
         - cache_hidden: manual cache used for storing past key and value states
     """
 
+    def __init__(self, config, backend="fa"):
+        super().__init__(config)
+        self.backend = backend
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -866,9 +870,7 @@ def forward(
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
 
-        query_states = query_states.view(
-            bsz, q_len, self.num_heads, self.head_dim
-        )
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
         key_states = key_states.view(
             bsz, q_len, self.num_key_value_heads, self.head_dim
         )
@@ -908,7 +910,12 @@ def forward(
         k0 = cache_k[0]
         v0 = cache_v[0]
 
-        attn_output, lse, _ = flash_attn_func(
+        if self.backend == "fa4":
+            attn_func = flash_attn_func_v4
+        else:
+            attn_func = flash_attn_func
+
+        attn_output, lse, _ = attn_func(
             query_states,
             k0,
             v0,
@@ -921,7 +928,13 @@ def forward(
 
         lck = len(cache_k)
         if lck > 1:
-            q_shape_expanded = (bsz, q_len, self.num_key_value_heads, self.num_key_value_groups, self.head_dim)
+            q_shape_expanded = (
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                self.num_key_value_groups,
+                self.head_dim,
+            )
             attn_outputs = [attn_output.view(q_shape_expanded)]
             lses = [lse.view(q_shape_expanded[:-1])]
 
@@ -1022,7 +1035,9 @@ def __init__(self, config, attention_backend: str = "sdpa"):
             print_with_rank("Using flex attention on draft model training!")
             self.self_attn = LlamaFlexAttention(config=config)
         elif attention_backend == "fa":
-            self.self_attn = LlamaFlashAttention(config=config)
+            self.self_attn = LlamaFlashAttention(config=config, backend="fa")
+        elif attention_backend == "fa4":
+            self.self_attn = LlamaFlashAttention(config=config, backend="fa4")
         else:
             raise ValueError(f"Unknown attention backend {attention_backend}")
 
@@ -1092,7 +1107,6 @@ def forward(
 
 
 class LlamaForCausalLMEagle3(Eagle3DraftModel):
-
     config_class = LlamaConfig
 
     def __init__(self, config, quant_config=None, attention_backend="sdpa") -> None:
diff --git a/tests/test_utils/test_flash_attention.py b/tests/test_utils/test_flash_attention.py
@@ -28,9 +28,12 @@ def assert_similar(ref, out):
 
 
 class TestFlashAttention(unittest.TestCase):
-
     def setUp(self):
+        import os
+
         torch.manual_seed(0)
+
+        self.backend = os.environ.get("FLASH_ATTN_BACKEND", "fa")
         self.config_dict = {
             "hidden_size": 128,
             "num_attention_heads": 8,
@@ -57,7 +60,11 @@ def test_forward_pass_comparison(self):
     def _test_forward_pass_comparison_for_seq_len(self, seq_len):
         """Helper method to test forward pass comparison for a specific sequence length."""
         attention = LlamaAttention(self.config).to("cuda").to(self.dtype)
-        flash_attention = LlamaFlashAttention(self.config).to("cuda").to(self.dtype)
+        flash_attention = (
+            LlamaFlashAttention(self.config, backend=self.backend)
+            .to("cuda")
+            .to(self.dtype)
+        )
 
         # Ensure same weights
         with torch.no_grad():
@@ -144,7 +151,11 @@ def test_backward_pass_gradient_comparison(self):
     def _test_backward_pass_gradient_comparison_for_seq_len(self, seq_len):
         """Helper method to test backward pass gradient comparison for a specific sequence length."""
         attention = LlamaAttention(self.config).to("cuda").to(self.dtype)
-        flash_attention = LlamaFlashAttention(self.config).to("cuda").to(self.dtype)
+        flash_attention = (
+            LlamaFlashAttention(self.config, backend=self.backend)
+            .to("cuda")
+            .to(self.dtype)
+        )
 
         # Ensure same weights
         with torch.no_grad():