disable triton FA by default

hliuca · hliuca · commit 1f947b59d942 · 2024-12-12T13:23:35.000-06:00
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -14,7 +14,7 @@
     VLLM_ROCM_PREFER_TORCH: bool = False
     VLLM_ROCM_PREFER_TRITON: bool = True
     VLLM_USE_SDPA_ATTENTION: bool = False
-    VLLM_USE_TRITON_FLASH_ATTN: bool = True
+    VLLM_USE_TRITON_FLASH_ATTN: bool = False
     VLLM_USE_ROCM_SKINNY_GEMM: bool = True
     VLLM_USE_ROCM_CUSTOM_PAGED_ATTN: bool = True
     VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = True
@@ -227,7 +227,7 @@ def get_default_config_root():
 
     # flag to control if vllm should use triton flash attention
     "VLLM_USE_TRITON_FLASH_ATTN":
-    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in
+    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "False").lower() in
              ("true", "1")),
 
     # Internal flag to enable Dynamo fullgraph capture