Client side validation for non fp8 kv cache and fp8 context fmha (#1302)

joostinyi · web-flow · commit 9a029be22644 · 2025-01-08T15:38:05.000-08:00
* client side validation for non fp8 kv cache and fp8 context fmha

* enable chunked context as default
diff --git a/truss/base/trt_llm_config.py b/truss/base/trt_llm_config.py
@@ -81,7 +81,7 @@ class TrussSpecDecMode(str, Enum):
 
 class TrussTRTLLMRuntimeConfiguration(BaseModel):
     kv_cache_free_gpu_mem_fraction: float = 0.9
-    enable_chunked_context: bool = False
+    enable_chunked_context: bool = True
     batch_scheduler_policy: TrussTRTLLMBatchSchedulerPolicy = (
         TrussTRTLLMBatchSchedulerPolicy.GUARANTEED_NO_EVICT
     )
@@ -135,6 +135,11 @@ def _validate_kv_cache_flags(self):
             and not self.plugin_configuration.use_paged_context_fmha
         ):
             raise ValueError("Using fp8 context fmha requires paged context fmha")
+        if (
+            self.plugin_configuration.use_fp8_context_fmha
+            and not self.quantization_type == TrussTRTLLMQuantizationType.FP8_KV
+        ):
+            raise ValueError("Using fp8 context fmha requires fp8 kv cache dtype")
         return self
 
     def _validate_speculator_config(self):
diff --git a/truss/tests/test_config.py b/truss/tests/test_config.py
@@ -465,6 +465,22 @@ def test_plugin_paged_fp8_context_fmha_check(trtllm_config):
         TrussConfig.from_dict(trtllm_config)
 
 
+def test_fp8_context_fmha_check_kv_dtype(trtllm_config):
+    trtllm_config["trt_llm"]["build"]["plugin_configuration"] = {
+        "paged_kv_cache": True,
+        "use_paged_context_fmha": True,
+        "use_fp8_context_fmha": True,
+    }
+    trtllm_config["trt_llm"]["build"]["quantization_type"] = (
+        TrussTRTLLMQuantizationType.FP8_KV.value
+    )
+    TrussConfig.from_dict(trtllm_config)
+
+    del trtllm_config["trt_llm"]["build"]["quantization_type"]
+    with pytest.raises(ValueError):
+        TrussConfig.from_dict(trtllm_config)
+
+
 @pytest.mark.parametrize("verbose, expect_equal", [(False, True), (True, False)])
 def test_to_dict_trtllm(
     verbose,