DerekHJH · Kenaz123 · Jan 26, 2025
diff --git a/benchmarks/evals/e2e/main.py b/benchmarks/evals/e2e/main.py
@@ -47,6 +47,7 @@ class EvalConfigs:
     all_approaches: List[str] = field(
         default_factory=lambda: [
             "full",
+            "full_optimized",
             "sink-64",
             "sink-128",
             "sink-256",
@@ -172,7 +173,8 @@ class and save the dataset.
         dataset: Data_set = str2class[dataset_name](
             tokenizer=tokenizer,
             path=self.configs.result_path,
-            tot_num_data=self.configs.tot_num_data,
+            # tot_num_data=self.configs.tot_num_data,
+            tot_num_data=3,
         )
         dataset.save_dataset(self.configs.result_path)
 
@@ -197,6 +199,14 @@ def load_model_for_approach(self, model_name: str, approach_name: str) -> AutoMo
                     device_map="cuda:0",
                     trust_remote_code=True,
                 )
+            elif "full" in approach_name and optimized:
+                from quest.models.full_llama_optimized import LlamaForCausalLM
+                model = LlamaForCausalLM.from_pretrained(
+                    model_name,
+                    device_map="cuda:0",
+                    trust_remote_code=True,
+                    torch_dtype=torch.float16, # Use float16 for optimized version
+                )
             elif "h2o" in approach_name:
                 from transformers import LlamaForCausalLM
                 from quest.models.h2o_llama import enable_h2o_attention_eval
@@ -337,6 +347,10 @@ def run_inference(self, pipe: Pipeline, dataset: Data_set) -> Data_set:
             results[f"JCT_{self.configs.approach}"].append(JCT)
             results[f"TPOT_{self.configs.approach}"].append(TPOT)
             results[f"num_decode_{self.configs.approach}"].append(num_decode)
+            # log the results each loop
+            logger.info(
+                f"Prompt: {prompt}\nAnswer: {answer}\nOutput: {model_output}\nTTFT: {TTFT:.2f} s\nJCT: {JCT:.2f} s\nTPOT: {TPOT:.2f} s\nNum_decode: {num_decode}"
+            )
         dataset.update(results)
         dataset.save_dataset(self.configs.result_path)
 
@@ -360,7 +374,7 @@ def test_model(
         cache_position = torch.arange(input_ids.shape[1], dtype=torch.int64, device="cuda:0")
 
         # Initialize the cache
-        if self.configs.approach == "full":
+        if self.configs.approach == "full" or "full_optimized":
             past_key_values = DynamicCache()
         elif "sink" in self.configs.approach:
             cache_budget = int(self.configs.approach.split("-")[-1])

diff --git a/benchmarks/evals/e2e/test.sh b/benchmarks/evals/e2e/test.sh
@@ -1,7 +1,7 @@
 
-dataset="aime"
+dataset="math500"
 model="peiyi9979/mistral-7b-sft"
-approach="h2o-84"
+approach="full_optimized"
 
 command="python3 main.py --dataset ${dataset} --model ${model} --approach ${approach}"
 echo "Running command: ${command}"