DerekHJH · rijuyuezhu · Jan 20, 2025 · Jan 21, 2025 · Jan 26, 2025 · Jan 22, 2025
diff --git a/README.md b/README.md
@@ -40,7 +40,10 @@ make -j
 4. Build end-to-end operators with PyBind
 ```
 # This will automatically build and link the operators
-cd quest/ops
+cd quest/quest-ops
+bash setup.sh
+cd -
+cd quest/raas-ops
 bash setup.sh
 ```
 

diff --git a/benchmarks/evals/e2e/main.py b/benchmarks/evals/e2e/main.py
@@ -49,12 +49,18 @@ class EvalConfigs:
     all_approaches: List[str] = field(
         default_factory=lambda: [
             "full",
+            "full_optimized",
             "sink-64",
             "sink-128",
             "sink-256",
             "sink-512",
             "sink-1024",
-            "h2o-64",
+            "sink_optimized-64",
+            "sink_optimized-128",
+            "sink_optimized-256",
+            "sink_optimized-512",
+            "sink_optimized-1024",
+            "h2o-84",
             "h2o-128",
             "h2o-256",
             "h2o-512",
@@ -64,11 +70,21 @@ class EvalConfigs:
             "quest-256",
             "quest-512",
             "quest-1024",
+            "quest_optimized-64",
+            "quest_optimized-128",
+            "quest_optimized-256",
+            "quest_optimized-512",
+            "quest_optimized-1024",
             "raas-64",
             "raas-128",
             "raas-256",
             "raas-512",
             "raas-1024",
+            "raas_optimized-64",
+            "raas_optimized-128",
+            "raas_optimized-256",
+            "raas_optimized-512",
+            "raas_optimized-1024",
         ]
     )
 
@@ -184,15 +200,26 @@ def load_model_for_approach(self, model_name: str, approach_name: str) -> AutoMo
 
         model_config = self.configs.model_config
         if model_config.model_type == "llama":
-            from transformers import LlamaForCausalLM
 
-            if approach_name == "full" or "sink" in approach_name:  # They differ only in cache type
+            optimized = ("optimized" in approach_name)
+
+            if ("full" in approach_name or "sink" in approach_name) and not optimized:  # They differ only in cache type
+                from transformers import LlamaForCausalLM
                 model = LlamaForCausalLM.from_pretrained(
                     model_name,
                     device_map="cuda:0",
                     trust_remote_code=True,
                 )
+            elif ("full" in approach_name or "sink" in approach_name) and optimized:
+                from quest.models.full_llama_optimized import LlamaForCausalLM
+                model = LlamaForCausalLM.from_pretrained(
+                    model_name,
+                    device_map="cuda:0",
+                    trust_remote_code=True,
+                    torch_dtype=torch.float16, # Use float16 for optimized version
+                )
             elif "h2o" in approach_name:
+                from transformers import LlamaForCausalLM
                 from quest.models.h2o_llama import enable_h2o_attention_eval
 
                 model = LlamaForCausalLM.from_pretrained(
@@ -204,9 +231,25 @@ def load_model_for_approach(self, model_name: str, approach_name: str) -> AutoMo
                     model,
                     {"cache_budget": int(approach_name.split("-")[-1])},
                 )
-            elif "quest" in approach_name:
+            elif "quest" in approach_name and optimized:
+                from quest.models.quest_llama_optimized import LlamaForCausalLM
+                from quest.models.quest_llama_optimized import enable_quest_attention_eval
+                model = LlamaForCausalLM.from_pretrained(
+                    model_name,
+                    device_map="cuda:0",
+                    trust_remote_code=True,
+                    torch_dtype=torch.float16, # Use float16 for optimized version
+                )
+                enable_quest_attention_eval(
+                    model,
+                    {
+                        "cache_budget": int(approach_name.split("-")[-1]),
+                        "page_size": 16,  # Fixed as stated in the paper
+                    },
+                )
+            elif "quest" in approach_name and not optimized:
+                from transformers import LlamaForCausalLM
                 from quest.models.quest_llama import enable_quest_attention_eval
-
                 model = LlamaForCausalLM.from_pretrained(
                     model_name,
                     device_map="cuda:0",
@@ -219,7 +262,24 @@ def load_model_for_approach(self, model_name: str, approach_name: str) -> AutoMo
                         "page_size": 16,  # Fixed as stated in the paper
                     },
                 )
-            elif "raas" in approach_name:
+            elif "raas" in approach_name and optimized:
+                from quest.models.raas_llama_optimized import LlamaForCausalLM
+                from quest.models.raas_llama_optimized import enable_raas_attention_eval
+                model = LlamaForCausalLM.from_pretrained(
+                    model_name,
+                    device_map="cuda:0",
+                    trust_remote_code=True,
+                    torch_dtype=torch.float16, # Use float16 for optimized version
+                )
+                enable_raas_attention_eval(
+                    model,
+                    {
+                        "cache_budget": int(approach_name.split("-")[-1]),
+                        "page_size": 16,  # Fixed as stated in the paper
+                    },
+                )
+            elif "raas" in approach_name and not optimized:
+                from transformers import LlamaForCausalLM
                 from quest.models.raas_llama import enable_raas_attention_eval
 
                 model = LlamaForCausalLM.from_pretrained(
@@ -337,7 +397,7 @@ def test_model(
         cache_position = torch.arange(input_ids.shape[1], dtype=torch.int64, device="cuda:0")
 
         # Initialize the cache
-        if self.configs.approach == "full":
+        if self.configs.approach in ["full", "full_optimized"]:
             past_key_values = DynamicCache()
         elif "sink" in self.configs.approach:
             cache_budget = int(self.configs.approach.split("-")[-1])
@@ -355,7 +415,6 @@ def test_model(
 
             cache_budget = int(self.configs.approach.split("-")[-1])
             past_key_values = RaaSCache(page_size=16, cache_budget=cache_budget)
-
         with torch.no_grad():
 
             # Prefill
@@ -407,6 +466,8 @@ def test_model(
             JCT = prefill_time + np.sum(decode_time)
             TPOT = np.sum(decode_time) / num_decode
 
+        if "optimized" in self.configs.approach:
+            pipe.model.reset_model()
         model_output = pipe.tokenizer.decode(generated_content, skip_special_tokens=True)
         return model_output, TTFT, JCT, TPOT, num_decode
 

diff --git a/benchmarks/evals/to_decodenum/__init__.py b/benchmarks/evals/to_decodenum/__init__.py