Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ make -j
4. Build end-to-end operators with PyBind
```
# This will automatically build and link the operators
cd quest/ops
cd quest/quest-ops
bash setup.sh
cd -
cd quest/raas-ops
bash setup.sh
```

Expand Down
77 changes: 69 additions & 8 deletions benchmarks/evals/e2e/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,18 @@ class EvalConfigs:
all_approaches: List[str] = field(
default_factory=lambda: [
"full",
"full_optimized",
"sink-64",
"sink-128",
"sink-256",
"sink-512",
"sink-1024",
"h2o-64",
"sink_optimized-64",
"sink_optimized-128",
"sink_optimized-256",
"sink_optimized-512",
"sink_optimized-1024",
"h2o-84",
"h2o-128",
"h2o-256",
"h2o-512",
Expand All @@ -64,11 +70,21 @@ class EvalConfigs:
"quest-256",
"quest-512",
"quest-1024",
"quest_optimized-64",
"quest_optimized-128",
"quest_optimized-256",
"quest_optimized-512",
"quest_optimized-1024",
"raas-64",
"raas-128",
"raas-256",
"raas-512",
"raas-1024",
"raas_optimized-64",
"raas_optimized-128",
"raas_optimized-256",
"raas_optimized-512",
"raas_optimized-1024",
]
)

Expand Down Expand Up @@ -184,15 +200,26 @@ def load_model_for_approach(self, model_name: str, approach_name: str) -> AutoMo

model_config = self.configs.model_config
if model_config.model_type == "llama":
from transformers import LlamaForCausalLM

if approach_name == "full" or "sink" in approach_name: # They differ only in cache type
optimized = ("optimized" in approach_name)

if ("full" in approach_name or "sink" in approach_name) and not optimized: # They differ only in cache type
from transformers import LlamaForCausalLM
model = LlamaForCausalLM.from_pretrained(
model_name,
device_map="cuda:0",
trust_remote_code=True,
)
elif ("full" in approach_name or "sink" in approach_name) and optimized:
from quest.models.full_llama_optimized import LlamaForCausalLM
model = LlamaForCausalLM.from_pretrained(
model_name,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.float16, # Use float16 for optimized version
)
elif "h2o" in approach_name:
from transformers import LlamaForCausalLM
from quest.models.h2o_llama import enable_h2o_attention_eval

model = LlamaForCausalLM.from_pretrained(
Expand All @@ -204,9 +231,25 @@ def load_model_for_approach(self, model_name: str, approach_name: str) -> AutoMo
model,
{"cache_budget": int(approach_name.split("-")[-1])},
)
elif "quest" in approach_name:
elif "quest" in approach_name and optimized:
from quest.models.quest_llama_optimized import LlamaForCausalLM
from quest.models.quest_llama_optimized import enable_quest_attention_eval
model = LlamaForCausalLM.from_pretrained(
model_name,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.float16, # Use float16 for optimized version
)
enable_quest_attention_eval(
model,
{
"cache_budget": int(approach_name.split("-")[-1]),
"page_size": 16, # Fixed as stated in the paper
},
)
elif "quest" in approach_name and not optimized:
from transformers import LlamaForCausalLM
from quest.models.quest_llama import enable_quest_attention_eval

model = LlamaForCausalLM.from_pretrained(
model_name,
device_map="cuda:0",
Expand All @@ -219,7 +262,24 @@ def load_model_for_approach(self, model_name: str, approach_name: str) -> AutoMo
"page_size": 16, # Fixed as stated in the paper
},
)
elif "raas" in approach_name:
elif "raas" in approach_name and optimized:
from quest.models.raas_llama_optimized import LlamaForCausalLM
from quest.models.raas_llama_optimized import enable_raas_attention_eval
model = LlamaForCausalLM.from_pretrained(
model_name,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.float16, # Use float16 for optimized version
)
enable_raas_attention_eval(
model,
{
"cache_budget": int(approach_name.split("-")[-1]),
"page_size": 16, # Fixed as stated in the paper
},
)
elif "raas" in approach_name and not optimized:
from transformers import LlamaForCausalLM
from quest.models.raas_llama import enable_raas_attention_eval

model = LlamaForCausalLM.from_pretrained(
Expand Down Expand Up @@ -337,7 +397,7 @@ def test_model(
cache_position = torch.arange(input_ids.shape[1], dtype=torch.int64, device="cuda:0")

# Initialize the cache
if self.configs.approach == "full":
if self.configs.approach in ["full", "full_optimized"]:
past_key_values = DynamicCache()
elif "sink" in self.configs.approach:
cache_budget = int(self.configs.approach.split("-")[-1])
Expand All @@ -355,7 +415,6 @@ def test_model(

cache_budget = int(self.configs.approach.split("-")[-1])
past_key_values = RaaSCache(page_size=16, cache_budget=cache_budget)

with torch.no_grad():

# Prefill
Expand Down Expand Up @@ -407,6 +466,8 @@ def test_model(
JCT = prefill_time + np.sum(decode_time)
TPOT = np.sum(decode_time) / num_decode

if "optimized" in self.configs.approach:
pipe.model.reset_model()
model_output = pipe.tokenizer.decode(generated_content, skip_special_tokens=True)
return model_output, TTFT, JCT, TPOT, num_decode

Expand Down
Empty file.
Loading