NVIDIA
diff --git a/‎.flake8
+2 b/‎.flake8
+2
diff --git a/‎evaluation/README.md
+2 b/‎evaluation/README.md
+2
diff --git a/‎evaluation/evaluate.py
+29-1 b/‎evaluation/evaluate.py
+29-1
diff --git a/‎evaluation/longbench/README.md
+7 b/‎evaluation/longbench/README.md
+7
diff --git a/‎evaluation/longbench/calculate_metrics.py
+236 b/‎evaluation/longbench/calculate_metrics.py
+236
@@ -3,5 +3,7 @@ max-line-length = 120
 per-file-ignores =
     __init__.py:F401
     evaluation/infinite_bench/create_huggingface_dataset.py:E501
+    evaluation/longbench/create_huggingface_dataset.py:E501
+    evaluation/longbenchv2/create_huggingface_dataset.py:E501
 # E203, W503 - black-compatible config
 extend-ignore = E203, W503
@@ -5,6 +5,8 @@ This directory contains a set of scripts to evaluate the performance of differen
 - [RULER](ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler))
 - [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
 - [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
+- [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
+- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
 
 
 Please refer to the README of each dataset for more information on how the Hugging Face dataset was generated.
 
@@ -10,6 +10,9 @@
 from datasets import load_dataset
 from fire import Fire
 from infinite_bench.calculate_metrics import calculate_metrics as infinite_bench_scorer
+from longbench.calculate_metrics import calculate_metrics as longbench_scorer
+from longbench.calculate_metrics import calculate_metrics_e as longbench_scorer_e
+from longbenchv2.calculate_metrics import calculate_metrics as longbenchv2_scorer
 from loogle.calculate_metrics import calculate_metrics as loogle_scorer
 from ruler.calculate_metrics import calculate_metrics as ruler_scorer
 from tqdm import tqdm
@@ -19,6 +22,7 @@
 from kvpress import (
     AdaKVPress,
     ChunkKVPress,
+    ComposedPress,
     CriticalAdaKVPress,
     CriticalKVPress,
     DuoAttentionPress,
@@ -39,13 +43,19 @@
     "ruler": "simonjegou/ruler",
     "zero_scrolls": "simonjegou/zero_scrolls",
     "infinitebench": "MaxJeblick/InfiniteBench",
+    "longbench": "Xnhyacinth/LongBench",
+    "longbench-e": "Xnhyacinth/LongBench",
+    "longbench-v2": "Xnhyacinth/LongBench-v2",
 }
 
 SCORER_DICT = {
     "loogle": loogle_scorer,
     "ruler": ruler_scorer,
     "zero_scrolls": zero_scrolls_scorer,
     "infinitebench": infinite_bench_scorer,
+    "longbench": longbench_scorer,
+    "longbench-e": longbench_scorer_e,
+    "longbench-v2": longbenchv2_scorer,
 }
 
 PRESS_DICT = {
@@ -66,6 +76,8 @@
     "tova": TOVAPress(),
     "duo_attention": DuoAttentionPress(),
     "chunkkv": ChunkKVPress(press=SnapKVPress(), chunk_length=20),
+    "snap_think": ComposedPress([SnapKVPress(), ThinKPress()]),
+    "full_kv": ExpectedAttentionPress(0.0),
 }
 
 
@@ -80,6 +92,7 @@ def evaluate(
     max_new_tokens: Optional[int] = None,
     max_context_length: Optional[int] = None,
     compress_questions: bool = False,
+    key_channel_compression_ratio: float = 0.5,
 ):
     """
     Evaluate a model on a dataset using a press and save the results
@@ -106,6 +119,8 @@ def evaluate(
         Maximum number of tokens to use in the context. By default will use the maximum length supported by the model.
     compress_questions : bool, optional
         Whether to compress the questions as well, by default False
+    key_channel_compression_ratio : float, optional
+        key Channel Compression ratio for the channel press, by default 0.5
     """
 
     assert dataset in DATASET_DICT, f"No dataset found for {dataset}"
@@ -146,6 +161,20 @@ def evaluate(
 
     if isinstance(press, (DuoAttentionPress)):
         press.head_compression_ratio = compression_ratio
+    elif isinstance(press, (ComposedPress)):
+        for ps in press.presses:
+            if isinstance(ps, (ThinKPress)):
+                ps.key_channel_compression_ratio = key_channel_compression_ratio
+                save_filename = save_filename.with_name(
+                    save_filename.stem + f"__channel{key_channel_compression_ratio}" + save_filename.suffix
+                )
+            else:
+                ps.compression_ratio = compression_ratio  # type:ignore[attr-defined]
+    elif isinstance(press, (ThinKPress)):
+        press.key_channel_compression_ratio = key_channel_compression_ratio
+        save_filename = save_filename.with_name(
+            save_filename.stem + f"__channel{key_channel_compression_ratio}" + save_filename.suffix
+        )
     else:
         press.compression_ratio = compression_ratio  # type:ignore[attr-defined]
 
@@ -165,7 +194,6 @@ def evaluate(
         pipe = pipeline("kv-press-text-generation", model=model, device_map="auto", model_kwargs=model_kwargs)
     else:
         pipe = pipeline("kv-press-text-generation", model=model, device=device, model_kwargs=model_kwargs)
-
     # Run pipeline on each context
     df["predicted_answer"] = None
     df_context = df.groupby("context")
 
@@ -0,0 +1,7 @@
+# longbench dataset
+
+[longbench](https://github.com/THUDM/LongBench/tree/main/LongBench). 
+
+## Create Hugging Face dataset
+
+The processed Hugging Face dataset for longbench can be found [here](https://huggingface.co/datasets/Xnhyacinth/LongBench). To reproduce this dataset, simply run the `create_huggingface_dataset.py` script.
@@ -0,0 +1,236 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+import string
+from collections import Counter
+import numpy as np
+from rouge import Rouge
+
+try:
+    import jieba
+    from fuzzywuzzy import fuzz
+except ImportError as e:
+    missing_module = str(e).split()[-1].strip("'")  # Extract missing module name
+    print(
+        f"Module '{missing_module}' not found. \
+          If test Longbench, please install it using 'pip install {missing_module}'"
+    )
+
+
+def calculate_metrics(df):
+    predictions = df["predicted_answer"].tolist()
+    answers = df["answers"].tolist()
+    dataset = df["task"].tolist()[0]
+    all_classes = df["all_classes"].tolist()[0]
+    return scorer(dataset, predictions, answers, all_classes)
+
+
+def calculate_metrics_e(df):
+    predictions = df["predicted_answer"].tolist()
+    answers = df["answers"].tolist()
+    dataset = df["task"].tolist()[0].removesuffix("-e")
+    all_classes = df["all_classes"].tolist()[0]
+    lengths = df["length"].tolist()
+    return scorer_e(dataset, predictions, answers, lengths, all_classes)
+
+
+def scorer_e(dataset, predictions, answers, lengths, all_classes):
+    scores = {"0-4k": [], "4-8k": [], "8k+": []}  # type:ignore[var-annotated]
+    for (prediction, ground_truths, length) in zip(predictions, answers, lengths):
+        score = 0.0
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip("\n").split("\n")[0]
+        for ground_truth in ground_truths:
+            score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
+        if length < 4000:
+            scores["0-4k"].append(score)
+        elif length < 8000:
+            scores["4-8k"].append(score)
+        else:
+            scores["8k+"].append(score)
+    for key in scores.keys():
+        scores[key] = round(100 * np.mean(scores[key]), 2)
+    return scores
+
+
+def scorer(dataset, predictions, answers, all_classes):
+    total_score = 0.0
+    for (prediction, ground_truths) in zip(predictions, answers):
+        score = 0.0
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip("\n").split("\n")[0]
+        for ground_truth in ground_truths:
+            score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
+        total_score += score
+    return round(100 * total_score / len(predictions), 2)
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def normalize_zh_answer(s):
+    """Lower text and remove punctuation, extra whitespace."""
+
+    def white_space_fix(text):
+        return "".join(text.split())
+
+    def remove_punc(text):
+        cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+        all_punctuation = set(string.punctuation + cn_punctuation)
+        return "".join(ch for ch in text if ch not in all_punctuation)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_punc(lower(s)))
+
+
+def count_score(prediction, ground_truth, **kwargs):
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+
+def retrieval_score(prediction, ground_truth, **kwargs):
+    pattern = r"Paragraph (\d+)"
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+
+def retrieval_zh_score(prediction, ground_truth, **kwargs):
+    pattern = r"段落(\d+)"
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+
+def code_sim_score(prediction, ground_truth, **kwargs):
+    all_lines = prediction.lstrip("\n").split("\n")
+    prediction = ""
+    for line in all_lines:
+        if ("`" not in line) and ("#" not in line) and ("//" not in line):
+            prediction = line
+            break
+    return fuzz.ratio(prediction, ground_truth) / 100
+
+
+def classification_score(prediction, ground_truth, **kwargs):
+    em_match_list = []
+    all_classes = kwargs["all_classes"]
+    for class_name in all_classes:
+        if class_name in prediction:
+            em_match_list.append(class_name)
+    for match_term in em_match_list:
+        if match_term in ground_truth and match_term != ground_truth:
+            em_match_list.remove(match_term)
+    if ground_truth in em_match_list:
+        score = 1.0 / len(em_match_list)
+    else:
+        score = 0.0
+    return score
+
+
+def rouge_score(prediction, ground_truth, **kwargs):
+    rouge = Rouge()
+    try:
+        scores = rouge.get_scores([prediction], [ground_truth], avg=True)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return 0.0
+    return scores["rouge-l"]["f"]
+
+
+def rouge_zh_score(prediction, ground_truth, **kwargs):
+    prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
+    ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
+    score = rouge_score(prediction, ground_truth)
+    return score
+
+
+def f1_score(prediction, ground_truth, **kwargs):
+    common = Counter(prediction) & Counter(ground_truth)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction)
+    recall = 1.0 * num_same / len(ground_truth)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def qa_f1_score(prediction, ground_truth, **kwargs):
+    normalized_prediction = normalize_answer(prediction)
+    normalized_ground_truth = normalize_answer(ground_truth)
+
+    prediction_tokens = normalized_prediction.split()
+    ground_truth_tokens = normalized_ground_truth.split()
+    return f1_score(prediction_tokens, ground_truth_tokens)
+
+
+def qa_f1_zh_score(prediction, ground_truth, **kwargs):
+    prediction_tokens = list(jieba.cut(prediction, cut_all=False))
+    ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
+    prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
+    ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
+    prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
+    ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
+    return f1_score(prediction_tokens, ground_truth_tokens)
+
+
+dataset2metric = {
+    "narrativeqa": qa_f1_score,
+    "qasper": qa_f1_score,
+    "multifieldqa_en": qa_f1_score,
+    "multifieldqa_zh": qa_f1_zh_score,
+    "hotpotqa": qa_f1_score,
+    "2wikimqa": qa_f1_score,
+    "musique": qa_f1_score,
+    "dureader": rouge_zh_score,
+    "gov_report": rouge_score,
+    "qmsum": rouge_score,
+    "multi_news": rouge_score,
+    "vcsum": rouge_zh_score,
+    "trec": classification_score,
+    "triviaqa": qa_f1_score,
+    "samsum": rouge_score,
+    "lsht": classification_score,
+    "passage_retrieval_en": retrieval_score,
+    "passage_count": count_score,
+    "passage_retrieval_zh": retrieval_zh_score,
+    "lcc": code_sim_score,
+    "repobench-p": code_sim_score,
+}