From 2a3278540812daff06a638678a85e45cc02e3758 Mon Sep 17 00:00:00 2001
From: AooooooA-C <chenaozhu@outlook.com>
Date: Fri, 7 Nov 2025 15:04:32 +0800
Subject: [PATCH] Add a script for testing F1 Score

---
 eval/README.md                  |  52 +++++++
 eval/eval.py                    | 257 ++++++++++++++++++++++++++++++++
 eval/eval_inference_F1.sh       | 119 +++++++++++++++
 eval/inference.py               | 195 ++++++++++++++++++++++++
 eval/ucm_sparse_config_esa.json |   9 ++
 5 files changed, 632 insertions(+)
 create mode 100644 eval/README.md
 create mode 100644 eval/eval.py
 create mode 100644 eval/eval_inference_F1.sh
 create mode 100644 eval/inference.py
 create mode 100644 eval/ucm_sparse_config_esa.json

diff --git a/eval/README.md b/eval/README.md
new file mode 100644
index 00000000..cc182947
--- /dev/null
+++ b/eval/README.md
@@ -0,0 +1,52 @@
+## Accuracy testing of Sparse method
+
+### Overview
+We use two Chinese subsets of  [LongBench](https://huggingface.co/datasets/zai-org/LongBench) to test the accuracy of single-document QA (multifieldqa_zh) and multi-document QA (dureader). The F1 score is adopted to evaluate the accuracy of these sparse methods. For more information about LongBench, please refer to https://github.com/THUDM/LongBench.
+
+### Quick Start
+
+#### Environment Preparation
+```shell
+pip install jieba fuzzywuzzy rouge
+```
+#### Test Data Preparation
+Dowdload the Longbench dataset 
+
+```shell
+wget https://huggingface.co/datasets/THUDM/LongBench/resolve/main/data.zip && unzip data.zip
+
+```
+
+#### Configure Specific Sparse Method
+
+Settings for different sparse methods are written in a JSON file, for example:
+```python
+{"ESA": 
+    {
+    "init_window_sz": 1,
+    "local_window_sz": 2,
+    "min_blocks":4,
+    "sparse_ratio": 0.2,
+    "retrieval_stride": 10
+    }
+}
+```
+
+Accuracy testing can be launched using the following command:
+```shell
+cd eval
+bash eval_inference_F1.sh <MODEL_PATH> <UCM_SPARSE_CONFIG> <TEST_DATA_DIR>
+
+# For example: bash eval_inference_F1.sh  /home/models/Qwen2.5-14B-Instruct .ucm_sparse_config_esa.json .data 
+
+```
+The result files will be saved in the eval/ucm_sparse_predictions folder.
+
+### Results
+Test results of Full Attention (Qwen2.5-14B-Instruct):
+
+| Dataset | F1-Score |
+|-------|-----------:|
+| multifieldqa_zh | 66.6 |
+| dureader | 29.33 |
+
diff --git a/eval/eval.py b/eval/eval.py
new file mode 100644
index 00000000..5d20550b
--- /dev/null
+++ b/eval/eval.py
@@ -0,0 +1,257 @@
+import argparse
+import difflib
+import json
+import os
+import re
+import string
+from collections import Counter
+from typing import List
+
+import jieba
+import numpy as np
+from fuzzywuzzy import fuzz
+from rouge import Rouge
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def normalize_zh_answer(s):
+    """Lower text and remove punctuation, extra whitespace."""
+
+    def white_space_fix(text):
+        return "".join(text.split())
+
+    def remove_punc(text):
+        cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+        all_punctuation = set(string.punctuation + cn_punctuation)
+        return "".join(ch for ch in text if ch not in all_punctuation)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_punc(lower(s)))
+
+
+def count_score(prediction, ground_truth, **kwargs):
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+
+def retrieval_score(prediction, ground_truth, **kwargs):
+    pattern = r"Paragraph (\d+)"
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+
+def retrieval_zh_score(prediction, ground_truth, **kwargs):
+    pattern = r"段落(\d+)"
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+
+def code_sim_score(prediction, ground_truth, **kwargs):
+    all_lines = prediction.lstrip("\n").split("\n")
+    prediction = ""
+    for line in all_lines:
+        if ("`" not in line) and ("#" not in line) and ("//" not in line):
+            prediction = line
+            break
+    return fuzz.ratio(prediction, ground_truth) / 100
+
+
+def classification_score(prediction, ground_truth, **kwargs):
+    em_match_list = []
+    all_classes = kwargs["all_classes"]
+    for class_name in all_classes:
+        if class_name in prediction:
+            em_match_list.append(class_name)
+    for match_term in em_match_list:
+        if match_term in ground_truth and match_term != ground_truth:
+            em_match_list.remove(match_term)
+    if ground_truth in em_match_list:
+        score = 1.0 / len(em_match_list)
+    else:
+        score = 0.0
+    return score
+
+
+def rouge_score(prediction, ground_truth, **kwargs):
+    rouge = Rouge()
+    try:
+        scores = rouge.get_scores([prediction], [ground_truth], avg=True)
+    except:
+        return 0.0
+    return scores["rouge-l"]["f"]
+
+
+def rouge_zh_score(prediction, ground_truth, **kwargs):
+    prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
+    ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
+    score = rouge_score(prediction, ground_truth)
+    return score
+
+
+def f1_score(prediction, ground_truth, **kwargs):
+    common = Counter(prediction) & Counter(ground_truth)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction)
+    recall = 1.0 * num_same / len(ground_truth)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def qa_f1_score(prediction, ground_truth, **kwargs):
+    normalized_prediction = normalize_answer(prediction)
+    normalized_ground_truth = normalize_answer(ground_truth)
+    prediction_tokens = normalized_prediction.split()
+    ground_truth_tokens = normalized_ground_truth.split()
+    return f1_score(prediction_tokens, ground_truth_tokens)
+
+
+def qa_f1_zh_score(prediction, ground_truth, **kwargs):
+    prediction_tokens = list(jieba.cut(prediction, cut_all=False))
+    ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
+    prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
+    ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
+    prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
+    ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
+    return f1_score(prediction_tokens, ground_truth_tokens)
+
+
+dataset2metric = {
+    "narrativeqa": qa_f1_score,
+    "qasper": qa_f1_score,
+    "multifieldqa_en": qa_f1_score,
+    "multifieldqa_zh": qa_f1_zh_score,
+    "clongeval": qa_f1_zh_score,
+    "hotpotqa": qa_f1_score,
+    "2wikimqa": qa_f1_score,
+    "musique": qa_f1_score,
+    "dureader": rouge_zh_score,
+    "gov_report": rouge_score,
+    "qmsum": rouge_score,
+    "multi_news": rouge_score,
+    "vcsum": rouge_zh_score,
+    "trec": classification_score,
+    "triviaqa": qa_f1_score,
+    "samsum": rouge_score,
+    "lsht": classification_score,
+    "passage_retrieval_en": retrieval_score,
+    "passage_count": count_score,
+    "passage_retrieval_zh": retrieval_zh_score,
+    "lcc": code_sim_score,
+    "repobench-p": code_sim_score,
+}
+
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default=None)
+    parser.add_argument("--predictions", type=str, default=None)
+    parser.add_argument("--answer", type=str, default=None)
+    parser.add_argument("--dataset", type=str, default=None)
+    parser.add_argument("--e", action="store_true", help="Evaluate on LongBench-E")
+    return parser.parse_args(args)
+
+
+def scorer_e(dataset, predictions, answers, lengths, all_classes):
+    scores = {"0-4k": [], "4-8k": [], "8k+": []}
+    for prediction, ground_truths, length in zip(predictions, answers, lengths):
+        score = 0.0
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip("\n").split("\n")[0]
+        for ground_truth in ground_truths:
+            score = max(
+                score,
+                dataset2metric[dataset](
+                    prediction, ground_truth, all_classes=all_classes
+                ),
+            )
+        if length < 4000:
+            scores["0-4k"].append(score)
+        elif length < 8000:
+            scores["4-8k"].append(score)
+        else:
+            scores["8k+"].append(score)
+    for key in scores.keys():
+        scores[key] = round(100 * np.mean(scores[key]), 2)
+    return scores
+
+
+def scorer(dataset, predictions, answers, all_classes):
+    total_score = 0.0
+    for prediction, ground_truths in zip(predictions, answers):
+        score = 0.0
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip("\n").split("\n")[0]
+        for ground_truth in ground_truths:
+            score = max(
+                score,
+                dataset2metric[dataset](
+                    prediction, ground_truth, all_classes=all_classes
+                ),
+            )
+        total_score += score
+    return round(100 * total_score / len(predictions), 2)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    scores = dict()
+    predictions, answers, lengths = [], [], []
+    all_classes = None
+    with open(args.predictions, "r", encoding="utf-8") as f:
+        for line in f:
+            predictions.append(line.strip())
+    with open(args.answer, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line)
+            answers.append(data["answers"])
+            if "length" in data:
+                lengths.append(data["length"])
+    if args.e:
+        score = scorer_e(args.dataset, predictions, answers, lengths, all_classes)
+        print("All score:", score)
+    else:
+        score = scorer(args.dataset, predictions[:50], answers[:50], all_classes)
+        print("50 score:", score)
+        score = scorer(args.dataset, predictions, answers, all_classes)
+        print("All score:", score)
diff --git a/eval/eval_inference_F1.sh b/eval/eval_inference_F1.sh
new file mode 100644
index 00000000..8891fce9
--- /dev/null
+++ b/eval/eval_inference_F1.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+# Check and auto-install required Python packages
+REQUIRED_PACKAGES=("fuzzywuzzy" "jieba" "rouge" )
+for pkg in "${REQUIRED_PACKAGES[@]}"; do
+  if ! python3 -c "import $pkg" 2>/dev/null; then
+    echo "❌ $pkg not found, installing..."
+    pip install "$pkg" --upgrade 2>/dev/null && echo "✅ $pkg installed successfully" || echo "❌ Failed to install $pkg (run 'pip3 install $pkg' manually)"
+  fi
+done
+
+CODE_ROOT=$(dirname "$(dirname "$(readlink -f "$0")")")
+
+MODEL_PATH="${1:-/home/models/Qwen2.5-14B-Instruct/}"
+UCM_SPARSE_CONFIG="${2:-${CODE_ROOT}/eval/ucm_sparse_config_esa_0.2_10.json}"
+TEST_DATA_DIR="${3:-${CODE_ROOT}/eval/data}"
+BATCH_SIZE="${4:-20}"
+export BATCH_SIZE
+
+# set vllm version
+export VLLM_VERSION="0.9.2"
+export VLLM_USE_V1="1"
+
+# Model 
+export MODEL_PATH
+MODEL_NAME=$(basename "$MODEL_PATH")
+
+# Dataset and storage path
+STORAGE_BACKENDS="${CODE_ROOT}/ucm_kv_cache/${MODEL_NAME}"
+export STORAGE_BACKENDS
+
+SAVE_PATH="${CODE_ROOT}/eval/ucm_sparse_predictions"
+DATASET="LongBench" 
+DATASET_SAVE_DIR="${SAVE_PATH}/${MODEL_NAME}/${DATASET}"
+
+mkdir -p "$STORAGE_BACKENDS" "$DATASET_SAVE_DIR" || { echo "Failed to create dirs!"; exit 1; }
+
+
+
+# -------------------------- LongBench --------------------------
+TARGET_FILES=(
+"${TEST_DATA_DIR}/multifieldqa_zh.jsonl"
+"${TEST_DATA_DIR}/dureader.jsonl"
+)
+
+EXISTING_FILES=()
+declare -A seen_files 
+for file in "${TARGET_FILES[@]}"; do
+    if [[ -f "$file" && -z "${seen_files[$file]}" ]]; then
+        seen_files["$file"]=1
+        EXISTING_FILES+=("$file")
+    fi
+done
+if [[ ${#EXISTING_FILES[@]} -eq 0 ]]; then
+    echo "❌ No valid data files found for '$DATASET'!"
+    exit 1
+fi
+
+echo -e "\nFound ${#EXISTING_FILES[@]} data files for $DATASET:"
+for file in "${EXISTING_FILES[@]}"; do
+    rel_path="${file#${BASE_DATA_DIR}/${DATASET}/}"
+    echo "  - $rel_path"
+done
+
+UCM_CONFIG_NAME=$(basename "$UCM_SPARSE_CONFIG") 
+UCM_CONFIG_NAME_NO_EXT="${UCM_CONFIG_NAME%.*}" 
+
+for DATASET_FLIE in "${EXISTING_FILES[@]}"; do
+    filename=$(basename "$DATASET_FLIE")
+    file_name_no_ext="${filename%.*}"
+    export DATASET_FLIE
+    
+    RES_FILE="${DATASET_SAVE_DIR}/${file_name_no_ext}_${UCM_CONFIG_NAME_NO_EXT}_bs${BATCH_SIZE}.txt"
+    ANS_FILE="${DATASET_SAVE_DIR}/${file_name_no_ext}_answer.jsonl"
+    export RES_FILE
+    export ANS_FILE
+    [[ -f "$RES_FILE" ]] && > "$RES_FILE"
+    [[ -f "$ANS_FILE" ]] && > "$ANS_FILE"
+
+    export UCM_SPARSE_CONFIG
+    echo -e "\n======================================"
+    echo "Executed model: $MODEL_NAME"
+    echo "Using Config: $UCM_SPARSE_CONFIG"
+    echo "======================================"
+
+    python3 "${CODE_ROOT}/eval/inference.py" \
+
+    if [[ ! -f "$RES_FILE" ]]; then
+        echo "Warning: test finished but result file not found!"
+        continue
+    fi
+
+    echo -e "\nCalculating F1 score..."
+    F1_FILE="${RES_FILE}.f1.txt"
+    > /tmp/scores
+   
+   python3 "${CODE_ROOT}/eval/eval.py" \
+        --predictions "$RES_FILE" \
+        --answer "$ANS_FILE" \
+        --dataset "$file_name_no_ext" 2>&1 | grep -E "50 score:|All score:" > /tmp/scores
+
+    if [[ -s /tmp/scores ]]; then
+        echo "Result file: $RES_FILE" > "$F1_FILE"
+        cat /tmp/scores >> "$F1_FILE"
+        echo "" >> "$F1_FILE"
+        echo "F1 score saved to: $F1_FILE"
+        echo -e "\n\n======================================"
+        echo ""
+        cat "$UCM_SPARSE_CONFIG"
+        echo
+        cat "$F1_FILE"
+        echo "======================================"
+    else
+        echo "Warning: No valid F1 score generated!"
+        touch "$F1_FILE"
+    fi
+      
+done
+rm -rf ${STORAGE_BACKENDS}
\ No newline at end of file
diff --git a/eval/inference.py b/eval/inference.py
new file mode 100644
index 00000000..c25d92a7
--- /dev/null
+++ b/eval/inference.py
@@ -0,0 +1,195 @@
+import contextlib
+import json
+import os
+import sys
+import time
+from dataclasses import asdict
+
+from transformers import AutoTokenizer
+
+# Third Party
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+from vllm.engine.arg_utils import EngineArgs
+
+from ucm.logger import init_logger
+
+logger = init_logger(__name__)
+model = ""
+path_to_dataset = ""
+data_dir = ""
+tokenizer = None
+
+
+def setup_environment_variables():
+    os.environ["VLLM_USE_V1"] = "1"
+    os.environ["PYTHONHASHSEED"] = "123456"
+
+    global model, path_to_dataset, data_dir, ucm_sparse_config, tokenizer
+    model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct")
+    if not os.path.isdir(model):
+        model = input("Enter path to model, e.g. /home/models/Qwen2.5-14B-Instruct: ")
+        if not os.path.isdir(model):
+            print("Exiting. Incorrect model_path")
+            sys.exit(1)
+
+    path_to_dataset = os.getenv(
+        "DATASET_FLIE", "/home/data/Longbench/data/multifieldqa_zh.jsonl"
+    )
+    if not os.path.isfile(path_to_dataset):
+        path_to_dataset = input(
+            "Enter path to one of the longbench dataset, e.g. /home/data/Longbench/data/multifieldqa_zh.jsonl: "
+        )
+        if not os.path.isfile(path_to_dataset):
+            print("Exiting. Incorrect dataset file path")
+            sys.exit(1)
+
+    data_dir = os.getenv("STORAGE_BACKENDS", "/home/data/kv_cache")
+    if not os.path.isdir(data_dir):
+        data_dir = input(
+            "Enter the directory for UCMStore to save kv cache, e.g. /home/data/kv_cache: "
+        )
+        create = input(f"Directory {data_dir} dose not exist. Create it? (Y/n): ")
+        if create.lower() == "y":
+            os.makedirs(data_dir, exist_ok=True)
+        else:
+            print("Exiting. Directory not created.")
+            sys.exit(1)
+
+    sparse_config_path = os.getenv("UCM_SPARSE_CONFIG", "eval/ucm_sparse_config.json")
+    if not os.path.isfile(sparse_config_path):
+        sparse_config_path = input(
+            "Enter path to one of the sparse config json, e.g. eval/ucm_sparse_config.json: "
+        )
+        if not os.path.isfile(sparse_config_path):
+            print("Exiting. Incorrect config json file path")
+            sys.exit(1)
+
+    with open(sparse_config_path, "r", encoding="utf-8") as f:
+        ucm_sparse_config = json.load(f)
+
+    tokenizer = AutoTokenizer.from_pretrained(model, use_chat_template=False)
+
+
+@contextlib.contextmanager
+def build_llm_with_uc(module_path: str, name: str, model: str):
+    ktc = KVTransferConfig(
+        kv_connector=name,
+        kv_connector_module_path=module_path,
+        kv_role="kv_both",
+        kv_connector_extra_config={
+            "ucm_connector_name": "UcmNfsStore",
+            "ucm_connector_config": {
+                "storage_backends": data_dir,
+                "kv_block_size": 33554432,
+            },
+            "ucm_sparse_config": ucm_sparse_config,
+        },
+    )
+
+    llm_args = EngineArgs(
+        model=model,
+        kv_transfer_config=ktc,
+        max_model_len=32768,
+        gpu_memory_utilization=0.8,
+        max_num_batched_tokens=30000,
+        block_size=128,
+        enforce_eager=True,
+        trust_remote_code=True,
+        distributed_executor_backend="mp",
+        tensor_parallel_size=1,
+    )
+
+    llm = LLM(**asdict(llm_args))
+    try:
+        yield llm
+    finally:
+        logger.info("LLM engine is exiting.")
+
+
+def print_output(
+    llm: LLM,
+    prompt: list[str],
+    sampling_params: SamplingParams,
+    req_str: str,
+):
+
+    start = time.time()
+    outputs = llm.generate(prompt, sampling_params)
+    print("-" * 50)
+    lines = []
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+        generated_text = "".join(
+            [line.strip() for line in generated_text.splitlines() if line.strip()]
+        )
+        lines.append(generated_text)
+    print(f"Generation took {time.time() - start:.2f} seconds, {req_str} request done.")
+    return lines
+
+
+def main():
+    module_path = "ucm.integration.vllm.uc_connector"
+    name = "UnifiedCacheConnectorV1"
+    setup_environment_variables()
+
+    def get_prompt(prompt):
+        messages = [
+            {
+                "role": "system",
+                "content": "根据下面的文章内容回答问题，不要进行分析，不要重复问题，用简短的语句给出答案。\n\n例如：“全国美国文学研究会的第十八届年会在哪所大学举办的？”\n回答应该为：“xx大学”。\n\n",
+            },
+            {"role": "user", "content": prompt},
+        ]
+        return tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            add_special_tokens=True,
+        )
+
+    with build_llm_with_uc(module_path, name, model) as llm:
+        res_file = os.getenv("RES_FILE", ".//multifieldqa_zh.jsonl")
+        ans_file = os.getenv("ANS_FILE", "./multifieldqa_zh.jsonl")
+        batch_size = int(os.getenv("BATCH_SIZE", 20))
+        with open(path_to_dataset, "r") as f:
+            lines = f.readlines()
+
+        total_data = len(lines)
+        for start_idx in range(0, total_data, batch_size):
+            end_idx = min(start_idx + batch_size, total_data)
+            current_batch = lines[start_idx:end_idx]
+            prompts = []
+            answers = []
+            length = []
+            for line in current_batch:
+                data = json.loads(line)
+                answer = data["answers"][0]
+                length_t = int(data["length"])
+                prompt = f"""阅读以下文字并用中文简短回答：\n\n{data["context"]}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{data["input"]}\n回答："""
+
+                prompts.append(get_prompt(prompt))
+                answers.append(answer)
+                length.append(length_t)
+
+            sampling_params = SamplingParams(
+                temperature=0, top_p=0.95, max_tokens=256, ignore_eos=False
+            )
+
+            gen_res = print_output(
+                llm, prompts, sampling_params, f"{len(current_batch)}"
+            )
+
+            with open(res_file, "a", encoding="utf-8") as res:
+                for line in gen_res:
+                    res.write(line + "\n")
+
+            with open(ans_file, "a", encoding="utf-8") as ans:
+                for context_len, ori_answer in zip(length, answers):
+                    json_obj = {"length": context_len, "answers": [ori_answer]}
+                    ans.write(json.dumps(json_obj, ensure_ascii=False) + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/ucm_sparse_config_esa.json b/eval/ucm_sparse_config_esa.json
new file mode 100644
index 00000000..6dc9e317
--- /dev/null
+++ b/eval/ucm_sparse_config_esa.json
@@ -0,0 +1,9 @@
+{"ESA": 
+    {
+    "init_window_sz": 1,
+    "local_window_sz": 2,
+    "min_blocks":4,
+    "sparse_ratio": 0.2,
+    "retrieval_stride": 10
+    }
+}
\ No newline at end of file