From 2a3278540812daff06a638678a85e45cc02e3758 Mon Sep 17 00:00:00 2001 From: AooooooA-C Date: Fri, 7 Nov 2025 15:04:32 +0800 Subject: [PATCH] Add a script for testing F1 Score --- eval/README.md | 52 +++++++ eval/eval.py | 257 ++++++++++++++++++++++++++++++++ eval/eval_inference_F1.sh | 119 +++++++++++++++ eval/inference.py | 195 ++++++++++++++++++++++++ eval/ucm_sparse_config_esa.json | 9 ++ 5 files changed, 632 insertions(+) create mode 100644 eval/README.md create mode 100644 eval/eval.py create mode 100644 eval/eval_inference_F1.sh create mode 100644 eval/inference.py create mode 100644 eval/ucm_sparse_config_esa.json diff --git a/eval/README.md b/eval/README.md new file mode 100644 index 00000000..cc182947 --- /dev/null +++ b/eval/README.md @@ -0,0 +1,52 @@ +## Accuracy testing of Sparse method + +### Overview +We use two Chinese subsets of [LongBench](https://huggingface.co/datasets/zai-org/LongBench) to test the accuracy of single-document QA (multifieldqa_zh) and multi-document QA (dureader). The F1 score is adopted to evaluate the accuracy of these sparse methods. For more information about LongBench, please refer to https://github.com/THUDM/LongBench. + +### Quick Start + +#### Environment Preparation +```shell +pip install jieba fuzzywuzzy rouge +``` +#### Test Data Preparation +Dowdload the Longbench dataset + +```shell +wget https://huggingface.co/datasets/THUDM/LongBench/resolve/main/data.zip && unzip data.zip + +``` + +#### Configure Specific Sparse Method + +Settings for different sparse methods are written in a JSON file, for example: +```python +{"ESA": + { + "init_window_sz": 1, + "local_window_sz": 2, + "min_blocks":4, + "sparse_ratio": 0.2, + "retrieval_stride": 10 + } +} +``` + +Accuracy testing can be launched using the following command: +```shell +cd eval +bash eval_inference_F1.sh + +# For example: bash eval_inference_F1.sh /home/models/Qwen2.5-14B-Instruct .ucm_sparse_config_esa.json .data + +``` +The result files will be saved in the eval/ucm_sparse_predictions folder. + +### Results +Test results of Full Attention (Qwen2.5-14B-Instruct): + +| Dataset | F1-Score | +|-------|-----------:| +| multifieldqa_zh | 66.6 | +| dureader | 29.33 | + diff --git a/eval/eval.py b/eval/eval.py new file mode 100644 index 00000000..5d20550b --- /dev/null +++ b/eval/eval.py @@ -0,0 +1,257 @@ +import argparse +import difflib +import json +import os +import re +import string +from collections import Counter +from typing import List + +import jieba +import numpy as np +from fuzzywuzzy import fuzz +from rouge import Rouge + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def normalize_zh_answer(s): + """Lower text and remove punctuation, extra whitespace.""" + + def white_space_fix(text): + return "".join(text.split()) + + def remove_punc(text): + cn_punctuation = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." + all_punctuation = set(string.punctuation + cn_punctuation) + return "".join(ch for ch in text if ch not in all_punctuation) + + def lower(text): + return text.lower() + + return white_space_fix(remove_punc(lower(s))) + + +def count_score(prediction, ground_truth, **kwargs): + numbers = re.findall(r"\d+", prediction) + right_num = 0 + for number in numbers: + if str(number) == str(ground_truth): + right_num += 1 + final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers) + return float(final_score) + + +def retrieval_score(prediction, ground_truth, **kwargs): + pattern = r"Paragraph (\d+)" + matches = re.findall(pattern, ground_truth) + ground_truth_id = matches[0] + numbers = re.findall(r"\d+", prediction) + right_num = 0 + for number in numbers: + if str(number) == str(ground_truth_id): + right_num += 1 + final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers) + return float(final_score) + + +def retrieval_zh_score(prediction, ground_truth, **kwargs): + pattern = r"段落(\d+)" + matches = re.findall(pattern, ground_truth) + ground_truth_id = matches[0] + numbers = re.findall(r"\d+", prediction) + right_num = 0 + for number in numbers: + if str(number) == str(ground_truth_id): + right_num += 1 + final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers) + return float(final_score) + + +def code_sim_score(prediction, ground_truth, **kwargs): + all_lines = prediction.lstrip("\n").split("\n") + prediction = "" + for line in all_lines: + if ("`" not in line) and ("#" not in line) and ("//" not in line): + prediction = line + break + return fuzz.ratio(prediction, ground_truth) / 100 + + +def classification_score(prediction, ground_truth, **kwargs): + em_match_list = [] + all_classes = kwargs["all_classes"] + for class_name in all_classes: + if class_name in prediction: + em_match_list.append(class_name) + for match_term in em_match_list: + if match_term in ground_truth and match_term != ground_truth: + em_match_list.remove(match_term) + if ground_truth in em_match_list: + score = 1.0 / len(em_match_list) + else: + score = 0.0 + return score + + +def rouge_score(prediction, ground_truth, **kwargs): + rouge = Rouge() + try: + scores = rouge.get_scores([prediction], [ground_truth], avg=True) + except: + return 0.0 + return scores["rouge-l"]["f"] + + +def rouge_zh_score(prediction, ground_truth, **kwargs): + prediction = " ".join(list(jieba.cut(prediction, cut_all=False))) + ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False))) + score = rouge_score(prediction, ground_truth) + return score + + +def f1_score(prediction, ground_truth, **kwargs): + common = Counter(prediction) & Counter(ground_truth) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction) + recall = 1.0 * num_same / len(ground_truth) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def qa_f1_score(prediction, ground_truth, **kwargs): + normalized_prediction = normalize_answer(prediction) + normalized_ground_truth = normalize_answer(ground_truth) + prediction_tokens = normalized_prediction.split() + ground_truth_tokens = normalized_ground_truth.split() + return f1_score(prediction_tokens, ground_truth_tokens) + + +def qa_f1_zh_score(prediction, ground_truth, **kwargs): + prediction_tokens = list(jieba.cut(prediction, cut_all=False)) + ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False)) + prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens] + ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens] + prediction_tokens = [token for token in prediction_tokens if len(token) > 0] + ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0] + return f1_score(prediction_tokens, ground_truth_tokens) + + +dataset2metric = { + "narrativeqa": qa_f1_score, + "qasper": qa_f1_score, + "multifieldqa_en": qa_f1_score, + "multifieldqa_zh": qa_f1_zh_score, + "clongeval": qa_f1_zh_score, + "hotpotqa": qa_f1_score, + "2wikimqa": qa_f1_score, + "musique": qa_f1_score, + "dureader": rouge_zh_score, + "gov_report": rouge_score, + "qmsum": rouge_score, + "multi_news": rouge_score, + "vcsum": rouge_zh_score, + "trec": classification_score, + "triviaqa": qa_f1_score, + "samsum": rouge_score, + "lsht": classification_score, + "passage_retrieval_en": retrieval_score, + "passage_count": count_score, + "passage_retrieval_zh": retrieval_zh_score, + "lcc": code_sim_score, + "repobench-p": code_sim_score, +} + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, default=None) + parser.add_argument("--predictions", type=str, default=None) + parser.add_argument("--answer", type=str, default=None) + parser.add_argument("--dataset", type=str, default=None) + parser.add_argument("--e", action="store_true", help="Evaluate on LongBench-E") + return parser.parse_args(args) + + +def scorer_e(dataset, predictions, answers, lengths, all_classes): + scores = {"0-4k": [], "4-8k": [], "8k+": []} + for prediction, ground_truths, length in zip(predictions, answers, lengths): + score = 0.0 + if dataset in ["trec", "triviaqa", "samsum", "lsht"]: + prediction = prediction.lstrip("\n").split("\n")[0] + for ground_truth in ground_truths: + score = max( + score, + dataset2metric[dataset]( + prediction, ground_truth, all_classes=all_classes + ), + ) + if length < 4000: + scores["0-4k"].append(score) + elif length < 8000: + scores["4-8k"].append(score) + else: + scores["8k+"].append(score) + for key in scores.keys(): + scores[key] = round(100 * np.mean(scores[key]), 2) + return scores + + +def scorer(dataset, predictions, answers, all_classes): + total_score = 0.0 + for prediction, ground_truths in zip(predictions, answers): + score = 0.0 + if dataset in ["trec", "triviaqa", "samsum", "lsht"]: + prediction = prediction.lstrip("\n").split("\n")[0] + for ground_truth in ground_truths: + score = max( + score, + dataset2metric[dataset]( + prediction, ground_truth, all_classes=all_classes + ), + ) + total_score += score + return round(100 * total_score / len(predictions), 2) + + +if __name__ == "__main__": + args = parse_args() + scores = dict() + predictions, answers, lengths = [], [], [] + all_classes = None + with open(args.predictions, "r", encoding="utf-8") as f: + for line in f: + predictions.append(line.strip()) + with open(args.answer, "r", encoding="utf-8") as f: + for line in f: + data = json.loads(line) + answers.append(data["answers"]) + if "length" in data: + lengths.append(data["length"]) + if args.e: + score = scorer_e(args.dataset, predictions, answers, lengths, all_classes) + print("All score:", score) + else: + score = scorer(args.dataset, predictions[:50], answers[:50], all_classes) + print("50 score:", score) + score = scorer(args.dataset, predictions, answers, all_classes) + print("All score:", score) diff --git a/eval/eval_inference_F1.sh b/eval/eval_inference_F1.sh new file mode 100644 index 00000000..8891fce9 --- /dev/null +++ b/eval/eval_inference_F1.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +# Check and auto-install required Python packages +REQUIRED_PACKAGES=("fuzzywuzzy" "jieba" "rouge" ) +for pkg in "${REQUIRED_PACKAGES[@]}"; do + if ! python3 -c "import $pkg" 2>/dev/null; then + echo "❌ $pkg not found, installing..." + pip install "$pkg" --upgrade 2>/dev/null && echo "✅ $pkg installed successfully" || echo "❌ Failed to install $pkg (run 'pip3 install $pkg' manually)" + fi +done + +CODE_ROOT=$(dirname "$(dirname "$(readlink -f "$0")")") + +MODEL_PATH="${1:-/home/models/Qwen2.5-14B-Instruct/}" +UCM_SPARSE_CONFIG="${2:-${CODE_ROOT}/eval/ucm_sparse_config_esa_0.2_10.json}" +TEST_DATA_DIR="${3:-${CODE_ROOT}/eval/data}" +BATCH_SIZE="${4:-20}" +export BATCH_SIZE + +# set vllm version +export VLLM_VERSION="0.9.2" +export VLLM_USE_V1="1" + +# Model +export MODEL_PATH +MODEL_NAME=$(basename "$MODEL_PATH") + +# Dataset and storage path +STORAGE_BACKENDS="${CODE_ROOT}/ucm_kv_cache/${MODEL_NAME}" +export STORAGE_BACKENDS + +SAVE_PATH="${CODE_ROOT}/eval/ucm_sparse_predictions" +DATASET="LongBench" +DATASET_SAVE_DIR="${SAVE_PATH}/${MODEL_NAME}/${DATASET}" + +mkdir -p "$STORAGE_BACKENDS" "$DATASET_SAVE_DIR" || { echo "Failed to create dirs!"; exit 1; } + + + +# -------------------------- LongBench -------------------------- +TARGET_FILES=( +"${TEST_DATA_DIR}/multifieldqa_zh.jsonl" +"${TEST_DATA_DIR}/dureader.jsonl" +) + +EXISTING_FILES=() +declare -A seen_files +for file in "${TARGET_FILES[@]}"; do + if [[ -f "$file" && -z "${seen_files[$file]}" ]]; then + seen_files["$file"]=1 + EXISTING_FILES+=("$file") + fi +done +if [[ ${#EXISTING_FILES[@]} -eq 0 ]]; then + echo "❌ No valid data files found for '$DATASET'!" + exit 1 +fi + +echo -e "\nFound ${#EXISTING_FILES[@]} data files for $DATASET:" +for file in "${EXISTING_FILES[@]}"; do + rel_path="${file#${BASE_DATA_DIR}/${DATASET}/}" + echo " - $rel_path" +done + +UCM_CONFIG_NAME=$(basename "$UCM_SPARSE_CONFIG") +UCM_CONFIG_NAME_NO_EXT="${UCM_CONFIG_NAME%.*}" + +for DATASET_FLIE in "${EXISTING_FILES[@]}"; do + filename=$(basename "$DATASET_FLIE") + file_name_no_ext="${filename%.*}" + export DATASET_FLIE + + RES_FILE="${DATASET_SAVE_DIR}/${file_name_no_ext}_${UCM_CONFIG_NAME_NO_EXT}_bs${BATCH_SIZE}.txt" + ANS_FILE="${DATASET_SAVE_DIR}/${file_name_no_ext}_answer.jsonl" + export RES_FILE + export ANS_FILE + [[ -f "$RES_FILE" ]] && > "$RES_FILE" + [[ -f "$ANS_FILE" ]] && > "$ANS_FILE" + + export UCM_SPARSE_CONFIG + echo -e "\n======================================" + echo "Executed model: $MODEL_NAME" + echo "Using Config: $UCM_SPARSE_CONFIG" + echo "======================================" + + python3 "${CODE_ROOT}/eval/inference.py" \ + + if [[ ! -f "$RES_FILE" ]]; then + echo "Warning: test finished but result file not found!" + continue + fi + + echo -e "\nCalculating F1 score..." + F1_FILE="${RES_FILE}.f1.txt" + > /tmp/scores + + python3 "${CODE_ROOT}/eval/eval.py" \ + --predictions "$RES_FILE" \ + --answer "$ANS_FILE" \ + --dataset "$file_name_no_ext" 2>&1 | grep -E "50 score:|All score:" > /tmp/scores + + if [[ -s /tmp/scores ]]; then + echo "Result file: $RES_FILE" > "$F1_FILE" + cat /tmp/scores >> "$F1_FILE" + echo "" >> "$F1_FILE" + echo "F1 score saved to: $F1_FILE" + echo -e "\n\n======================================" + echo "" + cat "$UCM_SPARSE_CONFIG" + echo + cat "$F1_FILE" + echo "======================================" + else + echo "Warning: No valid F1 score generated!" + touch "$F1_FILE" + fi + +done +rm -rf ${STORAGE_BACKENDS} \ No newline at end of file diff --git a/eval/inference.py b/eval/inference.py new file mode 100644 index 00000000..c25d92a7 --- /dev/null +++ b/eval/inference.py @@ -0,0 +1,195 @@ +import contextlib +import json +import os +import sys +import time +from dataclasses import asdict + +from transformers import AutoTokenizer + +# Third Party +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig +from vllm.engine.arg_utils import EngineArgs + +from ucm.logger import init_logger + +logger = init_logger(__name__) +model = "" +path_to_dataset = "" +data_dir = "" +tokenizer = None + + +def setup_environment_variables(): + os.environ["VLLM_USE_V1"] = "1" + os.environ["PYTHONHASHSEED"] = "123456" + + global model, path_to_dataset, data_dir, ucm_sparse_config, tokenizer + model = os.getenv("MODEL_PATH", "/home/models/Qwen2.5-14B-Instruct") + if not os.path.isdir(model): + model = input("Enter path to model, e.g. /home/models/Qwen2.5-14B-Instruct: ") + if not os.path.isdir(model): + print("Exiting. Incorrect model_path") + sys.exit(1) + + path_to_dataset = os.getenv( + "DATASET_FLIE", "/home/data/Longbench/data/multifieldqa_zh.jsonl" + ) + if not os.path.isfile(path_to_dataset): + path_to_dataset = input( + "Enter path to one of the longbench dataset, e.g. /home/data/Longbench/data/multifieldqa_zh.jsonl: " + ) + if not os.path.isfile(path_to_dataset): + print("Exiting. Incorrect dataset file path") + sys.exit(1) + + data_dir = os.getenv("STORAGE_BACKENDS", "/home/data/kv_cache") + if not os.path.isdir(data_dir): + data_dir = input( + "Enter the directory for UCMStore to save kv cache, e.g. /home/data/kv_cache: " + ) + create = input(f"Directory {data_dir} dose not exist. Create it? (Y/n): ") + if create.lower() == "y": + os.makedirs(data_dir, exist_ok=True) + else: + print("Exiting. Directory not created.") + sys.exit(1) + + sparse_config_path = os.getenv("UCM_SPARSE_CONFIG", "eval/ucm_sparse_config.json") + if not os.path.isfile(sparse_config_path): + sparse_config_path = input( + "Enter path to one of the sparse config json, e.g. eval/ucm_sparse_config.json: " + ) + if not os.path.isfile(sparse_config_path): + print("Exiting. Incorrect config json file path") + sys.exit(1) + + with open(sparse_config_path, "r", encoding="utf-8") as f: + ucm_sparse_config = json.load(f) + + tokenizer = AutoTokenizer.from_pretrained(model, use_chat_template=False) + + +@contextlib.contextmanager +def build_llm_with_uc(module_path: str, name: str, model: str): + ktc = KVTransferConfig( + kv_connector=name, + kv_connector_module_path=module_path, + kv_role="kv_both", + kv_connector_extra_config={ + "ucm_connector_name": "UcmNfsStore", + "ucm_connector_config": { + "storage_backends": data_dir, + "kv_block_size": 33554432, + }, + "ucm_sparse_config": ucm_sparse_config, + }, + ) + + llm_args = EngineArgs( + model=model, + kv_transfer_config=ktc, + max_model_len=32768, + gpu_memory_utilization=0.8, + max_num_batched_tokens=30000, + block_size=128, + enforce_eager=True, + trust_remote_code=True, + distributed_executor_backend="mp", + tensor_parallel_size=1, + ) + + llm = LLM(**asdict(llm_args)) + try: + yield llm + finally: + logger.info("LLM engine is exiting.") + + +def print_output( + llm: LLM, + prompt: list[str], + sampling_params: SamplingParams, + req_str: str, +): + + start = time.time() + outputs = llm.generate(prompt, sampling_params) + print("-" * 50) + lines = [] + for output in outputs: + generated_text = output.outputs[0].text + print(f"Generated text: {generated_text!r}") + generated_text = "".join( + [line.strip() for line in generated_text.splitlines() if line.strip()] + ) + lines.append(generated_text) + print(f"Generation took {time.time() - start:.2f} seconds, {req_str} request done.") + return lines + + +def main(): + module_path = "ucm.integration.vllm.uc_connector" + name = "UnifiedCacheConnectorV1" + setup_environment_variables() + + def get_prompt(prompt): + messages = [ + { + "role": "system", + "content": "根据下面的文章内容回答问题,不要进行分析,不要重复问题,用简短的语句给出答案。\n\n例如:“全国美国文学研究会的第十八届年会在哪所大学举办的?”\n回答应该为:“xx大学”。\n\n", + }, + {"role": "user", "content": prompt}, + ] + return tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + add_special_tokens=True, + ) + + with build_llm_with_uc(module_path, name, model) as llm: + res_file = os.getenv("RES_FILE", ".//multifieldqa_zh.jsonl") + ans_file = os.getenv("ANS_FILE", "./multifieldqa_zh.jsonl") + batch_size = int(os.getenv("BATCH_SIZE", 20)) + with open(path_to_dataset, "r") as f: + lines = f.readlines() + + total_data = len(lines) + for start_idx in range(0, total_data, batch_size): + end_idx = min(start_idx + batch_size, total_data) + current_batch = lines[start_idx:end_idx] + prompts = [] + answers = [] + length = [] + for line in current_batch: + data = json.loads(line) + answer = data["answers"][0] + length_t = int(data["length"]) + prompt = f"""阅读以下文字并用中文简短回答:\n\n{data["context"]}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{data["input"]}\n回答:""" + + prompts.append(get_prompt(prompt)) + answers.append(answer) + length.append(length_t) + + sampling_params = SamplingParams( + temperature=0, top_p=0.95, max_tokens=256, ignore_eos=False + ) + + gen_res = print_output( + llm, prompts, sampling_params, f"{len(current_batch)}" + ) + + with open(res_file, "a", encoding="utf-8") as res: + for line in gen_res: + res.write(line + "\n") + + with open(ans_file, "a", encoding="utf-8") as ans: + for context_len, ori_answer in zip(length, answers): + json_obj = {"length": context_len, "answers": [ori_answer]} + ans.write(json.dumps(json_obj, ensure_ascii=False) + "\n") + + +if __name__ == "__main__": + main() diff --git a/eval/ucm_sparse_config_esa.json b/eval/ucm_sparse_config_esa.json new file mode 100644 index 00000000..6dc9e317 --- /dev/null +++ b/eval/ucm_sparse_config_esa.json @@ -0,0 +1,9 @@ +{"ESA": + { + "init_window_sz": 1, + "local_window_sz": 2, + "min_blocks":4, + "sparse_ratio": 0.2, + "retrieval_stride": 10 + } +} \ No newline at end of file