From 4c874c705ee6f0fc5c80af3df8199ec8d2194247 Mon Sep 17 00:00:00 2001
From: "Xingjun.Wang" <wangxingjun778@163.com>
Date: Fri, 8 Nov 2024 14:27:43 +0800
Subject: [PATCH 1/6] Set pyarrow version (#187)

* add publish workflow

* update pyarrow to <=17.0.0 to avoid installation issue on OSX
---
 requirements/framework.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/framework.txt b/requirements/framework.txt
index 89626ea..956a45c 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -13,7 +13,7 @@ nltk>=3.9
 openai
 pandas
 plotly
-pyarrow
+pyarrow<=17.0.0
 pympler
 pyyaml
 regex

From fb2e0fd4d951adc0740b3683829ffcd34570540b Mon Sep 17 00:00:00 2001
From: Yunlin Mao <mao.looper@qq.com>
Date: Tue, 12 Nov 2024 14:29:13 +0800
Subject: [PATCH 2/6] fix #192 and compact mteb v1.19 (#196)

---
 .../rag_eval/cmteb/tasks/Clustering.py        | 192 +++++++++---------
 .../backend/rag_eval/cmteb/tasks/Reranking.py | 141 +++++++------
 evalscope/backend/rag_eval/utils/embedding.py |  44 ++--
 examples/example_eval_mteb.py                 | 115 +++++------
 requirements/rag.txt                          |   4 +-
 tests/rag/test_mteb.py                        | 142 ++++++-------
 6 files changed, 321 insertions(+), 317 deletions(-)

diff --git a/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py b/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py
index 0442a96..70c7b11 100644
--- a/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py
+++ b/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py
@@ -17,57 +17,57 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast):
     max_fraction_of_documents_to_embed = None
 
     metadata = TaskMetadata(
-        name="CLSClusteringS2S",
-        description="Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.",
-        reference="https://arxiv.org/abs/2209.05034",
+        name='CLSClusteringS2S',
+        description='Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.',
+        reference='https://arxiv.org/abs/2209.05034',
         dataset={
-            "path": "C-MTEB/CLSClusteringS2S",
-            "revision": "e458b3f5414b62b7f9f83499ac1f5497ae2e869f",
+            'path': 'C-MTEB/CLSClusteringS2S',
+            'revision': 'e458b3f5414b62b7f9f83499ac1f5497ae2e869f',
         },
-        type="Clustering",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="v_measure",
-        date=("2022-01-01", "2022-09-12"),
-        domains=["Academic", "Written"],
-        task_subtypes=["Thematic clustering", "Topic classification"],
-        license="Apache-2.0",
-        annotations_creators="derived",
+        type='Clustering',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='v_measure',
+        date=('2022-01-01', '2022-09-12'),
+        domains=['Academic', 'Written'],
+        task_subtypes=['Thematic clustering', 'Topic classification'],
+        license='apache-2.0',
+        annotations_creators='derived',
         dialect=[],
-        sample_creation="found",
+        sample_creation='found',
         bibtex_citation="""@misc{li2022csl,
-            title={CSL: A Large-scale Chinese Scientific Literature Dataset}, 
+            title={CSL: A Large-scale Chinese Scientific Literature Dataset},
             author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
             year={2022},
             eprint={2209.05034},
             archivePrefix={arXiv},
             primaryClass={cs.CL}
-        }""",
+        }""",  # noqa
         descriptive_stats={
-            "n_samples": {"test": NUM_SAMPLES},
-            "avg_character_length": {},
+            'n_samples': {'test': NUM_SAMPLES},
+            'avg_character_length': {},
         },
     )
 
     def dataset_transform(self):
         ds = {}
         for split in self.metadata.eval_splits:
-            labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
+            labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
             sentences = list(
-                itertools.chain.from_iterable(self.dataset[split]["sentences"])
+                itertools.chain.from_iterable(self.dataset[split]['sentences'])
             )
 
             check_label_distribution(self.dataset[split])
 
-            ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
+            ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
         self.dataset = DatasetDict(ds)
         self.dataset = self.stratified_subsampling(
             self.dataset,
             self.seed,
             self.metadata.eval_splits,
-            label="labels",
+            label='labels',
             n_samples=NUM_SAMPLES,
         )
 
@@ -77,57 +77,57 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast):
     max_fraction_of_documents_to_embed = None
 
     metadata = TaskMetadata(
-        name="CLSClusteringP2P",
-        description="Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.",
-        reference="https://arxiv.org/abs/2209.05034",
+        name='CLSClusteringP2P',
+        description='Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.',
+        reference='https://arxiv.org/abs/2209.05034',
         dataset={
-            "path": "C-MTEB/CLSClusteringP2P",
-            "revision": "4b6227591c6c1a73bc76b1055f3b7f3588e72476",
+            'path': 'C-MTEB/CLSClusteringP2P',
+            'revision': '4b6227591c6c1a73bc76b1055f3b7f3588e72476',
         },
-        type="Clustering",
-        category="p2p",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="v_measure",
-        date=("2022-01-01", "2022-09-12"),
-        domains=["Academic", "Written"],
-        task_subtypes=["Thematic clustering", "Topic classification"],
-        license="Apache-2.0",
-        annotations_creators="derived",
+        type='Clustering',
+        category='p2p',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='v_measure',
+        date=('2022-01-01', '2022-09-12'),
+        domains=['Academic', 'Written'],
+        task_subtypes=['Thematic clustering', 'Topic classification'],
+        license='apache-2.0',
+        annotations_creators='derived',
         dialect=[],
-        sample_creation="found",
+        sample_creation='found',
         bibtex_citation="""@misc{li2022csl,
-            title={CSL: A Large-scale Chinese Scientific Literature Dataset}, 
+            title={CSL: A Large-scale Chinese Scientific Literature Dataset},
             author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
             year={2022},
             eprint={2209.05034},
             archivePrefix={arXiv},
             primaryClass={cs.CL}
-        }""",
+        }""",  # noqa
         descriptive_stats={
-            "n_samples": {"test": NUM_SAMPLES},
-            "avg_character_length": {},
+            'n_samples': {'test': NUM_SAMPLES},
+            'avg_character_length': {},
         },
     )
 
     def dataset_transform(self):
         ds = {}
         for split in self.metadata.eval_splits:
-            labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
+            labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
             sentences = list(
-                itertools.chain.from_iterable(self.dataset[split]["sentences"])
+                itertools.chain.from_iterable(self.dataset[split]['sentences'])
             )
 
             check_label_distribution(self.dataset[split])
 
-            ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
+            ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
         self.dataset = DatasetDict(ds)
         self.dataset = self.stratified_subsampling(
             self.dataset,
             self.seed,
             self.metadata.eval_splits,
-            label="labels",
+            label='labels',
             n_samples=NUM_SAMPLES,
         )
 
@@ -137,26 +137,26 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
     max_fraction_of_documents_to_embed = None
 
     metadata = TaskMetadata(
-        name="ThuNewsClusteringS2S",
+        name='ThuNewsClusteringS2S',
         dataset={
-            "path": "C-MTEB/ThuNewsClusteringS2S",
-            "revision": "8a8b2caeda43f39e13c4bc5bea0f8a667896e10d",
+            'path': 'C-MTEB/ThuNewsClusteringS2S',
+            'revision': '8a8b2caeda43f39e13c4bc5bea0f8a667896e10d',
         },
-        description="Clustering of titles from the THUCNews dataset",
-        reference="http://thuctc.thunlp.org/",
-        type="Clustering",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="v_measure",
-        date=("2006-01-01", "2007-01-01"),
-        domains=["News", "Written"],
-        task_subtypes=["Thematic clustering", "Topic classification"],
-        license="Not specified",
-        annotations_creators="derived",
+        description='Clustering of titles from the THUCNews dataset',
+        reference='http://thuctc.thunlp.org/',
+        type='Clustering',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='v_measure',
+        date=('2006-01-01', '2007-01-01'),
+        domains=['News', 'Written'],
+        task_subtypes=['Thematic clustering', 'Topic classification'],
+        license='apache-2.0',
+        annotations_creators='derived',
         dialect=[],
-        sample_creation="found",
+        sample_creation='found',
         bibtex_citation="""@software{THUCTC,
   author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
   title = {THUCTC: An Efficient Chinese Text Classifier},
@@ -166,28 +166,28 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
   url = {https://github.com/thunlp/THUCTC}
 }""",
         descriptive_stats={
-            "n_samples": {"test": NUM_SAMPLES},
-            "avg_character_length": {},
+            'n_samples': {'test': NUM_SAMPLES},
+            'avg_character_length': {},
         },
     )
 
     def dataset_transform(self):
         ds = {}
         for split in self.metadata.eval_splits:
-            labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
+            labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
             sentences = list(
-                itertools.chain.from_iterable(self.dataset[split]["sentences"])
+                itertools.chain.from_iterable(self.dataset[split]['sentences'])
             )
 
             check_label_distribution(self.dataset[split])
 
-            ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
+            ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
         self.dataset = DatasetDict(ds)
         self.dataset = self.stratified_subsampling(
             self.dataset,
             self.seed,
             self.metadata.eval_splits,
-            label="labels",
+            label='labels',
             n_samples=NUM_SAMPLES,
         )
 
@@ -197,26 +197,26 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
     max_fraction_of_documents_to_embed = None
 
     metadata = TaskMetadata(
-        name="ThuNewsClusteringP2P",
+        name='ThuNewsClusteringP2P',
         dataset={
-            "path": "C-MTEB/ThuNewsClusteringP2P",
-            "revision": "5798586b105c0434e4f0fe5e767abe619442cf93",
+            'path': 'C-MTEB/ThuNewsClusteringP2P',
+            'revision': '5798586b105c0434e4f0fe5e767abe619442cf93',
         },
-        description="Clustering of titles + abstracts from the THUCNews dataset",
-        reference="http://thuctc.thunlp.org/",
-        type="Clustering",
-        category="p2p",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="v_measure",
-        date=("2006-01-01", "2007-01-01"),
-        domains=["News", "Written"],
-        task_subtypes=["Thematic clustering", "Topic classification"],
-        license="Not specified",
-        annotations_creators="derived",
+        description='Clustering of titles + abstracts from the THUCNews dataset',
+        reference='http://thuctc.thunlp.org/',
+        type='Clustering',
+        category='p2p',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='v_measure',
+        date=('2006-01-01', '2007-01-01'),
+        domains=['News', 'Written'],
+        task_subtypes=['Thematic clustering', 'Topic classification'],
+        license='apache-2.0',
+        annotations_creators='derived',
         dialect=[],
-        sample_creation="found",
+        sample_creation='found',
         bibtex_citation="""@software{THUCTC,
   author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
   title = {THUCTC: An Efficient Chinese Text Classifier},
@@ -226,27 +226,27 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
   url = {https://github.com/thunlp/THUCTC}
 }""",
         descriptive_stats={
-            "n_samples": {"test": NUM_SAMPLES},
-            "avg_character_length": {},
+            'n_samples': {'test': NUM_SAMPLES},
+            'avg_character_length': {},
         },
     )
 
     def dataset_transform(self):
         ds = {}
         for split in self.metadata.eval_splits:
-            labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
+            labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
             sentences = list(
-                itertools.chain.from_iterable(self.dataset[split]["sentences"])
+                itertools.chain.from_iterable(self.dataset[split]['sentences'])
             )
 
             check_label_distribution(self.dataset[split])
 
-            ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
+            ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
         self.dataset = DatasetDict(ds)
         self.dataset = self.stratified_subsampling(
             self.dataset,
             self.seed,
             self.metadata.eval_splits,
-            label="labels",
+            label='labels',
             n_samples=NUM_SAMPLES,
         )
diff --git a/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py b/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py
index 41d2ecb..453f2bc 100644
--- a/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py
+++ b/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py
@@ -2,22 +2,21 @@
 from mteb.abstasks.TaskMetadata import HFSubset, TaskMetadata
 
 
-
 class T2Reranking(AbsTaskReranking):
     metadata = TaskMetadata(
-        name="T2Reranking",
-        description="T2Ranking: A large-scale Chinese Benchmark for Passage Ranking",
-        reference="https://arxiv.org/abs/2304.03679",
+        name='T2Reranking',
+        description='T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
+        reference='https://arxiv.org/abs/2304.03679',
         dataset={
-            "path": "C-MTEB/T2Reranking",
-            "revision": "76631901a18387f85eaa53e5450019b87ad58ef9",
+            'path': 'C-MTEB/T2Reranking',
+            'revision': '76631901a18387f85eaa53e5450019b87ad58ef9',
         },
-        type="Reranking",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["dev"],
-        eval_langs=["cmn-Hans"],
-        main_score="map",
+        type='Reranking',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['dev'],
+        eval_langs=['cmn-Hans'],
+        main_score='map',
         date=None,
         form=None,
         domains=None,
@@ -27,32 +26,32 @@ class T2Reranking(AbsTaskReranking):
         dialect=None,
         sample_creation=None,
         bibtex_citation="""@misc{xie2023t2ranking,
-      title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking}, 
+      title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
       author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma},
       year={2023},
       eprint={2304.03679},
       archivePrefix={arXiv},
       primaryClass={cs.IR}
-}""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+}""",  # noqa
+        descriptive_stats={'n_samples': None, 'avg_character_length': None},
     )
 
 
 class MMarcoReranking(AbsTaskReranking):
     metadata = TaskMetadata(
-        name="MMarcoReranking",
-        description="mMARCO is a multilingual version of the MS MARCO passage ranking dataset",
-        reference="https://github.com/unicamp-dl/mMARCO",
+        name='MMarcoReranking',
+        description='mMARCO is a multilingual version of the MS MARCO passage ranking dataset',
+        reference='https://github.com/unicamp-dl/mMARCO',
         dataset={
-            "path": "C-MTEB/Mmarco-reranking",
-            "revision": "8e0c766dbe9e16e1d221116a3f36795fbade07f6",
+            'path': 'C-MTEB/Mmarco-reranking',
+            'revision': '8e0c766dbe9e16e1d221116a3f36795fbade07f6',
         },
-        type="Reranking",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["dev"],
-        eval_langs=["cmn-Hans"],
-        main_score="map",
+        type='Reranking',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['dev'],
+        eval_langs=['cmn-Hans'],
+        main_score='map',
         date=None,
         form=None,
         domains=None,
@@ -62,39 +61,39 @@ class MMarcoReranking(AbsTaskReranking):
         dialect=None,
         sample_creation=None,
         bibtex_citation="""@misc{bonifacio2021mmarco,
-      title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset}, 
+      title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset},
       author={Luiz Henrique Bonifacio and Vitor Jeronymo and Hugo Queiroz Abonizio and Israel Campiotti and Marzieh Fadaee and  and Roberto Lotufo and Rodrigo Nogueira},
       year={2021},
       eprint={2108.13897},
       archivePrefix={arXiv},
       primaryClass={cs.CL}
-}""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+}""",  # noqa
+        descriptive_stats={'n_samples': None, 'avg_character_length': None},
     )
 
 
 class CMedQAv1(AbsTaskReranking):
     metadata = TaskMetadata(
-        name="CMedQAv1",
-        description="Chinese community medical question answering",
-        reference="https://github.com/zhangsheng93/cMedQA",
+        name='CMedQAv1',
+        description='Chinese community medical question answering',
+        reference='https://github.com/zhangsheng93/cMedQA',
         dataset={
-            "path": "C-MTEB/CMedQAv1-reranking",
-            "revision": "8d7f1e942507dac42dc58017c1a001c3717da7df",
+            'path': 'C-MTEB/CMedQAv1-reranking',
+            'revision': '8d7f1e942507dac42dc58017c1a001c3717da7df',
         },
-        type="Reranking",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="map",
-        date=("2017-01-01", "2017-07-26"),
-        domains=["Medical", "Written"],
+        type='Reranking',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='map',
+        date=('2017-01-01', '2017-07-26'),
+        domains=['Medical', 'Written'],
         task_subtypes=[],
-        license="Not specified",
-        annotations_creators="expert-annotated",
+        license='apache-2.0',
+        annotations_creators='expert-annotated',
         dialect=[],
-        sample_creation="found",
+        sample_creation='found',
         bibtex_citation="""@article{zhang2017chinese,
   title={Chinese Medical Question Answer Matching Using End-to-End Character-Level Multi-Scale CNNs},
   author={Zhang, Sheng and Zhang, Xin and Wang, Hui and Cheng, Jiajun and Li, Pei and Ding, Zhaoyun},
@@ -106,27 +105,27 @@ class CMedQAv1(AbsTaskReranking):
   publisher={Multidisciplinary Digital Publishing Institute}
 }""",
         descriptive_stats={
-            "n_samples": {"test": 2000},
-            "avg_character_length": {"test": 165},
+            'n_samples': {'test': 2000},
+            'avg_character_length': {'test': 165},
         },
     )
 
 
 class CMedQAv2(AbsTaskReranking):
     metadata = TaskMetadata(
-        name="CMedQAv2",
-        description="Chinese community medical question answering",
-        reference="https://github.com/zhangsheng93/cMedQA2",
+        name='CMedQAv2',
+        description='Chinese community medical question answering',
+        reference='https://github.com/zhangsheng93/cMedQA2',
         dataset={
-            "path": "C-MTEB/CMedQAv2-reranking",
-            "revision": "23d186750531a14a0357ca22cd92d712fd512ea0",
+            'path': 'C-MTEB/CMedQAv2-reranking',
+            'revision': '23d186750531a14a0357ca22cd92d712fd512ea0',
         },
-        type="Reranking",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="map",
+        type='Reranking',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='map',
         date=None,
         form=None,
         domains=None,
@@ -135,17 +134,17 @@ class CMedQAv2(AbsTaskReranking):
         annotations_creators=None,
         dialect=None,
         sample_creation=None,
-        bibtex_citation="""@ARTICLE{8548603, 
-author={S. Zhang and X. Zhang and H. Wang and L. Guo and S. Liu}, 
-journal={IEEE Access}, 
-title={Multi-Scale Attentive Interaction Networks for Chinese Medical Question Answer Selection}, 
-year={2018}, 
-volume={6}, 
-number={}, 
-pages={74061-74071}, 
-keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extraction;Knowledge discovery;Medical question answering;interactive attention;deep learning;deep neural networks}, 
-doi={10.1109/ACCESS.2018.2883637}, 
-ISSN={2169-3536}, 
-month={},}""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        bibtex_citation="""@ARTICLE{8548603,
+author={S. Zhang and X. Zhang and H. Wang and L. Guo and S. Liu},
+journal={IEEE Access},
+title={Multi-Scale Attentive Interaction Networks for Chinese Medical Question Answer Selection},
+year={2018},
+volume={6},
+number={},
+pages={74061-74071},
+keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extraction;Knowledge discovery;Medical question answering;interactive attention;deep learning;deep neural networks},
+doi={10.1109/ACCESS.2018.2883637},
+ISSN={2169-3536},
+month={},}""",  # noqa
+        descriptive_stats={'n_samples': None, 'avg_character_length': None},
     )
diff --git a/evalscope/backend/rag_eval/utils/embedding.py b/evalscope/backend/rag_eval/utils/embedding.py
index dc97a1e..0974415 100644
--- a/evalscope/backend/rag_eval/utils/embedding.py
+++ b/evalscope/backend/rag_eval/utils/embedding.py
@@ -17,20 +17,20 @@ def __init__(
         self,
         model_name_or_path: str,
         max_seq_length: int = 512,
-        prompt: str = "",
+        prompt: str = '',
         revision: Optional[str] = None,
         **kwargs,
     ):
         self.model_name_or_path = model_name_or_path
         self.max_seq_length = max_seq_length
-        self.model_kwargs = kwargs.pop("model_kwargs", {})
-        self.model_kwargs["trust_remote_code"] = True
+        self.model_kwargs = kwargs.pop('model_kwargs', {})
+        self.model_kwargs['trust_remote_code'] = True
 
-        self.config_kwargs = kwargs.pop("config_kwargs", {})
-        self.config_kwargs["trust_remote_code"] = True
+        self.config_kwargs = kwargs.pop('config_kwargs', {})
+        self.config_kwargs['trust_remote_code'] = True
 
-        self.encode_kwargs = kwargs.pop("encode_kwargs", {})
-        self.encode_kwargs["convert_to_tensor"] = True
+        self.encode_kwargs = kwargs.pop('encode_kwargs', {})
+        self.encode_kwargs['convert_to_tensor'] = True
 
         self.prompt = prompt
         self.revision = revision
@@ -73,7 +73,6 @@ def encode(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
         """Embed text."""
         raise NotImplementedError
 
-
     def encode_queries(self, queries: List[str], **kwargs) -> list[torch.Tensor]:
         """Embed query text. Compact mteb."""
         raise NotImplementedError
@@ -81,7 +80,8 @@ def encode_queries(self, queries: List[str], **kwargs) -> list[torch.Tensor]:
     def encode_corpus(self, corpus: List[str] | List[Dict[str, str]], **kwargs) -> list[torch.Tensor]:
         """Embed search docs . Compact mteb."""
         raise NotImplementedError
-    
+
+
 class SentenceTransformerModel(BaseModel):
     def __init__(
         self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs
@@ -111,16 +111,16 @@ def __init__(
         self.model.max_seq_length = self.max_seq_length
 
     def encode(self, texts: Union[str, List[str]], prompt=None, **kwargs) -> List[torch.Tensor]:
-        kwargs.pop("prompt_name", "")  # remove prompt name, use prompt
+        kwargs.pop('prompt_name', '')  # remove prompt name, use prompt
         self.encode_kwargs.update(kwargs)
-        
+
         embeddings = self.model.encode(texts, prompt=prompt, **self.encode_kwargs)
         assert isinstance(embeddings, Tensor)
         return embeddings.cpu().detach()
-    
+
     def encode_queries(self, queries, **kwargs):
         return self.encode(queries, prompt=self.prompt)
-    
+
     def encode_corpus(self, corpus, **kwargs):
         if isinstance(corpus[0], dict):
             input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
@@ -128,6 +128,7 @@ def encode_corpus(self, corpus, **kwargs):
             input_texts = corpus
         return self.encode(input_texts)
 
+
 class CrossEncoderModel(BaseModel):
     def __init__(self, model_name_or_path: str, **kwargs):
         super().__init__(model_name_or_path, **kwargs)
@@ -141,9 +142,12 @@ def predict(self, sentences: List[List[str]], **kwargs) -> List[List[float]]:
         self.encode_kwargs.update(kwargs)
 
         if len(sentences[0]) == 3:  # Note: For mteb retrieval task
-            sentences = [
-                (self.prompt + query, docs) for query, docs, instruction in sentences
-            ]
+            processed_sentences = []
+            for query, docs, instruction in sentences:
+                if isinstance(docs, dict):
+                    docs = docs['text']
+                processed_sentences.append((self.prompt + query, docs))
+            sentences = processed_sentences
         embeddings = self.model.predict(sentences, **self.encode_kwargs)
         assert isinstance(embeddings, Tensor)
         return embeddings
@@ -154,14 +158,14 @@ class EmbeddingModel:
 
     @staticmethod
     def load(
-        model_name_or_path: str = "",
+        model_name_or_path: str = '',
         is_cross_encoder: bool = False,
-        hub: str = "modelscope",
-        revision: Optional[str] = "master",
+        hub: str = 'modelscope',
+        revision: Optional[str] = 'master',
         **kwargs,
     ):
         # If model path does not exist and hub is 'modelscope', download the model
-        if not os.path.exists(model_name_or_path) and hub == "modelscope":
+        if not os.path.exists(model_name_or_path) and hub == 'modelscope':
             model_name_or_path = download_model(model_name_or_path, revision)
 
         # Return different model instances based on whether it is a cross-encoder and pooling mode
diff --git a/examples/example_eval_mteb.py b/examples/example_eval_mteb.py
index 6b0ea5f..4fc4139 100644
--- a/examples/example_eval_mteb.py
+++ b/examples/example_eval_mteb.py
@@ -17,84 +17,85 @@ def run_eval():
 
     # Prepare the config
 
-    one_stage_task_cfg = {
-        "eval_backend": "RAGEval",
-        "eval_config": {
-            "tool": "MTEB",
-            "model": [
-                {   
-                    "model_name_or_path": "AI-ModelScope/bge-large-zh",
-                    "pooling_mode": "cls",  # if not set, load from model config; use `cls` for bge series model
-                    "max_seq_length": 512,
-                    "prompt": "为这个句子生成表示以用于检索相关文章：",
-                    "encode_kwargs": {
-                        "batch_size": 512,
+    one_stage_task_cfg = {  # noqa
+        'eval_backend': 'RAGEval',
+        'eval_config': {
+            'tool': 'MTEB',
+            'model': [
+                {
+                    'model_name_or_path': 'AI-ModelScope/bge-large-zh',
+                    'pooling_mode': 'cls',  # if not set, load from model config; use `cls` for bge series model
+                    'max_seq_length': 512,
+                    'prompt': '为这个句子生成表示以用于检索相关文章：',
+                    'encode_kwargs': {
+                        'batch_size': 512,
                     },
                 }
             ],
-            "eval": {
-                "tasks": [
-                    "TNews",
-                    "CLSClusteringS2S",
-                    "T2Reranking",
-                    "ATEC",
-                    "T2Retrieval",
-                    "MMarcoRetrieval",
-                    "DuRetrieval",
-                    "CovidRetrieval",
-                    "CmedqaRetrieval",
-                    "EcomRetrieval",
-                    "MedicalRetrieval",
-                    "VideoRetrieval"
+            'eval': {
+                'tasks': [
+                    'TNews',
+                    'CLSClusteringS2S',
+                    'T2Reranking',
+                    'ATEC',
+                    'T2Retrieval',
+                    'MMarcoRetrieval',
+                    'DuRetrieval',
+                    'CovidRetrieval',
+                    'CmedqaRetrieval',
+                    'EcomRetrieval',
+                    'MedicalRetrieval',
+                    'VideoRetrieval'
                 ],
-                "verbosity": 2,
-                "output_folder": "outputs",
-                "overwrite_results": True,
-                "top_k" : 10,
-                "limits": 1000, # don't limit for retrieval task
+                'verbosity': 2,
+                'output_folder': 'outputs',
+                'overwrite_results': True,
+                'top_k': 10,
+                'limits': 1000,  # don't limit for retrieval task
             },
         },
     }
 
     two_stage_task_cfg = {
-        "eval_backend": "RAGEval",
-        "eval_config": {
-            "tool": "MTEB",
-            "model": [
+        'eval_backend': 'RAGEval',
+        'eval_config': {
+            'tool': 'MTEB',
+            'model': [
                 {
-                    "model_name_or_path": "AI-ModelScope/m3e-base",
-                    "is_cross_encoder": False,
-                    "max_seq_length": 512,
-                    "prompt": "",
-                    "model_kwargs": {"torch_dtype": "auto"},
-                    "encode_kwargs": {
-                        "batch_size": 64,
+                    'model_name_or_path': 'AI-ModelScope/m3e-base',
+                    'is_cross_encoder': False,
+                    'max_seq_length': 512,
+                    'prompt': '',
+                    'model_kwargs': {'torch_dtype': 'auto'},
+                    'encode_kwargs': {
+                        'batch_size': 64,
                     },
                 },
                 {
-                    "model_name_or_path": "OpenBMB/MiniCPM-Reranker",
-                    "is_cross_encoder": True,
-                    "max_seq_length": 512,
-                    "prompt": "为这个问题生成一个检索用的表示",
-                    "model_kwargs": {"torch_dtype": "auto"},
-                    "encode_kwargs": {
-                        "batch_size": 32,
+                    'model_name_or_path': 'OpenBMB/MiniCPM-Reranker',
+                    'is_cross_encoder': True,
+                    'max_seq_length': 512,
+                    'prompt': '为这个问题生成一个检索用的表示',
+                    'model_kwargs': {'torch_dtype': 'auto'},
+                    'encode_kwargs': {
+                        'batch_size': 32,
                     },
                 },
             ],
-            "eval": {
-                "tasks": ["T2Retrieval"],
-                "verbosity": 2,
-                "output_folder": "outputs",
-                "overwrite_results": True,
-                "limits": 100,
+            'eval': {
+                'tasks': ['T2Retrieval'],
+                'verbosity': 2,
+                'output_folder': 'outputs',
+                'overwrite_results': True,
+                'limits': 100,
             },
         },
     }
 
     # Run task
-    run_task(task_cfg=one_stage_task_cfg)
+    # run_task(task_cfg=one_stage_task_cfg)
+    run_task(task_cfg=two_stage_task_cfg)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     run_eval()
diff --git a/requirements/rag.txt b/requirements/rag.txt
index a7a9038..8f5df71 100644
--- a/requirements/rag.txt
+++ b/requirements/rag.txt
@@ -1,3 +1,3 @@
-mteb>=0.14.16
-ragas>=0.2.3,<0.3
+mteb==1.19.4
+ragas==0.2.3
 webdataset>0.2.0
diff --git a/tests/rag/test_mteb.py b/tests/rag/test_mteb.py
index a05b326..a5e834d 100644
--- a/tests/rag/test_mteb.py
+++ b/tests/rag/test_mteb.py
@@ -12,7 +12,7 @@
 class TestMTEB(unittest.TestCase):
 
     def setUp(self) -> None:
-        self._check_env("mteb")
+        self._check_env('mteb')
 
     def tearDown(self) -> None:
         pass
@@ -20,111 +20,111 @@ def tearDown(self) -> None:
     @staticmethod
     def _check_env(module_name: str):
         if is_module_installed(module_name):
-            logger.info(f"{module_name} is installed.")
+            logger.info(f'{module_name} is installed.')
         else:
-            raise ModuleNotFoundError(f"run: pip install {module_name}")
+            raise ModuleNotFoundError(f'run: pip install {module_name}')
 
-    @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_one_stage_mteb(self):
         task_cfg = {
-            "eval_backend": "RAGEval",
-            "eval_config": {
-                "tool": "MTEB",
-                "model": [
+            'eval_backend': 'RAGEval',
+            'eval_config': {
+                'tool': 'MTEB',
+                'model': [
                     {
-                        "model_name_or_path": "AI-ModelScope/m3e-base",
-                        "pooling_mode": None,  # load from model config
-                        "max_seq_length": 512,
-                        "prompt": "",
-                        "model_kwargs": {"torch_dtype": "auto"},
-                        "encode_kwargs": {
-                            "batch_size": 128,
+                        'model_name_or_path': 'AI-ModelScope/m3e-base',
+                        'pooling_mode': None,  # load from model config
+                        'max_seq_length': 512,
+                        'prompt': '',
+                        'model_kwargs': {'torch_dtype': 'auto'},
+                        'encode_kwargs': {
+                            'batch_size': 128,
                         },
                     }
                 ],
-                "eval": {
-                    "tasks": [
-                        "TNews",
-                        "CLSClusteringS2S",
-                        "T2Reranking",
-                        "T2Retrieval",
-                        "ATEC",
+                'eval': {
+                    'tasks': [
+                        'TNews',
+                        'CLSClusteringS2S',
+                        'T2Reranking',
+                        'T2Retrieval',
+                        'ATEC',
                     ],
-                    "verbosity": 2,
-                    "output_folder": "outputs",
-                    "overwrite_results": True,
-                    "limits": 500,
+                    'verbosity': 2,
+                    'output_folder': 'outputs',
+                    'overwrite_results': True,
+                    'limits': 500,
                 },
             },
         }
 
         run_task(task_cfg)
 
-    @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_two_stage_mteb(self):
         task_cfg = {
-            "eval_backend": "RAGEval",
-            "eval_config": {
-                "tool": "MTEB",
-                "model": [
+            'eval_backend': 'RAGEval',
+            'eval_config': {
+                'tool': 'MTEB',
+                'model': [
                     {
-                        "model_name_or_path": "AI-ModelScope/m3e-base",
-                        "is_cross_encoder": False,
-                        "max_seq_length": 512,
-                        "prompt": "",
-                        "model_kwargs": {"torch_dtype": "auto"},
-                        "encode_kwargs": {
-                            "batch_size": 64,
+                        'model_name_or_path': 'AI-ModelScope/m3e-base',
+                        'is_cross_encoder': False,
+                        'max_seq_length': 512,
+                        'prompt': '',
+                        'model_kwargs': {'torch_dtype': 'auto'},
+                        'encode_kwargs': {
+                            'batch_size': 64,
                         },
                     },
                     {
-                        "model_name_or_path": "OpenBMB/MiniCPM-Reranker",
-                        "is_cross_encoder": True,
-                        "max_seq_length": 512,
-                        "prompt": "为这个问题生成一个检索用的表示",
-                        "model_kwargs": {"torch_dtype": "auto"},
-                        "encode_kwargs": {
-                            "batch_size": 32,
+                        'model_name_or_path': 'OpenBMB/MiniCPM-Reranker',
+                        'is_cross_encoder': True,
+                        'max_seq_length': 512,
+                        'prompt': '为这个问题生成一个检索用的表示',
+                        'model_kwargs': {'torch_dtype': 'auto'},
+                        'encode_kwargs': {
+                            'batch_size': 32,
                         },
                     },
                 ],
-                "eval": {
-                    "tasks": ["T2Retrieval"],
-                    "verbosity": 2,
-                    "output_folder": "outputs",
-                    "overwrite_results": True,
-                    "limits": 100,
+                'eval': {
+                    'tasks': ['MedicalRetrieval', 'T2Retrieval'],
+                    'verbosity': 2,
+                    'output_folder': 'outputs',
+                    'overwrite_results': True,
+                    'limits': 10,
                 },
             },
         }
 
         run_task(task_cfg)
 
-    @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_custom(self):
         task_cfg = {
-            "eval_backend": "RAGEval",
-            "eval_config": {
-                "tool": "MTEB",
-                "model": [
+            'eval_backend': 'RAGEval',
+            'eval_config': {
+                'tool': 'MTEB',
+                'model': [
                     {
-                        "model_name_or_path": "AI-ModelScope/m3e-base",
-                        "pooling_mode": None,  # load from model config
-                        "max_seq_length": 512,
-                        "prompt": "",
-                        "model_kwargs": {"torch_dtype": "auto"},
-                        "encode_kwargs": {
-                            "batch_size": 128,
+                        'model_name_or_path': 'AI-ModelScope/m3e-base',
+                        'pooling_mode': None,  # load from model config
+                        'max_seq_length': 512,
+                        'prompt': '',
+                        'model_kwargs': {'torch_dtype': 'auto'},
+                        'encode_kwargs': {
+                            'batch_size': 128,
                         },
                     }
                 ],
-                "eval": {
-                    "tasks": ["CustomRetrieval"],
-                    "dataset_path": "custom_eval/text/retrieval",
-                    "verbosity": 2,
-                    "output_folder": "outputs",
-                    "overwrite_results": True,
-                    "limits": 500,
+                'eval': {
+                    'tasks': ['CustomRetrieval'],
+                    'dataset_path': 'custom_eval/text/retrieval',
+                    'verbosity': 2,
+                    'output_folder': 'outputs',
+                    'overwrite_results': True,
+                    'limits': 500,
                 },
             },
         }
@@ -132,5 +132,5 @@ def test_run_custom(self):
         run_task(task_cfg)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main(buffer=False)

From 99ca993f832709b13a6d3665ae650b58ec1758f9 Mon Sep 17 00:00:00 2001
From: "Xingjun.Wang" <wangxingjun778@163.com>
Date: Tue, 12 Nov 2024 17:45:13 +0800
Subject: [PATCH 3/6] Add cmmlu (#198)

* add cmmlu

* add cmmlu
---
 evalscope/backend/opencompass/tasks/eval_datasets.py | 1 +
 requirements/opencompass.txt                         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/evalscope/backend/opencompass/tasks/eval_datasets.py b/evalscope/backend/opencompass/tasks/eval_datasets.py
index 104929b..08c4ec0 100644
--- a/evalscope/backend/opencompass/tasks/eval_datasets.py
+++ b/evalscope/backend/opencompass/tasks/eval_datasets.py
@@ -50,6 +50,7 @@
     from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
     from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
     from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
 
     # Note: to be supported
     # from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
diff --git a/requirements/opencompass.txt b/requirements/opencompass.txt
index ebed2d3..335d288 100644
--- a/requirements/opencompass.txt
+++ b/requirements/opencompass.txt
@@ -1 +1 @@
-ms-opencompass>=0.1.1
+ms-opencompass>=0.1.3

From 776fd5977689b1d3b97735394c609805f0fcd8f1 Mon Sep 17 00:00:00 2001
From: Yunlin Mao <mao.looper@qq.com>
Date: Tue, 12 Nov 2024 18:05:54 +0800
Subject: [PATCH 4/6] update oc docs (#199)

---
 .../backend/opencompass_backend.md            |  14 +-
 .../backend/opencompass_backend.md            |  16 ++-
 tests/rag/test_ragas.py                       | 120 +++++++++---------
 3 files changed, 87 insertions(+), 63 deletions(-)

diff --git a/docs/en/user_guides/backend/opencompass_backend.md b/docs/en/user_guides/backend/opencompass_backend.md
index d3a9fb8..0c80f67 100644
--- a/docs/en/user_guides/backend/opencompass_backend.md
+++ b/docs/en/user_guides/backend/opencompass_backend.md
@@ -15,7 +15,19 @@ There are two ways to download datasets. The automatic download method supports
 You can view the dataset name list using the following code:
 ```python
 from evalscope.backend.opencompass import OpenCompassBackendManager
-print(f'All datasets from OpenCompass backend: {OpenCompassBackendManager.list_datasets()}')
+# list datasets
+OpenCompassBackendManager.list_datasets()
+
+>>> ['summedits', 'humaneval', 'lambada', 
+'ARC_c', 'ARC_e', 'CB', 'C3', 'cluewsc', 'piqa',
+ 'bustm', 'storycloze', 'lcsts', 'Xsum', 'winogrande', 
+ 'ocnli', 'AX_b', 'math', 'race', 'hellaswag', 
+ 'WSC', 'eprstmt', 'siqa', 'agieval', 'obqa',
+ 'afqmc', 'GaokaoBench', 'triviaqa', 'CMRC', 
+ 'chid', 'gsm8k', 'ceval', 'COPA', 'ReCoRD', 
+ 'ocnli_fc', 'mbpp', 'csl', 'tnews', 'RTE', 
+ 'cmnli', 'AX_g', 'nq', 'cmb', 'BoolQ', 'strategyqa', 
+ 'mmlu', 'WiC', 'MultiRC', 'DRCD', 'cmmlu']
 ```
 ````
 
diff --git a/docs/zh/user_guides/backend/opencompass_backend.md b/docs/zh/user_guides/backend/opencompass_backend.md
index cad21b9..7266aba 100644
--- a/docs/zh/user_guides/backend/opencompass_backend.md
+++ b/docs/zh/user_guides/backend/opencompass_backend.md
@@ -18,10 +18,22 @@ pip install evalscope[opencompass] -U
 
 数据集的详细信息可以参考[OpenCompass数据集列表](../../get_started/supported_dataset.md#2-opencompass评测后端支持的数据集)
 
-您可以使用以下方式，来查看数据集的名称列表：
+您可以使用以下方式，来查看支持的数据集的名称列表：
 ```python
 from evalscope.backend.opencompass import OpenCompassBackendManager
-print(f'All datasets from OpenCompass backend: {OpenCompassBackendManager.list_datasets()}')
+# 显示支持的数据集名称列表
+OpenCompassBackendManager.list_datasets()
+
+>>> ['summedits', 'humaneval', 'lambada', 
+'ARC_c', 'ARC_e', 'CB', 'C3', 'cluewsc', 'piqa',
+ 'bustm', 'storycloze', 'lcsts', 'Xsum', 'winogrande', 
+ 'ocnli', 'AX_b', 'math', 'race', 'hellaswag', 
+ 'WSC', 'eprstmt', 'siqa', 'agieval', 'obqa',
+ 'afqmc', 'GaokaoBench', 'triviaqa', 'CMRC', 
+ 'chid', 'gsm8k', 'ceval', 'COPA', 'ReCoRD', 
+ 'ocnli_fc', 'mbpp', 'csl', 'tnews', 'RTE', 
+ 'cmnli', 'AX_g', 'nq', 'cmb', 'BoolQ', 'strategyqa', 
+ 'mmlu', 'WiC', 'MultiRC', 'DRCD', 'cmmlu']
 ```
 ````
 
diff --git a/tests/rag/test_ragas.py b/tests/rag/test_ragas.py
index 0d88806..3cbd9db 100644
--- a/tests/rag/test_ragas.py
+++ b/tests/rag/test_ragas.py
@@ -11,7 +11,7 @@
 class TestRAGAS(unittest.TestCase):
 
     def setUp(self) -> None:
-        self._check_env("ragas")
+        self._check_env('ragas')
 
     def tearDown(self) -> None:
         pass
@@ -19,102 +19,102 @@ def tearDown(self) -> None:
     @staticmethod
     def _check_env(module_name: str):
         if is_module_installed(module_name):
-            logger.info(f"{module_name} is installed.")
+            logger.info(f'{module_name} is installed.')
         else:
-            raise ModuleNotFoundError(f"run: pip install {module_name}")
+            raise ModuleNotFoundError(f'run: pip install {module_name}')
 
-    @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_generate_dataset(self):
         task_cfg = {
-            "eval_backend": "RAGEval",
-            "eval_config": {
-                "tool": "RAGAS",
-                "testset_generation": {
-                    "docs": ["README_zh.md"],
-                    "test_size": 5,
-                    "output_file": "outputs/testset.json",
-                    "distribution": {
-                        "simple": 0.5,
-                        "multi_context": 0.4,
-                        "reasoning": 0.1,
+            'eval_backend': 'RAGEval',
+            'eval_config': {
+                'tool': 'RAGAS',
+                'testset_generation': {
+                    'docs': ['README_zh.md'],
+                    'test_size': 5,
+                    'output_file': 'outputs/testset.json',
+                    'distribution': {
+                        'simple': 0.5,
+                        'multi_context': 0.4,
+                        'reasoning': 0.1,
                     },
-                    "generator_llm": {
-                        "model_name_or_path": "qwen/Qwen2-7B-Instruct",
-                        "template_type": "qwen",
+                    'generator_llm': {
+                        'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
+                        'template_type': 'qwen',
                     },
-                    "embeddings": {
-                        "model_name_or_path": "AI-ModelScope/m3e-base",
+                    'embeddings': {
+                        'model_name_or_path': 'AI-ModelScope/m3e-base',
                     },
-                    "language": "chinese",
+                    'language': 'chinese',
                 },
             },
         }
 
-        logger.info(f">> Start to run task: {task_cfg}")
+        logger.info(f'>> Start to run task: {task_cfg}')
 
         run_task(task_cfg)
 
-    @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_rag_eval(self):
         task_cfg = {
-            "eval_backend": "RAGEval",
-            "eval_config": {
-                "tool": "RAGAS",
-                "eval": {
-                    "testset_file": "outputs/testset.json",
-                    "critic_llm": {
-                        "model_name_or_path": "qwen/Qwen2-7B-Instruct",
-                        "template_type": "qwen",
+            'eval_backend': 'RAGEval',
+            'eval_config': {
+                'tool': 'RAGAS',
+                'eval': {
+                    'testset_file': 'outputs/testset_chinese_with_answer.json',
+                    'critic_llm': {
+                        'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
+                        'template_type': 'qwen',
                     },
-                    "embeddings": {
-                        "model_name_or_path": "AI-ModelScope/m3e-base",
+                    'embeddings': {
+                        'model_name_or_path': 'AI-ModelScope/m3e-base',
                     },
-                    "metrics": [
-                        "Faithfulness",
-                        "AnswerRelevancy",
-                        "ContextPrecision",
-                        "AnswerCorrectness",
+                    'metrics': [
+                        'Faithfulness',
+                        'AnswerRelevancy',
+                        'ContextPrecision',
+                        'AnswerCorrectness',
                     ],
                 },
             },
         }
 
-        logger.info(f">> Start to run task: {task_cfg}")
+        logger.info(f'>> Start to run task: {task_cfg}')
 
         run_task(task_cfg)
 
-    @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
     def test_run_rag_eval_api(self):
         task_cfg = {
-            "eval_backend": "RAGEval",
-            "eval_config": {
-                "tool": "RAGAS",
-                "eval": {
-                    "testset_file": "outputs/testset.json",
-                    "critic_llm": {
-                        "model_name": "gpt-4o-mini",  # 自定义聊天模型名称
-                        "api_base": "http://127.0.0.1:8088/v1",  # 自定义基础URL
-                        "api_key": "xxxx",  # 你的API密钥
+            'eval_backend': 'RAGEval',
+            'eval_config': {
+                'tool': 'RAGAS',
+                'eval': {
+                    'testset_file': 'outputs/testset.json',
+                    'critic_llm': {
+                        'model_name': 'gpt-4o-mini',  # 自定义聊天模型名称
+                        'api_base': 'http://127.0.0.1:8088/v1',  # 自定义基础URL
+                        'api_key': 'xxxx',  # 你的API密钥
                     },
-                    "embeddings": {
-                        "model_name_or_path": "AI-ModelScope/m3e-base",
+                    'embeddings': {
+                        'model_name_or_path': 'AI-ModelScope/m3e-base',
                     },
-                    "metrics": [
-                        "Faithfulness",
-                        "AnswerRelevancy",
-                        "ContextPrecision",
-                        "AnswerCorrectness",
-                        "MultiModalFaithfulness",
-                        "MultiModalRelevance",
+                    'metrics': [
+                        'Faithfulness',
+                        'AnswerRelevancy',
+                        'ContextPrecision',
+                        'AnswerCorrectness',
+                        'MultiModalFaithfulness',
+                        'MultiModalRelevance',
                     ],
                 },
             },
         }
 
-        logger.info(f">> Start to run task: {task_cfg}")
+        logger.info(f'>> Start to run task: {task_cfg}')
 
         run_task(task_cfg)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main(buffer=False)

From 96fd22f22b9bacfb9fae2ccb13f2091040318503 Mon Sep 17 00:00:00 2001
From: Yunlin Mao <mao.looper@qq.com>
Date: Thu, 21 Nov 2024 10:30:30 +0800
Subject: [PATCH 5/6] add timeout for download punkt.zip (#206)

---
 evalscope/metrics/bundled_rouge_score/rouge_scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evalscope/metrics/bundled_rouge_score/rouge_scorer.py b/evalscope/metrics/bundled_rouge_score/rouge_scorer.py
index ffb64b9..3863cd0 100644
--- a/evalscope/metrics/bundled_rouge_score/rouge_scorer.py
+++ b/evalscope/metrics/bundled_rouge_score/rouge_scorer.py
@@ -51,7 +51,7 @@
     punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
 
     if not os.path.exists(punkt_path):
-        os.system(f'wget -P {nltk_dir} {punkt_tab_url}')
+        os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
         os.system(f'unzip {punkt_path} -d {nltk_dir}')
     else:
         logger.info(f'{punkt_path} already exists, skipping download')

From b2abfe7a00a3c1e8def86bc2666cb16260cc6938 Mon Sep 17 00:00:00 2001
From: Yunlin Mao <mao.looper@qq.com>
Date: Fri, 22 Nov 2024 13:55:11 +0800
Subject: [PATCH 6/6] compact ragas v0.2.5 and update readme (#205)

* modify ragas

* update readme

* compact ragas v0.2.5
---
 .pre-commit-config.yaml                       |  26 +--
 README.md                                     |  11 +-
 README_zh.md                                  |  12 +-
 .../common_theme_prompt_chinese.json          |  30 ---
 .../critic_query_prompt_chinese.json          |  35 ---
 .../generate_reference_prompt_chinese.json    |  17 --
 .../generate_user_input_prompt_chinese.json   |  17 --
 .../query_modification_prompt_chinese.json    |   7 -
 .../correctness_prompt_chinese.json           |  50 ++--
 .../long_form_answer_prompt_chinese.json      |  18 +-
 .../question_generation_chinese.json          |  12 +-
 .../common_concepts_prompt_chinese.json       |  47 ----
 .../critic_query_prompt_chinese.json          |  35 ---
 .../generate_query_prompt_chinese.json        |  28 ---
 .../generate_reference_prompt_chinese.json    |  17 --
 .../query_modification_prompt_chinese.json    |   7 -
 .../context_precision_prompt_chinese.json     |  22 +-
 .../nli_statements_message_chinese.json       |  30 +--
 .../statement_prompt_chinese.json             |  14 +-
 .../HeadlinesExtractor/prompt_chinese.json    |  27 +--
 .../KeyphrasesExtractor/prompt_chinese.json   |  22 --
 .../concept_combination_prompt_chinese.json   |  35 +++
 ...nerate_query_reference_prompt_chinese.json |   7 +
 ...theme_persona_matching_prompt_chinese.json |  39 ++++
 ...nerate_query_reference_prompt_chinese.json |   7 +
 ...theme_persona_matching_prompt_chinese.json |  39 ++++
 .../faithfulness_prompt_chinese.json          |   8 +-
 .../relevance_prompt_chinese.json             |  10 +-
 .../chinese/NERExtractor/prompt_chinese.json  |  25 ++
 ...nerate_query_reference_prompt_chinese.json |   7 +
 ...theme_persona_matching_prompt_chinese.json |  39 ++++
 .../critic_query_prompt_chinese.json          |  35 ---
 .../generate_query_prompt_chinese.json        |  18 --
 .../generate_reference_prompt_chinese.json    |  17 --
 .../query_modification_prompt_chinese.json    |   7 -
 .../SummaryExtractor/prompt_chinese.json      |  12 +-
 .../ThemesExtractor/prompt_chinese.json       |  24 ++
 .../TitleExtractor/prompt_chinese.json        |  16 --
 .../ragas/prompts/multi_modal_prompt.py       | 207 ----------------
 .../rag_eval/ragas/prompts/persona_prompt.py  |  18 ++
 .../ragas/tasks/testset_generation.py         | 220 ++++++++++--------
 evalscope/backend/rag_eval/utils/__init__.py  |   0
 requirements/rag.txt                          |   2 +-
 setup.cfg                                     |   3 +-
 44 files changed, 489 insertions(+), 790 deletions(-)
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/common_theme_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/critic_query_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/generate_reference_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/generate_user_input_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/query_modification_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/common_concepts_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/critic_query_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/generate_query_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/generate_reference_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/query_modification_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/KeyphrasesExtractor/prompt_chinese.json
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/critic_query_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/generate_query_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/generate_reference_prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/query_modification_prompt_chinese.json
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/chinese/TitleExtractor/prompt_chinese.json
 delete mode 100644 evalscope/backend/rag_eval/ragas/prompts/multi_modal_prompt.py
 create mode 100644 evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py
 create mode 100644 evalscope/backend/rag_eval/utils/__init__.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6a98a33..5bf22aa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,19 +8,19 @@ repos:
                 thirdparty/|
                 examples/
             )$
-#  - repo: https://github.com/PyCQA/isort.git
-#    rev: 4.3.21
-#    hooks:
-#      - id: isort
-#  - repo: https://github.com/pre-commit/mirrors-yapf.git
-#    rev: v0.30.0
-#    hooks:
-#      - id: yapf
-#        exclude: |
-#            (?x)^(
-#                thirdparty/|
-#                examples/
-#            )$
+  - repo: https://github.com/PyCQA/isort.git
+    rev: 4.3.21
+    hooks:
+      - id: isort
+  - repo: https://github.com/pre-commit/mirrors-yapf.git
+    rev: v0.30.0
+    hooks:
+      - id: yapf
+        exclude: |
+            (?x)^(
+                thirdparty/|
+                examples/
+            )$
   - repo: https://github.com/pre-commit/pre-commit-hooks.git
     rev: v3.1.0
     hooks:
diff --git a/README.md b/README.md
index 495cff0..a3811ef 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@
  <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
 <p>
 
+> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
 
 ## 📋 Table of Contents
 - [Introduction](#introduction)
@@ -42,7 +43,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
 The architecture includes the following modules:
 1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
 2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
-3. **Evaluation Backend**: 
+3. **Evaluation Backend**:
     - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
     - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
     - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
@@ -129,7 +130,7 @@ You can execute this command from any directory:
 python -m evalscope.run \
  --model qwen/Qwen2-0.5B-Instruct \
  --template-type qwen \
- --datasets arc 
+ --datasets arc
 ```
 
 #### Install from source
@@ -236,13 +237,13 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
 EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
 
 ## Offline Evaluation
-You can use local dataset to evaluate the model without internet connection. 
+You can use local dataset to evaluate the model without internet connection.
 
 Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
 
 
 ## Arena Mode
-The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report. 
+The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
 
 Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
 
@@ -270,4 +271,4 @@ Refer to : Model Serving Performance Evaluation [📖 User Guide](https://evalsc
 
 ## Star History
 
-[![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
\ No newline at end of file
+[![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
diff --git a/README_zh.md b/README_zh.md
index fb8186d..fba7064 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -18,6 +18,8 @@
 <p>
 
 
+> ⭐ 如果你喜欢这个项目，请点击右上角的 "Star" 按钮支持我们。你的支持是我们前进的动力！
+
 ## 📋 目录
 - [简介](#简介)
 - [新闻](#新闻)
@@ -46,7 +48,7 @@ EvalScope包括以下模块：
 
 2. **Data Adapter**: 数据适配器，负责转换和处理输入数据，以便适应不同的评估需求和格式。
 
-3. **Evaluation Backend**: 
+3. **Evaluation Backend**:
     - **Native**：EvalScope自身的**默认评测框架**，支持多种评估模式，包括单模型评估、竞技场模式、Baseline模型对比模式等。
     - **OpenCompass**：支持[OpenCompass](https://github.com/open-compass/opencompass)作为评测后端，对其进行了高级封装和任务简化，您可以更轻松地提交任务进行评估。
     - **VLMEvalKit**：支持[VLMEvalKit](https://github.com/open-compass/VLMEvalKit)作为评测后端，轻松发起多模态评测任务，支持多种多模态模型和数据集。
@@ -138,7 +140,7 @@ pip install -e '.[all]'           # 安装所有 backends (Native, OpenCompass,
 python -m evalscope.run \
  --model qwen/Qwen2-0.5B-Instruct \
  --template-type qwen \
- --datasets arc 
+ --datasets arc
 ```
 
 #### 使用源码安装
@@ -176,7 +178,7 @@ python evalscope/run.py \
 
 **示例2：**
 ```shell
-python evalscope/run.py \ 
+python evalscope/run.py \
  --model qwen/Qwen2-0.5B-Instruct \
  --template-type qwen \
  --generation-config do_sample=false,temperature=0.0 \
@@ -219,7 +221,7 @@ your_task_cfg = {
         'dataset_args': {},
         'dry_run': False,
         'model': 'qwen/Qwen2-0.5B-Instruct',
-        'template_type': 'qwen', 
+        'template_type': 'qwen',
         'datasets': ['arc', 'hellaswag'],
         'work_dir': DEFAULT_ROOT_CACHE_DIR,
         'outputs': DEFAULT_ROOT_CACHE_DIR,
@@ -280,4 +282,4 @@ EvalScope支持自定义数据集评测，具体请参考：自定义数据集
 
 ## Star History
 
-[![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
\ No newline at end of file
+[![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/common_theme_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/common_theme_prompt_chinese.json
deleted file mode 100644
index 4943ea9..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/common_theme_prompt_chinese.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -6400990569250161398,
-  "language": "chinese",
-  "instruction": "分析以下摘要并确定给定数量的共同主题。这些主题应简洁、描述性强，并突出摘要中共享的关键方面。",
-  "examples": [
-    {
-      "input": {
-        "summaries": [
-          "人工智能的进步已经革新了许多行业。从医疗保健到金融，人工智能算法使流程更加高效和准确。机器学习模型被用于预测疾病、优化投资策略，甚至向用户推荐个性化内容。人工智能融入日常运营对于现代企业来说正变得越来越不可或缺。",
-          "医疗保健行业由于人工智能的进步正在经历重大变革。人工智能驱动的诊断工具提高了医疗诊断的准确性，减少了人为错误，并实现了疾病的早期检测。此外，人工智能简化了行政任务，使医疗专业人员能够更加专注于患者护理。由人工智能分析驱动的个性化治疗计划正在改善患者结果。",
-          "金融科技，或称fintech，已经看到了人工智能应用的激增。欺诈检测、风险管理、自动化交易的算法是该领域的一些关键创新。人工智能驱动的分析帮助公司更好地理解市场趋势并做出明智的决策。人工智能在金融科技中的应用不仅提高了安全性，还提高了效率和盈利能力。"
-        ],
-        "num_themes": 2
-      },
-      "output": {
-        "themes": [
-          {
-            "theme": "人工智能提高了各行业的效率和准确性。",
-            "description": "人工智能算法通过提高效率和准确性，改善了医疗保健、金融等领域的流程。"
-          },
-          {
-            "theme": "人工智能工具改善了决策和结果。",
-            "description": "人工智能在诊断工具、个性化治疗计划和金融科技分析中的应用正在改善决策和结果。"
-          }
-        ]
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/critic_query_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/critic_query_prompt_chinese.json
deleted file mode 100644
index 4c79e87..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/critic_query_prompt_chinese.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 6368494196383210761,
-  "language": "chinese",
-  "instruction": "根据以下评分标准批评合成生成的问题。为每个标准提供一个分数：独立性和明确意图。分数为低（0），中（1），或高（2）。",
-  "examples": [
-    {
-      "input": {
-        "text": "人工智能如何提高各个行业的效率和准确性？"
-      },
-      "output": {
-        "independence": 2,
-        "clear_intent": 2
-      }
-    },
-    {
-      "input": {
-        "text": "解释人工智能的好处。"
-      },
-      "output": {
-        "independence": 1,
-        "clear_intent": 1
-      }
-    },
-    {
-      "input": {
-        "text": "人工智能如何？"
-      },
-      "output": {
-        "independence": 0,
-        "clear_intent": 0
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/generate_reference_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/generate_reference_prompt_chinese.json
deleted file mode 100644
index 077c87b..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/generate_reference_prompt_chinese.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -800989296329424606,
-  "language": "chinese",
-  "instruction": "根据给定文本中提供的信息回答以下问题。",
-  "examples": [
-    {
-      "input": {
-        "query": "人工智能如何提高不同行业的效率和准确性？",
-        "context": "人工智能的进步已经革新了许多行业。从医疗保健到金融，人工智能算法使流程更加高效和准确。机器学习模型被用于预测疾病、优化投资策略，甚至向用户推荐个性化内容。人工智能融入日常运营对于现代企业变得越来越不可或缺。"
-      },
-      "output": {
-        "text": "人工智能通过使流程更加高效和准确，从而提高不同行业的效率和准确性。"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/generate_user_input_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/generate_user_input_prompt_chinese.json
deleted file mode 100644
index 3044191..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/generate_user_input_prompt_chinese.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 8903202105324427480,
-  "language": "chinese",
-  "instruction": "根据给定的主题生成一个可以从提供的上下文中回答的抽象概念问题。",
-  "examples": [
-    {
-      "input": {
-        "theme": "AI 提高了各个行业的效率和准确性。",
-        "context": "AI 通过提高效率和准确性来改变各个行业。例如，在制造业中，AI 驱动的机器人以高精度自动化重复任务，减少错误并提高生产率。在医疗保健领域，AI 算法分析医学图像和患者数据，提供准确的诊断和个性化的治疗计划。金融服务利用 AI 进行欺诈检测和风险管理，确保更快、更可靠的决策。总体而言，AI 处理大量数据并从中学习的能力使行业能够优化运营、降低成本并提供更好的产品和服务。"
-      },
-      "output": {
-        "text": "AI 是如何提高各个行业的效率和准确性的？"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/query_modification_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/query_modification_prompt_chinese.json
deleted file mode 100644
index 6e419c4..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/AbstractQuerySynthesizer/query_modification_prompt_chinese.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 8176750950393070076,
-  "language": "chinese",
-  "instruction": "修改给定的问题以适应给定的风格和长度。",
-  "examples": []
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json
index bf65d08..072fa7d 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json
@@ -1,57 +1,57 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -2250499596349978971,
+  "ragas_version": "0.2.5",
+  "original_hash": 963876325390538086,
   "language": "chinese",
-  "instruction": "给定一个真实情况和一个答案陈述，分析每个陈述并将其分类为以下类别之一：TP（真阳性）：答案中存在且由一个或多个真实情况中的陈述直接支持的陈述，FP（假阳性）：答案中存在但未被真实情况中的任何陈述直接支持的陈述，FN（假阴性）：真实情况中存在但答案中未出现的陈述。每个陈述只能属于一个类别。为每个分类提供理由。",
+  "instruction": "给定一个真实情况和一个答案陈述，分析每个陈述并将其分类为以下类别之一：TP（真正）：答案中存在的陈述也直接由一个或多个真实情况中的陈述支持，FP（假正）：答案中存在的陈述但没有被任何真实情况中的陈述直接支持，FN（假负）：在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于一个类别。为每个分类提供理由。",
   "examples": [
     {
       "input": {
-        "question": "太阳的能量来源是什么，它的主要功能是什么？",
+        "question": "是什么为太阳提供能量，它的主要功能是什么？",
         "answer": [
           "太阳的能量来源于核裂变，类似于地球上的核反应堆。",
           "太阳的主要功能是为太阳系提供光。"
         ],
         "ground_truth": [
-          "太阳的能量来源于核聚变，氢原子聚变成氦。",
-          "太阳核心的聚变过程释放出巨大的能量。",
-          "太阳的能量提供了热量和光，这对地球上的生命至关重要。",
+          "太阳的能量来源于核聚变，其中氢原子融合形成氦。",
+          "太阳核心的这种聚变过程释放出巨大的能量。",
+          "来自太阳的能量提供热量和光，这对地球上的生命至关重要。",
           "太阳的光在地球的气候系统中起着关键作用。",
-          "阳光有助于驱动天气和洋流。"
+          "阳光有助于驱动天气和海洋洋流。"
         ]
       },
       "output": {
         "TP": [
           {
             "statement": "太阳的主要功能是为太阳系提供光。",
-            "reason": "这一陈述在某种程度上得到了事实的支持，提到太阳提供光和其作用，但更广泛地关注太阳的能量。"
+            "reason": "这一说法在某种程度上得到了地面事实的支持，提到太阳提供光和它的作用，尽管它更广泛地关注太阳的能量。"
           }
         ],
         "FP": [
           {
             "statement": "太阳的能量来源于核裂变，类似于地球上的核反应堆。",
-            "reason": "这一陈述是不正确的，与事实相矛盾，事实指出太阳的能量来源于核聚变。"
+            "reason": "这一说法是不正确的，与地面事实相矛盾，地面事实指出太阳的能量来源于核聚变。"
           }
         ],
         "FN": [
           {
-            "statement": "太阳的能量来源于核聚变，氢原子聚变成氦。",
-            "reason": "这一对太阳能量来源的准确描述没有包含在答案中。"
+            "statement": "太阳的能量来源于核聚变，其中氢原子融合形成氦。",
+            "reason": "这种对太阳能量来源的准确描述没有包含在答案中。"
           },
           {
-            "statement": "太阳核心的聚变过程释放出巨大的能量。",
-            "reason": "这一过程及其重要性没有在答案中提及。"
+            "statement": "太阳核心的这种聚变过程释放出巨大的能量。",
+            "reason": "这个过程及其重要性没有在答案中提到。"
           },
           {
-            "statement": "太阳的能量提供了热量和光，这对地球上的生命至关重要。",
-            "reason": "答案只提到了光，忽略了热量及其对生命的重要性，而事实涵盖了这一点。"
+            "statement": "来自太阳的能量提供热量和光，这对地球上的生命至关重要。",
+            "reason": "答案中只提到了光，忽略了热量及其对生命的必要性，这些在地面事实中都有涵盖。"
           },
           {
             "statement": "太阳的光在地球的气候系统中起着关键作用。",
-            "reason": "太阳光对地球气候系统的更广泛影响在答案中没有涉及。"
+            "reason": "太阳光对地球气候系统的这种更广泛的影响没有在答案中提到。"
           },
           {
-            "statement": "阳光有助于驱动天气和洋流。",
-            "reason": "阳光对天气模式和洋流的影响在答案中被忽略了。"
+            "statement": "阳光有助于驱动天气和海洋洋流。",
+            "reason": "答案中省略了阳光对天气模式和海洋洋流的影响。"
           }
         ]
       }
@@ -60,28 +60,28 @@
       "input": {
         "question": "水的沸点是多少？",
         "answer": [
-          "水的沸点在海平面是100摄氏度。"
+          "水的沸点在海平面上是100摄氏度。"
         ],
         "ground_truth": [
-          "水的沸点在海平面是100摄氏度（212华氏度）。",
+          "水的沸点在海平面上是100摄氏度（212华氏度）。",
           "水的沸点会随着海拔的变化而变化。"
         ]
       },
       "output": {
         "TP": [
           {
-            "statement": "水的沸点在海平面是100摄氏度。",
-            "reason": "这一陈述直接得到了事实的支持，事实指出了水的沸点在海平面是100摄氏度。"
+            "statement": "水的沸点在海平面上是100摄氏度。",
+            "reason": "这一说法得到了地面事实的直接支持，地面事实明确指出水的沸点在海平面上是100摄氏度。"
           }
         ],
         "FP": [],
         "FN": [
           {
             "statement": "水的沸点会随着海拔的变化而变化。",
-            "reason": "这一关于水的沸点会随海拔变化的附加信息在答案中没有提及。"
+            "reason": "关于水的沸点如何随海拔变化的额外信息没有在答案中提到。"
           }
         ]
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json
index a1e46d7..b84702e 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json
@@ -1,16 +1,16 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -3898713101712703116,
+  "ragas_version": "0.2.5",
+  "original_hash": 8370494081602031492,
   "language": "chinese",
-  "instruction": "给定一个问题，一个答案，以及答案中的句子，分析每个句子的复杂性。将每个句子分解为一个或多个完全可理解的陈述，同时确保每个陈述中不使用代词。输出格式为JSON。",
+  "instruction": "给定一个问题、一个答案和答案中的句子，分析在“句子”下给出的每个句子的复杂性，并将每个句子分解为一个或多个完全可理解的陈述，同时确保每个陈述中不使用代词。将输出格式化为JSON。",
   "examples": [
     {
       "input": {
-        "question": "阿尔伯特·爱因斯坦是谁，他最出名的是什么？",
-        "answer": "他是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最出名的是发展了相对论，同时对量子力学理论的发展也做出了重要贡献。",
+        "question": "阿尔伯特·爱因斯坦是谁，他以什么而闻名？",
+        "answer": "他是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最著名的是发展了相对论，他还对量子力学理论的发展做出了重要贡献。",
         "sentences": {
           "0": "他是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的物理学家之一。",
-          "1": "他最出名的是发展了相对论，同时对量子力学理论的发展也做出了重要贡献。"
+          "1": "他最著名的是发展了相对论，他还对量子力学理论的发展做出了重要贡献。"
         }
       },
       "output": {
@@ -19,13 +19,13 @@
             "sentence_index": 0,
             "simpler_statements": [
               "阿尔伯特·爱因斯坦是一位出生于德国的理论物理学家。",
-              "阿尔伯特·爱因斯坦被认为是历史上最伟大和最有影响力的物理学家之一。"
+              "阿尔伯特·爱因斯坦被认为是有史以来最伟大和最有影响力的物理学家之一。"
             ]
           },
           {
             "sentence_index": 1,
             "simpler_statements": [
-              "阿尔伯特·爱因斯坦最出名的是发展了相对论。",
+              "阿尔伯特·爱因斯坦最著名的是发展了相对论。",
               "阿尔伯特·爱因斯坦还对量子力学理论的发展做出了重要贡献。"
             ]
           }
@@ -33,4 +33,4 @@
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json
index 198fe0a..bec27be 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json
@@ -1,12 +1,12 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -8204697915071564805,
+  "ragas_version": "0.2.5",
+  "original_hash": -6199619726952258368,
   "language": "chinese",
-  "instruction": "为给定的答案生成一个问题，并判断答案是否含糊。如果答案含糊，则给出1；如果答案明确，则给出0。含糊的答案是指那些回避、模糊或模棱两可的回答。例如，“我不知道”或“我不确定”都是含糊的答案。",
+  "instruction": "为给定的答案生成一个问题，并识别答案是否是不明确的。如果答案是不明确的，则给出1；如果答案是明确的，则给出0。不明确的答案是指那些含糊其辞、模棱两可或不清楚的答案。例如，“我不知道”或“我不确定”是不明确的答案。",
   "examples": [
     {
       "input": {
-        "response": "阿尔伯特·爱因斯坦出生于德国。"
+        "response": "阿尔伯特·爱因斯坦出生在德国。"
       },
       "output": {
         "question": "阿尔伯特·爱因斯坦出生在哪里？",
@@ -15,7 +15,7 @@
     },
     {
       "input": {
-        "response": "我不知道2023年发明的智能手机的突破性功能，因为我没有2022年之后的信息。"
+        "response": "我不知道2023年发明的智能手机的突破性功能，因为我对2022年以后的信息不了解。"
       },
       "output": {
         "question": "2023年发明的智能手机的突破性功能是什么？",
@@ -23,4 +23,4 @@
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/common_concepts_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/common_concepts_prompt_chinese.json
deleted file mode 100644
index 84f4c53..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/common_concepts_prompt_chinese.json
+++ /dev/null
@@ -1,47 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 523894698462997543,
-  "language": "chinese",
-  "instruction": "从给定的关键短语列表中识别出用于跨报告比较给定主题的一系列常见概念。",
-  "examples": [
-    {
-      "input": {
-        "keyphrases": [
-          "快速充电",
-          "长电池寿命",
-          "OLED显示屏",
-          "防水"
-        ],
-        "num_concepts": 4
-      },
-      "output": {
-        "concepts": {
-          "Charging": [
-            "快速充电",
-            "长电池寿命",
-            "OLED显示屏",
-            "防水"
-          ],
-          "Battery Life": [
-            "长电池寿命",
-            "扩展电池",
-            "耐用电池",
-            "持久电池"
-          ],
-          "Display": [
-            "OLED显示屏",
-            "高清显示屏",
-            "AMOLED显示屏",
-            "视网膜显示屏"
-          ],
-          "Water/Dust Resistance": [
-            "防水",
-            "防尘",
-            "防溅",
-            "防水"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/critic_query_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/critic_query_prompt_chinese.json
deleted file mode 100644
index 3924002..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/critic_query_prompt_chinese.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 6368494196383210761,
-  "language": "chinese",
-  "instruction": "根据以下评分标准批评合成生成的问题。为每个标准提供一个分数：独立性和明确意图。分数为低（0），中（1），或高（2）。",
-  "examples": [
-    {
-      "input": {
-        "text": "AI如何提高各个行业的效率和准确性？"
-      },
-      "output": {
-        "independence": 2,
-        "clear_intent": 2
-      }
-    },
-    {
-      "input": {
-        "text": "解释AI的好处。"
-      },
-      "output": {
-        "independence": 1,
-        "clear_intent": 1
-      }
-    },
-    {
-      "input": {
-        "text": "AI如何？"
-      },
-      "output": {
-        "independence": 0,
-        "clear_intent": 0
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/generate_query_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/generate_query_prompt_chinese.json
deleted file mode 100644
index df7ead6..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/generate_query_prompt_chinese.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -4554662415072061518,
-  "language": "chinese",
-  "instruction": "根据给定的概念、属于该概念的关键短语以及报告的摘要，生成一个抽象的比较问题。",
-  "examples": [
-    {
-      "input": {
-        "concept": "电池寿命",
-        "keyphrases": [
-          "长电池寿命",
-          "延长电池",
-          "耐用电池",
-          "延长的电池"
-        ],
-        "summaries": [
-          "该设备提供长电池寿命，单次充电可使用长达24小时。",
-          "具有延长电池的产品在重度使用下可运行20小时。",
-          "这款具有耐用电池的型号在正常条件下可确保22小时的运行时间。",
-          "电池寿命延长，使该设备单次充电可使用长达18小时。"
-        ]
-      },
-      "output": {
-        "text": "具有长电池寿命、延长电池、耐用电池和延长的电池的设备在不同报告中的电池寿命声明和性能指标如何比较？"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/generate_reference_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/generate_reference_prompt_chinese.json
deleted file mode 100644
index 077c87b..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/generate_reference_prompt_chinese.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -800989296329424606,
-  "language": "chinese",
-  "instruction": "根据给定文本中提供的信息回答以下问题。",
-  "examples": [
-    {
-      "input": {
-        "query": "人工智能如何提高不同行业的效率和准确性？",
-        "context": "人工智能的进步已经革新了许多行业。从医疗保健到金融，人工智能算法使流程更加高效和准确。机器学习模型被用于预测疾病、优化投资策略，甚至向用户推荐个性化内容。人工智能融入日常运营对于现代企业变得越来越不可或缺。"
-      },
-      "output": {
-        "text": "人工智能通过使流程更加高效和准确，从而提高不同行业的效率和准确性。"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/query_modification_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/query_modification_prompt_chinese.json
deleted file mode 100644
index 6e419c4..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/ComparativeAbstractQuerySynthesizer/query_modification_prompt_chinese.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 8176750950393070076,
-  "language": "chinese",
-  "instruction": "修改给定的问题以适应给定的风格和长度。",
-  "examples": []
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json
index 0539d45..cd4c189 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json
@@ -1,41 +1,41 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 1601443977079251276,
+  "ragas_version": "0.2.5",
+  "original_hash": 6611742689846464445,
   "language": "chinese",
-  "instruction": "给定问题、答案和上下文，验证上下文是否有助于得出给定答案。如果有助于，则给出裁决为“1”，如果不有助于，则给出裁决为“0”，并以json格式输出。",
+  "instruction": "给定问题、答案和上下文，验证上下文在得出给定答案时是否有用。如果有用，给出判决为“1”，如果没有用，给出判决为“0”，并以json格式输出。",
   "examples": [
     {
       "input": {
         "question": "你能告诉我关于阿尔伯特·爱因斯坦的什么？",
-        "context": "阿尔伯特·爱因斯坦（1879年3月14日—1955年4月18日）是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大、最有影响力的科学家之一。他最著名的是发展了相对论，还对量子力学做出了重要贡献，因此成为20世纪初现代物理学在自然理解上的革命性重塑中的核心人物。他的质能等价公式E = mc²，源于相对论，被称为“世界上最著名的方程”。他因“对理论物理学的服务，特别是发现光电效应定律”而获得1921年诺贝尔物理学奖，这是量子理论发展中的一个关键步骤。他的工作还因其对科学哲学的影响而闻名。1999年，英国《物理世界》杂志对130位世界顶尖物理学家进行的一项调查显示，爱因斯坦被评为有史以来最伟大的物理学家。他的智力成就和独创性使爱因斯坦成为天才的代名词。",
-        "answer": "阿尔伯特·爱因斯坦，1879年3月14日出生，是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大、最有影响力的科学家之一。他因对理论物理学的服务而获得1921年诺贝尔物理学奖。"
+        "context": "阿尔伯特·爱因斯坦（1879年3月14日－1955年4月18日）是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最具影响力的科学家之一。他因发展相对论而闻名，同时也对量子力学做出了重要贡献，因此在20世纪前几十年现代物理学对自然科学理解的革命性重塑中起到了核心作用。他的质能等价公式E=mc²源于相对论，被称为“世界上最著名的方程”。他因“对理论物理学的贡献，特别是发现光电效应定律”而获得1921年诺贝尔物理学奖，这是量子理论发展的关键一步。他的工作也因其对科学哲学的影响而闻名。在1999年由英国《物理世界》杂志对全球130位顶尖物理学家的调查中，爱因斯坦被评为有史以来最伟大的物理学家。他的智力成就和原创性使爱因斯坦成为天才的代名词。",
+        "answer": "阿尔伯特·爱因斯坦，生于1879年3月14日，是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最具影响力的科学家之一。他因对理论物理学的贡献而获得1921年诺贝尔物理学奖。"
       },
       "output": {
-        "reason": "提供的上下文确实有助于得出给定的答案。上下文包括关于阿尔伯特·爱因斯坦的生活和贡献的关键信息，这些信息在答案中得到了反映。",
+        "reason": "提供的背景确实有助于得出给定的答案。背景包括关于阿尔伯特·爱因斯坦的生活和贡献的关键信息，这些信息在答案中得到了反映。",
         "verdict": 1
       }
     },
     {
       "input": {
         "question": "谁赢得了2020年ICC世界杯？",
-        "context": "2022年ICC男子T20世界杯于2022年10月16日至11月13日在澳大利亚举行，是该赛事的第八届。原定于2020年举行，但由于COVID-19大流行而推迟。英格兰队在决赛中以5个球的优势击败巴基斯坦队，赢得了他们的第二个ICC男子T20世界杯冠军。",
+        "context": "2022年ICC男子T20世界杯于2022年10月16日至11月13日在澳大利亚举行，是该赛事的第八届。原定于2020年举行，但因COVID-19大流行而推迟。英格兰在决赛中以五个小门击败巴基斯坦，赢得了他们的第二个ICC男子T20世界杯冠军。",
         "answer": "英格兰"
       },
       "output": {
-        "reason": "提供的上下文有助于澄清2020年ICC世界杯的情况，并指出英格兰是原定于2020年举行但实际上在2022年举行的比赛中获胜的队伍。",
+        "reason": "背景有助于澄清关于2020年ICC世界杯的情况，并指出英格兰是原定于2020年举行但实际上在2022年举行的比赛的获胜者。",
         "verdict": 1
       }
     },
     {
       "input": {
         "question": "世界上最高的山是什么？",
-        "context": "安第斯山脉是世界上最长的大陆山脉，位于南美洲。它横跨七个国境，拥有西半球最高的许多山峰。该山脉以其多样的生态系统而闻名，包括高海拔的安第斯高原和亚马逊雨林。",
+        "context": "安第斯山脉是世界上最长的大陆山脉，位于南美洲。它横跨七个国家，拥有西半球许多最高的山峰。该山脉以其多样的生态系统而闻名，包括高海拔的安第斯高原和亚马逊雨林。",
         "answer": "珠穆朗玛峰。"
       },
       "output": {
-        "reason": "提供的上下文讨论了安第斯山脉，虽然令人印象深刻，但不包括珠穆朗玛峰，也与关于世界最高峰的问题没有直接关系。",
+        "reason": "提供的背景讨论了安第斯山脉，虽然令人印象深刻，但不包括珠穆朗玛峰，也与关于世界最高山的问题没有直接关系。",
         "verdict": 0
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json
index 347d4df..3dcac31 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json
@@ -1,16 +1,16 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 7290724011641781781,
+  "ragas_version": "0.2.5",
+  "original_hash": -4418228766304805311,
   "language": "chinese",
-  "instruction": "你的任务是根据给定的上下文判断一系列陈述的忠实度。对于每个陈述，如果该陈述可以直接根据上下文推断，则返回裁决为1；如果该陈述不能直接根据上下文推断，则返回裁决为0。",
+  "instruction": "您的任务是根据给定的上下文判断一系列陈述的真实性。对于每个陈述，如果可以根据上下文直接推断出该陈述，则必须返回判决为1；如果不能根据上下文直接推断出该陈述，则返回判决为0。",
   "examples": [
     {
       "input": {
-        "context": "约翰是XYZ大学的学生。他正在攻读计算机科学学位。本学期他注册了几门课程，包括数据结构、算法和数据库管理。约翰是一个勤奋的学生，花大量时间学习和完成作业。他经常在图书馆加班做项目。",
+        "context": "约翰是XYZ大学的学生。他正在攻读计算机科学学位。本学期他选修了几门课程，包括数据结构、算法和数据库管理。约翰是一个勤奋的学生，花费大量时间学习和完成作业。他经常在图书馆待到很晚以完成他的项目。",
         "statements": [
           "约翰主修生物学。",
-          "约翰正在上一门关于人工智能的课程。",
-          "约翰是一个敬业的学生。",
+          "约翰正在学习人工智能课程。",
+          "约翰是一个勤奋的学生。",
           "约翰有一份兼职工作。"
         ]
       },
@@ -18,22 +18,22 @@
         "statements": [
           {
             "statement": "约翰主修生物学。",
-            "reason": "约翰的专业被明确提到是计算机科学。没有信息表明他主修生物学。",
+            "reason": "约翰的专业明确提到是计算机科学。没有信息表明他主修生物学。",
             "verdict": 0
           },
           {
-            "statement": "约翰正在上一门关于人工智能的课程。",
-            "reason": "上下文提到了约翰当前注册的课程，人工智能没有被提及。因此，不能推断约翰正在上一门关于人工智能的课程。",
+            "statement": "约翰正在学习人工智能课程。",
+            "reason": "上下文提到约翰目前选修的课程，并未提到人工智能。因此，无法推断出约翰正在学习人工智能课程。",
             "verdict": 0
           },
           {
-            "statement": "约翰是一个敬业的学生。",
-            "reason": "上下文提到他花大量时间学习和完成作业。此外，还提到他经常在图书馆加班做项目，这表明他的敬业。",
+            "statement": "约翰是一个勤奋的学生。",
+            "reason": "上下文指出他花费大量时间学习和完成作业。此外，还提到他经常在图书馆待到很晚以完成他的项目，这表明了他的勤奋。",
             "verdict": 1
           },
           {
             "statement": "约翰有一份兼职工作。",
-            "reason": "上下文中没有提到约翰有一份兼职工作。",
+            "reason": "上下文中没有提供关于约翰有兼职工作的信息。",
             "verdict": 0
           }
         ]
@@ -43,13 +43,13 @@
       "input": {
         "context": "光合作用是植物、藻类和某些细菌用来将光能转化为化学能的过程。",
         "statements": [
-          "爱因斯坦是一个天才。"
+          "阿尔伯特·爱因斯坦是个天才。"
         ]
       },
       "output": {
         "statements": [
           {
-            "statement": "爱因斯坦是一个天才。",
+            "statement": "阿尔伯特·爱因斯坦是个天才。",
             "reason": "上下文和陈述无关",
             "verdict": 0
           }
@@ -57,4 +57,4 @@
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json
index a1e46d7..345fe8d 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json
@@ -1,16 +1,16 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -3898713101712703116,
+  "ragas_version": "0.2.5",
+  "original_hash": 8370494081602031492,
   "language": "chinese",
-  "instruction": "给定一个问题，一个答案，以及答案中的句子，分析每个句子的复杂性。将每个句子分解为一个或多个完全可理解的陈述，同时确保每个陈述中不使用代词。输出格式为JSON。",
+  "instruction": "给定一个问题、一个答案和答案中的句子，分析在“句子”下给出的每个句子的复杂性，并将每个句子分解为一个或多个完全可理解的陈述，同时确保每个陈述中不使用代词。将输出格式化为JSON。",
   "examples": [
     {
       "input": {
         "question": "阿尔伯特·爱因斯坦是谁，他最出名的是什么？",
-        "answer": "他是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最出名的是发展了相对论，同时对量子力学理论的发展也做出了重要贡献。",
+        "answer": "他是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最出名的是发展了相对论，他还对量子力学理论的发展做出了重要贡献。",
         "sentences": {
           "0": "他是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的物理学家之一。",
-          "1": "他最出名的是发展了相对论，同时对量子力学理论的发展也做出了重要贡献。"
+          "1": "他最出名的是发展了相对论，他还对量子力学理论的发展做出了重要贡献。"
         }
       },
       "output": {
@@ -19,7 +19,7 @@
             "sentence_index": 0,
             "simpler_statements": [
               "阿尔伯特·爱因斯坦是一位出生于德国的理论物理学家。",
-              "阿尔伯特·爱因斯坦被认为是历史上最伟大和最有影响力的物理学家之一。"
+              "阿尔伯特·爱因斯坦被认为是有史以来最伟大和最有影响力的物理学家之一。"
             ]
           },
           {
@@ -33,4 +33,4 @@
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json
index 0cbe908..05cfc3d 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json
@@ -1,25 +1,22 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -2516793524213563754,
+  "ragas_version": "0.2.5",
+  "original_hash": 5488893487931336269,
   "language": "chinese",
-  "instruction": "从给定的文本中提取标题。",
+  "instruction": "从给定文本中仅提取第2级和第3级标题。",
   "examples": [
     {
       "input": {
-        "text": "某个标题\n1. 引言和相关工作\n\n1.1 条件计算\n利用训练数据和模型规模的规模效应一直是深度学习成功的核心...\n1.2 我们的方法：稀疏门控专家混合层\n我们对条件计算的方法是引入一种新的通用神经网络组件...\n1.3 专家混合的相关工作\n自二十多年前引入以来（Jacobs 等，1991；Jordan & Jacobs，1994），专家混合方法..\n\n2. 稀疏门控专家混合层\n2.1 架构\n稀疏门控专家混合层是一种前馈神经网络层，由多个专家网络和一个门控网络组成...\n"
+        "text": "                介绍\n                主题概述...\n\n                主要概念\n                核心思想的解释...\n\n                详细分析\n                分析的技术和方法...\n\n                小节：专业技术\n                专业技术的进一步细节...\n\n                未来方向\n                对即将到来的趋势的见解...\n\n                小节：研究的下一步\n                新研究领域的讨论...\n\n                结论\n                最后的评论和总结。\n                "
       },
       "output": {
-        "headlines": {
-          "1. Introduction and Related Work": [
-            "1.1 条件计算",
-            "1.2 我们的方法：稀疏门控专家混合层",
-            "1.3 专家混合的相关工作"
-          ],
-          "2. The Sparsely-Gated Mixture-of-Experts Layer": [
-            "2.1 架构"
-          ]
-        }
+        "headlines": [
+          "主要概念",
+          "详细分析",
+          "小节：专业技术",
+          "未来方向",
+          "小节：研究的下一步"
+        ]
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/KeyphrasesExtractor/prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/KeyphrasesExtractor/prompt_chinese.json
deleted file mode 100644
index 6581764..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/KeyphrasesExtractor/prompt_chinese.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 5758001203665521499,
-  "language": "chinese",
-  "instruction": "从给定的文本中提取前5个关键词。",
-  "examples": [
-    {
-      "input": {
-        "text": "人工智能\n\n人工智能通过自动化以前需要人类智能的任务，正在改变各个行业。从医疗保健到金融，人工智能被用于快速准确地分析大量数据。这项技术还在自动驾驶汽车和个性化推荐等领域推动创新。"
-      },
-      "output": {
-        "keyphrases": [
-          "人工智能",
-          "自动化任务",
-          "医疗保健",
-          "自动驾驶汽车",
-          "个性化推荐"
-        ]
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json
new file mode 100644
index 0000000..7e16f32
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json
@@ -0,0 +1,35 @@
+{
+  "ragas_version": "0.2.5",
+  "original_hash": -5063505674847369221,
+  "language": "chinese",
+  "instruction": "通过将来自至少两个不同列表的概念配对来形成组合。\n**说明：**\n- 查看每个节点的概念。\n- 确定可以逻辑连接或对比的概念。\n- 形成涉及来自不同节点的概念的组合。\n- 每个组合应至少包括来自两个或多个节点的一个概念。\n- 清晰简洁地列出组合。\n- 不要重复相同的组合。",
+  "examples": [
+    {
+      "input": {
+        "lists_of_concepts": [
+          [
+            "人工智能",
+            "自动化"
+          ],
+          [
+            "医疗保健",
+            "数据隐私"
+          ]
+        ],
+        "max_combinations": 2
+      },
+      "output": {
+        "combinations": [
+          [
+            "人工智能",
+            "医疗保健"
+          ],
+          [
+            "自动化",
+            "数据隐私"
+          ]
+        ]
+      }
+    }
+  ]
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json
new file mode 100644
index 0000000..0df823e
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json
@@ -0,0 +1,7 @@
+{
+  "ragas_version": "0.2.5",
+  "original_hash": -1698100170803872933,
+  "language": "chinese",
+  "instruction": "根据指定的条件（角色、主题、风格、长度）和提供的上下文生成查询和答案。确保答案完全忠实于上下文，仅使用直接来自提供节点的信息。### 指令：\n1. **生成查询**：根据上下文、角色、主题、风格和长度，创建一个符合角色视角并反映主题的问题。\n2. **生成答案**：仅使用提供的上下文内容，创建一个忠实且详细的答案。不要包含任何不在或无法从给定上下文中推断的信息。\n### 示例输出：\n\n",
+  "examples": []
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json
new file mode 100644
index 0000000..45a035d
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json
@@ -0,0 +1,39 @@
+{
+  "ragas_version": "0.2.5",
+  "original_hash": 2334929353739018813,
+  "language": "chinese",
+  "instruction": "给定一个主题和角色列表，根据角色描述将每个角色与相关主题关联起来。",
+  "examples": [
+    {
+      "input": {
+        "themes": [
+          "同理心",
+          "包容性",
+          "远程工作"
+        ],
+        "personas": [
+          {
+            "name": "人力资源经理",
+            "role_description": "专注于包容性和员工支持。"
+          },
+          {
+            "name": "远程团队负责人",
+            "role_description": "管理远程团队沟通。"
+          }
+        ]
+      },
+      "output": {
+        "mapping": {
+          "HR Manager": [
+            "包容性",
+            "同理心"
+          ],
+          "Remote Team Lead": [
+            "远程工作",
+            "同理心"
+          ]
+        }
+      }
+    }
+  ]
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json
new file mode 100644
index 0000000..7ed5a4d
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json
@@ -0,0 +1,7 @@
+{
+  "ragas_version": "0.2.5",
+  "original_hash": -1698100170803872933,
+  "language": "chinese",
+  "instruction": "根据指定的条件（角色、主题、风格、长度）和提供的上下文生成查询和答案。确保答案完全忠实于上下文，仅使用直接来自提供节点的信息。### 指令：\n1. **生成查询**：根据上下文、角色、主题、风格和长度，创建一个与角色视角一致并反映主题的问题。\n2. **生成答案**：仅使用提供的上下文内容，创建一个忠实且详细的答案。不要包含任何不在或无法从给定上下文中推断的信息。\n### 示例输出：\n\n",
+  "examples": []
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json
new file mode 100644
index 0000000..45a035d
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json
@@ -0,0 +1,39 @@
+{
+  "ragas_version": "0.2.5",
+  "original_hash": 2334929353739018813,
+  "language": "chinese",
+  "instruction": "给定一个主题和角色列表，根据角色描述将每个角色与相关主题关联起来。",
+  "examples": [
+    {
+      "input": {
+        "themes": [
+          "同理心",
+          "包容性",
+          "远程工作"
+        ],
+        "personas": [
+          {
+            "name": "人力资源经理",
+            "role_description": "专注于包容性和员工支持。"
+          },
+          {
+            "name": "远程团队负责人",
+            "role_description": "管理远程团队沟通。"
+          }
+        ]
+      },
+      "output": {
+        "mapping": {
+          "HR Manager": [
+            "包容性",
+            "同理心"
+          ],
+          "Remote Team Lead": [
+            "远程工作",
+            "同理心"
+          ]
+        }
+      }
+    }
+  ]
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json
index 942a1e7..92e1713 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json
@@ -1,8 +1,8 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 9029354505515513159,
+  "ragas_version": "0.2.5",
+  "original_hash": -2189588237940965149,
   "language": "chinese",
-  "instruction": "请判断给定的信息是否得到视觉和文本上下文信息的支持。您需要回答“是”或“否”。如果任何图像和文本上下文支持该信息，请回答“是”。",
+  "instruction": "请说明给定的信息是否得到视觉和文本上下文信息的支持。您需要回答“是”或“否”。如果任何图像和文本上下文支持该信息，请回答“是”。",
   "examples": [
     {
       "input": {
@@ -31,4 +31,4 @@
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json
index 3bd989d..0233642 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json
@@ -1,6 +1,6 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -4024749674493077311,
+  "ragas_version": "0.2.5",
+  "original_hash": -7302860412443151372,
   "language": "chinese",
   "instruction": "\n您的任务是评估查询的响应是否与提供的图像和文本上下文信息一致。\n您有两个选项可以回答。要么是 True / False。\n如果查询的响应与上下文信息一致，则回答 - True，否则为 False。\n",
   "examples": [
@@ -9,7 +9,7 @@
         "user_input": "传统玛格丽塔披萨的主要成分是什么？",
         "response": "玛格丽塔披萨的主要成分是番茄、马苏里拉奶酪和新鲜罗勒。",
         "retrieved_contexts": [
-          "传统的玛格丽塔披萨由薄饼皮组成。",
+          "传统的玛格丽塔披萨由薄薄的饼皮组成。",
           "主要的配料包括番茄、马苏里拉奶酪、新鲜罗勒、盐和橄榄油。",
           "它是最简单和最经典的披萨类型之一。"
         ]
@@ -24,7 +24,7 @@
         "response": "2021年的最佳男演员奖由莱昂纳多·迪卡普里奥获得。",
         "retrieved_contexts": [
           "第93届奥斯卡颁奖典礼于2021年举行。",
-          "安东尼·霍普金斯凭借在《困在时间里的父亲》中的角色获得了最佳男演员奖。",
+          "安东尼·霍普金斯凭借在《困在时间里的父亲》中的角色赢得了最佳男演员奖。",
           "由于COVID-19的限制，这次活动具有独特性。"
         ]
       },
@@ -33,4 +33,4 @@
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json
new file mode 100644
index 0000000..0c9ae2d
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json
@@ -0,0 +1,25 @@
+{
+  "ragas_version": "0.2.5",
+  "original_hash": -7036736759899743798,
+  "language": "chinese",
+  "instruction": "从给定文本中提取命名实体，限制输出为最重要的实体。确保实体数量不超过指定的最大值。",
+  "examples": [
+    {
+      "input": {
+        "text": "特斯拉和SpaceX的首席执行官埃隆·马斯克宣布计划将业务扩展到欧洲和亚洲的新地点。\n                此次扩展预计将创造数千个就业机会，特别是在柏林和上海等城市。",
+        "max_num": 10
+      },
+      "output": {
+        "entities": [
+          "埃隆·马斯克",
+          "特斯拉",
+          "SpaceX",
+          "欧洲",
+          "亚洲",
+          "柏林",
+          "上海"
+        ]
+      }
+    }
+  ]
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json
new file mode 100644
index 0000000..945e021
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json
@@ -0,0 +1,7 @@
+{
+  "ragas_version": "0.2.5",
+  "original_hash": -1422723613754983378,
+  "language": "chinese",
+  "instruction": "根据指定的条件（角色、术语、风格、长度）和提供的上下文生成查询和答案。确保答案完全忠实于上下文，仅使用直接来自提供上下文的信息。### 指令：\n1. **生成查询**：根据上下文、角色、术语、风格和长度，创建一个与角色视角一致并包含术语的问题。\n2. **生成答案**：仅使用提供的上下文中的内容，构建对查询的详细答案。不要添加上下文中未包含或无法推断的信息。\n### 示例输出：\n\n",
+  "examples": []
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json
new file mode 100644
index 0000000..5d52fa4
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json
@@ -0,0 +1,39 @@
+{
+  "ragas_version": "0.2.5",
+  "original_hash": 2334929353739018813,
+  "language": "chinese",
+  "instruction": "给定一个主题和角色列表，根据角色描述将每个角色与相关主题关联起来。",
+  "examples": [
+    {
+      "input": {
+        "themes": [
+          "同理心",
+          "包容性",
+          "远程工作"
+        ],
+        "personas": [
+          {
+            "name": "人力资源经理",
+            "role_description": "专注于包容性和员工支持。"
+          },
+          {
+            "name": "远程团队领导",
+            "role_description": "管理远程团队沟通。"
+          }
+        ]
+      },
+      "output": {
+        "mapping": {
+          "HR Manager": [
+            "包容性",
+            "同理心"
+          ],
+          "Remote Team Lead": [
+            "远程工作",
+            "同理心"
+          ]
+        }
+      }
+    }
+  ]
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/critic_query_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/critic_query_prompt_chinese.json
deleted file mode 100644
index 4c79e87..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/critic_query_prompt_chinese.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 6368494196383210761,
-  "language": "chinese",
-  "instruction": "根据以下评分标准批评合成生成的问题。为每个标准提供一个分数：独立性和明确意图。分数为低（0），中（1），或高（2）。",
-  "examples": [
-    {
-      "input": {
-        "text": "人工智能如何提高各个行业的效率和准确性？"
-      },
-      "output": {
-        "independence": 2,
-        "clear_intent": 2
-      }
-    },
-    {
-      "input": {
-        "text": "解释人工智能的好处。"
-      },
-      "output": {
-        "independence": 1,
-        "clear_intent": 1
-      }
-    },
-    {
-      "input": {
-        "text": "人工智能如何？"
-      },
-      "output": {
-        "independence": 0,
-        "clear_intent": 0
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/generate_query_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/generate_query_prompt_chinese.json
deleted file mode 100644
index 357eb8c..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/generate_query_prompt_chinese.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 7962062912799755737,
-  "language": "chinese",
-  "instruction": "根据文本的标题和文本片段，以及片段中的关键词，生成与关键词相关的问题。\n\n",
-  "examples": [
-    {
-      "input": {
-        "title": "人工智能对现代医疗保健的影响",
-        "keyphrase": "个性化治疗方案",
-        "text": "人工智能（AI）通过提高诊断准确性并实现个性化治疗方案，正在革新医疗保健。AI算法分析大量医疗数据，以识别模式并预测患者结果，从而增强医疗专业人员的决策过程。"
-      },
-      "output": {
-        "text": "AI如何促进医疗保健中个性化治疗方案的发展？"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/generate_reference_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/generate_reference_prompt_chinese.json
deleted file mode 100644
index 077c87b..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/generate_reference_prompt_chinese.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -800989296329424606,
-  "language": "chinese",
-  "instruction": "根据给定文本中提供的信息回答以下问题。",
-  "examples": [
-    {
-      "input": {
-        "query": "人工智能如何提高不同行业的效率和准确性？",
-        "context": "人工智能的进步已经革新了许多行业。从医疗保健到金融，人工智能算法使流程更加高效和准确。机器学习模型被用于预测疾病、优化投资策略，甚至向用户推荐个性化内容。人工智能融入日常运营对于现代企业变得越来越不可或缺。"
-      },
-      "output": {
-        "text": "人工智能通过使流程更加高效和准确，从而提高不同行业的效率和准确性。"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/query_modification_prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/query_modification_prompt_chinese.json
deleted file mode 100644
index 6e419c4..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/SpecificQuerySynthesizer/query_modification_prompt_chinese.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 8176750950393070076,
-  "language": "chinese",
-  "instruction": "修改给定的问题以适应给定的风格和长度。",
-  "examples": []
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json
index b9661ef..bb28889 100644
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json
@@ -1,16 +1,16 @@
 {
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": 8902721344697016091,
+  "ragas_version": "0.2.5",
+  "original_hash": -5467318232123540806,
   "language": "chinese",
-  "instruction": "将给定的文本总结为不到10句话。",
+  "instruction": "将给定文本总结为少于10个句子。",
   "examples": [
     {
       "input": {
-        "text": "人工智能\n\n人工智能通过自动化以前需要人类智能的任务，正在改变各个行业。从医疗保健到金融，人工智能被用于快速准确地分析大量数据。这项技术还在自动驾驶汽车和个性化推荐等领域推动创新。"
+        "text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗到金融，人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
       },
       "output": {
-        "text": "人工智能通过自动化任务、分析数据以及推动自动驾驶汽车和个性化推荐等领域的创新，正在革新各个行业。"
+        "text": "人工智能通过自动化任务、分析数据和推动自动驾驶汽车和个性化推荐等创新，正在革新各个行业。"
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json
new file mode 100644
index 0000000..50f05af
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json
@@ -0,0 +1,24 @@
+{
+  "ragas_version": "0.2.5",
+  "original_hash": 2452110859551524285,
+  "language": "chinese",
+  "instruction": "从给定的文本中提取主要主题和概念。",
+  "examples": [
+    {
+      "input": {
+        "text": "人工智能通过自动化需要人类智能的任务来改变行业。人工智能快速准确地分析大量数据，推动了自动驾驶汽车和个性化推荐等创新。",
+        "max_num": 10
+      },
+      "output": {
+        "output": [
+          "人工智能",
+          "自动化",
+          "数据分析",
+          "创新",
+          "自动驾驶汽车",
+          "个性化推荐"
+        ]
+      }
+    }
+  ]
+}
diff --git a/evalscope/backend/rag_eval/ragas/prompts/chinese/TitleExtractor/prompt_chinese.json b/evalscope/backend/rag_eval/ragas/prompts/chinese/TitleExtractor/prompt_chinese.json
deleted file mode 100644
index eeffd2a..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/chinese/TitleExtractor/prompt_chinese.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "ragas_version": "0.2.2.dev9+g8efe80d.d20241021",
-  "original_hash": -2292763252839833091,
-  "language": "chinese",
-  "instruction": "提取给定文档的标题。",
-  "examples": [
-    {
-      "input": {
-        "text": "深度学习在自然语言处理中的应用\n\n摘要\n\n深度学习已经彻底改变了自然语言处理（NLP）领域。本文探讨了各种深度学习模型及其在语言翻译、情感分析和文本生成等NLP任务中的应用。我们讨论了不同模型的优点和局限性，并提供了NLP当前最先进状态的全面概述。"
-      },
-      "output": {
-        "text": "深度学习在自然语言处理中的应用"
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/evalscope/backend/rag_eval/ragas/prompts/multi_modal_prompt.py b/evalscope/backend/rag_eval/ragas/prompts/multi_modal_prompt.py
deleted file mode 100644
index 871f243..0000000
--- a/evalscope/backend/rag_eval/ragas/prompts/multi_modal_prompt.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import os
-import typing as t
-import base64
-import urllib.request
-from urllib.parse import urlparse
-from langchain_core.prompt_values import PromptValue, HumanMessage
-from pydantic import BaseModel
-from ragas.callbacks import ChainType, new_group
-from ragas.exceptions import RagasOutputParserException
-from ragas.prompt.pydantic_prompt import PydanticPrompt, RagasOutputParser
-from evalscope.utils.logger import get_logger
-import mimetypes
-from langchain_core.callbacks import Callbacks
-from ragas.llms.base import BaseRagasLLM
-
-
-# type variables for input and output models
-InputModel = t.TypeVar("InputModel", bound=BaseModel)
-OutputModel = t.TypeVar("OutputModel", bound=BaseModel)
-
-logger = get_logger()
-
-
-class ImageTextPrompt(PydanticPrompt, t.Generic[InputModel, OutputModel]):
-
-    def _generate_examples(self):
-        if self.examples:
-            example_strings = []
-            for e in self.examples:
-                input_data, output_data = e
-                example_strings.append(
-                    self.instruction
-                    + "\n"
-                    + "input: "
-                    + input_data.model_dump_json(indent=4)
-                    + "\n"
-                    + "output: "
-                    + output_data.model_dump_json(indent=4)
-                )
-
-            return (
-                "Some examples are provided below with only text context, but please do use any images for context if they are provided.\n"
-                + "\n\n".join(example_strings)
-            )
-        # if no examples are provided
-        else:
-            return ""
-
-    def to_prompt_value(self, data: t.Optional[InputModel] = None):
-        text = [
-            self._generate_instruction(),
-            self._generate_output_signature(),
-            self._generate_examples(),
-            "Now perform the above instruction with the following",
-        ] + data.to_string_list()
-        return ImageTextPromptValue(items=text)
-
-    async def generate_multiple(
-        self,
-        llm: BaseRagasLLM,
-        data: InputModel,
-        n: int = 1,
-        temperature: t.Optional[float] = None,
-        stop: t.Optional[t.List[str]] = None,
-        callbacks: t.Optional[Callbacks] = None,
-    ) -> t.List[OutputModel]:
-        """
-        Generate multiple outputs using the provided language model and input data.
-
-        Parameters
-        ----------
-        llm : BaseRagasLLM
-            The language model to use for generation.
-        data : InputModel
-            The input data for generation.
-        n : int, optional
-            The number of outputs to generate. Default is 1.
-        temperature : float, optional
-            The temperature parameter for controlling randomness in generation.
-        stop : List[str], optional
-            A list of stop sequences to end generation.
-        callbacks : Callbacks, optional
-            Callback functions to be called during the generation process.
-
-        Returns
-        -------
-        List[OutputModel]
-            A list of generated outputs.
-
-        Raises
-        ------
-        RagasOutputParserException
-            If there's an error parsing the output.
-        """
-        callbacks = callbacks or []
-        processed_data = self.process_input(data)
-        prompt_rm, prompt_cb = new_group(
-            name=self.name,
-            inputs={"data": processed_data},
-            callbacks=callbacks,
-            metadata={"type": ChainType.RAGAS_PROMPT},
-        )
-        prompt_value = self.to_prompt_value(processed_data)
-        resp = await llm.generate(
-            prompt_value,
-            n=n,
-            temperature=temperature,
-            stop=stop,
-            callbacks=prompt_cb,
-        )
-
-        output_models = []
-        parser = RagasOutputParser(pydantic_object=self.output_model)
-        for i in range(n):
-            output_string = resp.generations[0][i].text
-            try:
-                answer = await parser.parse_output_string(
-                    output_string=output_string,
-                    prompt_value=prompt_value,
-                    llm=llm,
-                    callbacks=prompt_cb,
-                    max_retries=3,
-                )
-                processed_output = self.process_output(answer, data)  # type: ignore
-                output_models.append(processed_output)
-            except RagasOutputParserException as e:
-                prompt_rm.on_chain_error(error=e)
-                logger.error("Prompt %s failed to parse output: %s", self.name, e)
-                raise e
-
-        prompt_rm.on_chain_end({"output": output_models})
-        return output_models
-
-
-class ImageTextPromptValue(PromptValue):
-    items: t.List[str]
-
-    def to_messages(self):
-        messages = []
-        for item in self.items:
-            if self.is_image(item):
-                messages.append(self.get_image(item))
-            else:
-                messages.append(self.get_text(item))
-        return [HumanMessage(content=messages)]
-
-    def get_text(self, item):
-        return {"type": "text", "text": item}
-
-    def get_image(self, item):
-        if self.is_base64(item):
-            encoded_image = item
-        elif self.is_valid_url(item):
-            encoded_image = self.download_and_encode_image(item)
-        else:
-            encoded_image = self.encode_image_to_base64(item)
-
-        return {
-            "type": "image_url",
-            "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
-        }
-
-    def to_string(self):
-        string_representation = ""
-        for item in self.items:
-            if self.is_image(item):
-                string_representation += "[Image]"
-            else:
-                string_representation += item
-            string_representation += " "
-        return string_representation.strip()
-
-    def is_base64(self, s):
-        try:
-            if isinstance(s, str):
-                # Try to decode the string
-                if base64.b64encode(base64.b64decode(s)).decode("utf-8") == s:
-                    return True
-            return False
-        except Exception:
-            return False
-
-    def is_valid_url(self, url):
-        try:
-            result = urlparse(url)
-            return all([result.scheme, result.netloc])
-        except ValueError:
-            return False
-
-    def encode_image_to_base64(self, file_path):
-        with open(file_path, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode("utf-8")
-
-    def download_and_encode_image(self, url):
-        with urllib.request.urlopen(url) as response:
-            return base64.b64encode(response.read()).decode("utf-8")
-
-    def is_image(self, item):
-        if self.is_base64(item):
-            return True
-        elif self.is_valid_url(item):
-            mime_type, _ = mimetypes.guess_type(item)
-            return mime_type and mime_type.startswith("image")
-        elif isinstance(item, str):
-            mime_type, _ = mimetypes.guess_type(item)
-            return mime_type and mime_type.startswith("image")
-        return False
diff --git a/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py b/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py
new file mode 100644
index 0000000..1287b07
--- /dev/null
+++ b/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py
@@ -0,0 +1,18 @@
+import typing as t
+
+from pydantic import BaseModel
+from ragas.prompt import PydanticPrompt, StringIO
+from ragas.testset.persona import Persona
+
+
+class PersonaGenerationPromptZH(PydanticPrompt[StringIO, Persona]):
+    instruction: str = ('使用提供的摘要，生成一个可能会与内容互动或从中受益的角色。包括一个独特的名字和一个简洁的角色描述。')
+    input_model: t.Type[StringIO] = StringIO
+    output_model: t.Type[Persona] = Persona
+    examples: t.List[t.Tuple[StringIO, Persona]] = [(
+        StringIO(text='《数字营销指南》解释了在各种在线平台上吸引受众的策略。'),
+        Persona(
+            name='数字营销专家',
+            role_description='专注于吸引受众并在线上提升品牌。',
+        ),
+    )]
diff --git a/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py b/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py
index c4bb756..8381c9c 100644
--- a/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py
+++ b/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py
@@ -1,15 +1,15 @@
-import os
 import asyncio
+import os
+
 import pandas as pd
-from tqdm import tqdm
-from ragas.llms import LangchainLLMWrapper
 from ragas.embeddings import LangchainEmbeddingsWrapper
-from .translate_prompt import translate_prompts
-from evalscope.utils.logger import get_logger
-from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
-from evalscope.backend.rag_eval import EmbeddingModel, LLM, ChatOpenAI
+from ragas.llms import LangchainLLMWrapper
+from tqdm import tqdm
 
-os.environ['DO_NOT_TRACK'] = 'true'
+from evalscope.backend.rag_eval import LLM, ChatOpenAI, EmbeddingModel
+from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
+from evalscope.utils.logger import get_logger
+from .translate_prompt import translate_prompts
 
 logger = get_logger()
 
@@ -17,116 +17,110 @@
 def get_transform(llm, embedding, language):
     """
     Creates and returns a default set of transforms for processing a knowledge graph.
-
-    This function defines a series of transformation steps to be applied to a
-    knowledge graph, including extracting summaries, keyphrases, titles,
-    headlines, and embeddings, as well as building similarity relationships
-    between nodes.
-
-    The transforms are applied in the following order:
-    1. Parallel extraction of summaries and headlines
-    2. Embedding of summaries for document nodes
-    3. Splitting of headlines
-    4. Parallel extraction of embeddings, keyphrases, and titles
-    5. Building cosine similarity relationships between nodes
-    6. Building cosine similarity relationships between summaries
-
-    Returns
-    -------
-    Transforms
-        A list of transformation steps to be applied to the knowledge graph.
-
     """
     from ragas.testset.transforms.engine import Parallel
     from ragas.testset.transforms.extractors import (
         EmbeddingExtractor,
         HeadlinesExtractor,
-        KeyphrasesExtractor,
         SummaryExtractor,
-        TitleExtractor,
     )
-    from ragas.testset.transforms.relationship_builders.cosine import (
+    from ragas.testset.transforms.extractors.llm_based import NERExtractor, ThemesExtractor
+    from ragas.testset.transforms.relationship_builders import (
         CosineSimilarityBuilder,
-        SummaryCosineSimilarityBuilder,
+        OverlapScoreBuilder,
     )
     from ragas.testset.transforms.splitters import HeadlineSplitter
+    from ragas.testset.transforms.filters import CustomNodeFilter
     from ragas.testset.graph import NodeType
+    from ragas.utils import num_tokens_from_string
+
+    def summary_filter(node):
+        return (node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > 500)
 
-    # define the transforms
-    summary_extractor = SummaryExtractor(llm=llm)
-    keyphrase_extractor = KeyphrasesExtractor(llm=llm)
-    title_extractor = TitleExtractor(llm=llm)
+    summary_extractor = SummaryExtractor(llm=llm, filter_nodes=lambda node: summary_filter(node))
+    ner_extractor = NERExtractor(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
+    theme_extractor = ThemesExtractor(llm=llm)
     headline_extractor = HeadlinesExtractor(llm=llm)
 
     asyncio.run(
         translate_prompts(
             prompts=[
                 summary_extractor,
-                keyphrase_extractor,
-                title_extractor,
+                theme_extractor,
+                ner_extractor,
                 headline_extractor,
             ],
             target_lang=language,
             llm=llm,
             adapt_instruction=True,
-        )
-    )
+        ))
+
+    splitter = HeadlineSplitter(min_tokens=500)
 
-    embedding_extractor = EmbeddingExtractor(embedding_model=embedding)
-    headline_splitter = HeadlineSplitter()
-    cosine_sim_builder = CosineSimilarityBuilder(threshold=0.8)
-    summary_embedder = EmbeddingExtractor(
-        name='summary_embedder',
-        filter_nodes=lambda node: True if node.type == NodeType.DOCUMENT else False,
+    summary_emb_extractor = EmbeddingExtractor(
+        embedding_model=embedding,
         property_name='summary_embedding',
         embed_property_name='summary',
-        embedding_model=embedding,
+        filter_nodes=lambda node: summary_filter(node),
     )
-    summary_cosine_sim_builder = SummaryCosineSimilarityBuilder(threshold=0.6)
 
-    # specify the transforms and their order to be applied
+    cosine_sim_builder = CosineSimilarityBuilder(
+        property_name='summary_embedding',
+        new_property_name='summary_similarity',
+        threshold=0.7,
+        filter_nodes=lambda node: summary_filter(node),
+    )
+
+    ner_overlap_sim = OverlapScoreBuilder(threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK)
+
+    node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
+
     transforms = [
-        Parallel(summary_extractor, headline_extractor),
-        summary_embedder,
-        headline_splitter,
-        Parallel(embedding_extractor, keyphrase_extractor, title_extractor),
-        cosine_sim_builder,
-        summary_cosine_sim_builder,
+        headline_extractor,
+        splitter,
+        summary_extractor,
+        node_filter,
+        Parallel(summary_emb_extractor, theme_extractor, ner_extractor),
+        Parallel(cosine_sim_builder, ner_overlap_sim),
     ]
+
     return transforms
 
 
 def get_distribution(llm, distribution, language):
-    from ragas.testset.synthesizers.abstract_query import (
-        AbstractQuerySynthesizer,
-        ComparativeAbstractQuerySynthesizer,
+    from ragas.testset.synthesizers.multi_hop import (
+        MultiHopAbstractQuerySynthesizer,
+        MultiHopSpecificQuerySynthesizer,
     )
-    from ragas.testset.synthesizers.specific_query import SpecificQuerySynthesizer
+    from ragas.testset.synthesizers.single_hop.specific import (
+        SingleHopSpecificQuerySynthesizer, )
 
-    abstract = AbstractQuerySynthesizer(llm=llm)
-    comparative = ComparativeAbstractQuerySynthesizer(llm=llm)
-    specific = SpecificQuerySynthesizer(llm=llm)
+    single_hop = SingleHopSpecificQuerySynthesizer(llm=llm)
+    multi_hop_abs = MultiHopAbstractQuerySynthesizer(llm=llm)
+    multi_hop_spec = MultiHopSpecificQuerySynthesizer(llm=llm)
 
     asyncio.run(
         translate_prompts(
             prompts=[
-                abstract,
-                comparative,
-                specific,
+                single_hop,
+                multi_hop_abs,
+                multi_hop_spec,
             ],
             target_lang=language,
             llm=llm,
             adapt_instruction=True,
-        )
-    )
-    return [
-        (abstract, distribution['simple']),
-        (comparative, distribution['multi_context']),
-        (specific, distribution['reasoning']),
-    ]
+        ))
+
+    mapping = {
+        'simple': single_hop,
+        'multi_context': multi_hop_abs,
+        'reasoning': multi_hop_spec,
+    }
+
+    return [(mapping[key], distribution[key]) for key in mapping if key in distribution]
 
 
-def get_knowledge_graph(documents, transforms, local_file):
+def get_knowledge_graph(documents, transforms, local_file, run_config):
     from ragas.testset.graph import KnowledgeGraph, Node, NodeType
     from ragas.testset.transforms import apply_transforms
 
@@ -148,7 +142,7 @@ def get_knowledge_graph(documents, transforms, local_file):
     kg = KnowledgeGraph(nodes=nodes)
 
     # apply transforms and update the knowledge graph
-    apply_transforms(kg, transforms)
+    apply_transforms(kg, transforms, run_config=run_config)
 
     # save the knowledge graph
     output_path = os.path.dirname(local_file)
@@ -158,6 +152,39 @@ def get_knowledge_graph(documents, transforms, local_file):
     return kg
 
 
+def get_persona(llm, kg, language):
+    from evalscope.backend.rag_eval.ragas.prompts.persona_prompt import PersonaGenerationPromptZH
+    from ragas.testset.persona import generate_personas_from_kg, PersonaGenerationPrompt
+    from ragas.testset.graph import Node
+
+    def filter(node: Node) -> bool:
+        if (node.type.name == 'DOCUMENT' and node.properties.get('summary_embedding') is not None):
+            return True
+        else:
+            return False
+
+    if language == 'chinese':
+        persona_prompt = PersonaGenerationPromptZH()
+    else:
+        persona_prompt = PersonaGenerationPrompt()
+    # NOTE: can't translate this yet
+    # asyncio.run(
+    #     translate_prompts(
+    #         prompts=[persona_prompt],
+    #         target_lang=language,
+    #         llm=llm,
+    #         adapt_instruction=True,
+    #     ))
+
+    return generate_personas_from_kg(
+        llm=llm,
+        kg=kg,
+        num_personas=3,
+        persona_generation_prompt=persona_prompt,
+        filter_fn=filter,
+    )
+
+
 def load_data(file_path):
     from langchain_community.document_loaders import UnstructuredFileLoader
 
@@ -178,32 +205,31 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
     generator_llm = LLM.load(**args.generator_llm)
     embeddings = EmbeddingModel.load(**args.embeddings)
 
+    wrapped_llm = LangchainLLMWrapper(generator_llm)
+    wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)
+
     # Change resulting question type distribution
-    distributions = get_distribution(
-        LangchainLLMWrapper(generator_llm), args.distribution, args.language
-    )
+    distributions = get_distribution(wrapped_llm, args.distribution, args.language)
 
+    run_config = RunConfig(timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True)
     # get transforms
     transforms = get_transform(
-        LangchainLLMWrapper(generator_llm),
-        LangchainEmbeddingsWrapper(embeddings),
+        wrapped_llm,
+        wrapped_embeddings,
         args.language,
     )
 
     # get knowledge graph
-    knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph)
+    knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph, run_config)
 
-    generator = TestsetGenerator.from_langchain(
-        generator_llm, embeddings, knowledge_graph
-    )
+    persona_list = get_persona(llm=wrapped_llm, kg=knowledge_graph, language=args.language)
+
+    generator = TestsetGenerator(llm=wrapped_llm, knowledge_graph=knowledge_graph, persona_list=persona_list)
 
-    runconfig = RunConfig(
-        timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True
-    )
     testset = generator.generate(
         testset_size=args.test_size,
         query_distribution=distributions,
-        run_config=runconfig,
+        run_config=run_config,
         with_debugging_logs=True,
         raise_exceptions=True,
     )
@@ -212,9 +238,7 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
     testset_df = testset.to_pandas()
     output_path = os.path.dirname(args.output_file)
     os.makedirs(output_path, exist_ok=True)
-    testset_df.to_json(
-        args.output_file, indent=4, index=False, orient='records', force_ascii=False
-    )
+    testset_df.to_json(args.output_file, indent=4, index=False, orient='records', force_ascii=False)
 
     # get answer
     testset_with_answer = get_answer(testset_df, generator_llm, args.language)
@@ -243,21 +267,17 @@ def get_answer(testset_df, generator_llm, language: None):
         contexts = '\n'.join(row['reference_contexts'])
 
         # Combine question and contexts as input for the LLM
-        input_text = template.format(
-            language=language, question=question, contexts=contexts
-        )
+        input_text = template.format(language=language, question=question, contexts=contexts)
 
         # Generate the answer using the generator LLM
         answer = generator_llm.invoke(input_text)
         if isinstance(generator_llm, ChatOpenAI):
             answer = answer.content
-        items.append(
-            {
-                'user_input': question,
-                'retrieved_contexts': row['reference_contexts'],
-                'response': answer,
-                'reference': row['reference'],
-            }
-        )
+        items.append({
+            'user_input': question,
+            'retrieved_contexts': row['reference_contexts'],
+            'response': answer,
+            'reference': row['reference'],
+        })
 
     return pd.DataFrame.from_dict(items)
diff --git a/evalscope/backend/rag_eval/utils/__init__.py b/evalscope/backend/rag_eval/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/requirements/rag.txt b/requirements/rag.txt
index 8f5df71..cf668db 100644
--- a/requirements/rag.txt
+++ b/requirements/rag.txt
@@ -1,3 +1,3 @@
 mteb==1.19.4
-ragas==0.2.3
+ragas==0.2.5
 webdataset>0.2.0
diff --git a/setup.cfg b/setup.cfg
index 86a9f71..8bff0fa 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [isort]
-line_length = 79
+line_length = 120
 multi_line_output = 0
 known_standard_library = setuptools
 known_first_party = evalscope
@@ -8,6 +8,7 @@ no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
 
 [yapf]
+column_limit = 120
 BASED_ON_STYLE = pep8
 BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
 SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true