From d4edd29b48e32ed1a85a99d5f55bd9eef62f1b91 Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Wed, 17 Dec 2025 19:24:51 +0100
Subject: [PATCH 01/12] =?UTF-8?q?=F0=9F=93=9D=20Added=20prompts=20for=20ev?=
=?UTF-8?q?aluation=20rubrics:=20Innovation,=20Gap,=20Rigor,=20Depth=20and?=
=?UTF-8?q?=20Breadth?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
yescieval/rubric/breadth.py | 268 +++++++++++++++++++++++++++++++++
yescieval/rubric/depth.py | 161 ++++++++++++++++++++
yescieval/rubric/gap.py | 54 +++++++
yescieval/rubric/innovation.py | 109 ++++++++++++++
yescieval/rubric/rigor.py | 161 ++++++++++++++++++++
5 files changed, 753 insertions(+)
create mode 100644 yescieval/rubric/breadth.py
create mode 100644 yescieval/rubric/depth.py
create mode 100644 yescieval/rubric/gap.py
create mode 100644 yescieval/rubric/innovation.py
create mode 100644 yescieval/rubric/rigor.py
diff --git a/yescieval/rubric/breadth.py b/yescieval/rubric/breadth.py
new file mode 100644
index 0000000..2f3fc28
--- /dev/null
+++ b/yescieval/rubric/breadth.py
@@ -0,0 +1,268 @@
+from ..base import Rubric
+
+geographic_coverage_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on correctness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. Correctness: is the information in the answer a correct representation of the content of the provided abstracts?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Correctness
+Rating 1. Very bad: The synthesis consistently misrepresents or inaccurately portrays the content of the provided abstracts, showing a significant deviation from the original sources.
+Rating 2. Bad: The synthesis contains several inaccuracies or misinterpretations of the source abstracts.
+Rating 3. Moderate: The synthesis accurately represents most of the content from the provided abstracts but may contain minor errors.
+Rating 4. Good: The synthesis provides an accurate representation of the content from the provided abstracts with minor exceptions.
+Rating 5. Very good: The information in the synthesis is an accurate and faithful representation of the content from the provided abstracts, without any factual errors or misinterpretations.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "Correctness": {"rating": "4", "rationale": "The synthesis represents the content of the provided abstract, but with minor inrelevant information."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class GeographicCoverage(Rubric):
+ system_prompt_template: str = geographic_coverage_prompt
+
+intervention_diversity_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on completeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. Completeness: is the answer a comprehensive encapsulation of the relevant information in the provided abstracts?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Completeness
+Rating 1. Very bad: The synthesis omits most of the relevant information, failing to capture the essential points or details from the provided abstracts.
+Rating 2. Bad: Significant portions of relevant information from the provided abstracts are missing.
+Rating 3. Moderate: The synthesis captures a fair amount of the relevant information, though it may overlook some details.
+Rating 4. Good: The synthesis includes almost all relevant information, missing only minor details.
+Rating 5. Very good: The synthesis comprehensively encapsulates all relevant information from the provided abstracts, leaving no pertinent details or points unaddressed.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "Completeness": {"rating": "4", "rationale": "Only minor details are missing in the synthesis."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class InterventionDiversity(Rubric):
+ system_prompt_template: str = intervention_diversity_prompt
+
+biodiversity_dimensions_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on informativeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. Informativeness: is the answer a useful and informative reply to the question?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Informativeness
+Rating 1. Very bad: The synthesis offers no valuable insights or useful information in response to the research question, lacking depth and utility.
+Rating 2. Bad: The answer provides limited new insights or useful information in response to the research question.
+Rating 3. Moderate: The answer is somewhat informative, offering insights or useful information but not in a comprehensive or detailed manner.
+Rating 4. Good: The answer is informative and insightful, providing comprehensive information in response to the research question.
+Rating 5. Very good: The synthesis is highly informative, providing valuable insights and detailed information that thoroughly addresses the research question.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "Informativeness": {"rating": "4", "rationale": "Most information is informative for the research question."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class BiodiversityDimensions(Rubric):
+ system_prompt_template: str = biodiversity_dimensions_prompt
+
+ecosystem_services_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on informativeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. Informativeness: is the answer a useful and informative reply to the question?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Informativeness
+Rating 1. Very bad: The synthesis offers no valuable insights or useful information in response to the research question, lacking depth and utility.
+Rating 2. Bad: The answer provides limited new insights or useful information in response to the research question.
+Rating 3. Moderate: The answer is somewhat informative, offering insights or useful information but not in a comprehensive or detailed manner.
+Rating 4. Good: The answer is informative and insightful, providing comprehensive information in response to the research question.
+Rating 5. Very good: The synthesis is highly informative, providing valuable insights and detailed information that thoroughly addresses the research question.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "Informativeness": {"rating": "4", "rationale": "Most information is informative for the research question."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class EcosystemServices(Rubric):
+ system_prompt_template: str = ecosystem_services_prompt
+
+spatial_scale_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on informativeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. Informativeness: is the answer a useful and informative reply to the question?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Informativeness
+Rating 1. Very bad: The synthesis offers no valuable insights or useful information in response to the research question, lacking depth and utility.
+Rating 2. Bad: The answer provides limited new insights or useful information in response to the research question.
+Rating 3. Moderate: The answer is somewhat informative, offering insights or useful information but not in a comprehensive or detailed manner.
+Rating 4. Good: The answer is informative and insightful, providing comprehensive information in response to the research question.
+Rating 5. Very good: The synthesis is highly informative, providing valuable insights and detailed information that thoroughly addresses the research question.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "Informativeness": {"rating": "4", "rationale": "Most information is informative for the research question."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class SpatialScale(Rubric):
+ system_prompt_template: str = spatial_scale_prompt
+
+
diff --git a/yescieval/rubric/depth.py b/yescieval/rubric/depth.py
new file mode 100644
index 0000000..9334572
--- /dev/null
+++ b/yescieval/rubric/depth.py
@@ -0,0 +1,161 @@
+from ..base import Rubric
+
+mechanistic_understanding_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on correctness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. Correctness: is the information in the answer a correct representation of the content of the provided abstracts?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Correctness
+Rating 1. Very bad: The synthesis consistently misrepresents or inaccurately portrays the content of the provided abstracts, showing a significant deviation from the original sources.
+Rating 2. Bad: The synthesis contains several inaccuracies or misinterpretations of the source abstracts.
+Rating 3. Moderate: The synthesis accurately represents most of the content from the provided abstracts but may contain minor errors.
+Rating 4. Good: The synthesis provides an accurate representation of the content from the provided abstracts with minor exceptions.
+Rating 5. Very good: The information in the synthesis is an accurate and faithful representation of the content from the provided abstracts, without any factual errors or misinterpretations.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "Correctness": {"rating": "4", "rationale": "The synthesis represents the content of the provided abstract, but with minor inrelevant information."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class MechanisticUnderstanding(Rubric):
+ system_prompt_template: str = mechanistic_understanding_prompt
+
+causal_reasoning_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on completeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. Completeness: is the answer a comprehensive encapsulation of the relevant information in the provided abstracts?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Completeness
+Rating 1. Very bad: The synthesis omits most of the relevant information, failing to capture the essential points or details from the provided abstracts.
+Rating 2. Bad: Significant portions of relevant information from the provided abstracts are missing.
+Rating 3. Moderate: The synthesis captures a fair amount of the relevant information, though it may overlook some details.
+Rating 4. Good: The synthesis includes almost all relevant information, missing only minor details.
+Rating 5. Very good: The synthesis comprehensively encapsulates all relevant information from the provided abstracts, leaving no pertinent details or points unaddressed.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "Completeness": {"rating": "4", "rationale": "Only minor details are missing in the synthesis."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class CausalReasoning(Rubric):
+ system_prompt_template: str = causal_reasoning_prompt
+
+temporal_precision_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on informativeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. Informativeness: is the answer a useful and informative reply to the question?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Informativeness
+Rating 1. Very bad: The synthesis offers no valuable insights or useful information in response to the research question, lacking depth and utility.
+Rating 2. Bad: The answer provides limited new insights or useful information in response to the research question.
+Rating 3. Moderate: The answer is somewhat informative, offering insights or useful information but not in a comprehensive or detailed manner.
+Rating 4. Good: The answer is informative and insightful, providing comprehensive information in response to the research question.
+Rating 5. Very good: The synthesis is highly informative, providing valuable insights and detailed information that thoroughly addresses the research question.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "Informativeness": {"rating": "4", "rationale": "Most information is informative for the research question."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class TemporalPrecision(Rubric):
+ system_prompt_template: str = temporal_precision_prompt
+
diff --git a/yescieval/rubric/gap.py b/yescieval/rubric/gap.py
new file mode 100644
index 0000000..facdcbd
--- /dev/null
+++ b/yescieval/rubric/gap.py
@@ -0,0 +1,54 @@
+from ..base import Rubric
+
+gap_identification_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on informativeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. gap_identification: To what extent does the answer explicitly identify research gaps or unanswered questions indicated by the provided abstracts?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Gap Identification
+Rating 1. Very bad: The synthesis does not identify any research gaps or unanswered questions, or it introduces gaps that are contradictory to or unsupported by the provided abstracts.
+Rating 2. Bad: The synthesis refers to research gaps only in a vague or generic manner (e.g., “more research is needed”) without clearly specifying what is missing or grounding the claims in the provided abstracts.
+Rating 3. Moderate: The synthesis identifies potential research gaps, but the description is partially vague, weakly justified, or only loosely connected to the content of the provided abstracts.
+Rating 4. Good: The synthesis clearly identifies one or more research gaps that are supported by the provided abstracts, though the gaps may lack full operational detail or discussion of implications.
+Rating 5. Very good: The synthesis explicitly and clearly identifies well-defined research gaps or unanswered questions that are directly supported by the provided abstracts, specifying what is missing, where, and why it matters, without relying on vague or generic statements.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "gap_identification": {"rating": "4", "rationale": "Identifies a relevant gap supported by the abstracts, with limited elaboration."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class GapIdentification(Rubric):
+ system_prompt_template: str = gap_identification_prompt
diff --git a/yescieval/rubric/innovation.py b/yescieval/rubric/innovation.py
new file mode 100644
index 0000000..290405a
--- /dev/null
+++ b/yescieval/rubric/innovation.py
@@ -0,0 +1,109 @@
+from ..base import Rubric
+
+speculative_statements_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on correctness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. speculative_statement: Does the answer clearly distinguish speculation (e.g., “might,” “could”) from established findings in the provided abstracts?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Speculative Statement
+Rating 1. Very bad: No innovation is present; the synthesis does not differ from prior work and may present speculation as fact.
+Rating 2. Bad: The synthesis shows little originality, relies on vague statements (e.g., “more research is needed”), and does not clearly distinguish from prior work.
+Rating 3. Moderate: The synthesis shows some originality, but the novel aspects are weak, underspecified, or not clearly differentiated from prior work.
+Rating 4. Good: The synthesis presents a clear novel angle or synthesis compared to prior work, with speculation appropriately flagged but limited in depth or specificity.
+Rating 5. Very good: The synthesis offers a genuinely novel synthesis or perspective, clearly distinguishes itself from prior work, appropriately bounds speculation, and proposes concrete, testable next steps.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "speculative_statement": {"rating": "4", "rationale": "Uses hedging appropriately and clearly distinguishes speculation from established findings."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class SpeculativeStatements(Rubric):
+ system_prompt_template: str = speculative_statements_prompt
+
+novelty_indicators_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on completeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. novelty_indicators: Does the answer appropriately use self-declared innovation terms (e.g., “novel,” “pioneering,” “emerging”) and clearly indicate whether such claims are supported by the provided abstracts?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Novelty Indicators
+Rating 1. Very bad: No novelty indicators are present, or novelty claims are incorrect or unsupported.
+Rating 2. Bad: Uses vague novelty claims (e.g., “more research is needed”) or presents speculation as fact, with no clear distinction from prior work.
+Rating 3. Moderate: Indicates some novelty, but the claims are weak, generic, or not clearly differentiated from prior work.
+Rating 4. Good: Shows a clear novel angle or synthesis compared to prior work, with speculation appropriately flagged but limited in detail.
+Rating 5. Very good: Clearly signals innovation with a distinct novel synthesis or perspective, properly bounds speculation, and proposes concrete, testable next steps.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "novelty_indicators": {"rating": "4", "rationale": "Shows a clear novel angle, but lacks full detail."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class NoveltyIndicators(Rubric):
+ system_prompt_template: str = novelty_indicators_prompt
+
+
diff --git a/yescieval/rubric/rigor.py b/yescieval/rubric/rigor.py
new file mode 100644
index 0000000..62c4aaf
--- /dev/null
+++ b/yescieval/rubric/rigor.py
@@ -0,0 +1,161 @@
+from ..base import Rubric
+
+statistical_sophistication_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on correctness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. statistical_sophistication: Does the answer reflect quantitative depth through the use of inferential statistics or analysis methods described in the abstracts?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Statistical Sophistication
+Rating 1. Very bad: The synthesis includes claims made without any statistics or methods; no mention of uncertainty or limitations; no reproducibility signals.
+Rating 2. Bad: The synthesis contains very minimal methodological detail; very few statistics; limitations/uncertainty mostly ignored; reproducibility rarely addressed.
+Rating 3. Moderate: The synthesis includes some statistics or method details; mentions limitations or uncertainty in passing; limited reproducibility information.
+Rating 4. Good: The synthesis provides clear methods and statistics; acknowledges key limitations and uncertainties; provides some reproducibility signals (eg: data,code or baselines).
+Rating 5. Very good: The information in the synthesis includes detailed and transparent methodology; robust statistics; thoroughly discusses limitations and uncertainty; strong reproducibility signals provided.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "statistical_sophistication": {"rating": "3", "rationale": "The synthesis provides some methodological details and basic statistics, but does not fully discuss limitations or reproducibility.""}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class StatisticalSophistication(Rubric):
+ system_prompt_template: str = statistical_sophistication_prompt
+
+citation_practices_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on completeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. citation_practices: is the answer supported by appropriate references, using parenthetical or narrative citations, for the relevant information in the provided abstracts?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Citation Practices
+Rating 1. Very bad: The synthesis has no citations present; claims are unsupported.
+Rating 2. Bad: Very few citations; many claims lack references; citation style inconsistent.
+Rating 3. Moderate: The synthesis has some claims supported with citations; occasional missing or unclear references.
+Rating 4. Good: The synthesis has most claims that are supported by citations; proper use of parenthetical or narrative style.
+Rating 5. Very good: The synthesis includes all relevant claims supported by clear, correctly formatted citations; references fully cover the sources in the abstracts.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "citation_practices": {"rating": "3", "rationale": "Some claims are supported with citations, but several important points lack references or use inconsistent citation style."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class CitationPractices(Rubric):
+ system_prompt_template: str = citation_practices_prompt
+
+uncertainty_acknowledgement_prompt = """
+Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires:
+
+- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources.
+- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question.
+- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information.
+- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question.
+- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability.
+- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on informativeness characteristic, ensuring it effectively communicates the synthesized information.
+
+In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness.
+
+
+
+You are tasked as a scientific syntheses quality evaluator.
+
+
+
+A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim.
+
+
+
+1. uncertainty_acknowledgement: does the answer explicitly discuss limitations, uncertainty, or gaps in evidence (e.g., using terms like “unknown,” “limited evidence,” or “unclear”)?
+
+
+
+For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
+
+1. Uncertainty Acknowledgement
+Rating 1. Very bad: The synthesis offers no mention of uncertainty, limitations, or gaps in evidence.
+Rating 2. Bad: The answer provides very limited acknowledgement of uncertainty; most claims presented as certain.
+Rating 3. Moderate: The answer has some uncertainty or limitations mentioned, but coverage is incomplete or vague.
+Rating 4. Good: The answer has clear acknowledgement of key uncertainties, limitations, or potential biases.
+Rating 5. Very good: The synthesis has thorough discussion of uncertainty, limitations, and potential biases for all relevant claims; clearly signals gaps in evidence.
+
+
+
+For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
+Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}}
+
+
+{
+ "uncertainty_acknowledgement": {"rating": "4", "rationale": "The answer clearly acknowledges key uncertainties and limitations in the study."}
+}
+
+
+
+
+Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
+"""
+class UncertaintyAcknowledgment(Rubric):
+ system_prompt_template: str = uncertainty_acknowledgement_prompt
+
From 082abb679ce4c53c6f9b227cd7ee9cf84dfc65d1 Mon Sep 17 00:00:00 2001
From: Mike Ashley Cedric
Date: Thu, 18 Dec 2025 21:10:47 +0100
Subject: [PATCH 02/12] =?UTF-8?q?=F0=9F=93=9D=20Added=20imports=20for=20th?=
=?UTF-8?q?e=20rubrics?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
yescieval/__init__.py | 3 ++-
yescieval/rubric/__init__.py | 11 ++++++++++-
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/yescieval/__init__.py b/yescieval/__init__.py
index 9e161b4..ec92eb8 100644
--- a/yescieval/__init__.py
+++ b/yescieval/__init__.py
@@ -3,7 +3,8 @@
from .base import Rubric, Parser
from .rubric import (Informativeness, Correctness, Completeness, Coherence, Relevancy,
- Integration, Cohesion, Readability, Conciseness)
+ Integration, Cohesion, Readability, Conciseness, GeographicCoverage,
+ InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, SpeculativeStatements, NoveltyIndicators)
from .judge import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge
from .parser import GPTParser
diff --git a/yescieval/rubric/__init__.py b/yescieval/rubric/__init__.py
index 79400d5..262818b 100644
--- a/yescieval/rubric/__init__.py
+++ b/yescieval/rubric/__init__.py
@@ -1,7 +1,16 @@
from .informativeness import Informativeness, Correctness, Completeness
from .structural import Coherence, Relevancy, Integration
from .stylistic import Cohesion, Readability, Conciseness
+from .breadth import GeographicCoverage, InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale
+from .depth import MechanisticUnderstanding, CausalReasoning, TemporalPrecision
+from .gap import GapIdentification
+from .rigor import StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment
+from .innovation import SpeculativeStatements, NoveltyIndicators
__all__ = ["Informativeness", "Correctness", "Completeness",
"Coherence", "Relevancy", "Integration",
- "Cohesion", "Readability", "Conciseness"]
+ "Cohesion", "Readability", "Conciseness", "GeographicCoverage",
+ "InterventionDiversity", "BiodiversityDimensions", "EcosystemServices",
+ "SpatialScale", "MechanisticUnderstanding", "CausalReasoning", "TemporalPrecision",
+ "GapIdentification", "StatisticalSophistication", "CitationPractices",
+ "UncertaintyAcknowledgment", "SpeculativeStatements", "NoveltyIndicators"]
From 6b6d96a5ce794dadc0145a5da8f37e961f3e0e59 Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Fri, 19 Dec 2025 01:00:03 +0100
Subject: [PATCH 03/12] =?UTF-8?q?=F0=9F=93=9D=20Added=20Prompts=20for=20th?=
=?UTF-8?q?e=20Dimension=20-=20Depth?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
yescieval/rubric/depth.py | 49 +++++++++++++++++++--------------------
1 file changed, 24 insertions(+), 25 deletions(-)
diff --git a/yescieval/rubric/depth.py b/yescieval/rubric/depth.py
index 9334572..04aeb00 100644
--- a/yescieval/rubric/depth.py
+++ b/yescieval/rubric/depth.py
@@ -22,19 +22,18 @@
-1. Correctness: is the information in the answer a correct representation of the content of the provided abstracts?
+1. mechanistic_understanding: does the answer reflect understanding of ecological processes by explicitly mentioning recognized mechanisms such as feedbacks, nutrient cycling, or trophic cascades?
For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
-1. Correctness
-Rating 1. Very bad: The synthesis consistently misrepresents or inaccurately portrays the content of the provided abstracts, showing a significant deviation from the original sources.
-Rating 2. Bad: The synthesis contains several inaccuracies or misinterpretations of the source abstracts.
-Rating 3. Moderate: The synthesis accurately represents most of the content from the provided abstracts but may contain minor errors.
-Rating 4. Good: The synthesis provides an accurate representation of the content from the provided abstracts with minor exceptions.
-Rating 5. Very good: The information in the synthesis is an accurate and faithful representation of the content from the provided abstracts, without any factual errors or misinterpretations.
-
+1. Mechanistic Understanding
+Rating 1. Very bad: The synthesis contains only vague statements (e.g., “X affects Y”) with no explanation of how or why; no causal language or mechanisms.
+Rating 2. Bad: The synthesis mentions a relationship but remains single-step; mechanisms are implied but not described; no mediators or temporal aspects.
+Rating 3. Moderate: The synthesis identifies at least one mechanism or causal link but lacks depth; limited causal connectors and no explicit assumptions or timing.
+Rating 4. Good: The synthesis describes multi-step mechanisms (driver → mediator → outcome) using causal language; may include some temporal or conditional detail.
+Rating 5. Very good: The information in the synthesis provides detailed, explicit multi-step causal mechanisms with clear mediators, temporal specificity, and stated assumptions or boundary conditions.
For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating.
@@ -42,7 +41,7 @@
{
- "Correctness": {"rating": "4", "rationale": "The synthesis represents the content of the provided abstract, but with minor inrelevant information."}
+ "mechanistic_understanding": {"rating": "4", "rationale": "The answer explains a clear multi-step ecological mechanism using causal language, but some temporal or boundary details are only briefly addressed."}
}
@@ -75,18 +74,18 @@ class MechanisticUnderstanding(Rubric):
-1. Completeness: is the answer a comprehensive encapsulation of the relevant information in the provided abstracts?
+1. causal_reasoning: does the answer explicitly express cause–effect relationships using causal connectives (e.g., “because,” “due to”), result indicators (e.g., “results in,” “induces”), or mechanistic verbs (e.g., “drives,” “regulates”) when describing ecological processes?
For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
-1. Completeness
-Rating 1. Very bad: The synthesis omits most of the relevant information, failing to capture the essential points or details from the provided abstracts.
-Rating 2. Bad: Significant portions of relevant information from the provided abstracts are missing.
-Rating 3. Moderate: The synthesis captures a fair amount of the relevant information, though it may overlook some details.
-Rating 4. Good: The synthesis includes almost all relevant information, missing only minor details.
-Rating 5. Very good: The synthesis comprehensively encapsulates all relevant information from the provided abstracts, leaving no pertinent details or points unaddressed.
+1. Causal Reasoning
+Rating 1. Very bad: The synthesis uses vague statements (e.g., “X affects Y”) with no causal connectors,
+Rating 2. Bad: The synthesis identifies a cause–effect relationship but only as a single-step claim; causal language is minimal and mediators are ignored.
+Rating 3. Moderate: The synthesis includes explicit causal connectors or verbs and at least one cause–effect link, but remains shallow.
+Rating 4. Good: The synthesis describes multi-step causal chains (driver → mediator → outcome) using clear causal language.
+Rating 5. Very good: The synthesis presents detailed, explicit multi-step causal reasoning with clear mediators, temporal specificity, and stated assumptions or boundary conditions.
@@ -95,7 +94,7 @@ class MechanisticUnderstanding(Rubric):
{
- "Completeness": {"rating": "4", "rationale": "Only minor details are missing in the synthesis."}
+ "causal_reasoning": {"rating": "4", "rationale": "The answer uses clear causal connectors and describes a multi-step cause–effect relationship."}
}
@@ -128,18 +127,18 @@ class CausalReasoning(Rubric):
-1. Informativeness: is the answer a useful and informative reply to the question?
+1. temporal_precision: does the answer include specific and explicit temporal references, such as quantified time intervals or dated events, rather than vague or unspecific timing?
For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
-1. Informativeness
-Rating 1. Very bad: The synthesis offers no valuable insights or useful information in response to the research question, lacking depth and utility.
-Rating 2. Bad: The answer provides limited new insights or useful information in response to the research question.
-Rating 3. Moderate: The answer is somewhat informative, offering insights or useful information but not in a comprehensive or detailed manner.
-Rating 4. Good: The answer is informative and insightful, providing comprehensive information in response to the research question.
-Rating 5. Very good: The synthesis is highly informative, providing valuable insights and detailed information that thoroughly addresses the research question.
+1. Temporal Precision
+Rating 1. Very bad: The synthesis contains no temporal references; timing is entirely vague or absent.
+Rating 2. Bad: The answer includes implicit or generic timing (e.g., “over time,” “eventually”) but no specific intervals, dates, or durations.
+Rating 3. Moderate: The answer provides at least one explicit temporal reference (e.g., a rough duration or time window) but lacks consistency or clear linkage to effects.
+Rating 4. Good: The answer includes multiple specific temporal references (e.g., quantified intervals or dated events) that are clearly tied to described processes or outcomes.
+Rating 5. Very good: The answer demonstrates high temporal precision, with detailed and explicit timeframes (lags, durations, windows, or dates) systematically linked to multi-step processes and their effects.
@@ -148,7 +147,7 @@ class CausalReasoning(Rubric):
{
- "Informativeness": {"rating": "4", "rationale": "Most information is informative for the research question."}
+ "temporal_precision": {"rating": "4", "rationale": "The answer includes several specific timeframes or durations that are clearly linked to the described processes, though some timing details could be more precise."}
}
From 47cbe0fb54123d78b17cee4634c4a5c49c5a451e Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Fri, 19 Dec 2025 02:27:03 +0100
Subject: [PATCH 04/12] =?UTF-8?q?=F0=9F=93=9D=20Added=20Prompt=20for=20the?=
=?UTF-8?q?=20Dimension:=20Breadth,=20Updated=20documentation=20with=20the?=
=?UTF-8?q?=20addition=20of=20new=20rubrics.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
docs/source/rubrics.rst | 104 ++++++++++++++++++++++++++++++++++--
yescieval/__init__.py | 5 +-
yescieval/rubric/breadth.py | 80 +++++++++++++--------------
3 files changed, 144 insertions(+), 45 deletions(-)
diff --git a/docs/source/rubrics.rst b/docs/source/rubrics.rst
index 3e78f14..e264f21 100644
--- a/docs/source/rubrics.rst
+++ b/docs/source/rubrics.rst
@@ -2,7 +2,7 @@
Rubrics
===================
-A total of nine evaluation rubrics were defined as part of the YESciEval test framework.
+A total of twenty three (23) evaluation rubrics were defined as part of the YESciEval test framework.
Linguistic & Stylistic Quality
---------------------------------
@@ -59,6 +59,99 @@ Following ``Content Accuracy & Informativeness`` ensures that the response is bo
* - **9. Informativeness:**
- Is the answer a useful and informative reply to the problem?
+Research Depth Assessment
+---------------------------------
+
+Following ``Research Depth Assessment`` quantifies the mechanistic and analytical sophistication of synthesis outputs.
+
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 80
+
+ * - Evaluation Rubric
+ - Description
+ * - **10. Mechanistic Understanding:**
+ - Does the answer show understanding of ecological processes, using indicators like “feedback,” “nutrient cycling,” or “trophic cascade”?
+ * - **11. Causal Reasoning:**
+ - Does the answer show clear cause-effect relationships using words like “because,” “results in,” or “drives”?
+ * - **12. Temporal Precision:**
+ - Does the answer include specific time references, like intervals (“within 6 months”) or dates (“1990–2020”)?
+
+Research Breadth Assessment
+---------------------------------
+
+Following ``Research Breadth Assessment`` evaluates the diversity of evidence across spatial, ecological, and methodological contexts.
+
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 80
+
+ * - Evaluation Rubric
+ - Description
+ * - **13. Geographic Coverage:**
+ - Does the answer cover multiple biogeographic zones, such as “Tropical” or “Boreal”?
+ * - **14. Intervention Diversity:**
+ - Does the answer include a variety of management practices?
+ * - **15. Biodiversity Dimensions:**
+ - Does the answer mention different aspects of biodiversity, like taxonomic, functional, phylogenetic, or spatial diversity?
+ * - **16. Ecosystem Services:**
+ - Does the answer include relevant ecosystem services, based on the Millennium Ecosystem Assessment vocabulary?
+ * - **17. Spatial Scale:**
+ - Does the answer specify the spatial scale, using terms like “local,” “regional,” or “continental” and area measures?
+
+Scientific Rigor Assessment
+---------------------------------
+
+Following ``Scientific Rigor Assessment`` assesses the evidentiary and methodological integrity of the synthesis.
+
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 80
+
+ * - Evaluation Rubric
+ - Description
+ * - **18. Statistical Sophistication:**
+ - Does the answer use statistical methods or analyses, showing quantitative rigor and depth?
+ * - **19. Citation Practices:**
+ - Does the answer properly cite sources, using parenthetical or narrative citations (e.g., “(Smith et al., 2021)”)?
+ * - **20. Uncertainty Acknowledgment:**
+ - Does the answer explicitly mention limitations or uncertainty, using terms like “unknown,” “limited evidence,” or “unclear”?
+
+Innovation Capacity Assessment
+---------------------------------
+
+Following ``Innovation Capacity Assessment`` evaluates the novelty of the synthesis.
+
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 80
+
+ * - Evaluation Rubric
+ - Description
+ * - **21. Speculative Statements:**
+ - Does the answer include cautious or hypothetical statements, using words like “might,” “could,” or “hypothetical”?
+ * - **22. Novelty Indicators :**
+ - Does the answer highlight innovation using terms like “novel,” “pioneering,” or “emerging”?
+
+
+Research Gap Assessment
+---------------------------------
+
+Following ``Research Gap Assessment`` detects explicit acknowledgment of unanswered questions or understudied areas in the synthesis.
+
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 80
+
+ * - Evaluation Rubric
+ - Description
+ * - **23. Gap Identification:**
+ - Does the answer point out unanswered questions or understudied areas, using terms like “research gap” or “understudied”?
Usage Example
@@ -68,9 +161,12 @@ Here is a simple example of how to import rubrics in your code:
.. code-block:: python
- from yescieval import Informativeness, Correctness, Completeness,
- Coherence, Relevancy, Integration,
- Cohesion, Readability, Conciseness
+ from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy,
+ Integration, Cohesion, Readability, Conciseness, GeographicCoverage,
+ InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale,
+ MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification,
+ StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment,
+ SpeculativeStatements, NoveltyIndicators
And to use rubrics:
diff --git a/yescieval/__init__.py b/yescieval/__init__.py
index ec92eb8..25974c8 100644
--- a/yescieval/__init__.py
+++ b/yescieval/__init__.py
@@ -4,7 +4,10 @@
from .base import Rubric, Parser
from .rubric import (Informativeness, Correctness, Completeness, Coherence, Relevancy,
Integration, Cohesion, Readability, Conciseness, GeographicCoverage,
- InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, SpeculativeStatements, NoveltyIndicators)
+ InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale,
+ MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification,
+ StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment,
+ SpeculativeStatements, NoveltyIndicators)
from .judge import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge
from .parser import GPTParser
diff --git a/yescieval/rubric/breadth.py b/yescieval/rubric/breadth.py
index 2f3fc28..efaf0bc 100644
--- a/yescieval/rubric/breadth.py
+++ b/yescieval/rubric/breadth.py
@@ -22,18 +22,18 @@
-1. Correctness: is the information in the answer a correct representation of the content of the provided abstracts?
+1. geographic_coverage: is the information in the answer a correct representation of the spatial scope of the provided abstracts?
For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
-1. Correctness
-Rating 1. Very bad: The synthesis consistently misrepresents or inaccurately portrays the content of the provided abstracts, showing a significant deviation from the original sources.
-Rating 2. Bad: The synthesis contains several inaccuracies or misinterpretations of the source abstracts.
-Rating 3. Moderate: The synthesis accurately represents most of the content from the provided abstracts but may contain minor errors.
-Rating 4. Good: The synthesis provides an accurate representation of the content from the provided abstracts with minor exceptions.
-Rating 5. Very good: The information in the synthesis is an accurate and faithful representation of the content from the provided abstracts, without any factual errors or misinterpretations.
+1. Geographic Coverage
+Rating 1. Very bad: The synthesis consistently misrepresents or inaccurately portrays the geographic scope of the provided abstracts, covering only a single context or ignoring relevant regions.
+Rating 2. Bad: The synthesis represents some regions correctly but overlooks several important biogeographic zones or scales, showing limited breadth.
+Rating 3. Moderate: The synthesis captures most relevant regions and some scale diversity, but may miss minor zones or nuances in spatial coverage.
+Rating 4. Good: The synthesis provides a broad and accurate representation of multiple regions and scales, triangulating evidence across sources with minor omissions.
+Rating 5. Very good: The synthesis comprehensively represents all relevant regions, scales, and contexts from the provided abstracts, accurately covering the geographic breadth without omissions.
@@ -42,7 +42,7 @@
{
- "Correctness": {"rating": "4", "rationale": "The synthesis represents the content of the provided abstract, but with minor inrelevant information."}
+ "geographic_coverage": {"rating": "4", "rationale": "The synthesis accurately represents multiple regions and scales from the provided abstracts, with only minor omissions or irrelevant details."}
}
@@ -75,18 +75,18 @@ class GeographicCoverage(Rubric):
-1. Completeness: is the answer a comprehensive encapsulation of the relevant information in the provided abstracts?
+1. intervention_diversity: is the answer a comprehensive encapsulation of the relevant information in the provided abstracts, measured by the number of unique management practices?
For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
-1. Completeness
-Rating 1. Very bad: The synthesis omits most of the relevant information, failing to capture the essential points or details from the provided abstracts.
-Rating 2. Bad: Significant portions of relevant information from the provided abstracts are missing.
-Rating 3. Moderate: The synthesis captures a fair amount of the relevant information, though it may overlook some details.
-Rating 4. Good: The synthesis includes almost all relevant information, missing only minor details.
-Rating 5. Very good: The synthesis comprehensively encapsulates all relevant information from the provided abstracts, leaving no pertinent details or points unaddressed.
+1. Intervention Diversity
+Rating 1. Very bad: The synthesis omits most of the relevant interventions, capturing very few management practices from the provided abstracts.
+Rating 2. Bad: The synthesis misses several important interventions, representing only a limited subset of management practices.
+Rating 3. Moderate: The synthesis captures a fair number of interventions, but some relevant management practices are overlooked.
+Rating 4. Good: The synthesis includes nearly all relevant interventions, missing only minor management practices.
+Rating 5. Very good: The synthesis comprehensively captures all relevant interventions and management practices from the provided abstracts, without omissions.
@@ -95,7 +95,7 @@ class GeographicCoverage(Rubric):
{
- "Completeness": {"rating": "4", "rationale": "Only minor details are missing in the synthesis."}
+ "intervention_diversity": {"rating": "4", "rationale": "The answer includes almost all relevant interventions from the provided abstracts, with only minor details missing."}
}
@@ -128,18 +128,18 @@ class InterventionDiversity(Rubric):
-1. Informativeness: is the answer a useful and informative reply to the question?
+1. biodiversity_dimensions: is the answer a comprehensive representation of the relevant biodiversity information in the provided abstracts, measured by the presence of terms related to taxonomic, functional, phylogenetic, and spatial diversity?
For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
-1. Informativeness
-Rating 1. Very bad: The synthesis offers no valuable insights or useful information in response to the research question, lacking depth and utility.
-Rating 2. Bad: The answer provides limited new insights or useful information in response to the research question.
-Rating 3. Moderate: The answer is somewhat informative, offering insights or useful information but not in a comprehensive or detailed manner.
-Rating 4. Good: The answer is informative and insightful, providing comprehensive information in response to the research question.
-Rating 5. Very good: The synthesis is highly informative, providing valuable insights and detailed information that thoroughly addresses the research question.
+1. Biodiversity Dimensions
+Rating 1. Very bad: The synthesis omits most of the relevant biodiversity information, capturing very few or none of the taxonomic, functional, phylogenetic, or spatial diversity aspects.
+Rating 2. Bad: The synthesis covers some biodiversity dimensions but misses several key aspects or contexts.
+Rating 3. Moderate: The synthesis captures a fair number of biodiversity dimensions, but some relevant terms or contexts are overlooked.
+Rating 4. Good: The synthesis includes nearly all relevant biodiversity dimensions, touching multiple contexts and scales, with only minor omissions.
+Rating 5. Very good: The synthesis comprehensively captures all relevant biodiversity dimensions from the provided abstracts, accurately representing taxonomic, functional, phylogenetic, and spatial diversity without omissions.
@@ -148,7 +148,7 @@ class InterventionDiversity(Rubric):
{
- "Informativeness": {"rating": "4", "rationale": "Most information is informative for the research question."}
+ "biodiversity_dimensions": {"rating": "4", "rationale": "Most information is informative for the research question, capturing the key biodiversity dimensions with minor omissions."}
}
@@ -181,18 +181,18 @@ class BiodiversityDimensions(Rubric):
-1. Informativeness: is the answer a useful and informative reply to the question?
+1. ecosystem_services: is the answer a useful and informative reply to the question, measured by the presence of terms matched against a vocabulary aligned with the Millennium Ecosystem Assessment?
For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
-1. Informativeness
-Rating 1. Very bad: The synthesis offers no valuable insights or useful information in response to the research question, lacking depth and utility.
-Rating 2. Bad: The answer provides limited new insights or useful information in response to the research question.
-Rating 3. Moderate: The answer is somewhat informative, offering insights or useful information but not in a comprehensive or detailed manner.
-Rating 4. Good: The answer is informative and insightful, providing comprehensive information in response to the research question.
-Rating 5. Very good: The synthesis is highly informative, providing valuable insights and detailed information that thoroughly addresses the research question.
+1. Ecosystem Services
+Rating 1. Very bad: The synthesis omits most relevant ecosystem services, capturing very few or none of the terms from the Millennium Ecosystem Assessment vocabulary.
+Rating 2. Bad: The synthesis covers some ecosystem services but misses several key services or contexts.
+Rating 3. Moderate: The synthesis captures a fair number of ecosystem services, but some relevant terms or contexts are overlooked.
+Rating 4. Good: The synthesis includes nearly all relevant ecosystem services, touching multiple contexts and scales, with only minor omissions.
+Rating 5. Very good: The synthesis comprehensively captures all relevant ecosystem services from the provided abstracts, accurately representing terms aligned with the Millennium Ecosystem Assessment vocabulary without omissions.
@@ -201,7 +201,7 @@ class BiodiversityDimensions(Rubric):
{
- "Informativeness": {"rating": "4", "rationale": "Most information is informative for the research question."}
+ "ecosystem_services": {"rating": "4", "rationale": "The synthesis includes nearly all relevant ecosystem services from the provided abstracts, with only minor omissions."}
}
@@ -234,18 +234,18 @@ class EcosystemServices(Rubric):
-1. Informativeness: is the answer a useful and informative reply to the question?
+1. spatial_scale: is the answer a useful and informative reply to the question, measured by the presence of explicit scale terms (e.g., “local,” “regional,” “continental”) and area measures?
For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic.
-1. Informativeness
-Rating 1. Very bad: The synthesis offers no valuable insights or useful information in response to the research question, lacking depth and utility.
-Rating 2. Bad: The answer provides limited new insights or useful information in response to the research question.
-Rating 3. Moderate: The answer is somewhat informative, offering insights or useful information but not in a comprehensive or detailed manner.
-Rating 4. Good: The answer is informative and insightful, providing comprehensive information in response to the research question.
-Rating 5. Very good: The synthesis is highly informative, providing valuable insights and detailed information that thoroughly addresses the research question.
+1. Spatial Scale
+Rating 1. Very bad: The synthesis omits most relevant spatial scale information, capturing very few or none of the scale terms or area measures.
+Rating 2. Bad: The synthesis covers some scale information but misses several key scales or contexts.
+Rating 3. Moderate: The synthesis captures a fair amount of spatial scale information, but some relevant terms or area measures are overlooked.
+Rating 4. Good: The synthesis includes nearly all relevant spatial scale information, touching multiple scales and contexts, with only minor omissions.
+Rating 5. Very good: The synthesis comprehensively captures all relevant spatial scale information from the provided abstracts, accurately representing scale terms and area measures without omissions.
@@ -254,7 +254,7 @@ class EcosystemServices(Rubric):
{
- "Informativeness": {"rating": "4", "rationale": "Most information is informative for the research question."}
+ "spatial_scale": {"rating": "4", "rationale": "The synthesis includes nearly all relevant spatial scale information from the provided abstracts, with only minor omissions."}
}
From 57f77b23ed4b2056ef706cda10fa3f0ab31680c4 Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Fri, 19 Dec 2025 03:17:51 +0100
Subject: [PATCH 05/12] :pencil: Added an example run for Gap Identification
with Custom Judge
---
docs/source/quickstart.rst | 39 +++++++++++++++++++++++++++++++++++++-
1 file changed, 38 insertions(+), 1 deletion(-)
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index f266e6d..af22303 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -1,7 +1,7 @@
Quickstart
=================
-YESciEval is a library designed to evaluate the quality of synthesized scientific answers using predefined rubrics and advanced LLM-based judgment models. This guide walks you through how to evaluate answers based on **informativeness** using a pretrained judge and parse LLM output into structured JSON.
+YESciEval is a library designed to evaluate the quality of synthesized scientific answers using predefined rubrics and advanced LLM-based judgment models. This guide walks you through how to evaluate answers based on **informativeness** & **gap identification** using a pretrained & a custom judge and parse LLM output into structured JSON.
**Example: Evaluating an Answer Using Informativeness + AskAutoJudge**
@@ -46,6 +46,43 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi
- Use the ``device="cuda"`` if running on GPU for better performance.
- Add more rubrics such as ``Informativeness``, ``Relevancy``, etc for multi-criteria evaluation.
+
+**Example: Evaluating an Answer Using GapIdentification + CustomAutoJudge**
+
+.. code-block:: python
+
+ from yescieval import GapIdentification, CustomAutoJudge, GPTParser
+
+ # Sample papers used in form of {"title": "abstract", ... }
+ papers = {
+ "A Study on AI": "This paper discusses recent advances in artificial intelligence, including deep learning.",
+ "Machine Learning Basics": "An overview of supervised learning methods such as decision trees and SVMs.",
+ "Neural Networks Explained": "Explains backpropagation and gradient descent for training networks.",
+ "Ethics in AI": "Explores ethical concerns in automated decision-making systems.",
+ "Applications of AI in Healthcare": "Details how AI improves diagnostics and personalized medicine."
+ }
+
+ # Input question and synthesized answer
+ question = "How is AI used in modern healthcare systems?"
+ answer = (
+ "AI is being used in healthcare for diagnosing diseases, predicting patient outcomes, "
+ "and assisting in treatment planning. It also supports personalized medicine and medical imaging."
+ )
+
+ # Step 1: Create a rubric
+ rubric = GapIdentification(papers=papers, question=question, answer=answer)
+ instruction_prompt = rubric.instruct()
+
+ # Step 2: Load the evaluation model (judge)
+ judge = CustomAutoJudge()
+ judge.from_pretrained(model_id="Qwen/Qwen3-8B", device="cpu", token="your_huggingface_token")
+
+ # Step 3: Evaluate the answer
+ result = judge.evaluate(rubric=rubric)
+
+ print("Raw Evaluation Output:")
+ print(result)
+
**Parsing Raw Output with GPTParser**
If the model outputs unstructured or loosely structured text, you can use GPTParser to parse it into valid JSON.
From 535be20241099c7e58e089e101f131d16504b5ef Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:19:49 +0100
Subject: [PATCH 06/12] =?UTF-8?q?=F0=9F=93=9D=20updated=20the=20gitignore?=
=?UTF-8?q?=20file?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 1 +
1 file changed, 1 insertion(+)
diff --git a/.gitignore b/.gitignore
index 80419c2..2f94a8b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,6 +133,7 @@ celerybeat.pid
# Environments
.env
.venv
+.myenv
env/
venv/
ENV/
From 52932b968c974615a99233b1d15ebaefbae7d1c5 Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:20:35 +0100
Subject: [PATCH 07/12] =?UTF-8?q?=F0=9F=93=9D=20updated=20the=20judge.py?=
=?UTF-8?q?=20file,=20added=20fix=20for=20tokenizer=20issue?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
yescieval/judge/judges.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py
index 2c436f0..00c736d 100644
--- a/yescieval/judge/judges.py
+++ b/yescieval/judge/judges.py
@@ -42,13 +42,13 @@ class AskAutoJudge(AutoJudge):
def from_pretrained(self, model_id:str="SciKnowOrg/YESciEval-ASK-Llama-3.1-8B",
device:str="auto",
token:str =""):
- return super()._from_pretrained(model_id=model_id, device=device, token=token)
+ self.model, self.tokenizer = super()._from_pretrained(model_id=model_id, device=device, token=token)
class BioASQAutoJudge(AutoJudge):
def from_pretrained(self, model_id: str = "SciKnowOrg/YESciEval-BioASQ-Llama-3.1-8B",
device: str = "auto",
token: str = ""):
- return super()._from_pretrained(model_id=model_id, device=device, token=token)
+ self.model, self.tokenizer = super()._from_pretrained(model_id=model_id, device=device, token=token)
From 89fda56a60f72790cc641fb223b6c55c79868004 Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Sat, 20 Dec 2025 00:37:48 +0100
Subject: [PATCH 08/12] updated gitignore file to remove venv from staged
changes
---
.gitignore | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index 2f94a8b..446b470 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,7 +133,7 @@ celerybeat.pid
# Environments
.env
.venv
-.myenv
+myenv/
env/
venv/
ENV/
From f34e10c29437fadff8ee26bbb3dcdf9517b52824 Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Sat, 20 Dec 2025 00:44:02 +0100
Subject: [PATCH 09/12] =?UTF-8?q?=F0=9F=93=9D=20updated=20the=20example=20?=
=?UTF-8?q?run=20script=20in=20the=20documentation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
docs/source/quickstart.rst | 21 ++-------------------
1 file changed, 2 insertions(+), 19 deletions(-)
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index af22303..fdbb3e3 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -1,7 +1,7 @@
Quickstart
=================
-YESciEval is a library designed to evaluate the quality of synthesized scientific answers using predefined rubrics and advanced LLM-based judgment models. This guide walks you through how to evaluate answers based on **informativeness** & **gap identification** using a pretrained & a custom judge and parse LLM output into structured JSON.
+YESciEval is a library designed to evaluate the quality of synthesized scientific answers using predefined rubrics and advanced LLM-based judgment models. This guide walks you through how to evaluate answers based on **informativeness** and **gap identification** using a pretrained & a custom judge and parse LLM output into structured JSON.
**Example: Evaluating an Answer Using Informativeness + AskAutoJudge**
@@ -51,23 +51,7 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi
.. code-block:: python
- from yescieval import GapIdentification, CustomAutoJudge, GPTParser
-
- # Sample papers used in form of {"title": "abstract", ... }
- papers = {
- "A Study on AI": "This paper discusses recent advances in artificial intelligence, including deep learning.",
- "Machine Learning Basics": "An overview of supervised learning methods such as decision trees and SVMs.",
- "Neural Networks Explained": "Explains backpropagation and gradient descent for training networks.",
- "Ethics in AI": "Explores ethical concerns in automated decision-making systems.",
- "Applications of AI in Healthcare": "Details how AI improves diagnostics and personalized medicine."
- }
-
- # Input question and synthesized answer
- question = "How is AI used in modern healthcare systems?"
- answer = (
- "AI is being used in healthcare for diagnosing diseases, predicting patient outcomes, "
- "and assisting in treatment planning. It also supports personalized medicine and medical imaging."
- )
+ from yescieval import GapIdentification, CustomAutoJudge
# Step 1: Create a rubric
rubric = GapIdentification(papers=papers, question=question, answer=answer)
@@ -79,7 +63,6 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi
# Step 3: Evaluate the answer
result = judge.evaluate(rubric=rubric)
-
print("Raw Evaluation Output:")
print(result)
From 7a07b20016897de07e1e7fc1a7e60139fc77038e Mon Sep 17 00:00:00 2001
From: Hamed Babaei Giglou
Date: Sat, 20 Dec 2025 15:42:15 +0100
Subject: [PATCH 10/12] :memo: update readme details
---
README.md | 46 +++++++++++++++++++++++++++++++++-------------
1 file changed, 33 insertions(+), 13 deletions(-)
diff --git a/README.md b/README.md
index 5dcf9a2..faba342 100644
--- a/README.md
+++ b/README.md
@@ -87,32 +87,52 @@ Judges within YESciEval are defined as follows:
| `AutoJudge` | Base class for loading and running evaluation models with PEFT adapters. |
| `AskAutoJudge` | Multidisciplinary judge tuned on the ORKGSyn dataset from the Open Research Knowledge Graph. |
| `BioASQAutoJudge` | Biomedical domain judge tuned on the BioASQ dataset from the BioASQ challenge. |
-| `CustomAutoJudge`| Custom LLM that can be used as a judge within YESciEval rubrics |
+| `CustomAutoJudge`| Custom LLM (open-source LLMs) that can be used as a judge within YESciEval rubrics |
-A total of nine evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code:
+A total of **23** evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code:
```python
-from yescieval import Informativeness, Correctness, Completeness,
- Coherence, Relevancy, Integration,
- Cohesion, Readability, Conciseness
+from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, \
+ Integration, Cohesion, Readability, Conciseness, GeographicCoverage, \
+ InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, \
+ MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, \
+ StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, \
+ SpeculativeStatements, NoveltyIndicators
+
```
A complete list of rubrics are available at YESciEval [📚 Rubrics](https://yescieval.readthedocs.io/rubrics.html) page.
## 💡 Acknowledgements
-If you use YESciEval in your research, please cite:
+If you find this repository helpful or use YESciEval in your work or research, feel free to cite our publication:
+
```bibtex
-@article{d2025yescieval,
- title={YESciEval: Robust LLM-as-a-Judge for Scientific Question Answering},
- author={D'Souza, Jennifer and Giglou, Hamed Babaei and M{\"u}nch, Quentin},
- journal={arXiv preprint arXiv:2505.14279},
- year={2025}
- }
+@inproceedings{dsouza-etal-2025-yescieval,
+ title = "{YES}ci{E}val: Robust {LLM}-as-a-Judge for Scientific Question Answering",
+ author = {D{'}Souza, Jennifer and
+ Babaei Giglou, Hamed and
+ M{\"u}nch, Quentin},
+ editor = "Che, Wanxiang and
+ Nabende, Joyce and
+ Shutova, Ekaterina and
+ Pilehvar, Mohammad Taher",
+ booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+ month = jul,
+ year = "2025",
+ address = "Vienna, Austria",
+ publisher = "Association for Computational Linguistics",
+ url = "https://aclanthology.org/2025.acl-long.675/",
+ doi = "10.18653/v1/2025.acl-long.675",
+ pages = "13749--13783",
+ ISBN = "979-8-89176-251-0"
+}
```
+> For other type of citations please refer to https://aclanthology.org/2025.acl-long.675/.
+
-This work is licensed under a [](https://opensource.org/licenses/MIT).
+This software is licensed under a [](https://opensource.org/licenses/MIT).
From ab40e8278d60ff4925de587fcc09c394746e9a4c Mon Sep 17 00:00:00 2001
From: Hamed Babaei Giglou
Date: Sat, 20 Dec 2025 15:42:50 +0100
Subject: [PATCH 11/12] :sparkles: add automated versioning
---
pyproject.toml | 13 +++++++++----
setup.py | 3 ++-
yescieval/VERSION | 1 +
yescieval/__init__.py | 3 ++-
4 files changed, 14 insertions(+), 6 deletions(-)
create mode 100644 yescieval/VERSION
diff --git a/pyproject.toml b/pyproject.toml
index f92d965..17cd54f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,7 @@
[tool.poetry]
name = "YESciEval"
-version = "0.2.0"
-
+version = "0.0.0"
description = "YESciEval: Robust LLM-as-a-Judge for Scientific Question Answering."
authors = ["Hamed Babaei Giglou "]
license = "MIT License"
@@ -30,6 +29,12 @@ wheel = "*"
twine = "*"
pytest = "*"
+[tool.poetry-dynamic-versioning]
+enable = true
+style = "semver"
+source = "attr"
+attr = "yescieval.__version__"
+
[build-system]
-requires = ["poetry-core>=1.0.0"]
-build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.4.0"]
+build-backend = "poetry_dynamic_versioning.backend"
diff --git a/setup.py b/setup.py
index e74ca6c..101d10d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,12 @@
from setuptools import setup, find_packages
+import os
with open("README.md", encoding="utf-8") as f:
long_description = f.read()
setup(
name="YESciEval",
- version="0.2.0",
+ version=open(os.path.join(os.path.dirname(__file__), 'yescieval/VERSION')).read().strip(),
author="Hamed Babaei Giglou",
author_email="hamedbabaeigiglou@gmail.com",
description="YESciEval: Robust LLM-as-a-Judge for Scientific Question Answering.",
diff --git a/yescieval/VERSION b/yescieval/VERSION
new file mode 100644
index 0000000..9325c3c
--- /dev/null
+++ b/yescieval/VERSION
@@ -0,0 +1 @@
+0.3.0
\ No newline at end of file
diff --git a/yescieval/__init__.py b/yescieval/__init__.py
index 25974c8..f8a37bb 100644
--- a/yescieval/__init__.py
+++ b/yescieval/__init__.py
@@ -1,5 +1,6 @@
+from pathlib import Path
-__version__ = "0.2.0"
+__version__ = (Path(__file__).parent / "VERSION").read_text().strip()
from .base import Rubric, Parser
from .rubric import (Informativeness, Correctness, Completeness, Coherence, Relevancy,
From 38c33939e5c57f8ef0765be0f2aac962feae628d Mon Sep 17 00:00:00 2001
From: Hamed Babaei Giglou
Date: Sat, 20 Dec 2025 15:43:08 +0100
Subject: [PATCH 12/12] :bookmark: v0.3.0
---
CHANGELOG.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 25a33f4..262c769 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
## Changelog
+### v0.3.0 (December 20, 2025)
+- Add more rubrics (PR #3)
+- Update documentation for new rubrics
+- Minor bug fixing
+- Update Readme
+
### v0.2.0 (May 30, 2025)
- Add custom judge module.
- Add documentation.