From 45810cf9f62a11777956fac71a57eb9981894140 Mon Sep 17 00:00:00 2001 From: MikeACedric <72818458+MikeACedric@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:20:29 +0500 Subject: [PATCH 1/9] =?UTF-8?q?=F0=9F=93=9D=20Added=20CustomGPT=20Model=20?= =?UTF-8?q?Class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 3 +- yescieval/__init__.py | 2 +- yescieval/judge/__init__.py | 5 +- yescieval/judge/judges.py | 96 +++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0478557..eb4a74f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ openai pandas numpy pydantic -pytest \ No newline at end of file +pytest +openai \ No newline at end of file diff --git a/yescieval/__init__.py b/yescieval/__init__.py index 25974c8..8f72e68 100644 --- a/yescieval/__init__.py +++ b/yescieval/__init__.py @@ -8,6 +8,6 @@ MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, SpeculativeStatements, NoveltyIndicators) -from .judge import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge +from .judge import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge, GPTCustomAutoJudge from .parser import GPTParser diff --git a/yescieval/judge/__init__.py b/yescieval/judge/__init__.py index a3fe787..09ba18f 100644 --- a/yescieval/judge/__init__.py +++ b/yescieval/judge/__init__.py @@ -1,8 +1,9 @@ -from .judges import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge +from .judges import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge, GPTCustomAutoJudge __all__ = [ "AutoJudge", "AskAutoJudge", "BioASQAutoJudge", - "CustomAutoJudge" + "CustomAutoJudge", + "GPTCustomAutoJudge" ] \ No newline at end of file diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py index 00c736d..0f4efc9 100644 --- a/yescieval/judge/judges.py +++ b/yescieval/judge/judges.py @@ -4,6 +4,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel, PeftConfig import torch +from openai import OpenAI @@ -66,3 +67,98 @@ def _from_pretrained(self, model_id:str, device:str="auto", token:str =""): token=token ) return model, tokenizer + + +class GPTCustomAutoJudge(AutoJudge): + + def from_pretrained(self, model_id: str, api_key: str = None, base_url: str = None, **kwargs): + + self.model_name = model_id + + client_kwargs = {} + if api_key: + client_kwargs["api_key"] = api_key + if base_url: + client_kwargs["base_url"] = base_url + client_kwargs.update(kwargs) + + self.client = OpenAI(**client_kwargs) + return self + + def _is_reasoning_model(self) -> bool: + + model_lower = self.model_name.lower() + reasoning_prefixes = ("gpt-5", "o1", "o4" "o3", "o-1", "o-3") + return any(model_lower.startswith(prefix) for prefix in reasoning_prefixes) + + def evaluate( + self, + rubric: Rubric, + max_new_tokens: int = 300, + temperature: float = 0.0, + **kwargs + ) -> str: + + if self.client is None: + raise ValueError("Model not initialized. Call from_pretrained() first.") + + raw_messages = rubric.instruct() + messages = self._format_messages(raw_messages) + + params = { + "model": self.model_name, + "messages": messages, + } + + # Add model-specific parameters + if self._is_reasoning_model(): + params["max_completion_tokens"] = max_new_tokens + else: + params["max_tokens"] = max_new_tokens + params["temperature"] = temperature + + for key, value in kwargs.items(): + if key not in params: + params[key] = value + try: + response = self.client.chat.completions.create(**params) + except Exception as e: + raise RuntimeError(f"OpenAI API call failed: {str(e)}") + + if response.choices and len(response.choices) > 0: + content = response.choices[0].message.content + return content if content else "" + return "" + + def _format_messages(self, raw_messages) -> list: + + messages = [] + + # Handle string input + if isinstance(raw_messages, str): + messages.append({"role": "user", "content": raw_messages}) + + # Handle list input + elif isinstance(raw_messages, list): + for msg in raw_messages: + if isinstance(msg, str): + messages.append({"role": "user", "content": msg}) + elif isinstance(msg, dict): + if "role" in msg and "content" in msg: + messages.append(msg) + else: + raise ValueError(f"Message dict missing 'role' or 'content': {msg}") + else: + raise ValueError(f"Invalid message type in list: {type(msg)}") + + # Handle dict input (single message) + elif isinstance(raw_messages, dict): + if "role" in raw_messages and "content" in raw_messages: + messages.append(raw_messages) + else: + raise ValueError(f"Message dict missing 'role' or 'content': {raw_messages}") + + else: + raise ValueError(f"Unsupported rubric.instruct() output type: {type(raw_messages)}") + + return messages \ No newline at end of file From 93af2a4a464d52fc1b6cdfa1889c1dee7476588d Mon Sep 17 00:00:00 2001 From: MikeACedric <72818458+MikeACedric@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:47:12 +0500 Subject: [PATCH 2/9] =?UTF-8?q?=F0=9F=93=9D=20Added=20documentation=20for?= =?UTF-8?q?=20Custom=20GPT=20Judge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/judges.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/source/judges.rst b/docs/source/judges.rst index 0eb37af..bc60c27 100644 --- a/docs/source/judges.rst +++ b/docs/source/judges.rst @@ -89,3 +89,23 @@ For example, you can load a model and evaluate a rubric like this: print(result) This approach allows full control over which model is used for evaluation, supporting any LLM.. + +GPT Custom Judge +-------------------- + +The `GPTCustomAutoJudge` class provides a generic, flexible interface to evaluate scientific syntheses using OpenAI GPT models. + +You can use it to evaluate a rubric by providing your OpenAI API key and specifying the model ID: + +.. code-block:: python + + # Initialize and load a custom model by specifying its Hugging Face model ID + judge = GPTCustomAutoJudge() + judge.from_pretrained("gpt-5.2", api_key=OPEN_AI_API_KEY) + + # Evaluate the rubric using the loaded model + result = judge.evaluate(rubric=rubric) + + print(result) + +This allows you to leverage the capabilities of OpenAI's GPT models for scientific text evaluation. \ No newline at end of file From 677491815e5b1e8681dc7f3a2de2df13ba1dbbe5 Mon Sep 17 00:00:00 2001 From: MikeACedric <72818458+MikeACedric@users.noreply.github.com> Date: Mon, 12 Jan 2026 09:56:50 +0100 Subject: [PATCH 3/9] Fixed structured output parsing for GPT-4 models and updated README --- README.md | 13 +++- docs/source/judges.rst | 4 +- requirements.txt | 3 +- yescieval/judge/judges.py | 155 +++++++++++++++++++++++++------------- 4 files changed, 116 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 5dcf9a2..3ed0573 100644 --- a/README.md +++ b/README.md @@ -88,13 +88,18 @@ Judges within YESciEval are defined as follows: | `AskAutoJudge` | Multidisciplinary judge tuned on the ORKGSyn dataset from the Open Research Knowledge Graph. | | `BioASQAutoJudge` | Biomedical domain judge tuned on the BioASQ dataset from the BioASQ challenge. | | `CustomAutoJudge`| Custom LLM that can be used as a judge within YESciEval rubrics | +| `GPTCustomAutoJudge`| Custom GPT-based LLM that can be used as a judge within YESciEval | -A total of nine evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code: + +A total of twenty three (23) evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code: ```python -from yescieval import Informativeness, Correctness, Completeness, - Coherence, Relevancy, Integration, - Cohesion, Readability, Conciseness + from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, + Integration, Cohesion, Readability, Conciseness, GeographicCoverage, + InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, + MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, + StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, + SpeculativeStatements, NoveltyIndicators ``` A complete list of rubrics are available at YESciEval [📚 Rubrics](https://yescieval.readthedocs.io/rubrics.html) page. diff --git a/docs/source/judges.rst b/docs/source/judges.rst index bc60c27..7c1a5ae 100644 --- a/docs/source/judges.rst +++ b/docs/source/judges.rst @@ -101,10 +101,10 @@ You can use it to evaluate a rubric by providing your OpenAI API key and specify # Initialize and load a custom model by specifying its Hugging Face model ID judge = GPTCustomAutoJudge() - judge.from_pretrained("gpt-5.2", api_key=OPEN_AI_API_KEY) + judge.from_pretrained("gpt-5.2", token=OPEN_AI_API_KEY) # Evaluate the rubric using the loaded model - result = judge.evaluate(rubric=rubric) + result = judge.judge(rubric=rubric) print(result) diff --git a/requirements.txt b/requirements.txt index eb4a74f..0478557 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,4 @@ openai pandas numpy pydantic -pytest -openai \ No newline at end of file +pytest \ No newline at end of file diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py index 0f4efc9..65067f8 100644 --- a/yescieval/judge/judges.py +++ b/yescieval/judge/judges.py @@ -4,6 +4,8 @@ from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel, PeftConfig import torch +import json +import ast from openai import OpenAI @@ -69,96 +71,147 @@ def _from_pretrained(self, model_id:str, device:str="auto", token:str =""): return model, tokenizer -class GPTCustomAutoJudge(AutoJudge): - - def from_pretrained(self, model_id: str, api_key: str = None, base_url: str = None, **kwargs): +class GPTCustomAutoJudge(Judge): + def from_pretrained(self, model_id: str, token: str = ""): self.model_name = model_id - client_kwargs = {} - if api_key: - client_kwargs["api_key"] = api_key - if base_url: - client_kwargs["base_url"] = base_url - client_kwargs.update(kwargs) - - self.client = OpenAI(**client_kwargs) - return self + if token: + self.client = OpenAI(api_key=token) + else: + self.client = OpenAI() def _is_reasoning_model(self) -> bool: model_lower = self.model_name.lower() - reasoning_prefixes = ("gpt-5", "o1", "o4" "o3", "o-1", "o-3") + reasoning_prefixes = ("gpt-5", "o1", "o4", "o3", "o-1", "o-3") return any(model_lower.startswith(prefix) for prefix in reasoning_prefixes) - def evaluate( - self, - rubric: Rubric, - max_new_tokens: int = 300, - temperature: float = 0.0, - **kwargs - ) -> str: - - if self.client is None: + def _is_gpt4_family(self) -> bool: + model_lower = self.model_name.lower() + return model_lower.startswith(("gpt-4", "gpt-4o", "gpt-4.1")) + + def _model_family(self) -> str: + if self._is_gpt4_family(): + return "GPT-4 family" + elif self._is_reasoning_model(): + return "GPT-5 / O-series" + else: + return "Other" + + + def _judge_function_schema_single(self, rubric: Rubric) -> dict: + + rubric_id = rubric.__class__.__name__ + return { + "name": "submit_judgement", + "description": f"Return rating and rationale for rubric {rubric_id}", + "parameters": { + "type": "object", + "properties": { + rubric_id: { + "type": "object", + "properties": { + "rating": {"type": "string", "description": "Score for this rubric"}, + "rationale": {"type": "string", "description": "Explanation for the rating"} + }, + "required": ["rating", "rationale"] + } + }, + "required": [rubric_id] + } + } + + def _parse_json(self, text: str) -> dict: + + text = text.strip() + + if text.startswith("```"): + text = text.split("```")[1] + if text.startswith("json"): + text = text[4:] + + text = text.strip() + + try: + return json.loads(text) + except json.JSONDecodeError: + try: + return ast.literal_eval(text) + except: + return None + + def judge(self, rubric: Rubric, max_new_tokens: int = 150) -> Dict[str, Dict[str, str]]: + if not self.client: raise ValueError("Model not initialized. Call from_pretrained() first.") - raw_messages = rubric.instruct() - messages = self._format_messages(raw_messages) + messages = self._format_messages(rubric.instruct()) + rubric_id = rubric.__class__.__name__ + + if self._is_reasoning_model(): + actual_max_tokens = max_new_tokens * 10 + else: + actual_max_tokens = max_new_tokens params = { "model": self.model_name, "messages": messages, + "max_completion_tokens" if self._is_reasoning_model() else "max_tokens": actual_max_tokens } - # Add model-specific parameters - if self._is_reasoning_model(): - params["max_completion_tokens"] = max_new_tokens - else: - params["max_tokens"] = max_new_tokens - params["temperature"] = temperature + if self._is_gpt4_family(): + params["functions"] = [self._judge_function_schema_single(rubric)] + params["function_call"] = {"name": "submit_judgement"} - for key, value in kwargs.items(): - if key not in params: - params[key] = value try: response = self.client.chat.completions.create(**params) + message = response.choices[0].message except Exception as e: raise RuntimeError(f"OpenAI API call failed: {str(e)}") - if response.choices and len(response.choices) > 0: - content = response.choices[0].message.content - return content if content else "" - return "" + if self._is_gpt4_family(): + if hasattr(message, 'function_call') and message.function_call: + raw_args = message.function_call.arguments.strip() + content = self._parse_json(raw_args) or {} + else: + content = {rubric_id: {"rating": "N/A", "rationale": "No function call returned"}} + else: + raw_text = getattr(message, 'content', '') + + if raw_text: + parsed = self._parse_json(raw_text) + if parsed: + content = parsed + else: + content = {rubric_id: {"rating": "N/A", "rationale": raw_text}} + else: + if response.choices[0].finish_reason == 'length': + content = {rubric_id: {"rating": "N/A", "rationale": "Token limit reached. Increase max_new_tokens parameter."}} + else: + content = {rubric_id: {"rating": "N/A", "rationale": "Empty response"}} - def _format_messages(self, raw_messages) -> list: + return content + def _format_messages(self, raw_messages) -> list: + """Convert rubric.instruct() output to OpenAI message format.""" messages = [] - # Handle string input if isinstance(raw_messages, str): messages.append({"role": "user", "content": raw_messages}) - - # Handle list input elif isinstance(raw_messages, list): for msg in raw_messages: if isinstance(msg, str): messages.append({"role": "user", "content": msg}) - elif isinstance(msg, dict): - if "role" in msg and "content" in msg: - messages.append(msg) - else: - raise ValueError(f"Message dict missing 'role' or 'content': {msg}") + elif isinstance(msg, dict) and "role" in msg and "content" in msg: + messages.append(msg) else: - raise ValueError(f"Invalid message type in list: {type(msg)}") - - # Handle dict input (single message) + raise ValueError(f"Invalid message format: {msg}") elif isinstance(raw_messages, dict): if "role" in raw_messages and "content" in raw_messages: messages.append(raw_messages) else: raise ValueError(f"Message dict missing 'role' or 'content': {raw_messages}") - else: - raise ValueError(f"Unsupported rubric.instruct() output type: {type(raw_messages)}") + raise ValueError(f"Unsupported rubric.instruct() type: {type(raw_messages)}") return messages \ No newline at end of file From 6929377b6102597a50a8f6c2a10d41fe27fa422a Mon Sep 17 00:00:00 2001 From: MikeACedric <72818458+MikeACedric@users.noreply.github.com> Date: Tue, 13 Jan 2026 10:40:07 +0100 Subject: [PATCH 4/9] Added retry logic to judge API calls and clean up method signatures --- yescieval/base/rubric.py | 1 + yescieval/judge/judges.py | 148 +++++++++------------------- yescieval/rubric/breadth.py | 5 + yescieval/rubric/depth.py | 3 + yescieval/rubric/gap.py | 1 + yescieval/rubric/informativeness.py | 3 + yescieval/rubric/innovation.py | 2 + yescieval/rubric/rigor.py | 3 + yescieval/rubric/structural.py | 3 + yescieval/rubric/stylistic.py | 3 + 10 files changed, 71 insertions(+), 101 deletions(-) diff --git a/yescieval/base/rubric.py b/yescieval/base/rubric.py index 64c37e7..6ca06fe 100644 --- a/yescieval/base/rubric.py +++ b/yescieval/base/rubric.py @@ -10,6 +10,7 @@ class Rubric(BaseModel, ABC): Subclasses must implement `verbalize`. """ system_prompt_template: str + name: str = "Rubric" papers: Dict[str, str] question: str answer: str diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py index 65067f8..edf3061 100644 --- a/yescieval/judge/judges.py +++ b/yescieval/judge/judges.py @@ -4,10 +4,11 @@ from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel, PeftConfig import torch -import json -import ast +import time from openai import OpenAI +import logging +logger = logging.getLogger(__name__) class AutoJudge(Judge): @@ -72,17 +73,18 @@ def _from_pretrained(self, model_id:str, device:str="auto", token:str =""): class GPTCustomAutoJudge(Judge): + + def from_pretrained(self, model_id:str, device: str="auto", token:str =""): + if not token: + raise ValueError( + "OpenAI API token must be provided." + ) - def from_pretrained(self, model_id: str, token: str = ""): self.model_name = model_id + self.client = OpenAI(api_key=token) - if token: - self.client = OpenAI(api_key=token) - else: - self.client = OpenAI() def _is_reasoning_model(self) -> bool: - model_lower = self.model_name.lower() reasoning_prefixes = ("gpt-5", "o1", "o4", "o3", "o-1", "o-3") return any(model_lower.startswith(prefix) for prefix in reasoning_prefixes) @@ -90,21 +92,10 @@ def _is_reasoning_model(self) -> bool: def _is_gpt4_family(self) -> bool: model_lower = self.model_name.lower() return model_lower.startswith(("gpt-4", "gpt-4o", "gpt-4.1")) - - def _model_family(self) -> str: - if self._is_gpt4_family(): - return "GPT-4 family" - elif self._is_reasoning_model(): - return "GPT-5 / O-series" - else: - return "Other" - - - def _judge_function_schema_single(self, rubric: Rubric) -> dict: - - rubric_id = rubric.__class__.__name__ + + def _build_rubric_evaluation_function_schema(self, rubric_id: str) -> dict: return { - "name": "submit_judgement", + "name": "evaluate_rubric", "description": f"Return rating and rationale for rubric {rubric_id}", "parameters": { "type": "object", @@ -122,96 +113,51 @@ def _judge_function_schema_single(self, rubric: Rubric) -> dict: } } - def _parse_json(self, text: str) -> dict: - - text = text.strip() - - if text.startswith("```"): - text = text.split("```")[1] - if text.startswith("json"): - text = text[4:] - - text = text.strip() - - try: - return json.loads(text) - except json.JSONDecodeError: - try: - return ast.literal_eval(text) - except: - return None - def judge(self, rubric: Rubric, max_new_tokens: int = 150) -> Dict[str, Dict[str, str]]: if not self.client: - raise ValueError("Model not initialized. Call from_pretrained() first.") - - messages = self._format_messages(rubric.instruct()) - rubric_id = rubric.__class__.__name__ + raise ValueError("Model not initialized.") - if self._is_reasoning_model(): - actual_max_tokens = max_new_tokens * 10 - else: - actual_max_tokens = max_new_tokens + messages = rubric.instruct() + rubric_id = rubric.name params = { "model": self.model_name, "messages": messages, - "max_completion_tokens" if self._is_reasoning_model() else "max_tokens": actual_max_tokens + "max_completion_tokens" if self._is_reasoning_model() else "max_tokens": max_new_tokens } if self._is_gpt4_family(): - params["functions"] = [self._judge_function_schema_single(rubric)] - params["function_call"] = {"name": "submit_judgement"} + params["functions"] = [self._build_rubric_evaluation_function_schema(rubric_id)] + params["function_call"] = {"name": "evaluate_rubric"} - try: - response = self.client.chat.completions.create(**params) - message = response.choices[0].message - except Exception as e: - raise RuntimeError(f"OpenAI API call failed: {str(e)}") - - if self._is_gpt4_family(): - if hasattr(message, 'function_call') and message.function_call: - raw_args = message.function_call.arguments.strip() - content = self._parse_json(raw_args) or {} - else: - content = {rubric_id: {"rating": "N/A", "rationale": "No function call returned"}} - else: - raw_text = getattr(message, 'content', '') - - if raw_text: - parsed = self._parse_json(raw_text) - if parsed: - content = parsed - else: - content = {rubric_id: {"rating": "N/A", "rationale": raw_text}} - else: - if response.choices[0].finish_reason == 'length': - content = {rubric_id: {"rating": "N/A", "rationale": "Token limit reached. Increase max_new_tokens parameter."}} + while True: + try: + response = self.client.chat.completions.create(**params) + message = response.choices[0].message + + if self._is_gpt4_family(): + if hasattr(message, 'function_call') and message.function_call: + raw_args = message.function_call.arguments.strip() + content = eval(raw_args) if raw_args else {} + else: + content = {rubric_id: {"rating": "N/A", "rationale": "No function call returned"}} else: - content = {rubric_id: {"rating": "N/A", "rationale": "Empty response"}} + raw_text = getattr(message, 'content', '') + if raw_text: + try: + content = eval(raw_text) + except Exception: + content = {rubric_id: {"rating": "N/A", "rationale": raw_text}} + else: + if response.choices[0].finish_reason == 'length': + content = {rubric_id: {"rating": "N/A", "rationale": "Token limit reached. Increase max_new_tokens parameter."}} + else: + content = {rubric_id: {"rating": "N/A", "rationale": "Empty response"}} + + break + + except Exception as e: + logger.warning(f"API call failed, retrying in 4 seconds: {e}") + time.sleep(4) return content - - def _format_messages(self, raw_messages) -> list: - """Convert rubric.instruct() output to OpenAI message format.""" - messages = [] - - if isinstance(raw_messages, str): - messages.append({"role": "user", "content": raw_messages}) - elif isinstance(raw_messages, list): - for msg in raw_messages: - if isinstance(msg, str): - messages.append({"role": "user", "content": msg}) - elif isinstance(msg, dict) and "role" in msg and "content" in msg: - messages.append(msg) - else: - raise ValueError(f"Invalid message format: {msg}") - elif isinstance(raw_messages, dict): - if "role" in raw_messages and "content" in raw_messages: - messages.append(raw_messages) - else: - raise ValueError(f"Message dict missing 'role' or 'content': {raw_messages}") - else: - raise ValueError(f"Unsupported rubric.instruct() type: {type(raw_messages)}") - - return messages \ No newline at end of file diff --git a/yescieval/rubric/breadth.py b/yescieval/rubric/breadth.py index efaf0bc..0ed9d2d 100644 --- a/yescieval/rubric/breadth.py +++ b/yescieval/rubric/breadth.py @@ -51,6 +51,7 @@ Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class GeographicCoverage(Rubric): + name: str = "Geographic Coverage" system_prompt_template: str = geographic_coverage_prompt intervention_diversity_prompt = """ @@ -104,6 +105,7 @@ class GeographicCoverage(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class InterventionDiversity(Rubric): + name: str = "Intervention Diversity" system_prompt_template: str = intervention_diversity_prompt biodiversity_dimensions_prompt = """ @@ -157,6 +159,7 @@ class InterventionDiversity(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class BiodiversityDimensions(Rubric): + name: str = "Biodiversity Dimensions" system_prompt_template: str = biodiversity_dimensions_prompt ecosystem_services_prompt = """ @@ -210,6 +213,7 @@ class BiodiversityDimensions(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class EcosystemServices(Rubric): + name: str = "Ecosystem Services" system_prompt_template: str = ecosystem_services_prompt spatial_scale_prompt = """ @@ -263,6 +267,7 @@ class EcosystemServices(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class SpatialScale(Rubric): + name: str = "Spatial Scale" system_prompt_template: str = spatial_scale_prompt diff --git a/yescieval/rubric/depth.py b/yescieval/rubric/depth.py index 04aeb00..984ddf5 100644 --- a/yescieval/rubric/depth.py +++ b/yescieval/rubric/depth.py @@ -50,6 +50,7 @@ Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class MechanisticUnderstanding(Rubric): + name: str = "Mechanistic Understanding" system_prompt_template: str = mechanistic_understanding_prompt causal_reasoning_prompt = """ @@ -103,6 +104,7 @@ class MechanisticUnderstanding(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class CausalReasoning(Rubric): + name: str = "Causal Reasoning" system_prompt_template: str = causal_reasoning_prompt temporal_precision_prompt = """ @@ -156,5 +158,6 @@ class CausalReasoning(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class TemporalPrecision(Rubric): + name: str = "Temporal Precision" system_prompt_template: str = temporal_precision_prompt diff --git a/yescieval/rubric/gap.py b/yescieval/rubric/gap.py index facdcbd..6cf6dcb 100644 --- a/yescieval/rubric/gap.py +++ b/yescieval/rubric/gap.py @@ -51,4 +51,5 @@ Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class GapIdentification(Rubric): + name: str = "Gap Identification" system_prompt_template: str = gap_identification_prompt diff --git a/yescieval/rubric/informativeness.py b/yescieval/rubric/informativeness.py index 9fd6788..bdfb448 100644 --- a/yescieval/rubric/informativeness.py +++ b/yescieval/rubric/informativeness.py @@ -51,6 +51,7 @@ Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class Correctness(Rubric): + name: str = "Correctness" system_prompt_template: str = correctness_prompt completeness_prompt = """ @@ -104,6 +105,7 @@ class Correctness(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class Completeness(Rubric): + name: str = "Completeness" system_prompt_template: str = completeness_prompt informativeness_prompt = """ @@ -157,5 +159,6 @@ class Completeness(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class Informativeness(Rubric): + name: str = "Informativeness" system_prompt_template: str = informativeness_prompt diff --git a/yescieval/rubric/innovation.py b/yescieval/rubric/innovation.py index 290405a..7a0bd80 100644 --- a/yescieval/rubric/innovation.py +++ b/yescieval/rubric/innovation.py @@ -51,6 +51,7 @@ Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class SpeculativeStatements(Rubric): + name: str = "Speculative Statements" system_prompt_template: str = speculative_statements_prompt novelty_indicators_prompt = """ @@ -104,6 +105,7 @@ class SpeculativeStatements(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class NoveltyIndicators(Rubric): + name: str = "Novelty Indicators" system_prompt_template: str = novelty_indicators_prompt diff --git a/yescieval/rubric/rigor.py b/yescieval/rubric/rigor.py index 62c4aaf..db2e7d3 100644 --- a/yescieval/rubric/rigor.py +++ b/yescieval/rubric/rigor.py @@ -51,6 +51,7 @@ Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class StatisticalSophistication(Rubric): + name: str = "Statistical Sophistication" system_prompt_template: str = statistical_sophistication_prompt citation_practices_prompt = """ @@ -104,6 +105,7 @@ class StatisticalSophistication(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class CitationPractices(Rubric): + name: str = "Citation Practices" system_prompt_template: str = citation_practices_prompt uncertainty_acknowledgement_prompt = """ @@ -157,5 +159,6 @@ class CitationPractices(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class UncertaintyAcknowledgment(Rubric): + name: str = "Uncertainty Acknowledgement" system_prompt_template: str = uncertainty_acknowledgement_prompt diff --git a/yescieval/rubric/structural.py b/yescieval/rubric/structural.py index a968642..6b83550 100644 --- a/yescieval/rubric/structural.py +++ b/yescieval/rubric/structural.py @@ -51,6 +51,7 @@ Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class Coherence(Rubric): + name: str = "Coherence" system_prompt_template: str = coherence_prompt integration_prompt = """ @@ -104,6 +105,7 @@ class Coherence(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class Integration(Rubric): + name: str = "Integration" system_prompt_template: str = integration_prompt relevancy_prompt = """ @@ -157,4 +159,5 @@ class Integration(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class Relevancy(Rubric): + name: str = "Relevancy" system_prompt_template: str = relevancy_prompt diff --git a/yescieval/rubric/stylistic.py b/yescieval/rubric/stylistic.py index b369fdf..0e92757 100644 --- a/yescieval/rubric/stylistic.py +++ b/yescieval/rubric/stylistic.py @@ -52,6 +52,7 @@ """ class Cohesion(Rubric): + name: str = "Cohesion" system_prompt_template: str = cohesion_prompt @@ -106,6 +107,7 @@ class Cohesion(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class Conciseness(Rubric): + name: str = "Conciseness" system_prompt_template: str = conciseness_prompt readability_prompt = """ @@ -159,5 +161,6 @@ class Conciseness(Rubric): Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. """ class Readability(Rubric): + name: str = "Readability" system_prompt_template: str = readability_prompt From bc4f7a5ecde546053fa88a50e0e17c8ad925cb36 Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Tue, 13 Jan 2026 12:00:17 +0100 Subject: [PATCH 5/9] :bug: fix deep research prompts --- yescieval/rubric/breadth.py | 20 ++++++++++---------- yescieval/rubric/depth.py | 12 ++++++------ yescieval/rubric/gap.py | 4 ++-- yescieval/rubric/innovation.py | 8 ++++---- yescieval/rubric/rigor.py | 12 ++++++------ 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/yescieval/rubric/breadth.py b/yescieval/rubric/breadth.py index 0ed9d2d..dfcc99e 100644 --- a/yescieval/rubric/breadth.py +++ b/yescieval/rubric/breadth.py @@ -22,7 +22,7 @@ -1. geographic_coverage: is the information in the answer a correct representation of the spatial scope of the provided abstracts? +1. Geographic Coverage: is the information in the answer a correct representation of the spatial scope of the provided abstracts? @@ -42,7 +42,7 @@ { - "geographic_coverage": {"rating": "4", "rationale": "The synthesis accurately represents multiple regions and scales from the provided abstracts, with only minor omissions or irrelevant details."} + "Geographic Coverage": {"rating": "4", "rationale": "The synthesis accurately represents multiple regions and scales from the provided abstracts, with only minor omissions or irrelevant details."} } @@ -76,7 +76,7 @@ class GeographicCoverage(Rubric): -1. intervention_diversity: is the answer a comprehensive encapsulation of the relevant information in the provided abstracts, measured by the number of unique management practices? +1. Intervention Diversity: is the answer a comprehensive encapsulation of the relevant information in the provided abstracts, measured by the number of unique management practices? @@ -96,7 +96,7 @@ class GeographicCoverage(Rubric): { - "intervention_diversity": {"rating": "4", "rationale": "The answer includes almost all relevant interventions from the provided abstracts, with only minor details missing."} + "Intervention Diversity": {"rating": "4", "rationale": "The answer includes almost all relevant interventions from the provided abstracts, with only minor details missing."} } @@ -130,7 +130,7 @@ class InterventionDiversity(Rubric): -1. biodiversity_dimensions: is the answer a comprehensive representation of the relevant biodiversity information in the provided abstracts, measured by the presence of terms related to taxonomic, functional, phylogenetic, and spatial diversity? +1. Biodiversity Dimensions: is the answer a comprehensive representation of the relevant biodiversity information in the provided abstracts, measured by the presence of terms related to taxonomic, functional, phylogenetic, and spatial diversity? @@ -150,7 +150,7 @@ class InterventionDiversity(Rubric): { - "biodiversity_dimensions": {"rating": "4", "rationale": "Most information is informative for the research question, capturing the key biodiversity dimensions with minor omissions."} + "Biodiversity Dimensions": {"rating": "4", "rationale": "Most information is informative for the research question, capturing the key biodiversity dimensions with minor omissions."} } @@ -184,7 +184,7 @@ class BiodiversityDimensions(Rubric): -1. ecosystem_services: is the answer a useful and informative reply to the question, measured by the presence of terms matched against a vocabulary aligned with the Millennium Ecosystem Assessment? +1. Ecosystem Services: is the answer a useful and informative reply to the question, measured by the presence of terms matched against a vocabulary aligned with the Millennium Ecosystem Assessment? @@ -204,7 +204,7 @@ class BiodiversityDimensions(Rubric): { - "ecosystem_services": {"rating": "4", "rationale": "The synthesis includes nearly all relevant ecosystem services from the provided abstracts, with only minor omissions."} + "Ecosystem Services": {"rating": "4", "rationale": "The synthesis includes nearly all relevant ecosystem services from the provided abstracts, with only minor omissions."} } @@ -238,7 +238,7 @@ class EcosystemServices(Rubric): -1. spatial_scale: is the answer a useful and informative reply to the question, measured by the presence of explicit scale terms (e.g., “local,” “regional,” “continental”) and area measures? +1. Spatial Scale: is the answer a useful and informative reply to the question, measured by the presence of explicit scale terms (e.g., “local,” “regional,” “continental”) and area measures? @@ -258,7 +258,7 @@ class EcosystemServices(Rubric): { - "spatial_scale": {"rating": "4", "rationale": "The synthesis includes nearly all relevant spatial scale information from the provided abstracts, with only minor omissions."} + "Spatial Scale": {"rating": "4", "rationale": "The synthesis includes nearly all relevant spatial scale information from the provided abstracts, with only minor omissions."} } diff --git a/yescieval/rubric/depth.py b/yescieval/rubric/depth.py index 984ddf5..3e12dc3 100644 --- a/yescieval/rubric/depth.py +++ b/yescieval/rubric/depth.py @@ -22,7 +22,7 @@ -1. mechanistic_understanding: does the answer reflect understanding of ecological processes by explicitly mentioning recognized mechanisms such as feedbacks, nutrient cycling, or trophic cascades? +1. Mechanistic Understanding: does the answer reflect understanding of ecological processes by explicitly mentioning recognized mechanisms such as feedbacks, nutrient cycling, or trophic cascades? @@ -41,7 +41,7 @@ { - "mechanistic_understanding": {"rating": "4", "rationale": "The answer explains a clear multi-step ecological mechanism using causal language, but some temporal or boundary details are only briefly addressed."} + "Mechanistic Understanding": {"rating": "4", "rationale": "The answer explains a clear multi-step ecological mechanism using causal language, but some temporal or boundary details are only briefly addressed."} } @@ -75,7 +75,7 @@ class MechanisticUnderstanding(Rubric): -1. causal_reasoning: does the answer explicitly express cause–effect relationships using causal connectives (e.g., “because,” “due to”), result indicators (e.g., “results in,” “induces”), or mechanistic verbs (e.g., “drives,” “regulates”) when describing ecological processes? +1. Causal Reasoning: does the answer explicitly express cause–effect relationships using causal connectives (e.g., “because,” “due to”), result indicators (e.g., “results in,” “induces”), or mechanistic verbs (e.g., “drives,” “regulates”) when describing ecological processes? @@ -95,7 +95,7 @@ class MechanisticUnderstanding(Rubric): { - "causal_reasoning": {"rating": "4", "rationale": "The answer uses clear causal connectors and describes a multi-step cause–effect relationship."} + "Causal Reasoning": {"rating": "4", "rationale": "The answer uses clear causal connectors and describes a multi-step cause–effect relationship."} } @@ -129,7 +129,7 @@ class CausalReasoning(Rubric): -1. temporal_precision: does the answer include specific and explicit temporal references, such as quantified time intervals or dated events, rather than vague or unspecific timing? +1. Temporal Precision: does the answer include specific and explicit temporal references, such as quantified time intervals or dated events, rather than vague or unspecific timing? @@ -149,7 +149,7 @@ class CausalReasoning(Rubric): { - "temporal_precision": {"rating": "4", "rationale": "The answer includes several specific timeframes or durations that are clearly linked to the described processes, though some timing details could be more precise."} + "Temporal Precision": {"rating": "4", "rationale": "The answer includes several specific timeframes or durations that are clearly linked to the described processes, though some timing details could be more precise."} } diff --git a/yescieval/rubric/gap.py b/yescieval/rubric/gap.py index 6cf6dcb..8c6fa2a 100644 --- a/yescieval/rubric/gap.py +++ b/yescieval/rubric/gap.py @@ -22,7 +22,7 @@ -1. gap_identification: To what extent does the answer explicitly identify research gaps or unanswered questions indicated by the provided abstracts? +1. Gap Identification: To what extent does the answer explicitly identify research gaps or unanswered questions indicated by the provided abstracts? @@ -42,7 +42,7 @@ { - "gap_identification": {"rating": "4", "rationale": "Identifies a relevant gap supported by the abstracts, with limited elaboration."} + "Gap Identification": {"rating": "4", "rationale": "Identifies a relevant gap supported by the abstracts, with limited elaboration."} } diff --git a/yescieval/rubric/innovation.py b/yescieval/rubric/innovation.py index 7a0bd80..628fa5d 100644 --- a/yescieval/rubric/innovation.py +++ b/yescieval/rubric/innovation.py @@ -22,7 +22,7 @@ -1. speculative_statement: Does the answer clearly distinguish speculation (e.g., “might,” “could”) from established findings in the provided abstracts? +1. Speculative Statements: Does the answer clearly distinguish speculation (e.g., “might,” “could”) from established findings in the provided abstracts? @@ -42,7 +42,7 @@ { - "speculative_statement": {"rating": "4", "rationale": "Uses hedging appropriately and clearly distinguishes speculation from established findings."} + "Speculative Statements": {"rating": "4", "rationale": "Uses hedging appropriately and clearly distinguishes speculation from established findings."} } @@ -76,7 +76,7 @@ class SpeculativeStatements(Rubric): -1. novelty_indicators: Does the answer appropriately use self-declared innovation terms (e.g., “novel,” “pioneering,” “emerging”) and clearly indicate whether such claims are supported by the provided abstracts? +1. Novelty Indicators: Does the answer appropriately use self-declared innovation terms (e.g., “novel,” “pioneering,” “emerging”) and clearly indicate whether such claims are supported by the provided abstracts? @@ -96,7 +96,7 @@ class SpeculativeStatements(Rubric): { - "novelty_indicators": {"rating": "4", "rationale": "Shows a clear novel angle, but lacks full detail."} + "Novelty Indicators": {"rating": "4", "rationale": "Shows a clear novel angle, but lacks full detail."} } diff --git a/yescieval/rubric/rigor.py b/yescieval/rubric/rigor.py index db2e7d3..cd428a4 100644 --- a/yescieval/rubric/rigor.py +++ b/yescieval/rubric/rigor.py @@ -22,7 +22,7 @@ -1. statistical_sophistication: Does the answer reflect quantitative depth through the use of inferential statistics or analysis methods described in the abstracts? +1. Statistical Sophistication: Does the answer reflect quantitative depth through the use of inferential statistics or analysis methods described in the abstracts? @@ -42,7 +42,7 @@ { - "statistical_sophistication": {"rating": "3", "rationale": "The synthesis provides some methodological details and basic statistics, but does not fully discuss limitations or reproducibility.""} + "Statistical Sophistication": {"rating": "3", "rationale": "The synthesis provides some methodological details and basic statistics, but does not fully discuss limitations or reproducibility.""} } @@ -76,7 +76,7 @@ class StatisticalSophistication(Rubric): -1. citation_practices: is the answer supported by appropriate references, using parenthetical or narrative citations, for the relevant information in the provided abstracts? +1. Citation Practices: is the answer supported by appropriate references, using parenthetical or narrative citations, for the relevant information in the provided abstracts? @@ -96,7 +96,7 @@ class StatisticalSophistication(Rubric): { - "citation_practices": {"rating": "3", "rationale": "Some claims are supported with citations, but several important points lack references or use inconsistent citation style."} + "Citation Practices": {"rating": "3", "rationale": "Some claims are supported with citations, but several important points lack references or use inconsistent citation style."} } @@ -130,7 +130,7 @@ class CitationPractices(Rubric): -1. uncertainty_acknowledgement: does the answer explicitly discuss limitations, uncertainty, or gaps in evidence (e.g., using terms like “unknown,” “limited evidence,” or “unclear”)? +1. Uncertainty Acknowledgement: does the answer explicitly discuss limitations, uncertainty, or gaps in evidence (e.g., using terms like “unknown,” “limited evidence,” or “unclear”)? @@ -150,7 +150,7 @@ class CitationPractices(Rubric): { - "uncertainty_acknowledgement": {"rating": "4", "rationale": "The answer clearly acknowledges key uncertainties and limitations in the study."} + "Uncertainty Acknowledgement": {"rating": "4", "rationale": "The answer clearly acknowledges key uncertainties and limitations in the study."} } From e4940754734fe42581377297450001e618b5ac57 Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Tue, 13 Jan 2026 12:02:05 +0100 Subject: [PATCH 6/9] :sparkles: update judge io --- yescieval/base/judge.py | 4 ++-- yescieval/judge/judges.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/yescieval/base/judge.py b/yescieval/base/judge.py index 5ef75ed..3178d2b 100644 --- a/yescieval/base/judge.py +++ b/yescieval/base/judge.py @@ -1,6 +1,6 @@ from abc import ABC from typing import Dict, Any -from . import Parser, Rubric +from . import Rubric, RubricLikertScale class Judge(ABC): @@ -8,7 +8,7 @@ class Judge(ABC): def from_pretrained(self, model_id:str, device: str="auto", token:str =""): self.model, self.tokenizer = self._from_pretrained(model_id=model_id, device=device, token=token) - def judge(self, rubric: Rubric, max_new_tokens: int=150) -> Dict[str, Dict[str, str]]: + def judge(self, rubric: Rubric, max_new_tokens: int=150) -> Dict[str, Dict[str, str]] | str | RubricLikertScale: pass def _from_pretrained(self, model_id: str, device: str = "auto", token: str = "") -> [Any, Any]: diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py index edf3061..0810805 100644 --- a/yescieval/judge/judges.py +++ b/yescieval/judge/judges.py @@ -29,7 +29,7 @@ def _from_pretrained(self, model_id:str, device:str="auto", token:str =""): model = PeftModel.from_pretrained(base_model, model_id) return model, tokenizer - def evaluate(self, rubric: Rubric, max_new_tokens: int=150) -> Dict[str, Dict[str, str]]: + def judge(self, rubric: Rubric, max_new_tokens: int=150) -> str: inputs = self.tokenizer.apply_chat_template(rubric.instruct(), add_generation_prompt=True, return_dict=True, From 71b22bd96ae90f7b5ca560e121614ccea67779ed Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Tue, 13 Jan 2026 12:02:31 +0100 Subject: [PATCH 7/9] :sparkles: add custom judges --- yescieval/judge/__init__.py | 3 +- yescieval/judge/custom.py | 97 +++++++++++++++++++++++++++++++ yescieval/judge/judges.py | 110 ------------------------------------ 3 files changed, 99 insertions(+), 111 deletions(-) create mode 100644 yescieval/judge/custom.py diff --git a/yescieval/judge/__init__.py b/yescieval/judge/__init__.py index 09ba18f..d0d69e3 100644 --- a/yescieval/judge/__init__.py +++ b/yescieval/judge/__init__.py @@ -1,4 +1,5 @@ -from .judges import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge, GPTCustomAutoJudge +from .judges import AutoJudge, AskAutoJudge, BioASQAutoJudge +from .custom import CustomAutoJudge, GPTCustomAutoJudge __all__ = [ "AutoJudge", diff --git a/yescieval/judge/custom.py b/yescieval/judge/custom.py new file mode 100644 index 0000000..c44d1cf --- /dev/null +++ b/yescieval/judge/custom.py @@ -0,0 +1,97 @@ +from ..base import Judge, Rubric, RubricLikertScale +from .judges import AutoJudge + +import time +from typing import Dict, List +from openai import OpenAI +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch +import logging + +logger = logging.getLogger(__name__) + +class CustomAutoJudge(AutoJudge): + + def _from_pretrained(self, model_id:str, device:str="auto", token:str =""): + tokenizer = AutoTokenizer.from_pretrained(model_id, + padding_side="left", + token=token) + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.float32, + device_map=device, + token=token + ) + return model, tokenizer + + +class GPTCustomAutoJudge(Judge): + + def from_pretrained(self, model_id: str, device: str = "auto", token: str = ""): + if not token: + raise ValueError("OpenAI API token must be provided.") + self.model_name = model_id + self.client = OpenAI(api_key=token) + + def _supports_function_calling(self) -> bool: + gpt_4_prefixes = ( + "gpt-4", # gpt4 family including gpt-4o, gpt-4o-mini, gpt-4.1, ... + "GPT-3.5", # gpt-3.5 family + ) + return any(self.model_name.startswith(prefix) for prefix in gpt_4_prefixes) + + def _output_schema(self) -> List[Dict]: + return [ + { + "name": "response_format", + "description": f"Return the `rating` and `rationale` only as a response.", + "parameters": { + "type": "object", + "properties": { + 'rating': { + "type": "number", + "description": "A numerical rating assigned to the characteristic.", + "minimum": 1, + "maximum": 5 + }, + "rationale": { + "type": "string", + "description": "The explanation for the assigned rating." + }, + }, + "required": ["rating", "rationale"] + } + } + ] + + def judge(self, rubric: Rubric, max_new_tokens: int = 150) -> RubricLikertScale: + if not self.client: + raise ValueError("Model not initialized.") + messages = rubric.instruct() + params = { + "model": self.model_name, + "messages": messages + } + if self._supports_function_calling(): + params["functions"] = self._output_schema() + + try_counter = 0 + while True: + try: + try_counter += 1 + response = self.client.chat.completions.create(**params) + message = response.choices[0].message + if self._supports_function_calling(): + parsed_output = eval(message.function_call.arguments) + else: + parsed_output = eval(message.content)[rubric.name] + evaluation = RubricLikertScale(rating=parsed_output['rating'], rationale=parsed_output['rationale']) + return evaluation + + except Exception as e: + logger.error(f"{try_counter} times failed attempt!") + logger.warning(f"API call failed, retrying in 4 seconds: {e}") + time.sleep(5) + + diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py index 0810805..c700b28 100644 --- a/yescieval/judge/judges.py +++ b/yescieval/judge/judges.py @@ -4,8 +4,6 @@ from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel, PeftConfig import torch -import time -from openai import OpenAI import logging logger = logging.getLogger(__name__) @@ -53,111 +51,3 @@ def from_pretrained(self, model_id: str = "SciKnowOrg/YESciEval-BioASQ-Llama-3.1 device: str = "auto", token: str = ""): self.model, self.tokenizer = super()._from_pretrained(model_id=model_id, device=device, token=token) - - - -class CustomAutoJudge(AutoJudge): - - def _from_pretrained(self, model_id:str, device:str="auto", token:str =""): - tokenizer = AutoTokenizer.from_pretrained(model_id, - padding_side="left", - token=token) - tokenizer.pad_token = tokenizer.eos_token - model = AutoModelForCausalLM.from_pretrained( - model_id, - torch_dtype=torch.float32, - device_map=device, - token=token - ) - return model, tokenizer - - -class GPTCustomAutoJudge(Judge): - - def from_pretrained(self, model_id:str, device: str="auto", token:str =""): - if not token: - raise ValueError( - "OpenAI API token must be provided." - ) - - self.model_name = model_id - self.client = OpenAI(api_key=token) - - - def _is_reasoning_model(self) -> bool: - model_lower = self.model_name.lower() - reasoning_prefixes = ("gpt-5", "o1", "o4", "o3", "o-1", "o-3") - return any(model_lower.startswith(prefix) for prefix in reasoning_prefixes) - - def _is_gpt4_family(self) -> bool: - model_lower = self.model_name.lower() - return model_lower.startswith(("gpt-4", "gpt-4o", "gpt-4.1")) - - def _build_rubric_evaluation_function_schema(self, rubric_id: str) -> dict: - return { - "name": "evaluate_rubric", - "description": f"Return rating and rationale for rubric {rubric_id}", - "parameters": { - "type": "object", - "properties": { - rubric_id: { - "type": "object", - "properties": { - "rating": {"type": "string", "description": "Score for this rubric"}, - "rationale": {"type": "string", "description": "Explanation for the rating"} - }, - "required": ["rating", "rationale"] - } - }, - "required": [rubric_id] - } - } - - def judge(self, rubric: Rubric, max_new_tokens: int = 150) -> Dict[str, Dict[str, str]]: - if not self.client: - raise ValueError("Model not initialized.") - - messages = rubric.instruct() - rubric_id = rubric.name - - params = { - "model": self.model_name, - "messages": messages, - "max_completion_tokens" if self._is_reasoning_model() else "max_tokens": max_new_tokens - } - - if self._is_gpt4_family(): - params["functions"] = [self._build_rubric_evaluation_function_schema(rubric_id)] - params["function_call"] = {"name": "evaluate_rubric"} - - while True: - try: - response = self.client.chat.completions.create(**params) - message = response.choices[0].message - - if self._is_gpt4_family(): - if hasattr(message, 'function_call') and message.function_call: - raw_args = message.function_call.arguments.strip() - content = eval(raw_args) if raw_args else {} - else: - content = {rubric_id: {"rating": "N/A", "rationale": "No function call returned"}} - else: - raw_text = getattr(message, 'content', '') - if raw_text: - try: - content = eval(raw_text) - except Exception: - content = {rubric_id: {"rating": "N/A", "rationale": raw_text}} - else: - if response.choices[0].finish_reason == 'length': - content = {rubric_id: {"rating": "N/A", "rationale": "Token limit reached. Increase max_new_tokens parameter."}} - else: - content = {rubric_id: {"rating": "N/A", "rationale": "Empty response"}} - - break - - except Exception as e: - logger.warning(f"API call failed, retrying in 4 seconds: {e}") - time.sleep(4) - - return content From 071d15fd186cd026246ea4261a54193d92d10c9f Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Tue, 13 Jan 2026 12:02:48 +0100 Subject: [PATCH 8/9] :memo: update docs --- README.md | 16 ++++++++-------- docs/source/judges.rst | 17 +++++++++++++---- docs/source/quickstart.rst | 30 +++++++++++++++++++++++++++--- docs/source/rubrics.rst | 1 + 4 files changed, 49 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index ac6f324..f00a219 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ judge.from_pretrained( ) # Step 3: Evaluate the answer -result = judge.evaluate(rubric=rubric) +result = judge.judge(rubric=rubric) print("Raw Evaluation Output:") print(result) ``` @@ -91,15 +91,15 @@ Judges within YESciEval are defined as follows: | `GPTCustomAutoJudge`| Custom GPT-based LLM that can be used as a judge within YESciEval | -A total of twenty three (23) evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code: +A total of **23** evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code: ```python - from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, - Integration, Cohesion, Readability, Conciseness, GeographicCoverage, - InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, - MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, - StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, - SpeculativeStatements, NoveltyIndicators +from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, \ + Integration, Cohesion, Readability, Conciseness, GeographicCoverage, \ + InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, \ + MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, \ + StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, \ + SpeculativeStatements, NoveltyIndicators ``` A complete list of rubrics are available at YESciEval [📚 Rubrics](https://yescieval.readthedocs.io/rubrics.html) page. diff --git a/docs/source/judges.rst b/docs/source/judges.rst index 7c1a5ae..14621a3 100644 --- a/docs/source/judges.rst +++ b/docs/source/judges.rst @@ -48,7 +48,7 @@ The following example demonstrates how to create an evaluation rubric, load a ju device="cpu") # Step 3: Evaluate the answer - result = judge.evaluate(rubric=rubric) + result = judge.judge(rubric=rubric) print("Raw Evaluation Output:") print(result) @@ -84,7 +84,7 @@ For example, you can load a model and evaluate a rubric like this: judge.from_pretrained(model_id="Qwen/Qwen3-8B", device="cpu", token="your_huggingface_token") # Evaluate the rubric using the loaded model - result = judge.evaluate(rubric=rubric) + result = judge.judge(rubric=rubric) print(result) @@ -101,11 +101,20 @@ You can use it to evaluate a rubric by providing your OpenAI API key and specify # Initialize and load a custom model by specifying its Hugging Face model ID judge = GPTCustomAutoJudge() - judge.from_pretrained("gpt-5.2", token=OPEN_AI_API_KEY) + judge.from_pretrained(model_id="gpt-5.2", token=OPEN_AI_API_KEY) # Evaluate the rubric using the loaded model result = judge.judge(rubric=rubric) - print(result) + print(result.model_dump()) + +as a result output will be in the following format + +.. code-block:: json + + { + "rating": rating-value, + "rationale": "rationale-text" + } This allows you to leverage the capabilities of OpenAI's GPT models for scientific text evaluation. \ No newline at end of file diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index fdbb3e3..8f7fbdc 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -35,7 +35,7 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi judge.from_pretrained(token="your_huggingface_token", device="cpu") # Step 3: Evaluate the answer - result = judge.evaluate(rubric=rubric) + result = judge.judge(rubric=rubric) print("Raw Evaluation Output:") print(result) @@ -62,7 +62,7 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi judge.from_pretrained(model_id="Qwen/Qwen3-8B", device="cpu", token="your_huggingface_token") # Step 3: Evaluate the answer - result = judge.evaluate(rubric=rubric) + result = judge.judge(rubric=rubric) print("Raw Evaluation Output:") print(result) @@ -81,7 +81,7 @@ If the model outputs unstructured or loosely structured text, you can use GPTPar parsed = parser.parse(raw_output=raw_output) print("Parsed Output:") - print(parsed) + print(parsed.model_dump()) **Expected Output Format** @@ -92,6 +92,30 @@ If the model outputs unstructured or loosely structured text, you can use GPTPar "rationale": "The answer covers key aspects of how AI is applied in healthcare, such as diagnostics and personalized medicine." } +The output schema is as a following (if you do not prefer to use ``.model_dump()``) to be able to use like ``result.rating`` to access the rating value or ``result.rationale`` to access the textual explanation for rating. + +.. code-block:: + + { + 'properties': { + 'rating': { + 'description': 'Rating from 1 to 5', + 'maximum': 5, + 'minimum': 1, + 'title': 'Rating', + 'type': 'integer' + }, + 'rationale': { + 'description': 'Textual explanation for the rating', + 'title': 'Rationale', + 'type': 'string' + } + }, + 'required': ['rating', 'rationale'], + 'title': 'RubricLikertScale', + 'type': 'object' + } + .. hint:: Key Components +------------------+-------------------------------------------------------+ diff --git a/docs/source/rubrics.rst b/docs/source/rubrics.rst index e264f21..b38498c 100644 --- a/docs/source/rubrics.rst +++ b/docs/source/rubrics.rst @@ -188,3 +188,4 @@ And to use rubrics: instruction = rubric.instruct() print(instruction) + print(rubric.name) From 2ad66fb2789afdb847f19371317a9b1443f8fcdd Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Tue, 13 Jan 2026 12:09:10 +0100 Subject: [PATCH 9/9] :bookmark: v0.4.0 --- CHANGELOG.md | 6 ++++++ yescieval/VERSION | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 262c769..1bb7366 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ## Changelog +### v0.4.0 (January 13, 2026) +- Add a GPT custom jude (PR #5) +- Update documentation +- Minor bug fixing in deep research rubrics and judges +- Update Readme + ### v0.3.0 (December 20, 2025) - Add more rubrics (PR #3) - Update documentation for new rubrics diff --git a/yescieval/VERSION b/yescieval/VERSION index 9325c3c..60a2d3e 100644 --- a/yescieval/VERSION +++ b/yescieval/VERSION @@ -1 +1 @@ -0.3.0 \ No newline at end of file +0.4.0 \ No newline at end of file