From 45810cf9f62a11777956fac71a57eb9981894140 Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Tue, 6 Jan 2026 17:20:29 +0500
Subject: [PATCH 1/9] =?UTF-8?q?=F0=9F=93=9D=20Added=20CustomGPT=20Model=20?=
 =?UTF-8?q?Class?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt            |  3 +-
 yescieval/__init__.py       |  2 +-
 yescieval/judge/__init__.py |  5 +-
 yescieval/judge/judges.py   | 96 +++++++++++++++++++++++++++++++++++++
 4 files changed, 102 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 0478557..eb4a74f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,5 @@ openai
 pandas
 numpy
 pydantic
-pytest
\ No newline at end of file
+pytest
+openai
\ No newline at end of file
diff --git a/yescieval/__init__.py b/yescieval/__init__.py
index 25974c8..8f72e68 100644
--- a/yescieval/__init__.py
+++ b/yescieval/__init__.py
@@ -8,6 +8,6 @@
                     MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, 
                     StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, 
                     SpeculativeStatements, NoveltyIndicators)
-from .judge import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge
+from .judge import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge, GPTCustomAutoJudge
 from .parser import GPTParser
 
diff --git a/yescieval/judge/__init__.py b/yescieval/judge/__init__.py
index a3fe787..09ba18f 100644
--- a/yescieval/judge/__init__.py
+++ b/yescieval/judge/__init__.py
@@ -1,8 +1,9 @@
-from .judges import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge
+from .judges import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge, GPTCustomAutoJudge
 
 __all__ = [
     "AutoJudge",
     "AskAutoJudge",
     "BioASQAutoJudge",
-    "CustomAutoJudge"
+    "CustomAutoJudge",
+    "GPTCustomAutoJudge"
 ]
\ No newline at end of file
diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py
index 00c736d..0f4efc9 100644
--- a/yescieval/judge/judges.py
+++ b/yescieval/judge/judges.py
@@ -4,6 +4,7 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, PeftConfig
 import torch
+from openai import OpenAI
 
 
 
@@ -66,3 +67,98 @@ def _from_pretrained(self, model_id:str, device:str="auto", token:str =""):
             token=token
         )
         return model, tokenizer
+
+
+class GPTCustomAutoJudge(AutoJudge):
+
+    def from_pretrained(self, model_id: str, api_key: str = None, base_url: str = None, **kwargs):
+
+        self.model_name = model_id
+
+        client_kwargs = {}
+        if api_key:
+            client_kwargs["api_key"] = api_key
+        if base_url:
+            client_kwargs["base_url"] = base_url
+        client_kwargs.update(kwargs)
+
+        self.client = OpenAI(**client_kwargs)
+        return self  
+
+    def _is_reasoning_model(self) -> bool:
+
+        model_lower = self.model_name.lower()
+        reasoning_prefixes = ("gpt-5", "o1", "o4" "o3", "o-1", "o-3")
+        return any(model_lower.startswith(prefix) for prefix in reasoning_prefixes)
+
+    def evaluate(
+        self,
+        rubric: Rubric,
+        max_new_tokens: int = 300,
+        temperature: float = 0.0,
+        **kwargs
+    ) -> str:
+       
+        if self.client is None:
+            raise ValueError("Model not initialized. Call from_pretrained() first.")
+
+        raw_messages = rubric.instruct()
+        messages = self._format_messages(raw_messages)
+
+        params = {
+            "model": self.model_name,
+            "messages": messages,
+        }
+
+        # Add model-specific parameters
+        if self._is_reasoning_model():
+            params["max_completion_tokens"] = max_new_tokens
+        else:
+            params["max_tokens"] = max_new_tokens
+            params["temperature"] = temperature
+
+        for key, value in kwargs.items():
+            if key not in params:
+                params[key] = value
+        try:
+            response = self.client.chat.completions.create(**params)
+        except Exception as e:
+            raise RuntimeError(f"OpenAI API call failed: {str(e)}")
+
+        if response.choices and len(response.choices) > 0:
+            content = response.choices[0].message.content
+            return content if content else ""
+        return ""
+
+    def _format_messages(self, raw_messages) -> list:
+
+        messages = []
+
+        # Handle string input
+        if isinstance(raw_messages, str):
+            messages.append({"role": "user", "content": raw_messages})
+        
+        # Handle list input
+        elif isinstance(raw_messages, list):
+            for msg in raw_messages:
+                if isinstance(msg, str):
+                    messages.append({"role": "user", "content": msg})
+                elif isinstance(msg, dict):
+                    if "role" in msg and "content" in msg:
+                        messages.append(msg)
+                    else:
+                        raise ValueError(f"Message dict missing 'role' or 'content': {msg}")
+                else:
+                    raise ValueError(f"Invalid message type in list: {type(msg)}")
+        
+        # Handle dict input (single message)
+        elif isinstance(raw_messages, dict):
+            if "role" in raw_messages and "content" in raw_messages:
+                messages.append(raw_messages)
+            else:
+                raise ValueError(f"Message dict missing 'role' or 'content': {raw_messages}")
+        
+        else:
+            raise ValueError(f"Unsupported rubric.instruct() output type: {type(raw_messages)}")
+
+        return messages
\ No newline at end of file

From 93af2a4a464d52fc1b6cdfa1889c1dee7476588d Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Tue, 6 Jan 2026 17:47:12 +0500
Subject: [PATCH 2/9] =?UTF-8?q?=F0=9F=93=9D=20Added=20documentation=20for?=
 =?UTF-8?q?=20Custom=20GPT=20Judge?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/source/judges.rst | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/docs/source/judges.rst b/docs/source/judges.rst
index 0eb37af..bc60c27 100644
--- a/docs/source/judges.rst
+++ b/docs/source/judges.rst
@@ -89,3 +89,23 @@ For example, you can load a model and evaluate a rubric like this:
     print(result)
 
 This approach allows full control over which model is used for evaluation, supporting any LLM..
+
+GPT Custom Judge
+--------------------
+
+The `GPTCustomAutoJudge` class provides a generic, flexible interface to evaluate scientific syntheses using OpenAI GPT models. 
+
+You can use it to evaluate a rubric by providing your OpenAI API key and specifying the model ID:
+
+.. code-block:: python
+
+    # Initialize and load a custom model by specifying its Hugging Face model ID
+    judge = GPTCustomAutoJudge()
+    judge.from_pretrained("gpt-5.2", api_key=OPEN_AI_API_KEY)
+
+    # Evaluate the rubric using the loaded model
+    result = judge.evaluate(rubric=rubric)
+
+    print(result)
+
+This allows you to leverage the capabilities of OpenAI's GPT models for scientific text evaluation.
\ No newline at end of file

From 677491815e5b1e8681dc7f3a2de2df13ba1dbbe5 Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Mon, 12 Jan 2026 09:56:50 +0100
Subject: [PATCH 3/9] Fixed structured output parsing for GPT-4 models and
 updated README

---
 README.md                 |  13 +++-
 docs/source/judges.rst    |   4 +-
 requirements.txt          |   3 +-
 yescieval/judge/judges.py | 155 +++++++++++++++++++++++++-------------
 4 files changed, 116 insertions(+), 59 deletions(-)

diff --git a/README.md b/README.md
index 5dcf9a2..3ed0573 100644
--- a/README.md
+++ b/README.md
@@ -88,13 +88,18 @@ Judges within YESciEval are defined as follows:
 | `AskAutoJudge`   | Multidisciplinary judge tuned on the ORKGSyn dataset from the Open Research Knowledge Graph. |
 | `BioASQAutoJudge` | Biomedical domain judge tuned on the BioASQ dataset from the BioASQ challenge.               |
 | `CustomAutoJudge`| Custom LLM that can be used as a judge within YESciEval rubrics                              |
+| `GPTCustomAutoJudge`| Custom GPT-based LLM that can be used as a judge within YESciEval                     |
 
-A total of nine evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code:
+
+A total of twenty three (23) evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code:
 
 ```python
-from yescieval import Informativeness, Correctness, Completeness, 
-                      Coherence, Relevancy, Integration, 
-                      Cohesion, Readability, Conciseness
+ from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy,
+                          Integration, Cohesion, Readability, Conciseness, GeographicCoverage, 
+                          InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale,
+                          MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, 
+                          StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, 
+                          SpeculativeStatements, NoveltyIndicators
 ```
 
 A complete list of rubrics are available at YESciEval [📚 Rubrics](https://yescieval.readthedocs.io/rubrics.html) page.
diff --git a/docs/source/judges.rst b/docs/source/judges.rst
index bc60c27..7c1a5ae 100644
--- a/docs/source/judges.rst
+++ b/docs/source/judges.rst
@@ -101,10 +101,10 @@ You can use it to evaluate a rubric by providing your OpenAI API key and specify
 
     # Initialize and load a custom model by specifying its Hugging Face model ID
     judge = GPTCustomAutoJudge()
-    judge.from_pretrained("gpt-5.2", api_key=OPEN_AI_API_KEY)
+    judge.from_pretrained("gpt-5.2", token=OPEN_AI_API_KEY)
 
     # Evaluate the rubric using the loaded model
-    result = judge.evaluate(rubric=rubric)
+    result = judge.judge(rubric=rubric)
 
     print(result)
 
diff --git a/requirements.txt b/requirements.txt
index eb4a74f..0478557 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,5 +6,4 @@ openai
 pandas
 numpy
 pydantic
-pytest
-openai
\ No newline at end of file
+pytest
\ No newline at end of file
diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py
index 0f4efc9..65067f8 100644
--- a/yescieval/judge/judges.py
+++ b/yescieval/judge/judges.py
@@ -4,6 +4,8 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, PeftConfig
 import torch
+import json
+import ast
 from openai import OpenAI
 
 
@@ -69,96 +71,147 @@ def _from_pretrained(self, model_id:str, device:str="auto", token:str =""):
         return model, tokenizer
 
 
-class GPTCustomAutoJudge(AutoJudge):
-
-    def from_pretrained(self, model_id: str, api_key: str = None, base_url: str = None, **kwargs):
+class GPTCustomAutoJudge(Judge):
 
+    def from_pretrained(self, model_id: str, token: str = ""):
         self.model_name = model_id
 
-        client_kwargs = {}
-        if api_key:
-            client_kwargs["api_key"] = api_key
-        if base_url:
-            client_kwargs["base_url"] = base_url
-        client_kwargs.update(kwargs)
-
-        self.client = OpenAI(**client_kwargs)
-        return self  
+        if token:
+            self.client = OpenAI(api_key=token)
+        else:
+            self.client = OpenAI()
 
     def _is_reasoning_model(self) -> bool:
 
         model_lower = self.model_name.lower()
-        reasoning_prefixes = ("gpt-5", "o1", "o4" "o3", "o-1", "o-3")
+        reasoning_prefixes = ("gpt-5", "o1", "o4", "o3", "o-1", "o-3")
         return any(model_lower.startswith(prefix) for prefix in reasoning_prefixes)
 
-    def evaluate(
-        self,
-        rubric: Rubric,
-        max_new_tokens: int = 300,
-        temperature: float = 0.0,
-        **kwargs
-    ) -> str:
-       
-        if self.client is None:
+    def _is_gpt4_family(self) -> bool:
+        model_lower = self.model_name.lower()
+        return model_lower.startswith(("gpt-4", "gpt-4o", "gpt-4.1"))
+    
+    def _model_family(self) -> str:
+        if self._is_gpt4_family():
+            return "GPT-4 family"
+        elif self._is_reasoning_model():
+            return "GPT-5 / O-series"
+        else:
+            return "Other"
+
+    
+    def _judge_function_schema_single(self, rubric: Rubric) -> dict:
+    
+        rubric_id = rubric.__class__.__name__  
+        return {
+            "name": "submit_judgement",
+            "description": f"Return rating and rationale for rubric {rubric_id}",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    rubric_id: {
+                        "type": "object",
+                        "properties": {
+                            "rating": {"type": "string", "description": "Score for this rubric"},
+                            "rationale": {"type": "string", "description": "Explanation for the rating"}
+                        },
+                        "required": ["rating", "rationale"]
+                    }
+                },
+                "required": [rubric_id]
+            }
+        }
+
+    def _parse_json(self, text: str) -> dict:
+            
+            text = text.strip()
+            
+            if text.startswith("```"):
+                text = text.split("```")[1]
+                if text.startswith("json"):
+                    text = text[4:]
+            
+            text = text.strip()
+            
+            try:
+                return json.loads(text)
+            except json.JSONDecodeError:
+                try:
+                    return ast.literal_eval(text)
+                except:
+                    return None
+
+    def judge(self, rubric: Rubric, max_new_tokens: int = 150) -> Dict[str, Dict[str, str]]:
+        if not self.client:
             raise ValueError("Model not initialized. Call from_pretrained() first.")
 
-        raw_messages = rubric.instruct()
-        messages = self._format_messages(raw_messages)
+        messages = self._format_messages(rubric.instruct())
+        rubric_id = rubric.__class__.__name__
+
+        if self._is_reasoning_model():
+            actual_max_tokens = max_new_tokens * 10
+        else:
+            actual_max_tokens = max_new_tokens
 
         params = {
             "model": self.model_name,
             "messages": messages,
+            "max_completion_tokens" if self._is_reasoning_model() else "max_tokens": actual_max_tokens
         }
 
-        # Add model-specific parameters
-        if self._is_reasoning_model():
-            params["max_completion_tokens"] = max_new_tokens
-        else:
-            params["max_tokens"] = max_new_tokens
-            params["temperature"] = temperature
+        if self._is_gpt4_family():
+            params["functions"] = [self._judge_function_schema_single(rubric)]
+            params["function_call"] = {"name": "submit_judgement"}
 
-        for key, value in kwargs.items():
-            if key not in params:
-                params[key] = value
         try:
             response = self.client.chat.completions.create(**params)
+            message = response.choices[0].message
         except Exception as e:
             raise RuntimeError(f"OpenAI API call failed: {str(e)}")
 
-        if response.choices and len(response.choices) > 0:
-            content = response.choices[0].message.content
-            return content if content else ""
-        return ""
+        if self._is_gpt4_family():
+            if hasattr(message, 'function_call') and message.function_call:
+                raw_args = message.function_call.arguments.strip()
+                content = self._parse_json(raw_args) or {}
+            else:
+                content = {rubric_id: {"rating": "N/A", "rationale": "No function call returned"}}
+        else:
+            raw_text = getattr(message, 'content', '')
+            
+            if raw_text:
+                parsed = self._parse_json(raw_text)
+                if parsed:
+                    content = parsed
+                else:
+                    content = {rubric_id: {"rating": "N/A", "rationale": raw_text}}
+            else:
+                if response.choices[0].finish_reason == 'length':
+                    content = {rubric_id: {"rating": "N/A", "rationale": "Token limit reached. Increase max_new_tokens parameter."}}
+                else:
+                    content = {rubric_id: {"rating": "N/A", "rationale": "Empty response"}}
 
-    def _format_messages(self, raw_messages) -> list:
+        return content
 
+    def _format_messages(self, raw_messages) -> list:
+        """Convert rubric.instruct() output to OpenAI message format."""
         messages = []
 
-        # Handle string input
         if isinstance(raw_messages, str):
             messages.append({"role": "user", "content": raw_messages})
-        
-        # Handle list input
         elif isinstance(raw_messages, list):
             for msg in raw_messages:
                 if isinstance(msg, str):
                     messages.append({"role": "user", "content": msg})
-                elif isinstance(msg, dict):
-                    if "role" in msg and "content" in msg:
-                        messages.append(msg)
-                    else:
-                        raise ValueError(f"Message dict missing 'role' or 'content': {msg}")
+                elif isinstance(msg, dict) and "role" in msg and "content" in msg:
+                    messages.append(msg)
                 else:
-                    raise ValueError(f"Invalid message type in list: {type(msg)}")
-        
-        # Handle dict input (single message)
+                    raise ValueError(f"Invalid message format: {msg}")
         elif isinstance(raw_messages, dict):
             if "role" in raw_messages and "content" in raw_messages:
                 messages.append(raw_messages)
             else:
                 raise ValueError(f"Message dict missing 'role' or 'content': {raw_messages}")
-        
         else:
-            raise ValueError(f"Unsupported rubric.instruct() output type: {type(raw_messages)}")
+            raise ValueError(f"Unsupported rubric.instruct() type: {type(raw_messages)}")
 
         return messages
\ No newline at end of file

From 6929377b6102597a50a8f6c2a10d41fe27fa422a Mon Sep 17 00:00:00 2001
From: MikeACedric <72818458+MikeACedric@users.noreply.github.com>
Date: Tue, 13 Jan 2026 10:40:07 +0100
Subject: [PATCH 4/9] Added retry logic to judge API calls and clean up method
 signatures

---
 yescieval/base/rubric.py            |   1 +
 yescieval/judge/judges.py           | 148 +++++++++-------------------
 yescieval/rubric/breadth.py         |   5 +
 yescieval/rubric/depth.py           |   3 +
 yescieval/rubric/gap.py             |   1 +
 yescieval/rubric/informativeness.py |   3 +
 yescieval/rubric/innovation.py      |   2 +
 yescieval/rubric/rigor.py           |   3 +
 yescieval/rubric/structural.py      |   3 +
 yescieval/rubric/stylistic.py       |   3 +
 10 files changed, 71 insertions(+), 101 deletions(-)

diff --git a/yescieval/base/rubric.py b/yescieval/base/rubric.py
index 64c37e7..6ca06fe 100644
--- a/yescieval/base/rubric.py
+++ b/yescieval/base/rubric.py
@@ -10,6 +10,7 @@ class Rubric(BaseModel, ABC):
     Subclasses must implement `verbalize`.
     """
     system_prompt_template: str
+    name: str = "Rubric"
     papers: Dict[str, str]
     question: str
     answer: str
diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py
index 65067f8..edf3061 100644
--- a/yescieval/judge/judges.py
+++ b/yescieval/judge/judges.py
@@ -4,10 +4,11 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, PeftConfig
 import torch
-import json
-import ast
+import time
 from openai import OpenAI
+import logging
 
+logger = logging.getLogger(__name__)
 
 
 class AutoJudge(Judge):
@@ -72,17 +73,18 @@ def _from_pretrained(self, model_id:str, device:str="auto", token:str =""):
 
 
 class GPTCustomAutoJudge(Judge):
+    
+    def from_pretrained(self, model_id:str, device: str="auto", token:str =""):
+        if not token:
+            raise ValueError(
+                "OpenAI API token must be provided."
+            )
 
-    def from_pretrained(self, model_id: str, token: str = ""):
         self.model_name = model_id
+        self.client = OpenAI(api_key=token)
 
-        if token:
-            self.client = OpenAI(api_key=token)
-        else:
-            self.client = OpenAI()
 
     def _is_reasoning_model(self) -> bool:
-
         model_lower = self.model_name.lower()
         reasoning_prefixes = ("gpt-5", "o1", "o4", "o3", "o-1", "o-3")
         return any(model_lower.startswith(prefix) for prefix in reasoning_prefixes)
@@ -90,21 +92,10 @@ def _is_reasoning_model(self) -> bool:
     def _is_gpt4_family(self) -> bool:
         model_lower = self.model_name.lower()
         return model_lower.startswith(("gpt-4", "gpt-4o", "gpt-4.1"))
-    
-    def _model_family(self) -> str:
-        if self._is_gpt4_family():
-            return "GPT-4 family"
-        elif self._is_reasoning_model():
-            return "GPT-5 / O-series"
-        else:
-            return "Other"
-
-    
-    def _judge_function_schema_single(self, rubric: Rubric) -> dict:
-    
-        rubric_id = rubric.__class__.__name__  
+  
+    def _build_rubric_evaluation_function_schema(self, rubric_id: str) -> dict:
         return {
-            "name": "submit_judgement",
+            "name": "evaluate_rubric",
             "description": f"Return rating and rationale for rubric {rubric_id}",
             "parameters": {
                 "type": "object",
@@ -122,96 +113,51 @@ def _judge_function_schema_single(self, rubric: Rubric) -> dict:
             }
         }
 
-    def _parse_json(self, text: str) -> dict:
-            
-            text = text.strip()
-            
-            if text.startswith("```"):
-                text = text.split("```")[1]
-                if text.startswith("json"):
-                    text = text[4:]
-            
-            text = text.strip()
-            
-            try:
-                return json.loads(text)
-            except json.JSONDecodeError:
-                try:
-                    return ast.literal_eval(text)
-                except:
-                    return None
-
     def judge(self, rubric: Rubric, max_new_tokens: int = 150) -> Dict[str, Dict[str, str]]:
         if not self.client:
-            raise ValueError("Model not initialized. Call from_pretrained() first.")
-
-        messages = self._format_messages(rubric.instruct())
-        rubric_id = rubric.__class__.__name__
+            raise ValueError("Model not initialized.")
 
-        if self._is_reasoning_model():
-            actual_max_tokens = max_new_tokens * 10
-        else:
-            actual_max_tokens = max_new_tokens
+        messages = rubric.instruct()
+        rubric_id = rubric.name
 
         params = {
             "model": self.model_name,
             "messages": messages,
-            "max_completion_tokens" if self._is_reasoning_model() else "max_tokens": actual_max_tokens
+            "max_completion_tokens" if self._is_reasoning_model() else "max_tokens": max_new_tokens
         }
 
         if self._is_gpt4_family():
-            params["functions"] = [self._judge_function_schema_single(rubric)]
-            params["function_call"] = {"name": "submit_judgement"}
+            params["functions"] = [self._build_rubric_evaluation_function_schema(rubric_id)]
+            params["function_call"] = {"name": "evaluate_rubric"}
 
-        try:
-            response = self.client.chat.completions.create(**params)
-            message = response.choices[0].message
-        except Exception as e:
-            raise RuntimeError(f"OpenAI API call failed: {str(e)}")
-
-        if self._is_gpt4_family():
-            if hasattr(message, 'function_call') and message.function_call:
-                raw_args = message.function_call.arguments.strip()
-                content = self._parse_json(raw_args) or {}
-            else:
-                content = {rubric_id: {"rating": "N/A", "rationale": "No function call returned"}}
-        else:
-            raw_text = getattr(message, 'content', '')
-            
-            if raw_text:
-                parsed = self._parse_json(raw_text)
-                if parsed:
-                    content = parsed
-                else:
-                    content = {rubric_id: {"rating": "N/A", "rationale": raw_text}}
-            else:
-                if response.choices[0].finish_reason == 'length':
-                    content = {rubric_id: {"rating": "N/A", "rationale": "Token limit reached. Increase max_new_tokens parameter."}}
+        while True:
+            try:
+                response = self.client.chat.completions.create(**params)
+                message = response.choices[0].message
+
+                if self._is_gpt4_family():
+                    if hasattr(message, 'function_call') and message.function_call:
+                        raw_args = message.function_call.arguments.strip()
+                        content = eval(raw_args) if raw_args else {}
+                    else:
+                        content = {rubric_id: {"rating": "N/A", "rationale": "No function call returned"}}
                 else:
-                    content = {rubric_id: {"rating": "N/A", "rationale": "Empty response"}}
+                    raw_text = getattr(message, 'content', '')
+                    if raw_text:
+                        try:
+                            content = eval(raw_text)
+                        except Exception:
+                            content = {rubric_id: {"rating": "N/A", "rationale": raw_text}}
+                    else:
+                        if response.choices[0].finish_reason == 'length':
+                            content = {rubric_id: {"rating": "N/A", "rationale": "Token limit reached. Increase max_new_tokens parameter."}}
+                        else:
+                            content = {rubric_id: {"rating": "N/A", "rationale": "Empty response"}}
+                
+                break
+
+            except Exception as e:
+                logger.warning(f"API call failed, retrying in 4 seconds: {e}")
+                time.sleep(4) 
 
         return content
-
-    def _format_messages(self, raw_messages) -> list:
-        """Convert rubric.instruct() output to OpenAI message format."""
-        messages = []
-
-        if isinstance(raw_messages, str):
-            messages.append({"role": "user", "content": raw_messages})
-        elif isinstance(raw_messages, list):
-            for msg in raw_messages:
-                if isinstance(msg, str):
-                    messages.append({"role": "user", "content": msg})
-                elif isinstance(msg, dict) and "role" in msg and "content" in msg:
-                    messages.append(msg)
-                else:
-                    raise ValueError(f"Invalid message format: {msg}")
-        elif isinstance(raw_messages, dict):
-            if "role" in raw_messages and "content" in raw_messages:
-                messages.append(raw_messages)
-            else:
-                raise ValueError(f"Message dict missing 'role' or 'content': {raw_messages}")
-        else:
-            raise ValueError(f"Unsupported rubric.instruct() type: {type(raw_messages)}")
-
-        return messages
\ No newline at end of file
diff --git a/yescieval/rubric/breadth.py b/yescieval/rubric/breadth.py
index efaf0bc..0ed9d2d 100644
--- a/yescieval/rubric/breadth.py
+++ b/yescieval/rubric/breadth.py
@@ -51,6 +51,7 @@
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class GeographicCoverage(Rubric):
+    name: str = "Geographic Coverage"
     system_prompt_template: str = geographic_coverage_prompt
 
 intervention_diversity_prompt = """<Context> 
@@ -104,6 +105,7 @@ class GeographicCoverage(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class InterventionDiversity(Rubric):
+    name: str = "Intervention Diversity"
     system_prompt_template: str = intervention_diversity_prompt
 
 biodiversity_dimensions_prompt = """<Context> 
@@ -157,6 +159,7 @@ class InterventionDiversity(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class BiodiversityDimensions(Rubric):
+    name: str = "Biodiversity Dimensions"
     system_prompt_template: str = biodiversity_dimensions_prompt
 
 ecosystem_services_prompt = """<Context> 
@@ -210,6 +213,7 @@ class BiodiversityDimensions(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class EcosystemServices(Rubric):
+    name: str = "Ecosystem Services"
     system_prompt_template: str = ecosystem_services_prompt
 
 spatial_scale_prompt = """<Context> 
@@ -263,6 +267,7 @@ class EcosystemServices(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class SpatialScale(Rubric):
+    name: str = "Spatial Scale"
     system_prompt_template: str = spatial_scale_prompt
 
 
diff --git a/yescieval/rubric/depth.py b/yescieval/rubric/depth.py
index 04aeb00..984ddf5 100644
--- a/yescieval/rubric/depth.py
+++ b/yescieval/rubric/depth.py
@@ -50,6 +50,7 @@
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class MechanisticUnderstanding(Rubric):
+    name: str = "Mechanistic Understanding"
     system_prompt_template: str = mechanistic_understanding_prompt
 
 causal_reasoning_prompt = """<Context> 
@@ -103,6 +104,7 @@ class MechanisticUnderstanding(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class CausalReasoning(Rubric):
+    name: str = "Causal Reasoning"
     system_prompt_template: str = causal_reasoning_prompt
 
 temporal_precision_prompt = """<Context> 
@@ -156,5 +158,6 @@ class CausalReasoning(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class TemporalPrecision(Rubric):
+    name: str = "Temporal Precision"
     system_prompt_template: str = temporal_precision_prompt
 
diff --git a/yescieval/rubric/gap.py b/yescieval/rubric/gap.py
index facdcbd..6cf6dcb 100644
--- a/yescieval/rubric/gap.py
+++ b/yescieval/rubric/gap.py
@@ -51,4 +51,5 @@
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class GapIdentification(Rubric):
+    name: str = "Gap Identification"
     system_prompt_template: str = gap_identification_prompt
diff --git a/yescieval/rubric/informativeness.py b/yescieval/rubric/informativeness.py
index 9fd6788..bdfb448 100644
--- a/yescieval/rubric/informativeness.py
+++ b/yescieval/rubric/informativeness.py
@@ -51,6 +51,7 @@
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class Correctness(Rubric):
+    name: str = "Correctness"
     system_prompt_template: str = correctness_prompt
 
 completeness_prompt = """<Context> 
@@ -104,6 +105,7 @@ class Correctness(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class Completeness(Rubric):
+    name: str = "Completeness"
     system_prompt_template: str = completeness_prompt
 
 informativeness_prompt = """<Context> 
@@ -157,5 +159,6 @@ class Completeness(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class Informativeness(Rubric):
+    name: str = "Informativeness"
     system_prompt_template: str = informativeness_prompt
 
diff --git a/yescieval/rubric/innovation.py b/yescieval/rubric/innovation.py
index 290405a..7a0bd80 100644
--- a/yescieval/rubric/innovation.py
+++ b/yescieval/rubric/innovation.py
@@ -51,6 +51,7 @@
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class SpeculativeStatements(Rubric):
+    name: str = "Speculative Statements"
     system_prompt_template: str = speculative_statements_prompt
 
 novelty_indicators_prompt = """<Context> 
@@ -104,6 +105,7 @@ class SpeculativeStatements(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class NoveltyIndicators(Rubric):
+    name: str = "Novelty Indicators"
     system_prompt_template: str = novelty_indicators_prompt
 
 
diff --git a/yescieval/rubric/rigor.py b/yescieval/rubric/rigor.py
index 62c4aaf..db2e7d3 100644
--- a/yescieval/rubric/rigor.py
+++ b/yescieval/rubric/rigor.py
@@ -51,6 +51,7 @@
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class StatisticalSophistication(Rubric):
+    name: str = "Statistical Sophistication"
     system_prompt_template: str = statistical_sophistication_prompt
 
 citation_practices_prompt = """<Context> 
@@ -104,6 +105,7 @@ class StatisticalSophistication(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class CitationPractices(Rubric):
+    name: str = "Citation Practices"
     system_prompt_template: str = citation_practices_prompt
 
 uncertainty_acknowledgement_prompt = """<Context> 
@@ -157,5 +159,6 @@ class CitationPractices(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class UncertaintyAcknowledgment(Rubric):
+    name: str = "Uncertainty Acknowledgement"
     system_prompt_template: str = uncertainty_acknowledgement_prompt
 
diff --git a/yescieval/rubric/structural.py b/yescieval/rubric/structural.py
index a968642..6b83550 100644
--- a/yescieval/rubric/structural.py
+++ b/yescieval/rubric/structural.py
@@ -51,6 +51,7 @@
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class Coherence(Rubric):
+    name: str = "Coherence"
     system_prompt_template: str = coherence_prompt
 
 integration_prompt = """<Context> 
@@ -104,6 +105,7 @@ class Coherence(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class Integration(Rubric):
+    name: str = "Integration"
     system_prompt_template: str = integration_prompt
 
 relevancy_prompt = """<Context> 
@@ -157,4 +159,5 @@ class Integration(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class Relevancy(Rubric):
+    name: str = "Relevancy"
     system_prompt_template: str = relevancy_prompt
diff --git a/yescieval/rubric/stylistic.py b/yescieval/rubric/stylistic.py
index b369fdf..0e92757 100644
--- a/yescieval/rubric/stylistic.py
+++ b/yescieval/rubric/stylistic.py
@@ -52,6 +52,7 @@
 </Note>"""
 
 class Cohesion(Rubric):
+    name: str = "Cohesion"
     system_prompt_template: str = cohesion_prompt
 
 
@@ -106,6 +107,7 @@ class Cohesion(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class Conciseness(Rubric):
+    name: str = "Conciseness"
     system_prompt_template: str = conciseness_prompt
 
 readability_prompt = """<Context> 
@@ -159,5 +161,6 @@ class Conciseness(Rubric):
 Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material.
 </Note>"""
 class Readability(Rubric):
+    name: str = "Readability"
     system_prompt_template: str = readability_prompt
 

From bc4f7a5ecde546053fa88a50e0e17c8ad925cb36 Mon Sep 17 00:00:00 2001
From: Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>
Date: Tue, 13 Jan 2026 12:00:17 +0100
Subject: [PATCH 5/9] :bug: fix deep research prompts

---
 yescieval/rubric/breadth.py    | 20 ++++++++++----------
 yescieval/rubric/depth.py      | 12 ++++++------
 yescieval/rubric/gap.py        |  4 ++--
 yescieval/rubric/innovation.py |  8 ++++----
 yescieval/rubric/rigor.py      | 12 ++++++------
 5 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/yescieval/rubric/breadth.py b/yescieval/rubric/breadth.py
index 0ed9d2d..dfcc99e 100644
--- a/yescieval/rubric/breadth.py
+++ b/yescieval/rubric/breadth.py
@@ -22,7 +22,7 @@
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. geographic_coverage: is the information in the answer a correct representation of the spatial scope of the provided abstracts?
+1. Geographic Coverage: is the information in the answer a correct representation of the spatial scope of the provided abstracts?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -42,7 +42,7 @@
 
 <Example-Response>
 {
-  "geographic_coverage": {"rating": "4", "rationale": "The synthesis accurately represents multiple regions and scales from the provided abstracts, with only minor omissions or irrelevant details."}
+  "Geographic Coverage": {"rating": "4", "rationale": "The synthesis accurately represents multiple regions and scales from the provided abstracts, with only minor omissions or irrelevant details."}
 }
 </Example-Response>
 </Response-Format>
@@ -76,7 +76,7 @@ class GeographicCoverage(Rubric):
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. intervention_diversity: is the answer a comprehensive encapsulation of the relevant information in the provided abstracts, measured by the number of unique management practices?
+1. Intervention Diversity: is the answer a comprehensive encapsulation of the relevant information in the provided abstracts, measured by the number of unique management practices?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -96,7 +96,7 @@ class GeographicCoverage(Rubric):
 
 <Example-Response>
 {
-  "intervention_diversity": {"rating": "4", "rationale": "The answer includes almost all relevant interventions from the provided abstracts, with only minor details missing."}
+  "Intervention Diversity": {"rating": "4", "rationale": "The answer includes almost all relevant interventions from the provided abstracts, with only minor details missing."}
 }
 </Example-Response>
 </Response-Format>
@@ -130,7 +130,7 @@ class InterventionDiversity(Rubric):
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. biodiversity_dimensions: is the answer a comprehensive representation of the relevant biodiversity information in the provided abstracts, measured by the presence of terms related to taxonomic, functional, phylogenetic, and spatial diversity?
+1. Biodiversity Dimensions: is the answer a comprehensive representation of the relevant biodiversity information in the provided abstracts, measured by the presence of terms related to taxonomic, functional, phylogenetic, and spatial diversity?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -150,7 +150,7 @@ class InterventionDiversity(Rubric):
 
 <Example-Response>
 {
-  "biodiversity_dimensions": {"rating": "4", "rationale": "Most information is informative for the research question, capturing the key biodiversity dimensions with minor omissions."}
+  "Biodiversity Dimensions": {"rating": "4", "rationale": "Most information is informative for the research question, capturing the key biodiversity dimensions with minor omissions."}
 }
 </Example-Response>
 </Response-Format>
@@ -184,7 +184,7 @@ class BiodiversityDimensions(Rubric):
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. ecosystem_services: is the answer a useful and informative reply to the question, measured by the presence of terms matched against a vocabulary aligned with the Millennium Ecosystem Assessment?
+1. Ecosystem Services: is the answer a useful and informative reply to the question, measured by the presence of terms matched against a vocabulary aligned with the Millennium Ecosystem Assessment?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -204,7 +204,7 @@ class BiodiversityDimensions(Rubric):
 
 <Example-Response>
 {
-  "ecosystem_services": {"rating": "4", "rationale": "The synthesis includes nearly all relevant ecosystem services from the provided abstracts, with only minor omissions."}
+  "Ecosystem Services": {"rating": "4", "rationale": "The synthesis includes nearly all relevant ecosystem services from the provided abstracts, with only minor omissions."}
 }
 </Example-Response>
 </Response-Format>
@@ -238,7 +238,7 @@ class EcosystemServices(Rubric):
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. spatial_scale: is the answer a useful and informative reply to the question, measured by the presence of explicit scale terms (e.g., “local,” “regional,” “continental”) and area measures?
+1. Spatial Scale: is the answer a useful and informative reply to the question, measured by the presence of explicit scale terms (e.g., “local,” “regional,” “continental”) and area measures?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -258,7 +258,7 @@ class EcosystemServices(Rubric):
 
 <Example-Response>
 {
-  "spatial_scale": {"rating": "4", "rationale": "The synthesis includes nearly all relevant spatial scale information from the provided abstracts, with only minor omissions."}
+  "Spatial Scale": {"rating": "4", "rationale": "The synthesis includes nearly all relevant spatial scale information from the provided abstracts, with only minor omissions."}
 }
 </Example-Response>
 </Response-Format>
diff --git a/yescieval/rubric/depth.py b/yescieval/rubric/depth.py
index 984ddf5..3e12dc3 100644
--- a/yescieval/rubric/depth.py
+++ b/yescieval/rubric/depth.py
@@ -22,7 +22,7 @@
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. mechanistic_understanding: does the answer reflect understanding of ecological processes by explicitly mentioning recognized mechanisms such as feedbacks, nutrient cycling, or trophic cascades?
+1. Mechanistic Understanding: does the answer reflect understanding of ecological processes by explicitly mentioning recognized mechanisms such as feedbacks, nutrient cycling, or trophic cascades?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -41,7 +41,7 @@
 
 <Example-Response>
 {
-  "mechanistic_understanding": {"rating": "4", "rationale": "The answer explains a clear multi-step ecological mechanism using causal language, but some temporal or boundary details are only briefly addressed."}
+  "Mechanistic Understanding": {"rating": "4", "rationale": "The answer explains a clear multi-step ecological mechanism using causal language, but some temporal or boundary details are only briefly addressed."}
 }
 </Example-Response>
 </Response-Format>
@@ -75,7 +75,7 @@ class MechanisticUnderstanding(Rubric):
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. causal_reasoning: does the answer explicitly express cause–effect relationships using causal connectives (e.g., “because,” “due to”), result indicators (e.g., “results in,” “induces”), or mechanistic verbs (e.g., “drives,” “regulates”) when describing ecological processes?
+1. Causal Reasoning: does the answer explicitly express cause–effect relationships using causal connectives (e.g., “because,” “due to”), result indicators (e.g., “results in,” “induces”), or mechanistic verbs (e.g., “drives,” “regulates”) when describing ecological processes?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -95,7 +95,7 @@ class MechanisticUnderstanding(Rubric):
 
 <Example-Response>
 {
-  "causal_reasoning": {"rating": "4", "rationale": "The answer uses clear causal connectors and describes a multi-step cause–effect relationship."}
+  "Causal Reasoning": {"rating": "4", "rationale": "The answer uses clear causal connectors and describes a multi-step cause–effect relationship."}
 }
 </Example-Response>
 </Response-Format>
@@ -129,7 +129,7 @@ class CausalReasoning(Rubric):
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. temporal_precision: does the answer include specific and explicit temporal references, such as quantified time intervals or dated events, rather than vague or unspecific timing?
+1. Temporal Precision: does the answer include specific and explicit temporal references, such as quantified time intervals or dated events, rather than vague or unspecific timing?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -149,7 +149,7 @@ class CausalReasoning(Rubric):
 
 <Example-Response>
 {
-  "temporal_precision": {"rating": "4", "rationale": "The answer includes several specific timeframes or durations that are clearly linked to the described processes, though some timing details could be more precise."}
+  "Temporal Precision": {"rating": "4", "rationale": "The answer includes several specific timeframes or durations that are clearly linked to the described processes, though some timing details could be more precise."}
 }
 </Example-Response>
 </Response-Format>
diff --git a/yescieval/rubric/gap.py b/yescieval/rubric/gap.py
index 6cf6dcb..8c6fa2a 100644
--- a/yescieval/rubric/gap.py
+++ b/yescieval/rubric/gap.py
@@ -22,7 +22,7 @@
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. gap_identification: To what extent does the answer explicitly identify research gaps or unanswered questions indicated by the provided abstracts?
+1. Gap Identification: To what extent does the answer explicitly identify research gaps or unanswered questions indicated by the provided abstracts?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -42,7 +42,7 @@
 
 <Example-Response>
 {
-  "gap_identification": {"rating": "4", "rationale": "Identifies a relevant gap supported by the abstracts, with limited elaboration."}
+  "Gap Identification": {"rating": "4", "rationale": "Identifies a relevant gap supported by the abstracts, with limited elaboration."}
 }
 </Example-Response>
 </Response-Format>
diff --git a/yescieval/rubric/innovation.py b/yescieval/rubric/innovation.py
index 7a0bd80..628fa5d 100644
--- a/yescieval/rubric/innovation.py
+++ b/yescieval/rubric/innovation.py
@@ -22,7 +22,7 @@
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. speculative_statement: Does the answer clearly distinguish speculation (e.g., “might,” “could”) from established findings in the provided abstracts?
+1. Speculative Statements: Does the answer clearly distinguish speculation (e.g., “might,” “could”) from established findings in the provided abstracts?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -42,7 +42,7 @@
 
 <Example-Response>
 {
-  "speculative_statement": {"rating": "4", "rationale": "Uses hedging appropriately and clearly distinguishes speculation from established findings."}
+  "Speculative Statements": {"rating": "4", "rationale": "Uses hedging appropriately and clearly distinguishes speculation from established findings."}
 }
 </Example-Response>
 </Response-Format>
@@ -76,7 +76,7 @@ class SpeculativeStatements(Rubric):
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. novelty_indicators: Does the answer appropriately use self-declared innovation terms (e.g., “novel,” “pioneering,” “emerging”) and clearly indicate whether such claims are supported by the provided abstracts?
+1. Novelty Indicators: Does the answer appropriately use self-declared innovation terms (e.g., “novel,” “pioneering,” “emerging”) and clearly indicate whether such claims are supported by the provided abstracts?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -96,7 +96,7 @@ class SpeculativeStatements(Rubric):
 
 <Example-Response>
 {
-  "novelty_indicators": {"rating": "4", "rationale": "Shows a clear novel angle, but lacks full detail."}
+  "Novelty Indicators": {"rating": "4", "rationale": "Shows a clear novel angle, but lacks full detail."}
 }
 </Example-Response>
 </Response-Format>
diff --git a/yescieval/rubric/rigor.py b/yescieval/rubric/rigor.py
index db2e7d3..cd428a4 100644
--- a/yescieval/rubric/rigor.py
+++ b/yescieval/rubric/rigor.py
@@ -22,7 +22,7 @@
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. statistical_sophistication: Does the answer reflect quantitative depth through the use of inferential statistics or analysis methods described in the abstracts?
+1. Statistical Sophistication: Does the answer reflect quantitative depth through the use of inferential statistics or analysis methods described in the abstracts?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -42,7 +42,7 @@
 
 <Example-Response>
 {
-  "statistical_sophistication": {"rating": "3", "rationale": "The synthesis provides some methodological details and basic statistics, but does not fully discuss limitations or reproducibility.""}
+  "Statistical Sophistication": {"rating": "3", "rationale": "The synthesis provides some methodological details and basic statistics, but does not fully discuss limitations or reproducibility.""}
 }
 </Example-Response>
 </Response-Format>
@@ -76,7 +76,7 @@ class StatisticalSophistication(Rubric):
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. citation_practices: is the answer supported by appropriate references, using parenthetical or narrative citations, for the relevant information in the provided abstracts?
+1. Citation Practices: is the answer supported by appropriate references, using parenthetical or narrative citations, for the relevant information in the provided abstracts?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -96,7 +96,7 @@ class StatisticalSophistication(Rubric):
 
 <Example-Response>
 {
-  "citation_practices": {"rating": "3", "rationale": "Some claims are supported with citations, but several important points lack references or use inconsistent citation style."}
+  "Citation Practices": {"rating": "3", "rationale": "Some claims are supported with citations, but several important points lack references or use inconsistent citation style."}
 }
 </Example-Response>
 </Response-Format>
@@ -130,7 +130,7 @@ class CitationPractices(Rubric):
 </Task-Description>
 
 <Evaluation-Characteristics>
-1. uncertainty_acknowledgement: does the answer explicitly discuss limitations, uncertainty, or gaps in evidence (e.g., using terms like “unknown,” “limited evidence,” or “unclear”)?
+1. Uncertainty Acknowledgement: does the answer explicitly discuss limitations, uncertainty, or gaps in evidence (e.g., using terms like “unknown,” “limited evidence,” or “unclear”)?
 </Evaluation-Characteristics>
 
 <Rating-Scale>
@@ -150,7 +150,7 @@ class CitationPractices(Rubric):
 
 <Example-Response>
 {
-  "uncertainty_acknowledgement": {"rating": "4", "rationale": "The answer clearly acknowledges key uncertainties and limitations in the study."}
+  "Uncertainty Acknowledgement": {"rating": "4", "rationale": "The answer clearly acknowledges key uncertainties and limitations in the study."}
 }
 </Example-Response>
 </Response-Format>

From e4940754734fe42581377297450001e618b5ac57 Mon Sep 17 00:00:00 2001
From: Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>
Date: Tue, 13 Jan 2026 12:02:05 +0100
Subject: [PATCH 6/9] :sparkles: update judge io

---
 yescieval/base/judge.py   | 4 ++--
 yescieval/judge/judges.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/yescieval/base/judge.py b/yescieval/base/judge.py
index 5ef75ed..3178d2b 100644
--- a/yescieval/base/judge.py
+++ b/yescieval/base/judge.py
@@ -1,6 +1,6 @@
 from abc import ABC
 from typing import Dict, Any
-from . import Parser, Rubric
+from . import Rubric, RubricLikertScale
 
 
 class Judge(ABC):
@@ -8,7 +8,7 @@ class Judge(ABC):
     def from_pretrained(self, model_id:str, device: str="auto", token:str =""):
         self.model, self.tokenizer = self._from_pretrained(model_id=model_id, device=device, token=token)
 
-    def judge(self, rubric: Rubric, max_new_tokens: int=150) -> Dict[str, Dict[str, str]]:
+    def judge(self, rubric: Rubric, max_new_tokens: int=150) -> Dict[str, Dict[str, str]] | str | RubricLikertScale:
         pass
 
     def _from_pretrained(self, model_id: str, device: str = "auto", token: str = "") -> [Any, Any]:
diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py
index edf3061..0810805 100644
--- a/yescieval/judge/judges.py
+++ b/yescieval/judge/judges.py
@@ -29,7 +29,7 @@ def _from_pretrained(self, model_id:str, device:str="auto", token:str =""):
         model = PeftModel.from_pretrained(base_model, model_id)
         return model, tokenizer
 
-    def evaluate(self, rubric: Rubric, max_new_tokens: int=150) -> Dict[str, Dict[str, str]]:
+    def judge(self, rubric: Rubric, max_new_tokens: int=150) -> str:
         inputs = self.tokenizer.apply_chat_template(rubric.instruct(),
                                                     add_generation_prompt=True,
                                                     return_dict=True,

From 71b22bd96ae90f7b5ca560e121614ccea67779ed Mon Sep 17 00:00:00 2001
From: Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>
Date: Tue, 13 Jan 2026 12:02:31 +0100
Subject: [PATCH 7/9] :sparkles: add custom judges

---
 yescieval/judge/__init__.py |   3 +-
 yescieval/judge/custom.py   |  97 +++++++++++++++++++++++++++++++
 yescieval/judge/judges.py   | 110 ------------------------------------
 3 files changed, 99 insertions(+), 111 deletions(-)
 create mode 100644 yescieval/judge/custom.py

diff --git a/yescieval/judge/__init__.py b/yescieval/judge/__init__.py
index 09ba18f..d0d69e3 100644
--- a/yescieval/judge/__init__.py
+++ b/yescieval/judge/__init__.py
@@ -1,4 +1,5 @@
-from .judges import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge, GPTCustomAutoJudge
+from .judges import AutoJudge, AskAutoJudge, BioASQAutoJudge
+from .custom import CustomAutoJudge, GPTCustomAutoJudge
 
 __all__ = [
     "AutoJudge",
diff --git a/yescieval/judge/custom.py b/yescieval/judge/custom.py
new file mode 100644
index 0000000..c44d1cf
--- /dev/null
+++ b/yescieval/judge/custom.py
@@ -0,0 +1,97 @@
+from ..base import Judge, Rubric, RubricLikertScale
+from .judges import AutoJudge
+
+import time
+from typing import Dict, List
+from openai import OpenAI
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import logging
+
+logger = logging.getLogger(__name__)
+
+class CustomAutoJudge(AutoJudge):
+
+    def _from_pretrained(self, model_id:str, device:str="auto", token:str =""):
+        tokenizer = AutoTokenizer.from_pretrained(model_id,
+                                                  padding_side="left",
+                                                  token=token)
+        tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,
+            device_map=device,
+            token=token
+        )
+        return model, tokenizer
+
+
+class GPTCustomAutoJudge(Judge):
+
+    def from_pretrained(self, model_id: str, device: str = "auto", token: str = ""):
+        if not token:
+            raise ValueError("OpenAI API token must be provided.")
+        self.model_name = model_id
+        self.client = OpenAI(api_key=token)
+
+    def _supports_function_calling(self) -> bool:
+        gpt_4_prefixes = (
+            "gpt-4", # gpt4 family including gpt-4o, gpt-4o-mini, gpt-4.1, ...
+            "GPT-3.5", # gpt-3.5 family
+        )
+        return any(self.model_name.startswith(prefix) for prefix in gpt_4_prefixes)
+
+    def _output_schema(self) -> List[Dict]:
+        return [
+            {
+                "name": "response_format",
+                "description": f"Return the `rating` and `rationale` only as a response.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        'rating': {
+                                "type": "number",
+                                "description": "A numerical rating assigned to the characteristic.",
+                                "minimum": 1,
+                                "maximum": 5
+                        },
+                        "rationale": {
+                            "type": "string",
+                            "description": "The explanation for the assigned rating."
+                        },
+                    },
+                    "required": ["rating", "rationale"]
+                }
+            }
+        ]
+
+    def judge(self, rubric: Rubric, max_new_tokens: int = 150) -> RubricLikertScale:
+        if not self.client:
+            raise ValueError("Model not initialized.")
+        messages = rubric.instruct()
+        params = {
+            "model": self.model_name,
+            "messages": messages
+        }
+        if self._supports_function_calling():
+            params["functions"] = self._output_schema()
+
+        try_counter = 0
+        while True:
+            try:
+                try_counter += 1
+                response = self.client.chat.completions.create(**params)
+                message = response.choices[0].message
+                if self._supports_function_calling():
+                    parsed_output = eval(message.function_call.arguments)
+                else:
+                    parsed_output = eval(message.content)[rubric.name]
+                evaluation = RubricLikertScale(rating=parsed_output['rating'], rationale=parsed_output['rationale'])
+                return evaluation
+
+            except Exception as e:
+                logger.error(f"{try_counter} times failed attempt!")
+                logger.warning(f"API call failed, retrying in 4 seconds: {e}")
+                time.sleep(5)
+
+
diff --git a/yescieval/judge/judges.py b/yescieval/judge/judges.py
index 0810805..c700b28 100644
--- a/yescieval/judge/judges.py
+++ b/yescieval/judge/judges.py
@@ -4,8 +4,6 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, PeftConfig
 import torch
-import time
-from openai import OpenAI
 import logging
 
 logger = logging.getLogger(__name__)
@@ -53,111 +51,3 @@ def from_pretrained(self, model_id: str = "SciKnowOrg/YESciEval-BioASQ-Llama-3.1
                          device: str = "auto",
                          token: str = ""):
         self.model, self.tokenizer = super()._from_pretrained(model_id=model_id, device=device, token=token)
-
-
-
-class CustomAutoJudge(AutoJudge):
-
-    def _from_pretrained(self, model_id:str, device:str="auto", token:str =""):
-        tokenizer = AutoTokenizer.from_pretrained(model_id,
-                                                  padding_side="left",
-                                                  token=token)
-        tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch.float32,
-            device_map=device,
-            token=token
-        )
-        return model, tokenizer
-
-
-class GPTCustomAutoJudge(Judge):
-    
-    def from_pretrained(self, model_id:str, device: str="auto", token:str =""):
-        if not token:
-            raise ValueError(
-                "OpenAI API token must be provided."
-            )
-
-        self.model_name = model_id
-        self.client = OpenAI(api_key=token)
-
-
-    def _is_reasoning_model(self) -> bool:
-        model_lower = self.model_name.lower()
-        reasoning_prefixes = ("gpt-5", "o1", "o4", "o3", "o-1", "o-3")
-        return any(model_lower.startswith(prefix) for prefix in reasoning_prefixes)
-
-    def _is_gpt4_family(self) -> bool:
-        model_lower = self.model_name.lower()
-        return model_lower.startswith(("gpt-4", "gpt-4o", "gpt-4.1"))
-  
-    def _build_rubric_evaluation_function_schema(self, rubric_id: str) -> dict:
-        return {
-            "name": "evaluate_rubric",
-            "description": f"Return rating and rationale for rubric {rubric_id}",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    rubric_id: {
-                        "type": "object",
-                        "properties": {
-                            "rating": {"type": "string", "description": "Score for this rubric"},
-                            "rationale": {"type": "string", "description": "Explanation for the rating"}
-                        },
-                        "required": ["rating", "rationale"]
-                    }
-                },
-                "required": [rubric_id]
-            }
-        }
-
-    def judge(self, rubric: Rubric, max_new_tokens: int = 150) -> Dict[str, Dict[str, str]]:
-        if not self.client:
-            raise ValueError("Model not initialized.")
-
-        messages = rubric.instruct()
-        rubric_id = rubric.name
-
-        params = {
-            "model": self.model_name,
-            "messages": messages,
-            "max_completion_tokens" if self._is_reasoning_model() else "max_tokens": max_new_tokens
-        }
-
-        if self._is_gpt4_family():
-            params["functions"] = [self._build_rubric_evaluation_function_schema(rubric_id)]
-            params["function_call"] = {"name": "evaluate_rubric"}
-
-        while True:
-            try:
-                response = self.client.chat.completions.create(**params)
-                message = response.choices[0].message
-
-                if self._is_gpt4_family():
-                    if hasattr(message, 'function_call') and message.function_call:
-                        raw_args = message.function_call.arguments.strip()
-                        content = eval(raw_args) if raw_args else {}
-                    else:
-                        content = {rubric_id: {"rating": "N/A", "rationale": "No function call returned"}}
-                else:
-                    raw_text = getattr(message, 'content', '')
-                    if raw_text:
-                        try:
-                            content = eval(raw_text)
-                        except Exception:
-                            content = {rubric_id: {"rating": "N/A", "rationale": raw_text}}
-                    else:
-                        if response.choices[0].finish_reason == 'length':
-                            content = {rubric_id: {"rating": "N/A", "rationale": "Token limit reached. Increase max_new_tokens parameter."}}
-                        else:
-                            content = {rubric_id: {"rating": "N/A", "rationale": "Empty response"}}
-                
-                break
-
-            except Exception as e:
-                logger.warning(f"API call failed, retrying in 4 seconds: {e}")
-                time.sleep(4) 
-
-        return content

From 071d15fd186cd026246ea4261a54193d92d10c9f Mon Sep 17 00:00:00 2001
From: Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>
Date: Tue, 13 Jan 2026 12:02:48 +0100
Subject: [PATCH 8/9] :memo: update docs

---
 README.md                  | 16 ++++++++--------
 docs/source/judges.rst     | 17 +++++++++++++----
 docs/source/quickstart.rst | 30 +++++++++++++++++++++++++++---
 docs/source/rubrics.rst    |  1 +
 4 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index ac6f324..f00a219 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ judge.from_pretrained(
 )
 
 # Step 3: Evaluate the answer
-result = judge.evaluate(rubric=rubric)
+result = judge.judge(rubric=rubric)
 print("Raw Evaluation Output:")
 print(result)
 ```
@@ -91,15 +91,15 @@ Judges within YESciEval are defined as follows:
 | `GPTCustomAutoJudge`| Custom GPT-based LLM that can be used as a judge within YESciEval                     |
 
 
-A total of twenty three (23) evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code:
+A total of **23** evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code:
 
 ```python
- from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy,
-                          Integration, Cohesion, Readability, Conciseness, GeographicCoverage, 
-                          InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale,
-                          MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, 
-                          StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, 
-                          SpeculativeStatements, NoveltyIndicators
+from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, \
+                      Integration, Cohesion, Readability, Conciseness, GeographicCoverage, \ 
+                      InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, \
+                      MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, \
+                      StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, \
+                      SpeculativeStatements, NoveltyIndicators 
 ```
 
 A complete list of rubrics are available at YESciEval [📚 Rubrics](https://yescieval.readthedocs.io/rubrics.html) page.
diff --git a/docs/source/judges.rst b/docs/source/judges.rst
index 7c1a5ae..14621a3 100644
--- a/docs/source/judges.rst
+++ b/docs/source/judges.rst
@@ -48,7 +48,7 @@ The following example demonstrates how to create an evaluation rubric, load a ju
                           device="cpu")
 
     # Step 3: Evaluate the answer
-    result = judge.evaluate(rubric=rubric)
+    result = judge.judge(rubric=rubric)
 
     print("Raw Evaluation Output:")
     print(result)
@@ -84,7 +84,7 @@ For example, you can load a model and evaluate a rubric like this:
     judge.from_pretrained(model_id="Qwen/Qwen3-8B", device="cpu", token="your_huggingface_token")
 
     # Evaluate the rubric using the loaded model
-    result = judge.evaluate(rubric=rubric)
+    result = judge.judge(rubric=rubric)
 
     print(result)
 
@@ -101,11 +101,20 @@ You can use it to evaluate a rubric by providing your OpenAI API key and specify
 
     # Initialize and load a custom model by specifying its Hugging Face model ID
     judge = GPTCustomAutoJudge()
-    judge.from_pretrained("gpt-5.2", token=OPEN_AI_API_KEY)
+    judge.from_pretrained(model_id="gpt-5.2", token=OPEN_AI_API_KEY)
 
     # Evaluate the rubric using the loaded model
     result = judge.judge(rubric=rubric)
 
-    print(result)
+    print(result.model_dump())
+
+as a result output will be in the following format
+
+.. code-block:: json
+
+   {
+     "rating": rating-value,
+     "rationale": "rationale-text"
+   }
 
 This allows you to leverage the capabilities of OpenAI's GPT models for scientific text evaluation.
\ No newline at end of file
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index fdbb3e3..8f7fbdc 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -35,7 +35,7 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi
    judge.from_pretrained(token="your_huggingface_token", device="cpu")
 
    # Step 3: Evaluate the answer
-   result = judge.evaluate(rubric=rubric)
+   result = judge.judge(rubric=rubric)
 
    print("Raw Evaluation Output:")
    print(result)
@@ -62,7 +62,7 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi
    judge.from_pretrained(model_id="Qwen/Qwen3-8B", device="cpu", token="your_huggingface_token")
 
    # Step 3: Evaluate the answer
-   result = judge.evaluate(rubric=rubric)
+   result = judge.judge(rubric=rubric)
    print("Raw Evaluation Output:")
    print(result)
 
@@ -81,7 +81,7 @@ If the model outputs unstructured or loosely structured text, you can use GPTPar
    parsed = parser.parse(raw_output=raw_output)
 
    print("Parsed Output:")
-   print(parsed)
+   print(parsed.model_dump())
 
 **Expected Output Format**
 
@@ -92,6 +92,30 @@ If the model outputs unstructured or loosely structured text, you can use GPTPar
      "rationale": "The answer covers key aspects of how AI is applied in healthcare, such as diagnostics and personalized medicine."
    }
 
+The output schema is as a following (if you do not prefer to use ``.model_dump()``) to be able to use like ``result.rating`` to access the rating value or ``result.rationale`` to access the textual explanation for rating.
+
+.. code-block::
+
+	{
+		'properties': {
+			'rating': {
+				'description': 'Rating from 1 to 5',
+				'maximum': 5,
+				'minimum': 1,
+				'title': 'Rating',
+				'type': 'integer'
+			},
+			'rationale': {
+				'description': 'Textual explanation for the rating',
+				'title': 'Rationale',
+				'type': 'string'
+			}
+		},
+		'required': ['rating', 'rationale'],
+		'title': 'RubricLikertScale',
+		'type': 'object'
+	}
+
 .. hint:: Key Components
 
     +------------------+-------------------------------------------------------+
diff --git a/docs/source/rubrics.rst b/docs/source/rubrics.rst
index e264f21..b38498c 100644
--- a/docs/source/rubrics.rst
+++ b/docs/source/rubrics.rst
@@ -188,3 +188,4 @@ And to use rubrics:
     instruction = rubric.instruct()
 
     print(instruction)
+    print(rubric.name)

From 2ad66fb2789afdb847f19371317a9b1443f8fcdd Mon Sep 17 00:00:00 2001
From: Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>
Date: Tue, 13 Jan 2026 12:09:10 +0100
Subject: [PATCH 9/9] :bookmark: v0.4.0

---
 CHANGELOG.md      | 6 ++++++
 yescieval/VERSION | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 262c769..1bb7366 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 ## Changelog
 
+### v0.4.0 (January 13, 2026)
+- Add a GPT custom jude (PR #5)
+- Update documentation 
+- Minor bug fixing in deep research rubrics and judges
+- Update Readme
+
 ### v0.3.0 (December 20, 2025)
 - Add more rubrics (PR #3)
 - Update documentation for new rubrics
diff --git a/yescieval/VERSION b/yescieval/VERSION
index 9325c3c..60a2d3e 100644
--- a/yescieval/VERSION
+++ b/yescieval/VERSION
@@ -1 +1 @@
-0.3.0
\ No newline at end of file
+0.4.0
\ No newline at end of file