From 1823c9bbb3eefcb5fe76ad5898bd4a9d5f30ccee Mon Sep 17 00:00:00 2001 From: MikeACedric <72818458+MikeACedric@users.noreply.github.com> Date: Wed, 21 Jan 2026 10:06:14 +0100 Subject: [PATCH 01/13] Add vocab and example injection into rubric prompts (#10) - Added functionality to include vocabulary terms and example responses directly into rubric prompts for improved prompt context. --- yescieval/__init__.py | 5 +- yescieval/base/__init__.py | 6 +- yescieval/base/example.py | 66 ++++++ yescieval/base/rubric.py | 61 ++++-- yescieval/base/vocab.py | 104 +++++++++ yescieval/data/examples/examples.json | 70 ++++++ .../data/vocab/ecology_dictionaries.json | 114 ++++++++++ yescieval/data/vocab/nlp_dictionaries.json | 118 ++++++++++ yescieval/rubric/__init__.py | 6 +- yescieval/rubric/depth.py | 202 +++++++++++------- 10 files changed, 648 insertions(+), 104 deletions(-) create mode 100644 yescieval/base/example.py create mode 100644 yescieval/base/vocab.py create mode 100644 yescieval/data/examples/examples.json create mode 100644 yescieval/data/vocab/ecology_dictionaries.json create mode 100644 yescieval/data/vocab/nlp_dictionaries.json diff --git a/yescieval/__init__.py b/yescieval/__init__.py index 28c7095..45f2d0e 100644 --- a/yescieval/__init__.py +++ b/yescieval/__init__.py @@ -2,10 +2,9 @@ __version__ = (Path(__file__).parent / "VERSION").read_text().strip() -from .base import Rubric, Parser +from .base import Rubric, Parser, Judge, VocabLoader, ExampleLoader from .rubric import (Informativeness, Correctness, Completeness, Coherence, Relevancy, - Integration, Cohesion, Readability, Conciseness, GeographicCoverage, - InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, + Integration, Cohesion, Readability, Conciseness, MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, SpeculativeStatements, NoveltyIndicators) diff --git a/yescieval/base/__init__.py b/yescieval/base/__init__.py index 7c07516..838757f 100644 --- a/yescieval/base/__init__.py +++ b/yescieval/base/__init__.py @@ -1,10 +1,14 @@ from .rubric import Rubric from .parser import Parser, RubricLikertScale from .judge import Judge +from .vocab import VocabLoader +from .example import ExampleLoader __all__ = [ "Rubric", "Parser", "RubricLikertScale", - "Judge" + "Judge", + "VocabLoader", + "ExampleLoader" ] \ No newline at end of file diff --git a/yescieval/base/example.py b/yescieval/base/example.py new file mode 100644 index 0000000..94576e0 --- /dev/null +++ b/yescieval/base/example.py @@ -0,0 +1,66 @@ +import json +from typing import Dict, Any + + +class ExampleLoader: + """ + Loads rubric-specific example responses and injects them + into prompt templates based on domain and rubric name. + """ + + DOMAIN_MAP = { + "nlp": "NLP", + "ecology": "Ecology", + } + + CATEGORIES = ("Depth", "Breadth") + PLACEHOLDER = "{EXAMPLE_RESPONSES}" + EMPTY_VALUE = "{}" + + def __init__(self, file_path: str): + self.data = self._load_examples(file_path) + + def _normalize_domain(self, domain: str) -> str: + return domain.strip().lower() + + def _load_examples(self, file_path: str) -> Dict[str, Any]: + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + + if not isinstance(data, dict): + raise ValueError("Example file must contain a JSON object at top level") + + return data + + def get_example_data(self, domain: str, rubric_name: str) -> Dict[str, Any]: + """ + Returns: + {rubric_name: } if found, + {} otherwise. + """ + domain = self._normalize_domain(domain) + topic_key = self.DOMAIN_MAP.get(domain) + + if not topic_key: + return {} + + topic_data = self.data.get(topic_key, {}) + + for category in self.CATEGORIES: + category_data = topic_data.get(category, {}) + if rubric_name in category_data: + return {rubric_name: category_data[rubric_name]} + + return {} + + def fill_prompt(self, template: str, domain: str, rubric_name: str) -> str: + """ + Injects example responses JSON into the template. + """ + example_data = self.get_example_data(domain, rubric_name) + + if not example_data: + return template.replace(self.PLACEHOLDER, self.EMPTY_VALUE) + + example_json = json.dumps(example_data, indent=4) + return template.replace(self.PLACEHOLDER, example_json) diff --git a/yescieval/base/rubric.py b/yescieval/base/rubric.py index 6ca06fe..d93d61e 100644 --- a/yescieval/base/rubric.py +++ b/yescieval/base/rubric.py @@ -1,8 +1,8 @@ from abc import ABC from pydantic import BaseModel -from typing import Dict, List - - +from typing import Dict, List, Optional +from .vocab import VocabLoader +from .example import ExampleLoader class Rubric(BaseModel, ABC): """ @@ -15,26 +15,55 @@ class Rubric(BaseModel, ABC): question: str answer: str user_prompt_template: str = ("Evaluate and rate the quality of the following scientific synthesis " - "according to the characteristics given in the system prompt.\n" - "\n{answer}\n" - "\n{question}\n" - "\n\n{content}\n\n###") + "according to the characteristics given in the system prompt.\n" + "\n{answer}\n" + "\n{question}\n" + "\n\n{content}\n\n###") + + domain: Optional[str] = None + vocab_manager: Optional[VocabLoader] = None + example_manager: Optional[ExampleLoader] = None + model_config = {"arbitrary_types_allowed": True} + def render_papers(self) -> str: paper_content = "" for idx, (title, abstract) in enumerate(self.papers.items()): - paper_content += f"{idx + 1}. " + title + "\n\n" + abstract + "\n\n" + paper_content += f"{idx + 1}. {title}\n\n{abstract}\n\n" return paper_content - def verbalize(self): - return self.user_prompt_template.format(answer=self.answer, - question=self.question, - content=self.render_papers()) + def preprocess_user_prompt(self, template: str) -> str: + """ + Fills vocabulary and example placeholders in the system prompt. + """ + filled = template + + if self.vocab_manager and self.domain: + filled = self.vocab_manager.fill_prompt(filled, self.domain) + + if self.example_manager and self.domain: + + filled = self.example_manager.fill_prompt( + template=filled, + domain=self.domain, + rubric_name=self.name + ) + + return filled + + def verbalize(self) -> str: + """ + Fill placeholders first, then format with answer, question, and papers. + """ + filled_template = self.preprocess_user_prompt(self.user_prompt_template) + return filled_template.format( + answer=self.answer, + question=self.question, + content=self.render_papers() + ) def instruct(self) -> List[Dict[str, str]]: - message = [ - {"role": "system", "content": self.system_prompt_template}, + return [ + {"role": "system", "content": self.system_prompt_template}, {"role": "user", "content": self.verbalize()}, ] - return message - diff --git a/yescieval/base/vocab.py b/yescieval/base/vocab.py new file mode 100644 index 0000000..2b476c9 --- /dev/null +++ b/yescieval/base/vocab.py @@ -0,0 +1,104 @@ +import json +from pathlib import Path +from typing import Dict + + +class VocabLoader: + """ + Loads multiple vocabularies and fills placeholders in prompts + based on the selected domain. + """ + + PLACEHOLDERS = { + "{MECHANISTIC_VOCAB}": "mechanistic_vocab_block", + "{CAUSAL_VOCAB}": "causal_vocab_block", + "{TEMPORAL_VOCAB}": "temporal_vocab_block", + } + + def __init__(self, domain_to_file: Dict[str, str]): + """ + domain_to_file: {"nlp": "vocab/nlp_dictionary.json", "ecology": "vocab/ecology_dictionary.json"} + """ + self.domain_to_file = { + self._normalize_domain(k): v for k, v in domain_to_file.items() + } + self.vocabs: Dict[str, Dict] = {} + + for domain, file_path in self.domain_to_file.items(): + self.vocabs[domain] = self._load_vocab(file_path) + + def _normalize_domain(self, domain: str) -> str: + return domain.strip().lower() + + def _load_vocab(self, file_path: str) -> Dict: + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"Vocabulary file not found: {file_path}") + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + if not isinstance(data, dict): + raise ValueError(f"Invalid vocabulary format: {file_path}") + return data + + def _clean_terms(self, terms) -> list[str]: + + seen = set() + cleaned = [] + for t in terms: + if not isinstance(t, str): + continue + t = t.strip() + if not t or t in seen: + continue + seen.add(t) + cleaned.append(t) + return cleaned + + + def mechanistic_vocab_block(self, domain: str) -> str: + domain = self._normalize_domain(domain) + V = self.vocabs.get(domain, {}) + + if domain == "ecology": + terms = V.get("mechanistic_terms", []) + label = "Mechanistic terms (Ecology)" + elif domain == "nlp": + terms = ( + V.get("training_terms", []) + + V.get("arch_terms", []) + + V.get("ablation_terms", []) + ) + label = "Mechanistic terms (NLP)" + else: + terms = V.get("mechanistic_terms", []) + label = "Mechanistic terms" + + terms = self._clean_terms(terms) + return f"{label}: " + ", ".join(terms) + + def causal_vocab_block(self, domain: str) -> str: + domain = self._normalize_domain(domain) + V = self.vocabs.get(domain, {}) + terms = self._clean_terms(V.get("causal_terms", [])) + return "Causal connectives / triggers: " + ", ".join(terms) + + def temporal_vocab_block(self, domain: str) -> str: + domain = self._normalize_domain(domain) + V = self.vocabs.get(domain, {}) + terms = self._clean_terms(V.get("temporal_terms", [])) + return "Temporal expressions: " + ", ".join(terms) + + def fill_prompt(self, prompt_template: str, domain: str) -> str: + """ + Replaces known placeholders in the prompt with vocab blocks + based on the domain. + """ + prompt = prompt_template + domain = self._normalize_domain(domain) + + for placeholder, method_name in self.PLACEHOLDERS.items(): + if placeholder in prompt: + block_fn = getattr(self, method_name) + prompt = prompt.replace(placeholder, block_fn(domain)) + + return prompt diff --git a/yescieval/data/examples/examples.json b/yescieval/data/examples/examples.json new file mode 100644 index 0000000..275abf2 --- /dev/null +++ b/yescieval/data/examples/examples.json @@ -0,0 +1,70 @@ +{ + "Ecology": { + "Depth": { + "MechanisticUnderstanding": [ + { + "rating": "1", + "rationale": "The response mainly describes outcomes or observations and does not explain the underlying mechanisms or processes driving them." + }, + { + "rating": "4", + "rationale": "The response explains ecological mechanisms by describing pathways and feedbacks such as nutrient cycling and energy flow, and how interactions like predation, competition, and mutualism influence ecosystem dynamics, resilience, and responses to disturbance." + } + ], + "CausalReasoning": [ + { + "rating": "1", + "rationale": "The response describes ecological patterns or outcomes but does not clearly explain why they occur or how one factor leads to another." + }, + { + "rating": "4", + "rationale": "The response presents clear cause-effect reasoning, explaining how changes in ecological factors drive outcomes, for example describing how disturbances lead to shifts in community structure, which consequently regulate ecosystem processes through specific mediating interactions." + } + ], + "TemporalPrecision": [ + { + "rating": "1", + "rationale": "The response refers to timing only in vague terms, such as long-term or historical trends, without specifying concrete dates, durations, or time intervals." + }, + { + "rating": "4", + "rationale": "The response uses specific and bounded temporal expressions, for example describing changes occurring within 2-5 years, after 3 months, or every 2 weeks, and referencing defined time periods such as 1998-2004 or June 2012." + } + ] + } + }, + "NLP": { + "Depth": { + "MechanisticUnderstanding": [ + { + "rating": "1", + "rationale": "The response reports results or model performance but does not explain how the model architecture or training process leads to those outcomes." + }, + { + "rating": "4", + "rationale": "The response provides a clear mechanistic explanation of how the model works, describing the role of transformer-based architectures, the effects of pretraining and fine-tuning, and insights from ablation studies that show how specific components contribute to performance." + } + ], + "CausalReasoning": [ + { + "rating": "1", + "rationale": "The response reports results or observations but does not provide explicit cause-effect explanations linking methods or design choices to outcomes." + }, + { + "rating": "4", + "rationale": "The response provides structured cause-effect reasoning, explaining how architectural or training choices lead to performance differences, for example noting that improvements occur because certain components modulate information flow, which consequently drives better generalization through specific training mechanisms." + } + ], + "TemporalPrecision": [ + { + "rating": "1", + "rationale": "The response mentions time only in broad or unspecific ways and does not provide clear dates, durations, or intervals relevant to the discussion." + }, + { + "rating": "4", + "rationale": "The response includes precise temporal details, such as model behavior observed after 3 months of training, performance changes within 2-5 years of development, or evaluations conducted every 2 weeks, with references to specific time ranges like 1998-2004 or June 2012." + } + ] + } + } +} \ No newline at end of file diff --git a/yescieval/data/vocab/ecology_dictionaries.json b/yescieval/data/vocab/ecology_dictionaries.json new file mode 100644 index 0000000..2ef3a66 --- /dev/null +++ b/yescieval/data/vocab/ecology_dictionaries.json @@ -0,0 +1,114 @@ +{ + "regions": [ + "Europe", "North America", "South America", "Asia", "Africa", "Australia", + "Mediterranean", "Alpine", "Tropical", "Arctic", "Boreal", "Temperate", + "Subtropical", "Arid", "Wetland", "Coastal", "Marine", "Freshwater", + "Terrestrial", "Montane", "Savanna", "Tundra", "Desert", "Grassland", + "Rainforest", "Riparian", "Peatland", "Mangrove", "Coral reef" + ], + "interventions": [ + "fertilizer", "stocking", "mowing", "grazing", "irrigation", "organic", + "controlled burn", "prescribed burn", "restoration", "reforestation", "afforestation", + "rewilding", "habitat creation", "invasive species control", "predator control", + "captive breeding", "protected area", "translocation", "assisted migration", + "biochar", "liming", "mulching", "cover cropping", "selective logging", + "thinning", "buffer strips", "fencing", "corridor", "wetland creation" + ], + "mechanistic_terms": [ + "mechanism", "pathway", "feedback", "trophic", "nutrient cycling", "energy flow", + "predation", "competition", "mutualism", "facilitation", "inhibition", + "succession", "disturbance", "resilience", "adaptation", "selection pressure", + "gene flow", "decomposition", "mineralization", "nitrification", + "photosynthesis", "respiration", "herbivory", "allelopathy", + "keystone", "hysteresis", "tipping point" + ], + "diversity_dimensions": [ + "taxonomic", "functional", "phylogenetic", "alpha", "beta", "gamma", + "species richness", "evenness", "dominance", "endemism", "rarity", + "abundance", "biomass", "density", "coverage", "trait diversity", + "genetic diversity", "structural diversity", "shannon", "simpson", "hill numbers" + ], + "temporal_terms" :[ + "within 2–5 years", "lag of ~6 months", "after 3 months", "before 12 weeks", + "1998–2004", "June 2012", "every 2 weeks" + ], + "ecosystem_services": [ + "provisioning", "regulating", "supporting", "cultural", + "carbon sequestration", "pollination", "pest control", "water purification", + "soil formation", "nutrient retention", "climate regulation", "flood control", + "erosion control", "recreation", "aesthetic value", "food production", + "timber", "fiber", "fuel", "genetic resources", "biochemicals", "fresh water" + ], + "scale_terms": [ + "individual", "population", "community", "ecosystem", "landscape", + "patch", "local", "regional", "global" + ], + "causal_terms": [ + "because", "due to", "caused by", "results in", "leads to", "triggers", "induces", + "therefore", "consequently", "as a result", "hence", "thus", + "accordingly", "owing to", "through", "via", "by means of", "mediates", "modulates", "drives", "regulates" + ], + "innovation_terms": [ + "novel", "innovative", "breakthrough", "pioneering", "cutting-edge", + "emerging", "frontier", "state-of-the-art", "advanced", "experimental", + "proof-of-concept", "first", "unprecedented" + ], + "speculative_terms": [ + "speculative", "hypothetical", "flagged" + ], + "gap_terms": [ + "research gap", "knowledge gap", "data gap" + ], + "uncertainty_terms": [ + "uncertain", "unclear", "unknown" + ], + "stats_terms": [ + "mean", "median", "variance", "standard deviation", "standard error", + "confidence interval", "ci", "p-value", "significant", "regression", + "anova", "t-test", "chi-square", "effect size", "meta-analysis", + "model comparison", "r-squared" + ], + + "conservation_terms": [ + "endangered", "extinction", "habitat loss", "fragmentation", + "restoration", "landscape connectivity", "corridor", "buffer zone" + ], + "climate_terms": [ + "climate change", "global warming", "drought", "heatwave", "extreme weather", + "phenology", "range shift", "sea level rise", "ocean acidification", + "greenhouse gas", "carbon dioxide", "thermal stress", "precipitation" + ], + "complexity_terms": [ + "nonlinear", "emergent", "synergistic", "interconnected", "complex", "multifaceted" + ], + + "weights": { + "alpha": { + "depth": 0.31, + "breadth": 0.27, + "rigor": 0.17, + "innov": 0.17, + "gap": 0.08 + }, + "depth": { + "mech": 0.40, + "causal": 0.30, + "temp": 0.30 + }, + "breadth": { + "regions": 0.25, + "interventions": 0.25, + "biodiversity": 0.25, + "services": 0.15, + "scale": 0.10 + }, + "rigor": { + "stats": 0.67, + "uncert": 0.33 + }, + "innovation": { + "spec": 0.52, + "novel": 0.48 + } + } +} diff --git a/yescieval/data/vocab/nlp_dictionaries.json b/yescieval/data/vocab/nlp_dictionaries.json new file mode 100644 index 0000000..107001b --- /dev/null +++ b/yescieval/data/vocab/nlp_dictionaries.json @@ -0,0 +1,118 @@ +{ + "tasks": [ + "classification", "sentiment", "ner", "named entity recognition", "pos", "part of speech", + "parsing", "constituency parsing", "dependency parsing", + "qa", "question answering", "open-domain qa", "closed-book qa", + "summarization", "abstractive summarization", "extractive summarization", + "translation", "machine translation", "mt", + "retrieval", "dense retrieval", "bm25", "reranking", "re-ranking", + "dialogue", "dialog", "conversation", "chat", + "generation", "text generation", "story generation", "code generation", + "coreference", "coreference resolution", "slot filling", + "nli", "natural language inference", "sts", "semantic textual similarity", + "entailment" + ], + "datasets": [ + "glue", "superglue", "squad", "squad2", "mnli", "qqp", "qnli", "cola", "sst", "sst-2", "stsb", + "wmt", "cnn/daily mail", "cnn dm", "xsum", "gigaword", + "coqa", "hotpotqa", "msmarco", "triviaqa", + "belebele", "mmlu", "hellaswag", "truthfulqa", + "gsm8k", "humaneval", "arc", "piqa", "boolq", "openbookqa" + ], + "languages": [ + "english", "german", "deutsch", "french", "spanish", "italian", + "chinese", "japanese", "korean", "arabic", "hindi", + "multilingual", "cross-lingual", "low-resource" + ], + "temporal_terms" :[ + "within 2–5 years", "lag of ~6 months", "after 3 months", "before 12 weeks", + "1998–2004", "June 2012", "every 2 weeks" + ], + "eval_metrics": [ + "accuracy", "f1", "precision", "recall", + "bleu", "chrf", "rouge", "meteor", "bertscore", + "perplexity", "exact match", "em" + ], + "arch_terms": [ + "transformer", "encoder-decoder", "decoder-only", + "bert", "albert", "roberta", "t5", "gpt", "llama", "mistral", + "lstm", "gru", "cnn" + ], + "training_terms": [ + "pretraining", "fine-tuning", "instruction tuning", "rlhf", "dpo", + "lora", "qlora", "quantization", "distillation", "curriculum", + "data augmentation", "continual learning" + ], + "ablation_terms": [ + "ablation", "ablation study", "component analysis", "feature ablation", "module ablation" + ], + "compute_terms": [ + "gpu", "tpu", "flops", "parameters", "params", "billion parameters", + "inference time", "throughput", "latency", "memory footprint" + ], + "causal_terms": [ + "because", "due to", "caused by", "results in", "leads to", "triggers", "induces", + "therefore", "consequently", "as a result", "hence", "thus", + "via", "through", "mediates", "modulates", "drives", "regulates" + ], + "rigor_stats": [ + "p-value", "p<", "p >", "significant", "confidence interval", "ci", + "t-test", "anova", "regression", "bootstrap", "cross-validation", + "held-out", "standard deviation", "std", "mean", "median" + ], + "stats_terms": [ + "p-value", "confidence interval", "t-test", "anova", "regression", + "effect size", "variance", "standard deviation", "standard error", "r-squared" + ], + "uncertainty_terms": [ + "uncertain", "unclear", "unknown" + ], + "innovation_terms": [ + "novel", "innovative", "breakthrough", "pioneering", "cutting-edge", + "emerging", "frontier", "state-of-the-art", "advanced", "experimental", + "proof-of-concept", "first", "unprecedented" + ], + "speculative_terms": [ + "speculative", "hypothetical", "flagged" + ], + "gap_terms": [ + "research gap", "knowledge gap", "data gap" + ], + "repro_terms": [ + "open source", "code available", "github", "weights", "checkpoint", + "seed", "license", "hyperparameter", "learning rate", "batch size" + ], + "safety_terms": [ + "bias", "fairness", "toxicity", "privacy", "safety", "data leakage", "red teaming", "harmful content" + ], + + "weights": { + "alpha": { + "depth": 0.31, + "breadth": 0.27, + "rigor": 0.17, + "innov": 0.17, + "gap": 0.08 + }, + "depth": { + "mech": 0.40, + "causal": 0.30, + "temp": 0.30 + }, + "breadth": { + "tasks": 0.25, + "datasets": 0.25, + "metrics": 0.25, + "languages": 0.15, + "compute": 0.10 + }, + "rigor": { + "stats": 0.67, + "uncert": 0.33 + }, + "innovation": { + "spec": 0.52, + "novel": 0.48 + } + } +} diff --git a/yescieval/rubric/__init__.py b/yescieval/rubric/__init__.py index 262818b..2e6435f 100644 --- a/yescieval/rubric/__init__.py +++ b/yescieval/rubric/__init__.py @@ -1,7 +1,6 @@ from .informativeness import Informativeness, Correctness, Completeness from .structural import Coherence, Relevancy, Integration from .stylistic import Cohesion, Readability, Conciseness -from .breadth import GeographicCoverage, InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale from .depth import MechanisticUnderstanding, CausalReasoning, TemporalPrecision from .gap import GapIdentification from .rigor import StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment @@ -9,8 +8,7 @@ __all__ = ["Informativeness", "Correctness", "Completeness", "Coherence", "Relevancy", "Integration", - "Cohesion", "Readability", "Conciseness", "GeographicCoverage", - "InterventionDiversity", "BiodiversityDimensions", "EcosystemServices", - "SpatialScale", "MechanisticUnderstanding", "CausalReasoning", "TemporalPrecision", + "Cohesion", "Readability", "Conciseness", + "MechanisticUnderstanding", "CausalReasoning", "TemporalPrecision", "GapIdentification", "StatisticalSophistication", "CitationPractices", "UncertaintyAcknowledgment", "SpeculativeStatements", "NoveltyIndicators"] diff --git a/yescieval/rubric/depth.py b/yescieval/rubric/depth.py index 3e12dc3..4580dee 100644 --- a/yescieval/rubric/depth.py +++ b/yescieval/rubric/depth.py @@ -1,163 +1,205 @@ from ..base import Rubric -mechanistic_understanding_prompt = """ -Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires: +from ..base import Rubric + +mechanistic_understanding_prompt = """ +Scientific question answering and synthesis often require more than listing findings: high-quality scientific writing explains not only what is believed to be true, but also how and why it may be true. This is commonly expressed through mechanistic understanding, where the text describes processes, interactions, intermediate steps, or pathways that connect conditions or components to outcomes. -- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources. -- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question. -- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information. -- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question. -- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability. -- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on correctness characteristic, ensuring it effectively communicates the synthesized information. +The response may be a single paragraph or a long-form report with multiple sections. There are no strict requirements on length or formatting; mechanistic explanation should be evaluated independently of presentation style. -In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness. +This rubric focuses exclusively on the presence and quality of mechanistic explanation within the provided text, emphasizing explanations of how and why phenomena occur rather than descriptions of what is observed. Other aspects of scientific quality (such as factual accuracy, evidential grounding, or completeness) are intentionally outside its scope and are assessed by separate evaluation criteria. -You are tasked as a scientific syntheses quality evaluator. +You are tasked as a scientific writing quality evaluator. -A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim. +A user will provide you with: +1) a research question, and +2) a written response intended to address that question. + +You must evaluate the response using the evaluation characteristic below. Focus on whether the response offers mechanistic understanding (how/why explanations) rather than only descriptive statements (what/that). Your judgment should be based solely on the provided question and response. -1. Mechanistic Understanding: does the answer reflect understanding of ecological processes by explicitly mentioning recognized mechanisms such as feedbacks, nutrient cycling, or trophic cascades? +MechanisticUnderstanding: Does the response explain mechanisms relevant to the research question by describing processes, interactions, intermediate steps, or pathways (i.e., “how/why”), rather than only stating observations or outcomes (“what”)? + +Below are domain-specific terms and phrases that often signal mechanistic discussion. They are examples only: their presence is not required, and their presence alone is not sufficient for a high score. + +{MECHANISTIC_VOCAB} + + -For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic. +For the characteristic above, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below. + +MechanisticUnderstanding +Rating 1. Very bad: The response is purely descriptive, listing facts or outcomes with no meaningful “how/why” explanation relevant to the research question. +Rating 2. Bad: The response contains occasional mechanistic terms or phrases, but explanations are superficial, generic, or weakly connected to the research question. +Rating 3. Moderate: The response provides some mechanistic explanation with partial detail, but important steps, interactions, or pathways are missing, unclear, or inconsistently developed. +Rating 4. Good: The response offers clear mechanistic explanations with multiple concrete steps, interactions, or pathways that are relevant to the research question; minor gaps or imprecision may remain. +Rating 5. Very good: The response provides a detailed, coherent mechanistic account tightly aligned with the research question, explicitly articulating multiple intermediate steps or process-level linkages and clearly distinguishing mechanistic explanation (“how/why”) from descriptive reporting (“what”). -1. Mechanistic Understanding -Rating 1. Very bad: The synthesis contains only vague statements (e.g., “X affects Y”) with no explanation of how or why; no causal language or mechanisms. -Rating 2. Bad: The synthesis mentions a relationship but remains single-step; mechanisms are implied but not described; no mediators or temporal aspects. -Rating 3. Moderate: The synthesis identifies at least one mechanism or causal link but lacks depth; limited causal connectors and no explicit assumptions or timing. -Rating 4. Good: The synthesis describes multi-step mechanisms (driver → mediator → outcome) using causal language; may include some temporal or conditional detail. -Rating 5. Very good: The information in the synthesis provides detailed, explicit multi-step causal mechanisms with clear mediators, temporal specificity, and stated assumptions or boundary conditions. + -For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating. -Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}} +Rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale that points to specific parts of the response demonstrating the presence or absence of mechanistic explanation relevant to the research question. - +Return your response in JSON format: { - "Mechanistic Understanding": {"rating": "4", "rationale": "The answer explains a clear multi-step ecological mechanism using causal language, but some temporal or boundary details are only briefly addressed."} + "MechanisticUnderstanding": {"rating": "", "rationale": ""} } - + + +{EXAMPLE_RESPONSES} + + + -Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. +Your evaluation must be based solely on the provided research question and response. Do not reward length by itself; reward mechanistic clarity, relevance to the question, and explanatory coherence. This rubric does not assess factual correctness, evidential grounding, or completeness. """ + class MechanisticUnderstanding(Rubric): - name: str = "Mechanistic Understanding" + name: str = "MechanisticUnderstanding" system_prompt_template: str = mechanistic_understanding_prompt -causal_reasoning_prompt = """ -Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires: +causal_reasoning_prompt = """ +Scientific question answering and synthesis often require more than listing findings: high-quality scientific writing explains not only what is believed to be true, but also how and why it may be true. One important aspect of this is causal reasoning, where the text articulates cause–effect relationships, conditions, mediators, moderators, and causal chains, rather than only describing associations or co-occurrences. -- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources. -- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question. -- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information. -- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question. -- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability. -- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on completeness characteristic, ensuring it effectively communicates the synthesized information. +The response may be a single paragraph or a long-form report with multiple sections. There are no strict requirements on length or formatting; causal reasoning should be evaluated independently of presentation style. -In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness. +This rubric focuses exclusively on the presence and quality of causal reasoning within the provided text, emphasizing language and structure that express why something happens (cause → effect) rather than only what is observed or correlated. Other aspects of scientific quality (such as factual accuracy, evidential grounding, or completeness) are intentionally outside its scope and are assessed by separate evaluation criteria. -You are tasked as a scientific syntheses quality evaluator. +You are tasked as a scientific writing quality evaluator. -A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim. +A user will provide you with: +1) a research question, and +2) a written response intended to address that question. + +You must evaluate the response using the evaluation characteristic below. Focus on whether the response expresses causal relationships relevant to the research question (cause–effect, mediators/moderators, conditions), rather than only descriptive or correlational statements. Your judgment should be based solely on the provided question and response. -1. Causal Reasoning: does the answer explicitly express cause–effect relationships using causal connectives (e.g., “because,” “due to”), result indicators (e.g., “results in,” “induces”), or mechanistic verbs (e.g., “drives,” “regulates”) when describing ecological processes? +CausalReasoning: Does the response demonstrate causal reasoning relevant to the research question by explicitly articulating cause–effect relationships (including causal chains, mediators, moderators, or conditional causal statements), rather than only reporting associations, trends, or co-occurrences? + +Below are examples of causal connectives and expressions that often signal causal reasoning (across domains). They are examples only: their presence is not required, and their presence alone is not sufficient for a high score. + +Causal connectives / triggers (examples): because, due to, therefore, thus, hence, leads to, results in, causes, contributes to, drives, produces, induces, triggers, promotes, suppresses, mediates, moderates, modulates, depends on, under conditions of, only if, unless. + +{CAUSAL_VOCAB} + + -For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic. - -1. Causal Reasoning -Rating 1. Very bad: The synthesis uses vague statements (e.g., “X affects Y”) with no causal connectors, -Rating 2. Bad: The synthesis identifies a cause–effect relationship but only as a single-step claim; causal language is minimal and mediators are ignored. -Rating 3. Moderate: The synthesis includes explicit causal connectors or verbs and at least one cause–effect link, but remains shallow. -Rating 4. Good: The synthesis describes multi-step causal chains (driver → mediator → outcome) using clear causal language. -Rating 5. Very good: The synthesis presents detailed, explicit multi-step causal reasoning with clear mediators, temporal specificity, and stated assumptions or boundary conditions. +For the characteristic above, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below. + +CausalReasoning +Rating 1. Very bad: The response is purely descriptive or correlational, offering no meaningful cause–effect statements relevant to the research question. +Rating 2. Bad: The response uses occasional causal words (e.g., “leads to”, “because”) but causal links are unclear, generic, or asserted without coherent cause–effect structure (often indistinguishable from correlation). +Rating 3. Moderate: The response includes some clear causal claims relevant to the question, but they are limited in number, shallow (single-step), inconsistently developed, or mixed with ambiguous association language. +Rating 4. Good: The response provides clear cause–effect reasoning with multiple relevant causal links and some structure (e.g., conditions, mediators/moderators, or short causal chains); minor ambiguity or gaps may remain. +Rating 5. Very good: The response demonstrates strong causal reasoning throughout, using explicit and coherent cause–effect structure aligned to the research question, including multiple well-articulated causal chains and/or conditional pathways (e.g., A → B → C; “A affects C via B”; “A increases B only under condition D”), and clearly distinguishes causation from association. + -For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating. -Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}} +Rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale that points to specific parts of the response demonstrating the presence or absence of causal reasoning relevant to the research question. - +Return your response in JSON format: { - "Causal Reasoning": {"rating": "4", "rationale": "The answer uses clear causal connectors and describes a multi-step cause–effect relationship."} + "CausalReasoning": {"rating": "", "rationale": ""} } - + + +{EXAMPLE_RESPONSES} + + + -Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. +Your evaluation must be based solely on the provided research question and response. Do not reward length by itself; reward clarity and coherence of causal structure, relevance to the question, and explicit differentiation between causation and association. This rubric does not assess factual correctness, evidential grounding, or completeness. """ + class CausalReasoning(Rubric): - name: str = "Causal Reasoning" + name: str = "CausalReasoning" system_prompt_template: str = causal_reasoning_prompt -temporal_precision_prompt = """ -Scientific synthesis generation involves creating a concise, coherent, and integrated summary from a collection of scientific texts (such as research paper titles and abstracts) that addresses a specific research question. Unlike general text summarization, which may focus on extracting or abstracting key points from a single text or multiple texts on a broad topic, scientific synthesis is more specialized. It requires: +temporal_precision_prompt = """ +Scientific question answering and synthesis often require more than listing findings: high-quality scientific writing is precise about time when time matters. Temporal precision refers to how clearly the text specifies when something occurs, over what duration, or across what interval. Precise temporal expressions include calendar dates, numeric durations, bounded year ranges, or clearly delimited intervals; vague temporal markers include expressions like “historically”, “recently”, “long-term”, or “soon” without further specification. -- Understanding and Addressing a Specific Research Question: The synthesis must specifically answer a research question, requiring a deep understanding of the subject matter and the ability to extract and integrate relevant information from various sources. -- Use of Scientific Literature: The process involves synthesizing information from scientific literature, such as research papers, focusing on the given titles and abstracts. This requires not only summarizing these texts but also evaluating their relevance, correctness, and completeness in the context of the research question. -- Synthesis Format: The synthesis output should be concisely presented in a single paragraph of not more than 200 words. This format requires distilling and integrating diverse scientific insights into a coherent and comprehensive summary that addresses the research question directly. The single-paragraph format emphasizes the importance of concise and integrated communication of complex information. -- Synthesize vs. Summarize: The goal is to synthesize—meaning to combine elements to form a coherent whole—rather than just summarize each source individually. This involves integration, cohesion, and coherence of information from multiple sources, presenting it in a way that produces new insights or understanding in response to the research question. -- Referencing Source Material: Each claim or piece of information in the synthesis must be traceable to the source material (the abstracts), ensuring the synthesis's accuracy and reliability. -- Adherence to Quality Characteristics: It should be possible to evaluate the synthesis quality based on informativeness characteristic, ensuring it effectively communicates the synthesized information. +The response may be a single paragraph or a long-form report with multiple sections. There are no strict requirements on length or formatting; temporal precision should be evaluated independently of presentation style. -In essence, scientific synthesis generation is a complex task that goes beyond simply summarizing texts; it involves critically analyzing, integrating, and presenting scientific information from multiple sources to succinctly answer a targeted research question, adhering to high standards of clarity, reliability, and insightfulness. +This rubric focuses exclusively on the presence and quality of temporal precision within the provided text, emphasizing specific and bounded time expressions (when/for how long/over what interval) rather than vague temporal language. Other aspects of scientific quality (such as factual accuracy, evidential grounding, or completeness) are intentionally outside its scope and are assessed by separate evaluation criteria. -You are tasked as a scientific syntheses quality evaluator. +You are tasked as a scientific writing quality evaluator. -A user will provide you with a synthesis which has been generated as an answer to a research question using the titles and abstracts of relevant research works. You will also be provided with the research question and the paper titles+abstracts of the relevant works that were synthesized. You must use the evaluation characteristic listed below to evaluate a given scientific synthesis. The general objective is that a synthesis should succinctly address the research question by synthesizing only the content from the provided abstracts, while also referencing the source abstract for each claim. +A user will provide you with: +1) a research question, and +2) a written response intended to address that question. + +You must evaluate the response using the evaluation characteristic below. Focus on whether the response uses specific, bounded temporal expressions when making temporally-relevant statements, rather than relying on vague time markers. Your judgment should be based solely on the provided question and response. -1. Temporal Precision: does the answer include specific and explicit temporal references, such as quantified time intervals or dated events, rather than vague or unspecific timing? +TemporalPrecision: Does the response use specific, bounded, and meaningful temporal expressions (e.g., dates, durations, intervals, year ranges) when discussing time-relevant aspects of the research question, rather than vague temporal markers? + +Below are examples of temporal expressions. They are examples only: their presence is not required, and their presence alone is not sufficient for a high score. + +Specific temporal expressions (examples): in 2019; between 2010–2015; over 6 months; within 2–5 years; a 3-year follow-up; from March 2020 to June 2021; after 12 weeks; pre- vs post-intervention; before/after fine-tuning; during pretraining. + +Vague temporal markers (examples): historically; in the past; long-term; recently; soon; over time; nowadays; for some time; at times; in earlier work. + +{TEMPORAL_VOCAB} + + -For a given characteristic, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below for each rating per evaluation characteristic. - -1. Temporal Precision -Rating 1. Very bad: The synthesis contains no temporal references; timing is entirely vague or absent. -Rating 2. Bad: The answer includes implicit or generic timing (e.g., “over time,” “eventually”) but no specific intervals, dates, or durations. -Rating 3. Moderate: The answer provides at least one explicit temporal reference (e.g., a rough duration or time window) but lacks consistency or clear linkage to effects. -Rating 4. Good: The answer includes multiple specific temporal references (e.g., quantified intervals or dated events) that are clearly tied to described processes or outcomes. -Rating 5. Very good: The answer demonstrates high temporal precision, with detailed and explicit timeframes (lags, durations, windows, or dates) systematically linked to multi-step processes and their effects. +For the characteristic above, rate the quality from 1 (very bad) to 5 (very good). Follow the guidelines specified below. + +TemporalPrecision +Rating 1. Very bad: The response uses time-related language only vaguely (or not at all when time is relevant), relying on unspecific markers such as “historically” or “long-term” without any bounded dates, durations, or intervals. +Rating 2. Bad: The response includes a few temporal references, but they are mostly vague or inconsistently specified; precise dates/durations/intervals are rare and do not meaningfully clarify timing. +Rating 3. Moderate: The response provides some specific temporal expressions (dates, durations, ranges), but many temporal references remain vague, or precision is applied only in isolated parts of the response. +Rating 4. Good: The response frequently uses specific, bounded temporal expressions that help interpret timing and change (dates, durations, intervals, ranges), with only minor reliance on vague temporal markers. +Rating 5. Very good: The response is consistently temporally precise wherever time is relevant, using specific and bounded expressions (dates, numeric durations, delimited intervals/ranges) and minimizing vague markers; temporal comparisons and sequences are clearly specified (e.g., pre/post, before/after, within X–Y, from A to B). + -For each characteristic rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale for each rating. -Return your response in JSON format: {characteristic : {‘rating’ : ‘’, ‘rationale’ : ‘’}} +Rate the quality from 1 (very bad) to 5 (very good). Provide a short rationale that points to specific parts of the response demonstrating temporal specificity or vagueness. - +Return your response in JSON format: { - "Temporal Precision": {"rating": "4", "rationale": "The answer includes several specific timeframes or durations that are clearly linked to the described processes, though some timing details could be more precise."} + "TemporalPrecision": {"rating": "", "rationale": ""} } - + + +{EXAMPLE_RESPONSES} + + + -Your evaluation should be based solely on the content of the provided synthesis and abstracts. Ensure your rationale is objective and backed by specific examples from the provided material. +Your evaluation must be based solely on the provided research question and response. Do not reward length by itself; reward specificity of temporal expressions and clarity of temporal sequencing when time is relevant. This rubric does not assess factual correctness, evidential grounding, or completeness. """ + class TemporalPrecision(Rubric): - name: str = "Temporal Precision" + name: str = "TemporalPrecision" system_prompt_template: str = temporal_precision_prompt From 690bdfad70a35345a979febd4c25fb1f94682a18 Mon Sep 17 00:00:00 2001 From: MikeACedric <72818458+MikeACedric@users.noreply.github.com> Date: Thu, 22 Jan 2026 11:25:30 +0100 Subject: [PATCH 02/13] Removed weights from the vocabulary files --- .../data/vocab/ecology_dictionaries.json | 32 +------------------ yescieval/data/vocab/nlp_dictionaries.json | 32 +------------------ yescieval/rubric/depth.py | 2 -- 3 files changed, 2 insertions(+), 64 deletions(-) diff --git a/yescieval/data/vocab/ecology_dictionaries.json b/yescieval/data/vocab/ecology_dictionaries.json index 2ef3a66..bf341f7 100644 --- a/yescieval/data/vocab/ecology_dictionaries.json +++ b/yescieval/data/vocab/ecology_dictionaries.json @@ -80,35 +80,5 @@ ], "complexity_terms": [ "nonlinear", "emergent", "synergistic", "interconnected", "complex", "multifaceted" - ], - - "weights": { - "alpha": { - "depth": 0.31, - "breadth": 0.27, - "rigor": 0.17, - "innov": 0.17, - "gap": 0.08 - }, - "depth": { - "mech": 0.40, - "causal": 0.30, - "temp": 0.30 - }, - "breadth": { - "regions": 0.25, - "interventions": 0.25, - "biodiversity": 0.25, - "services": 0.15, - "scale": 0.10 - }, - "rigor": { - "stats": 0.67, - "uncert": 0.33 - }, - "innovation": { - "spec": 0.52, - "novel": 0.48 - } - } + ] } diff --git a/yescieval/data/vocab/nlp_dictionaries.json b/yescieval/data/vocab/nlp_dictionaries.json index 107001b..f997e36 100644 --- a/yescieval/data/vocab/nlp_dictionaries.json +++ b/yescieval/data/vocab/nlp_dictionaries.json @@ -84,35 +84,5 @@ ], "safety_terms": [ "bias", "fairness", "toxicity", "privacy", "safety", "data leakage", "red teaming", "harmful content" - ], - - "weights": { - "alpha": { - "depth": 0.31, - "breadth": 0.27, - "rigor": 0.17, - "innov": 0.17, - "gap": 0.08 - }, - "depth": { - "mech": 0.40, - "causal": 0.30, - "temp": 0.30 - }, - "breadth": { - "tasks": 0.25, - "datasets": 0.25, - "metrics": 0.25, - "languages": 0.15, - "compute": 0.10 - }, - "rigor": { - "stats": 0.67, - "uncert": 0.33 - }, - "innovation": { - "spec": 0.52, - "novel": 0.48 - } - } + ] } diff --git a/yescieval/rubric/depth.py b/yescieval/rubric/depth.py index 4580dee..94738da 100644 --- a/yescieval/rubric/depth.py +++ b/yescieval/rubric/depth.py @@ -1,7 +1,5 @@ from ..base import Rubric -from ..base import Rubric - mechanistic_understanding_prompt = """ Scientific question answering and synthesis often require more than listing findings: high-quality scientific writing explains not only what is believed to be true, but also how and why it may be true. This is commonly expressed through mechanistic understanding, where the text describes processes, interactions, intermediate steps, or pathways that connect conditions or components to outcomes. From 2c92b20c703469cd13bfbe25c0c7c2235fbd1982 Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Sun, 25 Jan 2026 19:03:11 +0100 Subject: [PATCH 03/13] :fire: delete files --- yescieval/data/examples/examples.json | 70 --------------- .../data/vocab/ecology_dictionaries.json | 84 ------------------ yescieval/data/vocab/nlp_dictionaries.json | 88 ------------------- 3 files changed, 242 deletions(-) delete mode 100644 yescieval/data/examples/examples.json delete mode 100644 yescieval/data/vocab/ecology_dictionaries.json delete mode 100644 yescieval/data/vocab/nlp_dictionaries.json diff --git a/yescieval/data/examples/examples.json b/yescieval/data/examples/examples.json deleted file mode 100644 index 275abf2..0000000 --- a/yescieval/data/examples/examples.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "Ecology": { - "Depth": { - "MechanisticUnderstanding": [ - { - "rating": "1", - "rationale": "The response mainly describes outcomes or observations and does not explain the underlying mechanisms or processes driving them." - }, - { - "rating": "4", - "rationale": "The response explains ecological mechanisms by describing pathways and feedbacks such as nutrient cycling and energy flow, and how interactions like predation, competition, and mutualism influence ecosystem dynamics, resilience, and responses to disturbance." - } - ], - "CausalReasoning": [ - { - "rating": "1", - "rationale": "The response describes ecological patterns or outcomes but does not clearly explain why they occur or how one factor leads to another." - }, - { - "rating": "4", - "rationale": "The response presents clear cause-effect reasoning, explaining how changes in ecological factors drive outcomes, for example describing how disturbances lead to shifts in community structure, which consequently regulate ecosystem processes through specific mediating interactions." - } - ], - "TemporalPrecision": [ - { - "rating": "1", - "rationale": "The response refers to timing only in vague terms, such as long-term or historical trends, without specifying concrete dates, durations, or time intervals." - }, - { - "rating": "4", - "rationale": "The response uses specific and bounded temporal expressions, for example describing changes occurring within 2-5 years, after 3 months, or every 2 weeks, and referencing defined time periods such as 1998-2004 or June 2012." - } - ] - } - }, - "NLP": { - "Depth": { - "MechanisticUnderstanding": [ - { - "rating": "1", - "rationale": "The response reports results or model performance but does not explain how the model architecture or training process leads to those outcomes." - }, - { - "rating": "4", - "rationale": "The response provides a clear mechanistic explanation of how the model works, describing the role of transformer-based architectures, the effects of pretraining and fine-tuning, and insights from ablation studies that show how specific components contribute to performance." - } - ], - "CausalReasoning": [ - { - "rating": "1", - "rationale": "The response reports results or observations but does not provide explicit cause-effect explanations linking methods or design choices to outcomes." - }, - { - "rating": "4", - "rationale": "The response provides structured cause-effect reasoning, explaining how architectural or training choices lead to performance differences, for example noting that improvements occur because certain components modulate information flow, which consequently drives better generalization through specific training mechanisms." - } - ], - "TemporalPrecision": [ - { - "rating": "1", - "rationale": "The response mentions time only in broad or unspecific ways and does not provide clear dates, durations, or intervals relevant to the discussion." - }, - { - "rating": "4", - "rationale": "The response includes precise temporal details, such as model behavior observed after 3 months of training, performance changes within 2-5 years of development, or evaluations conducted every 2 weeks, with references to specific time ranges like 1998-2004 or June 2012." - } - ] - } - } -} \ No newline at end of file diff --git a/yescieval/data/vocab/ecology_dictionaries.json b/yescieval/data/vocab/ecology_dictionaries.json deleted file mode 100644 index bf341f7..0000000 --- a/yescieval/data/vocab/ecology_dictionaries.json +++ /dev/null @@ -1,84 +0,0 @@ -{ - "regions": [ - "Europe", "North America", "South America", "Asia", "Africa", "Australia", - "Mediterranean", "Alpine", "Tropical", "Arctic", "Boreal", "Temperate", - "Subtropical", "Arid", "Wetland", "Coastal", "Marine", "Freshwater", - "Terrestrial", "Montane", "Savanna", "Tundra", "Desert", "Grassland", - "Rainforest", "Riparian", "Peatland", "Mangrove", "Coral reef" - ], - "interventions": [ - "fertilizer", "stocking", "mowing", "grazing", "irrigation", "organic", - "controlled burn", "prescribed burn", "restoration", "reforestation", "afforestation", - "rewilding", "habitat creation", "invasive species control", "predator control", - "captive breeding", "protected area", "translocation", "assisted migration", - "biochar", "liming", "mulching", "cover cropping", "selective logging", - "thinning", "buffer strips", "fencing", "corridor", "wetland creation" - ], - "mechanistic_terms": [ - "mechanism", "pathway", "feedback", "trophic", "nutrient cycling", "energy flow", - "predation", "competition", "mutualism", "facilitation", "inhibition", - "succession", "disturbance", "resilience", "adaptation", "selection pressure", - "gene flow", "decomposition", "mineralization", "nitrification", - "photosynthesis", "respiration", "herbivory", "allelopathy", - "keystone", "hysteresis", "tipping point" - ], - "diversity_dimensions": [ - "taxonomic", "functional", "phylogenetic", "alpha", "beta", "gamma", - "species richness", "evenness", "dominance", "endemism", "rarity", - "abundance", "biomass", "density", "coverage", "trait diversity", - "genetic diversity", "structural diversity", "shannon", "simpson", "hill numbers" - ], - "temporal_terms" :[ - "within 2–5 years", "lag of ~6 months", "after 3 months", "before 12 weeks", - "1998–2004", "June 2012", "every 2 weeks" - ], - "ecosystem_services": [ - "provisioning", "regulating", "supporting", "cultural", - "carbon sequestration", "pollination", "pest control", "water purification", - "soil formation", "nutrient retention", "climate regulation", "flood control", - "erosion control", "recreation", "aesthetic value", "food production", - "timber", "fiber", "fuel", "genetic resources", "biochemicals", "fresh water" - ], - "scale_terms": [ - "individual", "population", "community", "ecosystem", "landscape", - "patch", "local", "regional", "global" - ], - "causal_terms": [ - "because", "due to", "caused by", "results in", "leads to", "triggers", "induces", - "therefore", "consequently", "as a result", "hence", "thus", - "accordingly", "owing to", "through", "via", "by means of", "mediates", "modulates", "drives", "regulates" - ], - "innovation_terms": [ - "novel", "innovative", "breakthrough", "pioneering", "cutting-edge", - "emerging", "frontier", "state-of-the-art", "advanced", "experimental", - "proof-of-concept", "first", "unprecedented" - ], - "speculative_terms": [ - "speculative", "hypothetical", "flagged" - ], - "gap_terms": [ - "research gap", "knowledge gap", "data gap" - ], - "uncertainty_terms": [ - "uncertain", "unclear", "unknown" - ], - "stats_terms": [ - "mean", "median", "variance", "standard deviation", "standard error", - "confidence interval", "ci", "p-value", "significant", "regression", - "anova", "t-test", "chi-square", "effect size", "meta-analysis", - "model comparison", "r-squared" - ], - - "conservation_terms": [ - "endangered", "extinction", "habitat loss", "fragmentation", - "restoration", "landscape connectivity", "corridor", "buffer zone" - ], - "climate_terms": [ - "climate change", "global warming", "drought", "heatwave", "extreme weather", - "phenology", "range shift", "sea level rise", "ocean acidification", - "greenhouse gas", "carbon dioxide", "thermal stress", "precipitation" - ], - "complexity_terms": [ - "nonlinear", "emergent", "synergistic", "interconnected", "complex", "multifaceted" - ] -} diff --git a/yescieval/data/vocab/nlp_dictionaries.json b/yescieval/data/vocab/nlp_dictionaries.json deleted file mode 100644 index f997e36..0000000 --- a/yescieval/data/vocab/nlp_dictionaries.json +++ /dev/null @@ -1,88 +0,0 @@ -{ - "tasks": [ - "classification", "sentiment", "ner", "named entity recognition", "pos", "part of speech", - "parsing", "constituency parsing", "dependency parsing", - "qa", "question answering", "open-domain qa", "closed-book qa", - "summarization", "abstractive summarization", "extractive summarization", - "translation", "machine translation", "mt", - "retrieval", "dense retrieval", "bm25", "reranking", "re-ranking", - "dialogue", "dialog", "conversation", "chat", - "generation", "text generation", "story generation", "code generation", - "coreference", "coreference resolution", "slot filling", - "nli", "natural language inference", "sts", "semantic textual similarity", - "entailment" - ], - "datasets": [ - "glue", "superglue", "squad", "squad2", "mnli", "qqp", "qnli", "cola", "sst", "sst-2", "stsb", - "wmt", "cnn/daily mail", "cnn dm", "xsum", "gigaword", - "coqa", "hotpotqa", "msmarco", "triviaqa", - "belebele", "mmlu", "hellaswag", "truthfulqa", - "gsm8k", "humaneval", "arc", "piqa", "boolq", "openbookqa" - ], - "languages": [ - "english", "german", "deutsch", "french", "spanish", "italian", - "chinese", "japanese", "korean", "arabic", "hindi", - "multilingual", "cross-lingual", "low-resource" - ], - "temporal_terms" :[ - "within 2–5 years", "lag of ~6 months", "after 3 months", "before 12 weeks", - "1998–2004", "June 2012", "every 2 weeks" - ], - "eval_metrics": [ - "accuracy", "f1", "precision", "recall", - "bleu", "chrf", "rouge", "meteor", "bertscore", - "perplexity", "exact match", "em" - ], - "arch_terms": [ - "transformer", "encoder-decoder", "decoder-only", - "bert", "albert", "roberta", "t5", "gpt", "llama", "mistral", - "lstm", "gru", "cnn" - ], - "training_terms": [ - "pretraining", "fine-tuning", "instruction tuning", "rlhf", "dpo", - "lora", "qlora", "quantization", "distillation", "curriculum", - "data augmentation", "continual learning" - ], - "ablation_terms": [ - "ablation", "ablation study", "component analysis", "feature ablation", "module ablation" - ], - "compute_terms": [ - "gpu", "tpu", "flops", "parameters", "params", "billion parameters", - "inference time", "throughput", "latency", "memory footprint" - ], - "causal_terms": [ - "because", "due to", "caused by", "results in", "leads to", "triggers", "induces", - "therefore", "consequently", "as a result", "hence", "thus", - "via", "through", "mediates", "modulates", "drives", "regulates" - ], - "rigor_stats": [ - "p-value", "p<", "p >", "significant", "confidence interval", "ci", - "t-test", "anova", "regression", "bootstrap", "cross-validation", - "held-out", "standard deviation", "std", "mean", "median" - ], - "stats_terms": [ - "p-value", "confidence interval", "t-test", "anova", "regression", - "effect size", "variance", "standard deviation", "standard error", "r-squared" - ], - "uncertainty_terms": [ - "uncertain", "unclear", "unknown" - ], - "innovation_terms": [ - "novel", "innovative", "breakthrough", "pioneering", "cutting-edge", - "emerging", "frontier", "state-of-the-art", "advanced", "experimental", - "proof-of-concept", "first", "unprecedented" - ], - "speculative_terms": [ - "speculative", "hypothetical", "flagged" - ], - "gap_terms": [ - "research gap", "knowledge gap", "data gap" - ], - "repro_terms": [ - "open source", "code available", "github", "weights", "checkpoint", - "seed", "license", "hyperparameter", "learning rate", "batch size" - ], - "safety_terms": [ - "bias", "fairness", "toxicity", "privacy", "safety", "data leakage", "red teaming", "harmful content" - ] -} From daec9b41005993146a17fcdbfcd62281067bfa21 Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Sun, 25 Jan 2026 19:05:57 +0100 Subject: [PATCH 04/13] :sparkles: add domain concept --- yescieval/base/__init__.py | 6 +- yescieval/base/domain.py | 9 ++ yescieval/injector/domains/__init__.py | 11 +++ yescieval/injector/domains/ecology.py | 107 ++++++++++++++++++++++ yescieval/injector/domains/nlp.py | 118 +++++++++++++++++++++++++ 5 files changed, 247 insertions(+), 4 deletions(-) create mode 100644 yescieval/base/domain.py create mode 100644 yescieval/injector/domains/__init__.py create mode 100644 yescieval/injector/domains/ecology.py create mode 100644 yescieval/injector/domains/nlp.py diff --git a/yescieval/base/__init__.py b/yescieval/base/__init__.py index 838757f..d8b245a 100644 --- a/yescieval/base/__init__.py +++ b/yescieval/base/__init__.py @@ -1,14 +1,12 @@ from .rubric import Rubric +from .domain import Domain from .parser import Parser, RubricLikertScale from .judge import Judge -from .vocab import VocabLoader -from .example import ExampleLoader __all__ = [ "Rubric", "Parser", "RubricLikertScale", "Judge", - "VocabLoader", - "ExampleLoader" + "Domain", ] \ No newline at end of file diff --git a/yescieval/base/domain.py b/yescieval/base/domain.py new file mode 100644 index 0000000..679c555 --- /dev/null +++ b/yescieval/base/domain.py @@ -0,0 +1,9 @@ +from abc import ABC +from pydantic import BaseModel +from typing import Dict + +class Domain(BaseModel, ABC): + examples: Dict[str, Dict] = None + vocab: Dict[str, Dict] = None + ID: str = None + verbalized: str = None \ No newline at end of file diff --git a/yescieval/injector/domains/__init__.py b/yescieval/injector/domains/__init__.py new file mode 100644 index 0000000..610c316 --- /dev/null +++ b/yescieval/injector/domains/__init__.py @@ -0,0 +1,11 @@ +from typing import Dict +from .nlp import NLP +from .ecology import Ecology + +domains = [NLP(), Ecology()] + +vocabs: Dict[str, Dict] = {domain.ID: domain.vocab for domain in domains} + +example_responses: Dict[str, Dict] = {domain.ID: domain.examples for domain in domains} + +verbalized_domains: Dict[str, str] = {domain.ID: domain.verbalized for domain in domains} \ No newline at end of file diff --git a/yescieval/injector/domains/ecology.py b/yescieval/injector/domains/ecology.py new file mode 100644 index 0000000..2946f86 --- /dev/null +++ b/yescieval/injector/domains/ecology.py @@ -0,0 +1,107 @@ +from typing import Dict +from ...base.domain import Domain + +vocabulary = { + "regions": [ + "Europe", "North America", "South America", "Asia", "Africa", "Australia", "Mediterranean", "Alpine", + "Tropical", "Arctic", "Boreal", "Temperate", "Subtropical", "Arid", "Wetland", "Coastal", "Marine", + "Freshwater", "Terrestrial", "Montane", "Savanna", "Tundra", "Desert", "Grassland", "Rainforest", + "Riparian", "Peatland", "Mangrove", "Coral reef" + ], + "interventions": [ + "fertilizer", "stocking", "mowing", "grazing", "irrigation", "organic", "controlled burn", "prescribed burn", + "restoration", "reforestation", "afforestation", "rewilding", "habitat creation", "invasive species control", + "predator control", "captive breeding", "protected area", "translocation", "assisted migration", "biochar", + "liming", "mulching", "cover cropping", "selective logging", "thinning", "buffer strips", "fencing", + "corridor", "wetland creation" + ], + "mechanistic_terms": [ + "mechanism", "pathway", "feedback", "trophic", "nutrient cycling", "energy flow", "predation", "competition", + "mutualism", "facilitation", "inhibition", "succession", "disturbance", "resilience", "adaptation", + "selection pressure", "gene flow", "decomposition", "mineralization", "nitrification", "photosynthesis", + "respiration", "herbivory", "allelopathy", "keystone", "hysteresis", "tipping point" + ], + "diversity_dimensions": [ + "taxonomic", "functional", "phylogenetic", "alpha", "beta", "gamma", "species richness", "evenness", + "dominance", "endemism", "rarity", "abundance", "biomass", "density", "coverage", "trait diversity", + "genetic diversity", "structural diversity", "shannon", "simpson", "hill numbers" + ], + "temporal_terms" :[ + "within 2–5 years", "lag of ~6 months", "after 3 months", "before 12 weeks", "1998–2004", + "June 2012", "every 2 weeks" + ], + "ecosystem_services": [ + "provisioning", "regulating", "supporting", "cultural", "carbon sequestration", "pollination", "pest control", + "water purification", "soil formation", "nutrient retention", "climate regulation", "flood control", + "erosion control", "recreation", "aesthetic value", "food production", "timber", "fiber", "fuel", + "genetic resources", "biochemicals", "fresh water" + ], + "scale_terms": ["individual", "population", "community", "ecosystem", "landscape", "patch", "local", "regional", "global"], + "causal_terms": [ + "because", "due to", "caused by", "results in", "leads to", "triggers", "induces", "therefore", "consequently", + "as a result", "hence", "thus", "accordingly", "owing to", "through", "via", "by means of", + "mediates", "modulates", "drives", "regulates" + ], + "innovation_terms": [ + "novel", "innovative", "breakthrough", "pioneering", "cutting-edge", + "emerging", "frontier", "state-of-the-art", "advanced", "experimental", + "proof-of-concept", "first", "unprecedented" + ], + "speculative_terms": ["speculative", "hypothetical", "flagged"], + "gap_terms": ["research gap", "knowledge gap", "data gap"], + "uncertainty_terms": ["uncertain", "unclear", "unknown"], + "stats_terms": [ + "mean", "median", "variance", "standard deviation", "standard error", "confidence interval", "ci", + "p-value", "significant", "regression", "anova", "t-test", "chi-square", "effect size", "meta-analysis", + "model comparison", "r-squared" + ], + "conservation_terms": [ + "endangered", "extinction", "habitat loss", "fragmentation", "restoration", "landscape connectivity", "corridor", "buffer zone" + ], + "climate_terms": [ + "climate change", "global warming", "drought", "heatwave", "extreme weather", "phenology", "range shift", + "sea level rise", "ocean acidification", "greenhouse gas", "carbon dioxide", "thermal stress", "precipitation" + ], + "complexity_terms": ["nonlinear", "emergent", "synergistic", "interconnected", "complex", "multifaceted"] +} + +example_responses = { + "Depth": { + "MechanisticUnderstanding": [ + { + "rating": "1", + "rationale": "The response mainly describes outcomes or observations and does not explain the underlying mechanisms or processes driving them." + }, + { + "rating": "4", + "rationale": "The response explains ecological mechanisms by describing pathways and feedbacks such as nutrient cycling and energy flow, and how interactions like predation, competition, and mutualism influence ecosystem dynamics, resilience, and responses to disturbance." + } + ], + "CausalReasoning": [ + { + "rating": "1", + "rationale": "The response describes ecological patterns or outcomes but does not clearly explain why they occur or how one factor leads to another." + }, + { + "rating": "4", + "rationale": "The response presents clear cause-effect reasoning, explaining how changes in ecological factors drive outcomes, for example describing how disturbances lead to shifts in community structure, which consequently regulate ecosystem processes through specific mediating interactions." + } + ], + "TemporalPrecision": [ + { + "rating": "1", + "rationale": "The response refers to timing only in vague terms, such as long-term or historical trends, without specifying concrete dates, durations, or time intervals." + }, + { + "rating": "4", + "rationale": "The response uses specific and bounded temporal expressions, for example describing changes occurring within 2-5 years, after 3 months, or every 2 weeks, and referencing defined time periods such as 1998-2004 or June 2012." + } + ] + } +} + +class Ecology(Domain): + examples: Dict[str, Dict] = example_responses + vocab: Dict[str, Dict] = vocabulary + ID: str = "ecology" + verbalized: str = "Ecology" \ No newline at end of file diff --git a/yescieval/injector/domains/nlp.py b/yescieval/injector/domains/nlp.py new file mode 100644 index 0000000..c487def --- /dev/null +++ b/yescieval/injector/domains/nlp.py @@ -0,0 +1,118 @@ +from typing import Dict +from ...base.domain import Domain + +vocabulary = { + "tasks": [ + "classification", "sentiment", "ner", "named entity recognition", "pos", "part of speech", "parsing", + "constituency parsing", "dependency parsing", "qa", "question answering", "open-domain qa", "closed-book qa", + "summarization", "abstractive summarization", "extractive summarization", "translation", "machine translation", + "mt", "retrieval", "dense retrieval", "bm25", "reranking", "re-ranking", "dialogue", "dialog", "conversation", + "chat", "generation", "text generation", "story generation", "code generation", "coreference", + "coreference resolution", "slot filling", "nli", "natural language inference", "sts", + "semantic textual similarity", "entailment" + ], + "datasets": [ + "glue", "superglue", "squad", "squad2", "mnli", "qqp", "qnli", "cola", "sst", "sst-2", "stsb", "wmt", + "cnn/daily mail", "cnn dm", "xsum", "gigaword", "coqa", "hotpotqa", "msmarco", "triviaqa", "belebele", "mmlu", + "hellaswag", "truthfulqa", "gsm8k", "humaneval", "arc", "piqa", "boolq", "openbookqa" + ], + "languages": [ + "english", "german", "deutsch", "french", "spanish", "italian", "chinese", "japanese", "korean", "arabic", "hindi", + "multilingual", "cross-lingual", "low-resource" + ], + "temporal_terms" :[ + "within 2–5 years", "lag of ~6 months", "after 3 months", "before 12 weeks", "1998–2004", "June 2012", "every 2 weeks" + ], + "eval_metrics": [ + "accuracy", "f1", "precision", "recall", "bleu", "chrf", "rouge", "meteor", "bertscore", "perplexity", + "exact match", "em" + ], + "arch_terms": [ + "transformer", "encoder-decoder", "decoder-only", "bert", "albert", "roberta", "t5", "gpt", "llama", "mistral", + "lstm", "gru", "cnn" + ], + "training_terms": [ + "pretraining", "fine-tuning", "instruction tuning", "rlhf", "dpo", "lora", "qlora", "quantization", + "distillation", "curriculum", "data augmentation", "continual learning" + ], + "ablation_terms": [ + "ablation", "ablation study", "component analysis", "feature ablation", "module ablation" + ], + "compute_terms": [ + "gpu", "tpu", "flops", "parameters", "params", "billion parameters", "inference time", "throughput", + "latency", "memory footprint" + ], + "causal_terms": [ + "because", "due to", "caused by", "results in", "leads to", "triggers", "induces", "therefore", "consequently", + "as a result", "hence", "thus", "via", "through", "mediates", "modulates", "drives", "regulates" + ], + "rigor_stats": [ + "p-value", "p<", "p >", "significant", "confidence interval", "ci", "t-test", "anova", "regression", + "bootstrap", "cross-validation", "held-out", "standard deviation", "std", "mean", "median" + ], + "stats_terms": [ + "p-value", "confidence interval", "t-test", "anova", "regression", "effect size", "variance", + "standard deviation", "standard error", "r-squared" + ], + "uncertainty_terms": [ + "uncertain", "unclear", "unknown" + ], + "innovation_terms": [ + "novel", "innovative", "breakthrough", "pioneering", "cutting-edge", "emerging", "frontier", "state-of-the-art", + "advanced", "experimental", "proof-of-concept", "first", "unprecedented" + ], + "speculative_terms": [ + "speculative", "hypothetical", "flagged" + ], + "gap_terms": [ + "research gap", "knowledge gap", "data gap" + ], + "repro_terms": [ + "open source", "code available", "github", "weights", "checkpoint", "seed", "license", "hyperparameter", + "learning rate", "batch size" + ], + "safety_terms": [ + "bias", "fairness", "toxicity", "privacy", "safety", "data leakage", "red teaming", "harmful content" + ] +} + +example_responses = { + "Depth": { + "MechanisticUnderstanding": [ + { + "rating": "1", + "rationale": "The response reports results or model performance but does not explain how the model architecture or training process leads to those outcomes." + }, + { + "rating": "4", + "rationale": "The response provides a clear mechanistic explanation of how the model works, describing the role of transformer-based architectures, the effects of pretraining and fine-tuning, and insights from ablation studies that show how specific components contribute to performance." + } + ], + "CausalReasoning": [ + { + "rating": "1", + "rationale": "The response reports results or observations but does not provide explicit cause-effect explanations linking methods or design choices to outcomes." + }, + { + "rating": "4", + "rationale": "The response provides structured cause-effect reasoning, explaining how architectural or training choices lead to performance differences, for example noting that improvements occur because certain components modulate information flow, which consequently drives better generalization through specific training mechanisms." + } + ], + "TemporalPrecision": [ + { + "rating": "1", + "rationale": "The response mentions time only in broad or unspecific ways and does not provide clear dates, durations, or intervals relevant to the discussion." + }, + { + "rating": "4", + "rationale": "The response includes precise temporal details, such as model behavior observed after 3 months of training, performance changes within 2-5 years of development, or evaluations conducted every 2 weeks, with references to specific time ranges like 1998-2004 or June 2012." + } + ] + } +} + +class NLP(Domain): + examples: Dict[str, Dict] = example_responses + vocab: Dict[str, Dict] = vocabulary + ID: str = 'nlp' + verbalized: str = "NLP" \ No newline at end of file From 4c56b0fad8cc9e2c2a92ab42e4b41b5db3f02dad Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Sun, 25 Jan 2026 19:06:12 +0100 Subject: [PATCH 05/13] :sparkles: add injector concept --- yescieval/__init__.py | 4 +- yescieval/base/example.py | 66 --------------------- yescieval/base/rubric.py | 62 ++++++++------------ yescieval/base/vocab.py | 104 --------------------------------- yescieval/injector/__init__.py | 9 +++ yescieval/injector/example.py | 37 ++++++++++++ yescieval/injector/vocab.py | 58 ++++++++++++++++++ 7 files changed, 131 insertions(+), 209 deletions(-) delete mode 100644 yescieval/base/example.py delete mode 100644 yescieval/base/vocab.py create mode 100644 yescieval/injector/__init__.py create mode 100644 yescieval/injector/example.py create mode 100644 yescieval/injector/vocab.py diff --git a/yescieval/__init__.py b/yescieval/__init__.py index 45f2d0e..3dff49d 100644 --- a/yescieval/__init__.py +++ b/yescieval/__init__.py @@ -2,12 +2,12 @@ __version__ = (Path(__file__).parent / "VERSION").read_text().strip() -from .base import Rubric, Parser, Judge, VocabLoader, ExampleLoader +from .base import Rubric, Parser, Judge from .rubric import (Informativeness, Correctness, Completeness, Coherence, Relevancy, Integration, Cohesion, Readability, Conciseness, MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, SpeculativeStatements, NoveltyIndicators) +from .injector import ExampleInjector, VocabularyInjector from .judge import AutoJudge, AskAutoJudge, BioASQAutoJudge, CustomAutoJudge, GPTCustomAutoJudge from .parser import GPTParser - diff --git a/yescieval/base/example.py b/yescieval/base/example.py deleted file mode 100644 index 94576e0..0000000 --- a/yescieval/base/example.py +++ /dev/null @@ -1,66 +0,0 @@ -import json -from typing import Dict, Any - - -class ExampleLoader: - """ - Loads rubric-specific example responses and injects them - into prompt templates based on domain and rubric name. - """ - - DOMAIN_MAP = { - "nlp": "NLP", - "ecology": "Ecology", - } - - CATEGORIES = ("Depth", "Breadth") - PLACEHOLDER = "{EXAMPLE_RESPONSES}" - EMPTY_VALUE = "{}" - - def __init__(self, file_path: str): - self.data = self._load_examples(file_path) - - def _normalize_domain(self, domain: str) -> str: - return domain.strip().lower() - - def _load_examples(self, file_path: str) -> Dict[str, Any]: - with open(file_path, "r", encoding="utf-8") as f: - data = json.load(f) - - if not isinstance(data, dict): - raise ValueError("Example file must contain a JSON object at top level") - - return data - - def get_example_data(self, domain: str, rubric_name: str) -> Dict[str, Any]: - """ - Returns: - {rubric_name: } if found, - {} otherwise. - """ - domain = self._normalize_domain(domain) - topic_key = self.DOMAIN_MAP.get(domain) - - if not topic_key: - return {} - - topic_data = self.data.get(topic_key, {}) - - for category in self.CATEGORIES: - category_data = topic_data.get(category, {}) - if rubric_name in category_data: - return {rubric_name: category_data[rubric_name]} - - return {} - - def fill_prompt(self, template: str, domain: str, rubric_name: str) -> str: - """ - Injects example responses JSON into the template. - """ - example_data = self.get_example_data(domain, rubric_name) - - if not example_data: - return template.replace(self.PLACEHOLDER, self.EMPTY_VALUE) - - example_json = json.dumps(example_data, indent=4) - return template.replace(self.PLACEHOLDER, example_json) diff --git a/yescieval/base/rubric.py b/yescieval/base/rubric.py index d93d61e..adb642e 100644 --- a/yescieval/base/rubric.py +++ b/yescieval/base/rubric.py @@ -1,8 +1,7 @@ from abc import ABC from pydantic import BaseModel from typing import Dict, List, Optional -from .vocab import VocabLoader -from .example import ExampleLoader +from ..injector import ExampleInjector, VocabularyInjector class Rubric(BaseModel, ABC): """ @@ -21,10 +20,11 @@ class Rubric(BaseModel, ABC): "\n\n{content}\n\n###") domain: Optional[str] = None - vocab_manager: Optional[VocabLoader] = None - example_manager: Optional[ExampleLoader] = None - model_config = {"arbitrary_types_allowed": True} + vocabulary: Optional[VocabularyInjector] = None + example: Optional[ExampleInjector] = None + model_config = {"arbitrary_types_allowed": True} # Not used in the class but unable to generate + # pydantic-core schema for vocab and example injectors def render_papers(self) -> str: paper_content = "" @@ -32,38 +32,26 @@ def render_papers(self) -> str: paper_content += f"{idx + 1}. {title}\n\n{abstract}\n\n" return paper_content - def preprocess_user_prompt(self, template: str) -> str: - """ - Fills vocabulary and example placeholders in the system prompt. - """ - filled = template - - if self.vocab_manager and self.domain: - filled = self.vocab_manager.fill_prompt(filled, self.domain) - - if self.example_manager and self.domain: - - filled = self.example_manager.fill_prompt( - template=filled, - domain=self.domain, - rubric_name=self.name - ) - - return filled - - def verbalize(self) -> str: - """ - Fill placeholders first, then format with answer, question, and papers. - """ - filled_template = self.preprocess_user_prompt(self.user_prompt_template) - return filled_template.format( - answer=self.answer, - question=self.question, - content=self.render_papers() - ) + def verbalize_user_prompt(self): + return self.user_prompt_template.format(answer=self.answer, + question=self.question, + content=self.render_papers()) + + def verbalize_system_prompt(self): + system_prompt_template = self.system_prompt_template + if self.domain: + if self.vocabulary: + system_prompt_template = self.vocabulary.format_prompt(prompt=system_prompt_template, domain=self.domain) + if self.example: + system_prompt_template = self.example.format_prompt(prompt=system_prompt_template, + domain=self.domain, + rubric_id=self.name) + return system_prompt_template def instruct(self) -> List[Dict[str, str]]: - return [ - {"role": "system", "content": self.system_prompt_template}, - {"role": "user", "content": self.verbalize()}, + message = [ + {"role": "system", "content": self.verbalize_system_prompt()}, + {"role": "user", "content": self.verbalize_user_prompt()}, ] + return message + diff --git a/yescieval/base/vocab.py b/yescieval/base/vocab.py deleted file mode 100644 index 2b476c9..0000000 --- a/yescieval/base/vocab.py +++ /dev/null @@ -1,104 +0,0 @@ -import json -from pathlib import Path -from typing import Dict - - -class VocabLoader: - """ - Loads multiple vocabularies and fills placeholders in prompts - based on the selected domain. - """ - - PLACEHOLDERS = { - "{MECHANISTIC_VOCAB}": "mechanistic_vocab_block", - "{CAUSAL_VOCAB}": "causal_vocab_block", - "{TEMPORAL_VOCAB}": "temporal_vocab_block", - } - - def __init__(self, domain_to_file: Dict[str, str]): - """ - domain_to_file: {"nlp": "vocab/nlp_dictionary.json", "ecology": "vocab/ecology_dictionary.json"} - """ - self.domain_to_file = { - self._normalize_domain(k): v for k, v in domain_to_file.items() - } - self.vocabs: Dict[str, Dict] = {} - - for domain, file_path in self.domain_to_file.items(): - self.vocabs[domain] = self._load_vocab(file_path) - - def _normalize_domain(self, domain: str) -> str: - return domain.strip().lower() - - def _load_vocab(self, file_path: str) -> Dict: - file_path = Path(file_path) - if not file_path.exists(): - raise FileNotFoundError(f"Vocabulary file not found: {file_path}") - with open(file_path, "r", encoding="utf-8") as f: - data = json.load(f) - if not isinstance(data, dict): - raise ValueError(f"Invalid vocabulary format: {file_path}") - return data - - def _clean_terms(self, terms) -> list[str]: - - seen = set() - cleaned = [] - for t in terms: - if not isinstance(t, str): - continue - t = t.strip() - if not t or t in seen: - continue - seen.add(t) - cleaned.append(t) - return cleaned - - - def mechanistic_vocab_block(self, domain: str) -> str: - domain = self._normalize_domain(domain) - V = self.vocabs.get(domain, {}) - - if domain == "ecology": - terms = V.get("mechanistic_terms", []) - label = "Mechanistic terms (Ecology)" - elif domain == "nlp": - terms = ( - V.get("training_terms", []) - + V.get("arch_terms", []) - + V.get("ablation_terms", []) - ) - label = "Mechanistic terms (NLP)" - else: - terms = V.get("mechanistic_terms", []) - label = "Mechanistic terms" - - terms = self._clean_terms(terms) - return f"{label}: " + ", ".join(terms) - - def causal_vocab_block(self, domain: str) -> str: - domain = self._normalize_domain(domain) - V = self.vocabs.get(domain, {}) - terms = self._clean_terms(V.get("causal_terms", [])) - return "Causal connectives / triggers: " + ", ".join(terms) - - def temporal_vocab_block(self, domain: str) -> str: - domain = self._normalize_domain(domain) - V = self.vocabs.get(domain, {}) - terms = self._clean_terms(V.get("temporal_terms", [])) - return "Temporal expressions: " + ", ".join(terms) - - def fill_prompt(self, prompt_template: str, domain: str) -> str: - """ - Replaces known placeholders in the prompt with vocab blocks - based on the domain. - """ - prompt = prompt_template - domain = self._normalize_domain(domain) - - for placeholder, method_name in self.PLACEHOLDERS.items(): - if placeholder in prompt: - block_fn = getattr(self, method_name) - prompt = prompt.replace(placeholder, block_fn(domain)) - - return prompt diff --git a/yescieval/injector/__init__.py b/yescieval/injector/__init__.py new file mode 100644 index 0000000..d50d946 --- /dev/null +++ b/yescieval/injector/__init__.py @@ -0,0 +1,9 @@ + +from .example import ExampleInjector +from .vocab import VocabularyInjector + + +__all__ = [ + "ExampleInjector", + "VocabularyInjector" +] \ No newline at end of file diff --git a/yescieval/injector/example.py b/yescieval/injector/example.py new file mode 100644 index 0000000..f6153e2 --- /dev/null +++ b/yescieval/injector/example.py @@ -0,0 +1,37 @@ +import json +from abc import ABC +from typing import Any + +from .domains import example_responses + +class ExampleInjector(ABC): + """ + Loads rubric-specific example responses and injects them + into prompt templates based on domain and rubric name. + """ + examples_placeholder = "{EXAMPLE_RESPONSES}" + empty_placeholder = "{}" + + def format_example(self, domain: str, rubric_id: str) -> Any: + """ + Returns: + {rubric_id: } if found, + {} otherwise. + """ + domain_id = domain.strip().lower() + domain_example_responses = example_responses.get(domain_id, None) + if domain_example_responses: + for _, rubrics in domain_example_responses.items(): + for rubric, example_response in rubrics.items(): + if rubric_id == rubric: + return json.dumps({rubric_id: example_response}, indent=4) + return None + + def format_prompt(self, prompt: str, domain: str, rubric_id: str) -> str: + """ + Injects example responses JSON into the template. + """ + examples = self.format_example(domain, rubric_id) + if examples: + return prompt.replace(self.examples_placeholder, examples) + return prompt.replace(self.examples_placeholder, self.empty_placeholder) diff --git a/yescieval/injector/vocab.py b/yescieval/injector/vocab.py new file mode 100644 index 0000000..28057c1 --- /dev/null +++ b/yescieval/injector/vocab.py @@ -0,0 +1,58 @@ +from abc import ABC +from typing import Dict, List +from .domains import vocabs, verbalized_domains + +class VocabularyInjector(ABC): + """ + Loads multiple vocabularies and fills placeholders in prompts + based on the selected domain. + """ + placeholders: Dict[str, str] = { + "{MECHANISTIC_VOCAB}": "mechanistic_vocab_block", + "{CAUSAL_VOCAB}": "causal_vocab_block", + "{TEMPORAL_VOCAB}": "temporal_vocab_block", + } + + def _clean_terms(self, terms) -> List[str]: + seen_terms = set() + cleaned_terms = [] + for term in terms: + if not isinstance(term, str): + continue + term = term.strip() + if not term or term in seen_terms: + continue + seen_terms.add(term) + cleaned_terms.append(term) + return cleaned_terms + + def mechanistic_vocab_block(self, domain_id: str) -> str: + terms = vocabs[domain_id].get("mechanistic_terms") + label = "Mechanistic terms" + label += f" ({verbalized_domains.get(domain_id)})" if verbalized_domains.get(domain_id) else "" + if domain_id == "nlp": + terms = (vocabs[domain_id].get("training_terms") + + vocabs[domain_id].get("arch_terms") + + vocabs[domain_id].get("ablation_terms")) + terms = self._clean_terms(terms) + return f"{label}: " + ", ".join(terms) + + def causal_vocab_block(self, domain_id: str) -> str: + terms = self._clean_terms(vocabs[domain_id].get("causal_terms")) + return "Causal connectives / triggers: " + ", ".join(terms) + + def temporal_vocab_block(self, domain_id: str) -> str: + terms = self._clean_terms(vocabs[domain_id].get("temporal_terms")) + return "Temporal expressions: " + ", ".join(terms) + + def format_prompt(self, prompt: str, domain: str) -> str: + """ + Replaces known placeholders in the prompt with vocab blocks + based on the domain. + """ + domain_id = domain.strip().lower() + for placeholder, method in self.placeholders.items(): + if placeholder in prompt: + block_fn = getattr(self, method) + prompt = prompt.replace(placeholder, block_fn(domain_id)) + return prompt From 04818625aaf52c6367b575a32eecbff6229fdb50 Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Sun, 25 Jan 2026 19:14:20 +0100 Subject: [PATCH 06/13] :pencil2: revert unwanted change --- yescieval/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yescieval/__init__.py b/yescieval/__init__.py index 3dff49d..5cb35f5 100644 --- a/yescieval/__init__.py +++ b/yescieval/__init__.py @@ -2,7 +2,7 @@ __version__ = (Path(__file__).parent / "VERSION").read_text().strip() -from .base import Rubric, Parser, Judge +from .base import Rubric, Parser from .rubric import (Informativeness, Correctness, Completeness, Coherence, Relevancy, Integration, Cohesion, Readability, Conciseness, MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, From 279aade909baf6c3792766e3a68e46be4ccac880 Mon Sep 17 00:00:00 2001 From: Mike Ashley Cedric Date: Mon, 26 Jan 2026 05:03:37 +0100 Subject: [PATCH 07/13] =?UTF-8?q?=F0=9F=90=9B=20Removed=20Additional=20Exa?= =?UTF-8?q?mples=20from=20Depth=20Prompt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- yescieval/rubric/depth.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/yescieval/rubric/depth.py b/yescieval/rubric/depth.py index 94738da..ef74a5c 100644 --- a/yescieval/rubric/depth.py +++ b/yescieval/rubric/depth.py @@ -92,8 +92,6 @@ class MechanisticUnderstanding(Rubric): Below are examples of causal connectives and expressions that often signal causal reasoning (across domains). They are examples only: their presence is not required, and their presence alone is not sufficient for a high score. -Causal connectives / triggers (examples): because, due to, therefore, thus, hence, leads to, results in, causes, contributes to, drives, produces, induces, triggers, promotes, suppresses, mediates, moderates, modulates, depends on, under conditions of, only if, unless. - {CAUSAL_VOCAB} @@ -159,10 +157,6 @@ class CausalReasoning(Rubric): Below are examples of temporal expressions. They are examples only: their presence is not required, and their presence alone is not sufficient for a high score. -Specific temporal expressions (examples): in 2019; between 2010–2015; over 6 months; within 2–5 years; a 3-year follow-up; from March 2020 to June 2021; after 12 weeks; pre- vs post-intervention; before/after fine-tuning; during pretraining. - -Vague temporal markers (examples): historically; in the past; long-term; recently; soon; over time; nowadays; for some time; at times; in earlier work. - {TEMPORAL_VOCAB} From 2801221eac2a891bbd60ab350b16789793ef4897 Mon Sep 17 00:00:00 2001 From: Mike Ashley Cedric Date: Mon, 26 Jan 2026 23:20:43 +0100 Subject: [PATCH 08/13] =?UTF-8?q?=F0=9F=93=9D=20updated=20the=20documentat?= =?UTF-8?q?ion,=20highlighting=20the=20code=20changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/judges.rst | 4 +- docs/source/quickstart.rst | 12 +++--- docs/source/rubrics.rst | 86 ++------------------------------------ 3 files changed, 12 insertions(+), 90 deletions(-) diff --git a/docs/source/judges.rst b/docs/source/judges.rst index 14621a3..c675a67 100644 --- a/docs/source/judges.rst +++ b/docs/source/judges.rst @@ -20,7 +20,7 @@ The following example demonstrates how to create an evaluation rubric, load a ju .. code-block:: python - from yescieval import Readability, AutoJudge + from yescieval import Readability, AutoJudge, ExampleInjector, VocabularyInjector papers = { "A Study on AI": "This paper discusses recent advances in artificial intelligence, including deep learning.", @@ -38,7 +38,7 @@ The following example demonstrates how to create an evaluation rubric, load a ju ) # Step 1: Create a rubric - rubric = Readability(papers=papers, question=question, answer=answer) + rubric = Readability(papers=papers, question=question, answer=answer, domain="nlp", vocabulary=VocabularyInjector(), example=ExampleInjector()) instruction_prompt = rubric.instruct() # Step 2: Load the evaluation model (judge) diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 8f7fbdc..f8b5da8 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -1,14 +1,14 @@ Quickstart ================= -YESciEval is a library designed to evaluate the quality of synthesized scientific answers using predefined rubrics and advanced LLM-based judgment models. This guide walks you through how to evaluate answers based on **informativeness** and **gap identification** using a pretrained & a custom judge and parse LLM output into structured JSON. +YESciEval is a library designed to evaluate the quality of synthesized scientific answers using predefined rubrics and advanced LLM-based judgment models. This guide walks you through how to evaluate answers based on **informativeness** and **mechanistic understanding** using a pretrained & a custom judge and parse LLM output into structured JSON. **Example: Evaluating an Answer Using Informativeness + AskAutoJudge** .. code-block:: python - from yescieval import Informativeness, AskAutoJudge, GPTParser + from yescieval import Informativeness, AskAutoJudge, GPTParser, ExampleInjector, VocabularyInjector # Sample papers used in form of {"title": "abstract", ... } papers = { @@ -27,7 +27,7 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi ) # Step 1: Create a rubric - rubric = Informativeness(papers=papers, question=question, answer=answer) + rubric = Informativeness(papers=papers, question=question, answer=answer, domain="nlp", vocabulary=VocabularyInjector(), example=ExampleInjector()) instruction_prompt = rubric.instruct() # Step 2: Load the evaluation model (judge) @@ -47,14 +47,14 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi - Add more rubrics such as ``Informativeness``, ``Relevancy``, etc for multi-criteria evaluation. -**Example: Evaluating an Answer Using GapIdentification + CustomAutoJudge** +**Example: Evaluating an Answer Using MechanisticUnderstanding + CustomAutoJudge** .. code-block:: python - from yescieval import GapIdentification, CustomAutoJudge + from yescieval import MechanisticUnderstanding, CustomAutoJudge, ExampleInjector, VocabularyInjector # Step 1: Create a rubric - rubric = GapIdentification(papers=papers, question=question, answer=answer) + rubric = MechanisticUnderstanding(papers=papers, question=question, answer=answer, domain="nlp", vocabulary=VocabularyInjector(), example=ExampleInjector()) instruction_prompt = rubric.instruct() # Step 2: Load the evaluation model (judge) diff --git a/docs/source/rubrics.rst b/docs/source/rubrics.rst index b38498c..7568210 100644 --- a/docs/source/rubrics.rst +++ b/docs/source/rubrics.rst @@ -2,7 +2,7 @@ Rubrics =================== -A total of twenty three (23) evaluation rubrics were defined as part of the YESciEval test framework. +A total of twelve (12) evaluation rubrics were defined as part of the YESciEval test framework. Linguistic & Stylistic Quality --------------------------------- @@ -78,81 +78,6 @@ Following ``Research Depth Assessment`` quantifies the mechanistic and analytica * - **12. Temporal Precision:** - Does the answer include specific time references, like intervals (“within 6 months”) or dates (“1990–2020”)? -Research Breadth Assessment ---------------------------------- - -Following ``Research Breadth Assessment`` evaluates the diversity of evidence across spatial, ecological, and methodological contexts. - - -.. list-table:: - :header-rows: 1 - :widths: 20 80 - - * - Evaluation Rubric - - Description - * - **13. Geographic Coverage:** - - Does the answer cover multiple biogeographic zones, such as “Tropical” or “Boreal”? - * - **14. Intervention Diversity:** - - Does the answer include a variety of management practices? - * - **15. Biodiversity Dimensions:** - - Does the answer mention different aspects of biodiversity, like taxonomic, functional, phylogenetic, or spatial diversity? - * - **16. Ecosystem Services:** - - Does the answer include relevant ecosystem services, based on the Millennium Ecosystem Assessment vocabulary? - * - **17. Spatial Scale:** - - Does the answer specify the spatial scale, using terms like “local,” “regional,” or “continental” and area measures? - -Scientific Rigor Assessment ---------------------------------- - -Following ``Scientific Rigor Assessment`` assesses the evidentiary and methodological integrity of the synthesis. - - -.. list-table:: - :header-rows: 1 - :widths: 20 80 - - * - Evaluation Rubric - - Description - * - **18. Statistical Sophistication:** - - Does the answer use statistical methods or analyses, showing quantitative rigor and depth? - * - **19. Citation Practices:** - - Does the answer properly cite sources, using parenthetical or narrative citations (e.g., “(Smith et al., 2021)”)? - * - **20. Uncertainty Acknowledgment:** - - Does the answer explicitly mention limitations or uncertainty, using terms like “unknown,” “limited evidence,” or “unclear”? - -Innovation Capacity Assessment ---------------------------------- - -Following ``Innovation Capacity Assessment`` evaluates the novelty of the synthesis. - - -.. list-table:: - :header-rows: 1 - :widths: 20 80 - - * - Evaluation Rubric - - Description - * - **21. Speculative Statements:** - - Does the answer include cautious or hypothetical statements, using words like “might,” “could,” or “hypothetical”? - * - **22. Novelty Indicators :** - - Does the answer highlight innovation using terms like “novel,” “pioneering,” or “emerging”? - - -Research Gap Assessment ---------------------------------- - -Following ``Research Gap Assessment`` detects explicit acknowledgment of unanswered questions or understudied areas in the synthesis. - - -.. list-table:: - :header-rows: 1 - :widths: 20 80 - - * - Evaluation Rubric - - Description - * - **23. Gap Identification:** - - Does the answer point out unanswered questions or understudied areas, using terms like “research gap” or “understudied”? - Usage Example -------------------------- @@ -162,11 +87,8 @@ Here is a simple example of how to import rubrics in your code: .. code-block:: python from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, - Integration, Cohesion, Readability, Conciseness, GeographicCoverage, - InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, - MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, - StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, - SpeculativeStatements, NoveltyIndicators + Integration, Cohesion, Readability, Conciseness, + MechanisticUnderstanding, CausalReasoning, TemporalPrecision And to use rubrics: @@ -184,7 +106,7 @@ And to use rubrics: answer = "The synthesis answer summarizing the papers." # Instantiate a rubric, e.g. Coherence - rubric = Coherence(papers=papers, question=question, answer=answer) + rubric = Coherence(papers=papers, question=question, answer=answer, domain="nlp", vocabulary=VocabularyInjector(), example=ExampleInjector()) instruction = rubric.instruct() print(instruction) From 1ebc0fa2ef25234517ce7d4f87573e31553f4b13 Mon Sep 17 00:00:00 2001 From: MikeACedric <72818458+MikeACedric@users.noreply.github.com> Date: Wed, 28 Jan 2026 10:13:33 +0100 Subject: [PATCH 09/13] :pencil: updated documentation for the rubrics, added section describing the example and vocabulary injectors --- docs/source/judges.rst | 4 +- docs/source/quickstart.rst | 53 +++++++++++++++++++++++- docs/source/rubrics.rst | 84 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 133 insertions(+), 8 deletions(-) diff --git a/docs/source/judges.rst b/docs/source/judges.rst index c675a67..14621a3 100644 --- a/docs/source/judges.rst +++ b/docs/source/judges.rst @@ -20,7 +20,7 @@ The following example demonstrates how to create an evaluation rubric, load a ju .. code-block:: python - from yescieval import Readability, AutoJudge, ExampleInjector, VocabularyInjector + from yescieval import Readability, AutoJudge papers = { "A Study on AI": "This paper discusses recent advances in artificial intelligence, including deep learning.", @@ -38,7 +38,7 @@ The following example demonstrates how to create an evaluation rubric, load a ju ) # Step 1: Create a rubric - rubric = Readability(papers=papers, question=question, answer=answer, domain="nlp", vocabulary=VocabularyInjector(), example=ExampleInjector()) + rubric = Readability(papers=papers, question=question, answer=answer) instruction_prompt = rubric.instruct() # Step 2: Load the evaluation model (judge) diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index f8b5da8..0c4629b 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -8,7 +8,7 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi .. code-block:: python - from yescieval import Informativeness, AskAutoJudge, GPTParser, ExampleInjector, VocabularyInjector + from yescieval import Informativeness, AskAutoJudge, GPTParser # Sample papers used in form of {"title": "abstract", ... } papers = { @@ -27,7 +27,7 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi ) # Step 1: Create a rubric - rubric = Informativeness(papers=papers, question=question, answer=answer, domain="nlp", vocabulary=VocabularyInjector(), example=ExampleInjector()) + rubric = Informativeness(papers=papers, question=question, answer=answer) instruction_prompt = rubric.instruct() # Step 2: Load the evaluation model (judge) @@ -47,6 +47,53 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi - Add more rubrics such as ``Informativeness``, ``Relevancy``, etc for multi-criteria evaluation. +Customizing Rubric Prompts with Injectors +----------------------------------------- + +Injectors allow you to augment rubric prompts with additional guidance, such as example responses or domain-specific vocabulary, to improve evaluation alignment. Each injector is rubric-specific, meaning different rubrics can receive different injected content. They are domain-dependent, so the examples and vocabulary injected are automatically selected based on the domain you specify (e.g., "nlp", "ecology"). Multiple injectors, such as examples and vocabulary, can be used together in a composable way. Available injectors are listed below: + +- **Example Injector**: Injects curated example responses for the chosen rubric and domain. +- **Vocabulary Injector**: Injects domain and rubric-specific terminology to guide model reasoning. + +**Usage Example** + +.. code-block:: python + + rubric = MechanisticUnderstanding( + papers=papers, + question=question, + answer=answer, + domain="nlp", + vocabulary=VocabularyInjector(), + example=ExampleInjector() + ) + +In this example, ``VocabularyInjector`` and ``ExampleInjector`` provide content aligned with the NLP domain for the *Mechanistic Understanding* rubric. + +**Example Injected Responses for Mechanistic Understanding** + +.. code-block:: json + + "MechanisticUnderstanding": [ + { + "rating": "1", + "rationale": "The response reports results or model performance but does not explain how the model architecture or training process leads to those outcomes." + }, + { + "rating": "4", + "rationale": "The response provides a clear mechanistic explanation of how the model works, describing the role of transformer-based architectures, the effects of pretraining and fine-tuning, and insights from ablation studies that show how specific components contribute to performance." + } + ] + +**Example Injected Vocabulary for Mechanistic Understanding in the NLP Domain** + +.. code-block:: json + + "training_terms": [ + "pretraining", "fine-tuning", "instruction tuning", "rlhf", "dpo", "lora", "qlora", "quantization", + "distillation", "curriculum", "data augmentation", "continual learning" + ] + **Example: Evaluating an Answer Using MechanisticUnderstanding + CustomAutoJudge** .. code-block:: python @@ -66,6 +113,8 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi print("Raw Evaluation Output:") print(result) + + **Parsing Raw Output with GPTParser** If the model outputs unstructured or loosely structured text, you can use GPTParser to parse it into valid JSON. diff --git a/docs/source/rubrics.rst b/docs/source/rubrics.rst index 7568210..7b04edd 100644 --- a/docs/source/rubrics.rst +++ b/docs/source/rubrics.rst @@ -2,7 +2,7 @@ Rubrics =================== -A total of twelve (12) evaluation rubrics were defined as part of the YESciEval test framework. +A total of twenty one (21) evaluation rubrics were defined as part of the YESciEval test framework. Linguistic & Stylistic Quality --------------------------------- @@ -79,6 +79,78 @@ Following ``Research Depth Assessment`` quantifies the mechanistic and analytica - Does the answer include specific time references, like intervals (“within 6 months”) or dates (“1990–2020”)? +Research Breadth Assessment +--------------------------------- + +Following ``Research Breadth Assessment`` evaluates the diversity of evidence across dimensions, scope, and methodological contexts. + + +.. list-table:: + :header-rows: 1 + :widths: 20 80 + + * - Evaluation Rubric + - Description + * - **13. Context Coverage:** + - Does the answer demonstrate breadth by addressing several distinct and relevant contexts related to the research question? + * - **14. Method Coverage:** + - Does the answer address multiple distinct methods or interventions relevant to the research question? + * - **15. Dimension Coverage:** + - Does the answer distribute attention across multiple distinct descriptive or evaluative dimensions relevant to the research question? + * - **16. Scope Coverage:** + - Does the answer distribute attention across multiple distinct scopes of applicability or impact relevant to the research question? + * - **17. Scale Coverage:** + - Does the answer distribute attention across multiple distinct scales relevant to the research question? + +Scientific Rigor Assessment +--------------------------------- + +Following ``Scientific Rigor Assessment`` assesses the evidentiary and methodological integrity of the synthesis. + + +.. list-table:: + :header-rows: 1 + :widths: 20 80 + + * - Evaluation Rubric + - Description + * - **18. Quantitative Evidence And Uncertainty:** + - Does the answer appropriately handle quantitative evidence and uncertainty relevant to the research question? + * - **19. Epistemic Calibration:** + - Does the answer clearly align claim strength with evidential support by marking uncertainty, assumptions, and limitations where relevant? + +Innovation Capacity Assessment +--------------------------------- + +Following ``Innovation Capacity Assessment`` evaluates the novelty of the synthesis. + + +.. list-table:: + :header-rows: 1 + :widths: 20 80 + + * - Evaluation Rubric + - Description + * - **20. State-Of-The-Art And Novelty :** + - Does the response identify and contextualize relevant state-of-the-art or novel contributions relative to prior work? + + +Research Gap Assessment +--------------------------------- + +Following ``Research Gap Assessment`` detects explicit acknowledgment of unanswered questions or understudied areas in the synthesis. + + +.. list-table:: + :header-rows: 1 + :widths: 20 80 + + * - Evaluation Rubric + - Description + * - **21. Gap Identification:** + - Does the answer point out unanswered questions or understudied areas, using terms like “research gap” or “understudied”? + + Usage Example -------------------------- @@ -87,8 +159,12 @@ Here is a simple example of how to import rubrics in your code: .. code-block:: python from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, - Integration, Cohesion, Readability, Conciseness, - MechanisticUnderstanding, CausalReasoning, TemporalPrecision + Integration, Cohesion, Readability, Conciseness, ContextCoverage, MethodCoverage, + DimensionCoverage, ScopeCoverage, ScaleCoverage, MechanisticUnderstanding, + CausalReasoning, TemporalPrecision, GapIdentification, + QuantitativeEvidenceAndUncertainty, EpistemicCalibration, + StateOfTheArtAndNovelty + And to use rubrics: @@ -106,7 +182,7 @@ And to use rubrics: answer = "The synthesis answer summarizing the papers." # Instantiate a rubric, e.g. Coherence - rubric = Coherence(papers=papers, question=question, answer=answer, domain="nlp", vocabulary=VocabularyInjector(), example=ExampleInjector()) + rubric = Coherence(papers=papers, question=question, answer=answer) instruction = rubric.instruct() print(instruction) From 1e4a075ac43d3831f68330889fd477fb7c21fb1d Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Thu, 29 Jan 2026 18:41:24 +0100 Subject: [PATCH 10/13] :memo: fix documentations --- docs/source/quickstart.rst | 102 ++++------------------ docs/source/rubrics.rst | 172 +++++++++++++++++++++++++++++-------- 2 files changed, 154 insertions(+), 120 deletions(-) diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 0c4629b..133ff53 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -1,10 +1,11 @@ Quickstart ================= -YESciEval is a library designed to evaluate the quality of synthesized scientific answers using predefined rubrics and advanced LLM-based judgment models. This guide walks you through how to evaluate answers based on **informativeness** and **mechanistic understanding** using a pretrained & a custom judge and parse LLM output into structured JSON. +YESciEval is a library designed to evaluate the quality of synthesized scientific answers using predefined rubrics and advanced LLM-based judgment models. This guide walks you through how to evaluate answers based given rubrics (i.e. **informativeness**) using a pretrained or a custom judge and parse LLM output into structured JSON. -**Example: Evaluating an Answer Using Informativeness + AskAutoJudge** +The following example shows the how to run a ``AskAutoJudge`` on ``Informativeness`` rubric: + .. code-block:: python @@ -46,78 +47,7 @@ YESciEval is a library designed to evaluate the quality of synthesized scientifi - Use the ``device="cuda"`` if running on GPU for better performance. - Add more rubrics such as ``Informativeness``, ``Relevancy``, etc for multi-criteria evaluation. - -Customizing Rubric Prompts with Injectors ------------------------------------------ - -Injectors allow you to augment rubric prompts with additional guidance, such as example responses or domain-specific vocabulary, to improve evaluation alignment. Each injector is rubric-specific, meaning different rubrics can receive different injected content. They are domain-dependent, so the examples and vocabulary injected are automatically selected based on the domain you specify (e.g., "nlp", "ecology"). Multiple injectors, such as examples and vocabulary, can be used together in a composable way. Available injectors are listed below: - -- **Example Injector**: Injects curated example responses for the chosen rubric and domain. -- **Vocabulary Injector**: Injects domain and rubric-specific terminology to guide model reasoning. - -**Usage Example** - -.. code-block:: python - - rubric = MechanisticUnderstanding( - papers=papers, - question=question, - answer=answer, - domain="nlp", - vocabulary=VocabularyInjector(), - example=ExampleInjector() - ) - -In this example, ``VocabularyInjector`` and ``ExampleInjector`` provide content aligned with the NLP domain for the *Mechanistic Understanding* rubric. - -**Example Injected Responses for Mechanistic Understanding** - -.. code-block:: json - - "MechanisticUnderstanding": [ - { - "rating": "1", - "rationale": "The response reports results or model performance but does not explain how the model architecture or training process leads to those outcomes." - }, - { - "rating": "4", - "rationale": "The response provides a clear mechanistic explanation of how the model works, describing the role of transformer-based architectures, the effects of pretraining and fine-tuning, and insights from ablation studies that show how specific components contribute to performance." - } - ] - -**Example Injected Vocabulary for Mechanistic Understanding in the NLP Domain** - -.. code-block:: json - - "training_terms": [ - "pretraining", "fine-tuning", "instruction tuning", "rlhf", "dpo", "lora", "qlora", "quantization", - "distillation", "curriculum", "data augmentation", "continual learning" - ] - -**Example: Evaluating an Answer Using MechanisticUnderstanding + CustomAutoJudge** - -.. code-block:: python - - from yescieval import MechanisticUnderstanding, CustomAutoJudge, ExampleInjector, VocabularyInjector - - # Step 1: Create a rubric - rubric = MechanisticUnderstanding(papers=papers, question=question, answer=answer, domain="nlp", vocabulary=VocabularyInjector(), example=ExampleInjector()) - instruction_prompt = rubric.instruct() - - # Step 2: Load the evaluation model (judge) - judge = CustomAutoJudge() - judge.from_pretrained(model_id="Qwen/Qwen3-8B", device="cpu", token="your_huggingface_token") - - # Step 3: Evaluate the answer - result = judge.judge(rubric=rubric) - print("Raw Evaluation Output:") - print(result) - - - -**Parsing Raw Output with GPTParser** - -If the model outputs unstructured or loosely structured text, you can use GPTParser to parse it into valid JSON. +**Output Parser**: If the model outputs unstructured or loosely structured text, you can use GPTParser to parse it into valid JSON. .. code-block:: python @@ -132,7 +62,7 @@ If the model outputs unstructured or loosely structured text, you can use GPTPar print("Parsed Output:") print(parsed.model_dump()) -**Expected Output Format** +Expected outputfFormat is: .. code-block:: json @@ -165,16 +95,20 @@ The output schema is as a following (if you do not prefer to use ``.model_dump() 'type': 'object' } + .. hint:: Key Components - +------------------+-------------------------------------------------------+ - | Component | Purpose | - +==================+=======================================================+ - | Informativeness | Defines rubric to evaluate relevance to source papers | - +------------------+-------------------------------------------------------+ - | AskAutoJudge | Loads and uses a judgment model to evaluate answers | - +------------------+-------------------------------------------------------+ - | GPTParser | Parses loosely formatted text from LLMs into JSON | - +------------------+-------------------------------------------------------+ + .. list-table:: + :header-rows: 1 + :widths: 30 60 + + * - Component + - Purpose + * - **Informativeness** + - Defines rubric to evaluate relevance to source papers + * - **AskAutoJudge** + - Loads and uses a judgment model to evaluate answers + * - **GPTParser** + - Parses loosely formatted text from LLMs into JSON diff --git a/docs/source/rubrics.rst b/docs/source/rubrics.rst index 7b04edd..08ff448 100644 --- a/docs/source/rubrics.rst +++ b/docs/source/rubrics.rst @@ -2,11 +2,31 @@ Rubrics =================== -A total of twenty one (21) evaluation rubrics were defined as part of the YESciEval test framework. +A total of **21** evaluation rubrics were defined as part of the YESciEval test framework within two categories presented as following: -Linguistic & Stylistic Quality +.. hint:: + + + Here is a simple example of how to import rubrics in your code: + + .. code-block:: python + + from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, + Integration, Cohesion, Readability, Conciseness, ContextCoverage, MethodCoverage, + DimensionCoverage, ScopeCoverage, ScaleCoverage, MechanisticUnderstanding, + CausalReasoning, TemporalPrecision, GapIdentification, + QuantitativeEvidenceAndUncertainty, EpistemicCalibration, + StateOfTheArtAndNovelty + + The rubrics are presented as following: + + +Question Answering --------------------------------- +Linguistic & Stylistic Quality +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Following ``Linguistic & Stylistic Quality`` concerns grammar, clarity, and adherence to academic writing conventions. @@ -24,7 +44,7 @@ Following ``Linguistic & Stylistic Quality`` concerns grammar, clarity, and adhe - Does the answer follow appropriate style and structure conventions for academic writing, particularly for readability? Logical & Structural Integrity ---------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Following ``Logical & Structural Integrity`` focuses on the reasoning and organization of information. .. list-table:: @@ -40,10 +60,10 @@ Following ``Logical & Structural Integrity`` focuses on the reasoning and organi * - **6. Relevancy:** - Is the information in the answer relevant to the problem? -Content Accuracy & Informativeness ---------------------------------- +Evidence Fidelity +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Following ``Content Accuracy & Informativeness`` ensures that the response is both correct and useful. +Following ``Evidence Fidelity`` ensures that the response is both correct and useful. .. list-table:: @@ -59,8 +79,36 @@ Following ``Content Accuracy & Informativeness`` ensures that the response is bo * - **9. Informativeness:** - Is the answer a useful and informative reply to the problem? +Usage +~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from yescieval import Coherence + + papers = { + "Paper 1 title": "abstract of paper 1 ...", + "Paper 2 title": "abstract of paper 2 ...", + "Paper 3 title": "abstract of paper 3 ...", + "Paper 4 title": "abstract of paper 4 ...", + "Paper 5 title": "abstract of paper 5 ..." + } + question = "What are the key findings on AI in these papers?" + answer = "The synthesis answer summarizing the papers." + + # Instantiate a rubric, e.g. Coherence + rubric = Coherence(papers=papers, question=question, answer=answer) + instruction = rubric.instruct() + + print(instruction) + print(rubric.name) + + +Deep Research +------------------- + Research Depth Assessment ---------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Following ``Research Depth Assessment`` quantifies the mechanistic and analytical sophistication of synthesis outputs. @@ -80,7 +128,7 @@ Following ``Research Depth Assessment`` quantifies the mechanistic and analytica Research Breadth Assessment ---------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Following ``Research Breadth Assessment`` evaluates the diversity of evidence across dimensions, scope, and methodological contexts. @@ -103,7 +151,7 @@ Following ``Research Breadth Assessment`` evaluates the diversity of evidence ac - Does the answer distribute attention across multiple distinct scales relevant to the research question? Scientific Rigor Assessment ---------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Following ``Scientific Rigor Assessment`` assesses the evidentiary and methodological integrity of the synthesis. @@ -120,7 +168,7 @@ Following ``Scientific Rigor Assessment`` assesses the evidentiary and methodolo - Does the answer clearly align claim strength with evidential support by marking uncertainty, assumptions, and limitations where relevant? Innovation Capacity Assessment ---------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Following ``Innovation Capacity Assessment`` evaluates the novelty of the synthesis. @@ -136,7 +184,7 @@ Following ``Innovation Capacity Assessment`` evaluates the novelty of the synthe Research Gap Assessment ---------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Following ``Research Gap Assessment`` detects explicit acknowledgment of unanswered questions or understudied areas in the synthesis. @@ -151,39 +199,91 @@ Following ``Research Gap Assessment`` detects explicit acknowledgment of unanswe - Does the answer point out unanswered questions or understudied areas, using terms like “research gap” or “understudied”? -Usage Example --------------------------- +Usage +~~~~~~~~~~~~~~ -Here is a simple example of how to import rubrics in your code: +Injectors allow you to augment rubric prompts with additional guidance, such as example responses or domain-specific vocabulary, to improve evaluation alignment. Each injector is rubric-specific, meaning different rubrics can receive different injected content. They are domain-dependent, so the examples and vocabulary injected are automatically selected based on the domain you specify (e.g., "nlp", "ecology"). Multiple injectors, such as examples and vocabulary, can be used together in a composable way. Available injectors are listed below: + +- **Example Injector**: Injects curated example responses for the chosen rubric and domain. +- **Vocabulary Injector**: Injects domain and rubric-specific terminology to guide model reasoning. + +Here is how to define the deep research rubric: .. code-block:: python - from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, - Integration, Cohesion, Readability, Conciseness, ContextCoverage, MethodCoverage, - DimensionCoverage, ScopeCoverage, ScaleCoverage, MechanisticUnderstanding, - CausalReasoning, TemporalPrecision, GapIdentification, - QuantitativeEvidenceAndUncertainty, EpistemicCalibration, - StateOfTheArtAndNovelty + from yescieval import MechanisticUnderstanding + + rubric = MechanisticUnderstanding( + papers=papers, + question=question, + answer=answer, + domain="nlp", + vocabulary=VocabularyInjector(), + example=ExampleInjector() + ) + +In this example, ``VocabularyInjector`` and ``ExampleInjector`` provide content aligned with the NLP domain for the *Mechanistic Understanding* rubric. +.. tab:: Injected Responses -And to use rubrics: + :: + + "MechanisticUnderstanding": [ + { + "rating": "1", + "rationale": "The response reports results or model performance but does not explain how the model architecture or training process leads to those outcomes." + }, + { + "rating": "4", + "rationale": "The response provides a clear mechanistic explanation of how the model works, describing the role of transformer-based architectures, the effects of pretraining and fine-tuning, and insights from ablation studies that show how specific components contribute to performance." + } + ] + +.. tab:: Injected Vocabulary + + :: + + "training_terms": [ + "pretraining", "fine-tuning", "instruction tuning", "rlhf", "dpo", "lora", "qlora", "quantization", + "distillation", "curriculum", "data augmentation", "continual learning" + ] + +Here is an complete example of how evaluation on can be done: .. code-block:: python - # Example inputs - papers = { - "Paper 1 title": "abstract of paper 1 ...", - "Paper 2 title": "abstract of paper 2 ...", - "Paper 3 title": "abstract of paper 3 ...", - "Paper 4 title": "abstract of paper 4 ...", - "Paper 5 title": "abstract of paper 5 ..." - } - question = "What are the key findings on AI in these papers?" - answer = "The synthesis answer summarizing the papers." + from yescieval import MechanisticUnderstanding, CustomAutoJudge, ExampleInjector, VocabularyInjector - # Instantiate a rubric, e.g. Coherence - rubric = Coherence(papers=papers, question=question, answer=answer) - instruction = rubric.instruct() + # Step 1: Create a rubric + rubric = MechanisticUnderstanding(papers=papers, + question=question, + answer=answer, + domain="nlp", + vocabulary=VocabularyInjector(), + example=ExampleInjector()) + instruction_prompt = rubric.instruct() - print(instruction) - print(rubric.name) + # Step 2: Load the evaluation model (judge) + judge = CustomAutoJudge() + judge.from_pretrained(model_id="Qwen/Qwen3-8B", device="cpu", token="your_huggingface_token") + + # Step 3: Evaluate the answer + result = judge.judge(rubric=rubric) + print("Raw Evaluation Output:") + print(result) + + +.. hint:: + + There are specific domains incorporated in YESCiEval for injectors presented as following, however using injector is also optional! + + .. list-table:: + :header-rows: 1 + :widths: 50 30 + + * - Domain + - ID + * - **Natural Language Processing** + - ``nlp`` + * - **Ecology** + - ``ecology`` \ No newline at end of file From 107df6afb4d7d61f13c4bfdc9f222ff017e7e98f Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Thu, 29 Jan 2026 18:49:10 +0100 Subject: [PATCH 11/13] :memo: fix documentations --- README.md | 11 +++++------ docs/source/rubrics.rst | 9 ++++----- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index f00a219..7d5b199 100644 --- a/README.md +++ b/README.md @@ -94,12 +94,11 @@ Judges within YESciEval are defined as follows: A total of **23** evaluation rubrics were defined as part of the YESciEval test framework and can be used via ``yescieval``. Following simple example shows how to import rubrics in your code: ```python -from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, \ - Integration, Cohesion, Readability, Conciseness, GeographicCoverage, \ - InterventionDiversity, BiodiversityDimensions, EcosystemServices, SpatialScale, \ - MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, \ - StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, \ - SpeculativeStatements, NoveltyIndicators +from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy,\ + Integration, Cohesion, Readability, Conciseness,\ + MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification,\ + StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment,\ + SpeculativeStatements, NoveltyIndicators ``` A complete list of rubrics are available at YESciEval [📚 Rubrics](https://yescieval.readthedocs.io/rubrics.html) page. diff --git a/docs/source/rubrics.rst b/docs/source/rubrics.rst index 08ff448..e730359 100644 --- a/docs/source/rubrics.rst +++ b/docs/source/rubrics.rst @@ -12,11 +12,10 @@ A total of **21** evaluation rubrics were defined as part of the YESciEval test .. code-block:: python from yescieval import Informativeness, Correctness, Completeness, Coherence, Relevancy, - Integration, Cohesion, Readability, Conciseness, ContextCoverage, MethodCoverage, - DimensionCoverage, ScopeCoverage, ScaleCoverage, MechanisticUnderstanding, - CausalReasoning, TemporalPrecision, GapIdentification, - QuantitativeEvidenceAndUncertainty, EpistemicCalibration, - StateOfTheArtAndNovelty + Integration, Cohesion, Readability, Conciseness, + MechanisticUnderstanding, CausalReasoning, TemporalPrecision, GapIdentification, + StatisticalSophistication, CitationPractices, UncertaintyAcknowledgment, + SpeculativeStatements, NoveltyIndicators The rubrics are presented as following: From d0286e2302c72d8dbec7e31c899868634837633d Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Thu, 29 Jan 2026 18:50:19 +0100 Subject: [PATCH 12/13] :recycle: convert `informativeness` -> `fidelity` (#11) --- yescieval/rubric/__init__.py | 2 +- yescieval/rubric/{informativeness.py => fidelity.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename yescieval/rubric/{informativeness.py => fidelity.py} (100%) diff --git a/yescieval/rubric/__init__.py b/yescieval/rubric/__init__.py index 2e6435f..72ce6d4 100644 --- a/yescieval/rubric/__init__.py +++ b/yescieval/rubric/__init__.py @@ -1,4 +1,4 @@ -from .informativeness import Informativeness, Correctness, Completeness +from .fidelity import Informativeness, Correctness, Completeness from .structural import Coherence, Relevancy, Integration from .stylistic import Cohesion, Readability, Conciseness from .depth import MechanisticUnderstanding, CausalReasoning, TemporalPrecision diff --git a/yescieval/rubric/informativeness.py b/yescieval/rubric/fidelity.py similarity index 100% rename from yescieval/rubric/informativeness.py rename to yescieval/rubric/fidelity.py From 806a00467b777f5f58ad91320f8ab5dd1e784981 Mon Sep 17 00:00:00 2001 From: Hamed Babaei Giglou Date: Thu, 29 Jan 2026 18:57:49 +0100 Subject: [PATCH 13/13] :bookmark: v0.5.0 --- CHANGELOG.md | 5 +++++ yescieval/VERSION | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bb7366..d0a0329 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ ## Changelog +### v0.5.0 (January 29, 2026) +- Added Example and Vocab Injectors for Deep Research Rubric (PR #13) +- Updated documentations (PR #13) +- Minor refactoring (#11) + ### v0.4.0 (January 13, 2026) - Add a GPT custom jude (PR #5) - Update documentation diff --git a/yescieval/VERSION b/yescieval/VERSION index 60a2d3e..79a2734 100644 --- a/yescieval/VERSION +++ b/yescieval/VERSION @@ -1 +1 @@ -0.4.0 \ No newline at end of file +0.5.0 \ No newline at end of file