From a379a2b62b59cf72fc40ce9e67e3c66688766f3c Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 6 Nov 2025 13:48:31 +0100 Subject: [PATCH 1/3] profbench --- src/lighteval/tasks/tasks/profbench.py | 137 +++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 src/lighteval/tasks/tasks/profbench.py diff --git a/src/lighteval/tasks/tasks/profbench.py b/src/lighteval/tasks/tasks/profbench.py new file mode 100644 index 000000000..8bb1c0df3 --- /dev/null +++ b/src/lighteval/tasks/tasks/profbench.py @@ -0,0 +1,137 @@ +from inspect_ai import Task, task +from inspect_ai.dataset import Sample, hf_dataset +from inspect_ai.model import get_model +from inspect_ai.scorer import Score, accuracy, scorer +from inspect_ai.solver import generate + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +QUESTION_PROMPT_YES_NO = """Response: + +{response} + +Evaluate whether the response above satisfies this criterion: {criterion_description}. Only answer Yes or No.""" + +weight_to_scale = {"Critical": 4, "Major": 3, "Minor": 2, "Additional": 1} + + +async def evaluate_criterion_with_judge(response, criterion_description, domain, criterion_type): + """Evaluate a single criterion using LLM judge.""" + prompt = QUESTION_PROMPT_YES_NO.format(response=response, criterion_description=criterion_description) + + model = get_model() + result = await model.generate(prompt) + + judge_rating = result.completion.strip() + return judge_rating.startswith("Yes"), judge_rating + + +@scorer(metrics=[accuracy()]) +def profbench_weighted_scorer(): + """Scorer that evaluates all criteria and computes weighted ProfBench scores.""" + + async def score(state, target): + rubrics = state.metadata.get("rubrics", []) + response = state.output.completion + task_id = state.metadata.get("task_id", "") + domain = state.metadata.get("domain", "") + + # Evaluate each criterion + criterion_results = [] + total_weight = 0 + achieved_weight = 0 + + for rubric in rubrics: + criterion_description = rubric["criterion_description"] + criterion_weight = rubric["criterion_weight"] + criterion_type = rubric["criterion_type"] + + weight_scale = weight_to_scale.get(criterion_weight, 1) + total_weight += weight_scale + + # Evaluate criterion + fulfilled, judge_rating = await evaluate_criterion_with_judge( + response, criterion_description, domain, criterion_type + ) + + if fulfilled: + achieved_weight += weight_scale + + criterion_results.append( + { + "criterion_description": criterion_description, + "criterion_weight": criterion_weight, + "criterion_type": criterion_type, + "fulfilled": fulfilled, + "judge_rating": judge_rating, + } + ) + + # Calculate score for this task + task_score = (achieved_weight / total_weight) if total_weight > 0 else 0.0 + + return Score( + value=task_score, + metadata={ + "task_id": task_id, + "domain": domain, + "task_score": task_score, + "achieved_weight": achieved_weight, + "total_weight": total_weight, + "criterion_results": criterion_results, + "response": response, + }, + ) + + return score + + +def record_to_sample(record): + """Convert ProfBench dataset record to Inspect Sample.""" + return Sample( + input=record["prompt"], + target="", # No target for generation tasks + metadata={ + "task_id": record["task_id"], + "domain": record["domain"], + "rubrics": record["rubrics"], + "filepaths": record.get("filepaths", []), + }, + ) + + +@task +def _profbench(): + """ + ProfBench report generation task. + """ + # Load dataset + dataset_obj = hf_dataset( + path="nvidia/ProfBench", + split="test", + sample_fields=record_to_sample, + ) + + return Task( + dataset=dataset_obj, + solver=[generate()], + scorer=profbench_weighted_scorer(), + ) + + +profbench = LightevalTaskConfig( + name="profbench", + prompt_function=lambda line, task_name: line["prompt"], + hf_repo="nvidia/ProfBench", + hf_subset="default", + evaluation_splits=["test"], + metrics=[Metrics.exact_match], + version=0, + sample_fields=record_to_sample, + solver=[generate(cache=True)], + scorer=profbench_weighted_scorer(), +) + +TASKS_TABLE = [profbench] From 08abb06b3e3623f3b6f388a29f155347ee9b513a Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 6 Nov 2025 13:52:04 +0100 Subject: [PATCH 2/3] profbench --- src/lighteval/tasks/tasks/profbench.py | 47 ++++++++++++++------------ 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/src/lighteval/tasks/tasks/profbench.py b/src/lighteval/tasks/tasks/profbench.py index 8bb1c0df3..8221a6a95 100644 --- a/src/lighteval/tasks/tasks/profbench.py +++ b/src/lighteval/tasks/tasks/profbench.py @@ -1,5 +1,29 @@ -from inspect_ai import Task, task -from inspect_ai.dataset import Sample, hf_dataset +""" +name: +ProfBench + +dataset: +nvidia/ProfBench + +abstract: +More than 3000 rubric criteria across 40 human-annotated tasks presenting +reports addressing professional tasks across PhD STEM (Chemistry, Physics) and +Professional Services (Financial Services, Management Consulting) domains. + +languages: +english + +tags: +reasoning, professional-reports + +paper: +https://arxiv.org/abs/2510.18941 + +starred: +true +""" + +from inspect_ai.dataset import Sample from inspect_ai.model import get_model from inspect_ai.scorer import Score, accuracy, scorer from inspect_ai.solver import generate @@ -102,25 +126,6 @@ def record_to_sample(record): ) -@task -def _profbench(): - """ - ProfBench report generation task. - """ - # Load dataset - dataset_obj = hf_dataset( - path="nvidia/ProfBench", - split="test", - sample_fields=record_to_sample, - ) - - return Task( - dataset=dataset_obj, - solver=[generate()], - scorer=profbench_weighted_scorer(), - ) - - profbench = LightevalTaskConfig( name="profbench", prompt_function=lambda line, task_name: line["prompt"], From b640ca5bfe556be0fe654f8482792751b89a2866 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 6 Nov 2025 14:31:43 +0100 Subject: [PATCH 3/3] profbench --- src/lighteval/tasks/tasks/profbench.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/lighteval/tasks/tasks/profbench.py b/src/lighteval/tasks/tasks/profbench.py index 8221a6a95..6cfa46b5c 100644 --- a/src/lighteval/tasks/tasks/profbench.py +++ b/src/lighteval/tasks/tasks/profbench.py @@ -45,7 +45,8 @@ async def evaluate_criterion_with_judge(response, criterion_description, domain, """Evaluate a single criterion using LLM judge.""" prompt = QUESTION_PROMPT_YES_NO.format(response=response, criterion_description=criterion_description) - model = get_model() + # The original code uses GPT-oss-120b as the judge model. + model = get_model("hf-inference-providers/openai/gpt-oss-120b") result = await model.generate(prompt) judge_rating = result.completion.strip() @@ -126,9 +127,14 @@ def record_to_sample(record): ) +def profbench_prompt_function(line, task_name): + """Prompt function for ProfBench.""" + raise NotImplementedError("ProfBench not implemented yet for backends other than inspect-ai.") + + profbench = LightevalTaskConfig( name="profbench", - prompt_function=lambda line, task_name: line["prompt"], + prompt_function=profbench_prompt_function, hf_repo="nvidia/ProfBench", hf_subset="default", evaluation_splits=["test"],