From a379a2b62b59cf72fc40ce9e67e3c66688766f3c Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib19@gmail.com>
Date: Thu, 6 Nov 2025 13:48:31 +0100
Subject: [PATCH 1/3] profbench

---
 src/lighteval/tasks/tasks/profbench.py | 137 +++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 src/lighteval/tasks/tasks/profbench.py

diff --git a/src/lighteval/tasks/tasks/profbench.py b/src/lighteval/tasks/tasks/profbench.py
new file mode 100644
index 000000000..8bb1c0df3
--- /dev/null
+++ b/src/lighteval/tasks/tasks/profbench.py
@@ -0,0 +1,137 @@
+from inspect_ai import Task, task
+from inspect_ai.dataset import Sample, hf_dataset
+from inspect_ai.model import get_model
+from inspect_ai.scorer import Score, accuracy, scorer
+from inspect_ai.solver import generate
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+QUESTION_PROMPT_YES_NO = """Response:
+
+{response}
+
+Evaluate whether the response above satisfies this criterion: {criterion_description}. Only answer Yes or No."""
+
+weight_to_scale = {"Critical": 4, "Major": 3, "Minor": 2, "Additional": 1}
+
+
+async def evaluate_criterion_with_judge(response, criterion_description, domain, criterion_type):
+    """Evaluate a single criterion using LLM judge."""
+    prompt = QUESTION_PROMPT_YES_NO.format(response=response, criterion_description=criterion_description)
+
+    model = get_model()
+    result = await model.generate(prompt)
+
+    judge_rating = result.completion.strip()
+    return judge_rating.startswith("Yes"), judge_rating
+
+
+@scorer(metrics=[accuracy()])
+def profbench_weighted_scorer():
+    """Scorer that evaluates all criteria and computes weighted ProfBench scores."""
+
+    async def score(state, target):
+        rubrics = state.metadata.get("rubrics", [])
+        response = state.output.completion
+        task_id = state.metadata.get("task_id", "")
+        domain = state.metadata.get("domain", "")
+
+        # Evaluate each criterion
+        criterion_results = []
+        total_weight = 0
+        achieved_weight = 0
+
+        for rubric in rubrics:
+            criterion_description = rubric["criterion_description"]
+            criterion_weight = rubric["criterion_weight"]
+            criterion_type = rubric["criterion_type"]
+
+            weight_scale = weight_to_scale.get(criterion_weight, 1)
+            total_weight += weight_scale
+
+            # Evaluate criterion
+            fulfilled, judge_rating = await evaluate_criterion_with_judge(
+                response, criterion_description, domain, criterion_type
+            )
+
+            if fulfilled:
+                achieved_weight += weight_scale
+
+            criterion_results.append(
+                {
+                    "criterion_description": criterion_description,
+                    "criterion_weight": criterion_weight,
+                    "criterion_type": criterion_type,
+                    "fulfilled": fulfilled,
+                    "judge_rating": judge_rating,
+                }
+            )
+
+        # Calculate score for this task
+        task_score = (achieved_weight / total_weight) if total_weight > 0 else 0.0
+
+        return Score(
+            value=task_score,
+            metadata={
+                "task_id": task_id,
+                "domain": domain,
+                "task_score": task_score,
+                "achieved_weight": achieved_weight,
+                "total_weight": total_weight,
+                "criterion_results": criterion_results,
+                "response": response,
+            },
+        )
+
+    return score
+
+
+def record_to_sample(record):
+    """Convert ProfBench dataset record to Inspect Sample."""
+    return Sample(
+        input=record["prompt"],
+        target="",  # No target for generation tasks
+        metadata={
+            "task_id": record["task_id"],
+            "domain": record["domain"],
+            "rubrics": record["rubrics"],
+            "filepaths": record.get("filepaths", []),
+        },
+    )
+
+
+@task
+def _profbench():
+    """
+    ProfBench report generation task.
+    """
+    # Load dataset
+    dataset_obj = hf_dataset(
+        path="nvidia/ProfBench",
+        split="test",
+        sample_fields=record_to_sample,
+    )
+
+    return Task(
+        dataset=dataset_obj,
+        solver=[generate()],
+        scorer=profbench_weighted_scorer(),
+    )
+
+
+profbench = LightevalTaskConfig(
+    name="profbench",
+    prompt_function=lambda line, task_name: line["prompt"],
+    hf_repo="nvidia/ProfBench",
+    hf_subset="default",
+    evaluation_splits=["test"],
+    metrics=[Metrics.exact_match],
+    version=0,
+    sample_fields=record_to_sample,
+    solver=[generate(cache=True)],
+    scorer=profbench_weighted_scorer(),
+)
+
+TASKS_TABLE = [profbench]

From 08abb06b3e3623f3b6f388a29f155347ee9b513a Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib19@gmail.com>
Date: Thu, 6 Nov 2025 13:52:04 +0100
Subject: [PATCH 2/3] profbench

---
 src/lighteval/tasks/tasks/profbench.py | 47 ++++++++++++++------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/lighteval/tasks/tasks/profbench.py b/src/lighteval/tasks/tasks/profbench.py
index 8bb1c0df3..8221a6a95 100644
--- a/src/lighteval/tasks/tasks/profbench.py
+++ b/src/lighteval/tasks/tasks/profbench.py
@@ -1,5 +1,29 @@
-from inspect_ai import Task, task
-from inspect_ai.dataset import Sample, hf_dataset
+"""
+name:
+ProfBench
+
+dataset:
+nvidia/ProfBench
+
+abstract:
+More than 3000 rubric criteria across 40 human-annotated tasks presenting
+reports addressing professional tasks across PhD STEM (Chemistry, Physics) and
+Professional Services (Financial Services, Management Consulting) domains.
+
+languages:
+english
+
+tags:
+reasoning, professional-reports
+
+paper:
+https://arxiv.org/abs/2510.18941
+
+starred:
+true
+"""
+
+from inspect_ai.dataset import Sample
 from inspect_ai.model import get_model
 from inspect_ai.scorer import Score, accuracy, scorer
 from inspect_ai.solver import generate
@@ -102,25 +126,6 @@ def record_to_sample(record):
     )
 
 
-@task
-def _profbench():
-    """
-    ProfBench report generation task.
-    """
-    # Load dataset
-    dataset_obj = hf_dataset(
-        path="nvidia/ProfBench",
-        split="test",
-        sample_fields=record_to_sample,
-    )
-
-    return Task(
-        dataset=dataset_obj,
-        solver=[generate()],
-        scorer=profbench_weighted_scorer(),
-    )
-
-
 profbench = LightevalTaskConfig(
     name="profbench",
     prompt_function=lambda line, task_name: line["prompt"],

From b640ca5bfe556be0fe654f8482792751b89a2866 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib19@gmail.com>
Date: Thu, 6 Nov 2025 14:31:43 +0100
Subject: [PATCH 3/3] profbench

---
 src/lighteval/tasks/tasks/profbench.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/tasks/tasks/profbench.py b/src/lighteval/tasks/tasks/profbench.py
index 8221a6a95..6cfa46b5c 100644
--- a/src/lighteval/tasks/tasks/profbench.py
+++ b/src/lighteval/tasks/tasks/profbench.py
@@ -45,7 +45,8 @@ async def evaluate_criterion_with_judge(response, criterion_description, domain,
     """Evaluate a single criterion using LLM judge."""
     prompt = QUESTION_PROMPT_YES_NO.format(response=response, criterion_description=criterion_description)
 
-    model = get_model()
+    # The original code uses GPT-oss-120b as the judge model.
+    model = get_model("hf-inference-providers/openai/gpt-oss-120b")
     result = await model.generate(prompt)
 
     judge_rating = result.completion.strip()
@@ -126,9 +127,14 @@ def record_to_sample(record):
     )
 
 
+def profbench_prompt_function(line, task_name):
+    """Prompt function for ProfBench."""
+    raise NotImplementedError("ProfBench not implemented yet for backends other than inspect-ai.")
+
+
 profbench = LightevalTaskConfig(
     name="profbench",
-    prompt_function=lambda line, task_name: line["prompt"],
+    prompt_function=profbench_prompt_function,
     hf_repo="nvidia/ProfBench",
     hf_subset="default",
     evaluation_splits=["test"],