dpaia · zaharchenko-evgeny · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/.ee-bench/codegen/Dockerfile b/.ee-bench/codegen/Dockerfile
@@ -0,0 +1,91 @@
+###############################################
+# BASE IMAGE
+###############################################
+FROM ubuntu:20.04
+
+###############################################
+# WORKING DIRECTORY
+###############################################
+RUN mkdir /app
+WORKDIR /app
+
+###############################################
+# SYSTEM DEPENDENCIES
+###############################################
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-setuptools \
+    python-is-python3 \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Node.js 18
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
+    apt-get install -y nodejs
+
+# Install Yarn
+RUN npm install -g yarn
+
+# CRITICAL: Verify pip is available after Python installation (needed for parsing.py dependencies)
+RUN python -m pip --version && echo "✓ pip is available" || (echo "✗ pip not found" && exit 1)
+
+###############################################
+# REPO SETUP
+###############################################
+RUN git clone https://github.com/dpaia/webclients.git .
+RUN git checkout 7366a9584597e6c06b10dee477e043cded019649
+
+###############################################
+# ENTRYPOINT / CMD
+###############################################
+
+# Write preprocess and build scripts
+ENV PYTEST_ADDOPTS="--tb=short -v --continue-on-collection-errors --reruns=3"
+ENV UV_HTTP_TIMEOUT=60
+
+RUN cat <<'EOFPREP' > /preprocess.sh
+#!/bin/bash
+
+cd /app
+
+git reset --hard 9b35b414f77c6165550550fdda8b25bbc74aac7b
+git clean -fdx
+git checkout 9b35b414f77c6165550550fdda8b25bbc74aac7b
+
+cd /
+
+EOFPREP
+RUN chmod +x /preprocess.sh
+RUN /preprocess.sh
+
+RUN cat <<'EOFBUILD' > /build.sh
+##!/bin/sh
+pip install setuptools || true
+sleep 3
+pip install pytest-rerunfailures
+export PYTEST_ADDOPTS="--tb=short -v --continue-on-collection-errors --reruns=3"
+
+cd /app
+set -e
+
+python -m pip --version && echo "✓ pip is available" || (echo "✗ pip not found" && exit 1)
+
+yarn install --no-frozen-lockfile
+
+export NODE_OPTIONS="--max-old-space-size=4096"
+
+echo "================= 0909 BUILD START 0909 ================="
+yarn workspaces foreach -A run postinstall || true
+echo "================= 0909 BUILD END 0909 ================="
+
+EOFBUILD
+RUN chmod +x /build.sh
+RUN /build.sh
+
+LABEL ee-bench.type="codegen"
+LABEL ee-bench.version="1.0"
+RUN rm -rf /app/.ee-bench/ 2>/dev/null || true
diff --git a/.ee-bench/codegen/eval/run.sh b/.ee-bench/codegen/eval/run.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${EE_BENCH_PROJECT_ROOT:-/app}"
+EVAL_DIR="/ee-bench/eval"
+SUBMISSION_DIR="/ee-bench/submission"
+
+# --- Environment from Dockerfile (rendered from dockerfile_env_vars) ---
+export DEBIAN_FRONTEND="noninteractive"
+export PYTEST_ADDOPTS="--tb=short -v --continue-on-collection-errors --reruns=3"
+export UV_HTTP_TIMEOUT="60"
+
+# --- Reset to base commit ---
+cd "$PROJECT_ROOT"
+git reset --hard "9b35b414f77c6165550550fdda8b25bbc74aac7b" 2>/dev/null
+git checkout "9b35b414f77c6165550550fdda8b25bbc74aac7b" 2>/dev/null
+git clean -fd 2>/dev/null
+
+# --- Fetch commits referenced by before_repo_set_cmd ---
+
+git fetch origin 1501eb765873b2884b6f1944fd242ecfc9d6b103 2>/dev/null || true
+
+# --- before_repo_set_cmd (from HF metadata, may be empty) ---
+git reset --hard 9b35b414f77c6165550550fdda8b25bbc74aac7b
+git clean -fd 
+git checkout 9b35b414f77c6165550550fdda8b25bbc74aac7b 
+git checkout 1501eb765873b2884b6f1944fd242ecfc9d6b103 -- packages/components/components/smartBanner/SmartBanner.test.tsx
+
+# --- Apply evaluation data (test patch) ---
+if [ -f "$EVAL_DIR/test_patch.diff" ]; then
+  git apply -v "$EVAL_DIR/test_patch.diff" 2>/dev/null || true
+fi
+
+# --- Apply candidate submission ---
+if [ -f "$SUBMISSION_DIR/patch.diff" ]; then
+  git apply -v "$SUBMISSION_DIR/patch.diff" 2>/dev/null || true
+fi
+
+# --- Run tests via SWE-bench Pro run script ---
+
+bash "$EVAL_DIR/scripts/run_script.sh" "components/smartBanner/SmartBanner.test.ts,packages/components/components/smartBanner/SmartBanner.test.tsx" \
+  > /tmp/stdout.log 2> /tmp/stderr.log || true
+
+
+# --- Parse results ---
+python3 "$EVAL_DIR/scripts/parser.py" /tmp/stdout.log /tmp/stderr.log /tmp/output.json
+
+# --- Convert parser output to EE-bench JSON v2.0 format ---
+python3 -c "
+import json, sys, datetime
+with open('/tmp/output.json') as f:
+    data = json.load(f)
+stdout = open('/tmp/stdout.log').read()
+stderr = open('/tmp/stderr.log').read()
+
+passed = [t for t in data.get('tests', []) if t['status'] == 'PASSED']
+failed = [t for t in data.get('tests', []) if t['status'] in ('FAILED', 'ERROR')]
+skipped = [t for t in data.get('tests', []) if t['status'] == 'SKIPPED']
+
+summary = {
+    'total': len(data.get('tests', [])),
+    'passed': len(passed),
+    'failed': len(failed),
+    'errors': 0,
+    'skipped': len(skipped),
+}
+passed_tests = [{'name': t['name']} for t in passed]
+failed_tests = [{'name': t['name']} for t in failed]
+
+result = {
+    'schema_version': '2.0',
+    'command': 'run',
+    'status': 'success',
+    'timestamp': datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
+    'criteria': [
+        {
+            'criterion': 'patch_applied',
+            'status': 'pass',
+        },
+        {
+            'criterion': 'compilation',
+            'status': 'pass',
+        },
+        {
+            'criterion': 'tests',
+            'status': 'pass' if not failed else 'fail',
+            'summary': summary,
+            'passed_tests': passed_tests,
+            'failed_tests': failed_tests,
+        },
+    ],
+    'stdout': stdout,
+    'stderr': stderr,
+    # Deprecated v1.0 fields for backward compat
+    'patch_applied': True,
+    'compile_success': True,
+    'summary': summary,
+    'passed_tests': passed_tests,
+    'failed_tests': failed_tests,
+}
+print(json.dumps(result))
+"
diff --git a/.ee-bench/codegen/eval/scripts/parser.py b/.ee-bench/codegen/eval/scripts/parser.py
@@ -0,0 +1,176 @@
+"""
+Test Results Parser
+
+This script parses test execution outputs to extract structured test results.
+
+Input:
+    - stdout_file: Path to the file containing standard output from test execution
+    - stderr_file: Path to the file containing standard error from test execution
+
+Output:
+    - JSON file containing parsed test results with structure:
+      {
+          "tests": [
+              {
+                  "name": "test_name",
+                  "status": "PASSED|FAILED|SKIPPED|ERROR"
+              },
+              ...
+          ]
+      }
+"""
+
+import dataclasses
+import json
+import sys
+import re
+from enum import Enum
+from pathlib import Path
+from typing import List
+
+class TestStatus(Enum):
+    """The test status enum."""
+
+    PASSED = 1
+    FAILED = 2
+    SKIPPED = 3
+    ERROR = 4
+
+
+@dataclasses.dataclass
+class TestResult:
+    """The test result dataclass."""
+
+    name: str
+    status: TestStatus
+
+
+
+def parse_test_output(stdout_content: str, stderr_content: str) -> List[TestResult]:
+    """
+    Parse Jest test output content and extract test results.
+
+    Jest output format includes:
+    - PASS/FAIL indicators for test files
+    - Individual test results with ✓ (passed) or ✗ (failed) symbols
+    - Test names in nested describe/it blocks
+    """
+    results = []
+
+    file_pattern = r'(PASS|FAIL)\s+(.+?\.(?:test|spec)\.[jt]sx?)'
+
+    # Pattern to match individual test results
+    test_pattern = r'^\s*[✓✗×]\s+(.+?)(?:\s+\(\d+\s*ms\))?$'
+
+    skip_pattern = r'^\s*○\s+(.+?)(?:\s+\(\d+\s*ms\))?$'
+
+    current_file = None
+    current_describe_blocks = []
+
+    lines = stderr_content.split('\n')
+
+    for i, line in enumerate(lines):
+        file_match = re.search(file_pattern, line)
+        if file_match:
+            current_file = file_match.group(2)
+            current_describe_blocks = []
+            continue
+
+        if not current_file:
+            continue
+
+        if line.strip() and not re.match(r'^\s*[✓✗×○]', line) and not line.strip().startswith('Test Suites:') and not line.strip().startswith('Tests:'):
+            stripped = line.strip()
+            if stripped and not stripped.startswith('PASS') and not stripped.startswith('FAIL') and not stripped.startswith('Time:') and not stripped.startswith('(node:'):
+                is_describe = False
+                for j in range(i + 1, min(i + 10, len(lines))):
+                    if re.match(r'^\s*[✓✗×○]', lines[j]):
+                        is_describe = True
+                        break
+                    elif lines[j].strip().startswith('PASS') or lines[j].strip().startswith('FAIL'):
+                        break
+
+                if is_describe:
+                    indent_level = len(line) - len(line.lstrip())
+                    if indent_level >= 2:  # At least some indentation
+                        current_describe_blocks = [stripped]
+
+        # Check for individual test results
+        test_match = re.match(test_pattern, line)
+        if test_match:
+            test_name = test_match.group(1).strip()
+            full_name = current_file
+            if current_describe_blocks:
+                full_name += f" | {current_describe_blocks[0]} {test_name}"
+            else:
+                full_name += f" | {test_name}"
+
+            if line.strip().startswith('✓'):
+                results.append(TestResult(full_name, TestStatus.PASSED))
+            else:
+                results.append(TestResult(full_name, TestStatus.FAILED))
+
+        skip_match = re.match(skip_pattern, line)
+        if skip_match:
+            test_name = skip_match.group(1).strip()
+            full_name = current_file
+            if current_describe_blocks:
+                full_name += f" | {current_describe_blocks[0]} {test_name}"
+            else:
+                full_name += f" | {test_name}"
+            results.append(TestResult(full_name, TestStatus.SKIPPED))
+
+    if "Error:" in stderr_content or "FAIL" in stderr_content:
+        pass
+
+    return results
+
+
+
+
+def export_to_json(results: List[TestResult], output_path: Path) -> None:
+    """
+    Export the test results to a JSON file.
+
+    Args:
+        results: List of TestResult objects
+        output_path: Path to the output JSON file
+    """
+
+    unique_results = {result.name: result for result in results}.values()
+
+    json_results = {
+        'tests': [
+            {'name': result.name, 'status': result.status.name} for result in unique_results
+        ]
+    }
+
+    with open(output_path, 'w') as f:
+        json.dump(json_results, f, indent=2)
+
+
+def main(stdout_path: Path, stderr_path: Path, output_path: Path) -> None:
+    """
+    Main function to orchestrate the parsing process.
+
+    Args:
+        stdout_path: Path to the stdout file
+        stderr_path: Path to the stderr file
+        output_path: Path to the output JSON file
+    """
+    with open(stdout_path) as f:
+        stdout_content = f.read()
+    with open(stderr_path) as f:
+        stderr_content = f.read()
+
+    results = parse_test_output(stdout_content, stderr_content)
+
+    export_to_json(results, output_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 4:
+        print('Usage: python parsing.py <stdout_file> <stderr_file> <output_json>')
+        sys.exit(1)
+
+    main(Path(sys.argv[1]), Path(sys.argv[2]), Path(sys.argv[3]))