Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions .ee-bench/codegen/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
###############################################
# BASE IMAGE
###############################################
FROM ubuntu:20.04

###############################################
# WORKING DIRECTORY
###############################################
RUN mkdir /app
WORKDIR /app

###############################################
# SYSTEM DEPENDENCIES
###############################################
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y \
curl \
git \
python3 \
python3-pip \
python3-setuptools \
python-is-python3 \
build-essential \
&& rm -rf /var/lib/apt/lists/*

# Install Node.js 18
RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - && \
apt-get install -y nodejs

# Install Yarn
RUN npm install -g yarn

# CRITICAL: Verify pip is available after Python installation (needed for parsing.py dependencies)
RUN python -m pip --version && echo "✓ pip is available" || (echo "✗ pip not found" && exit 1)

###############################################
# REPO SETUP
###############################################
RUN git clone https://github.com/dpaia/webclients.git .
RUN git checkout 7366a9584597e6c06b10dee477e043cded019649

###############################################
# ENTRYPOINT / CMD
###############################################

# Write preprocess and build scripts
ENV PYTEST_ADDOPTS="--tb=short -v --continue-on-collection-errors --reruns=3"
ENV UV_HTTP_TIMEOUT=60

RUN cat <<'EOFPREP' > /preprocess.sh
#!/bin/bash

cd /app

git reset --hard 9b35b414f77c6165550550fdda8b25bbc74aac7b
git clean -fdx
git checkout 9b35b414f77c6165550550fdda8b25bbc74aac7b

cd /

EOFPREP
RUN chmod +x /preprocess.sh
RUN /preprocess.sh

RUN cat <<'EOFBUILD' > /build.sh
##!/bin/sh
pip install setuptools || true
sleep 3
pip install pytest-rerunfailures
export PYTEST_ADDOPTS="--tb=short -v --continue-on-collection-errors --reruns=3"

cd /app
set -e

python -m pip --version && echo "✓ pip is available" || (echo "✗ pip not found" && exit 1)

yarn install --no-frozen-lockfile

export NODE_OPTIONS="--max-old-space-size=4096"

echo "================= 0909 BUILD START 0909 ================="
yarn workspaces foreach -A run postinstall || true
echo "================= 0909 BUILD END 0909 ================="

EOFBUILD
RUN chmod +x /build.sh
RUN /build.sh

LABEL ee-bench.type="codegen"
LABEL ee-bench.version="1.0"
RUN rm -rf /app/.ee-bench/ 2>/dev/null || true
102 changes: 102 additions & 0 deletions .ee-bench/codegen/eval/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env bash
set -euo pipefail

PROJECT_ROOT="${EE_BENCH_PROJECT_ROOT:-/app}"
EVAL_DIR="/ee-bench/eval"
SUBMISSION_DIR="/ee-bench/submission"

# --- Environment from Dockerfile (rendered from dockerfile_env_vars) ---
export DEBIAN_FRONTEND="noninteractive"
export PYTEST_ADDOPTS="--tb=short -v --continue-on-collection-errors --reruns=3"
export UV_HTTP_TIMEOUT="60"

# --- Reset to base commit ---
cd "$PROJECT_ROOT"
git reset --hard "9b35b414f77c6165550550fdda8b25bbc74aac7b" 2>/dev/null
git checkout "9b35b414f77c6165550550fdda8b25bbc74aac7b" 2>/dev/null
git clean -fd 2>/dev/null

# --- Fetch commits referenced by before_repo_set_cmd ---

git fetch origin 1501eb765873b2884b6f1944fd242ecfc9d6b103 2>/dev/null || true

# --- before_repo_set_cmd (from HF metadata, may be empty) ---
git reset --hard 9b35b414f77c6165550550fdda8b25bbc74aac7b
git clean -fd
git checkout 9b35b414f77c6165550550fdda8b25bbc74aac7b
git checkout 1501eb765873b2884b6f1944fd242ecfc9d6b103 -- packages/components/components/smartBanner/SmartBanner.test.tsx

# --- Apply evaluation data (test patch) ---
if [ -f "$EVAL_DIR/test_patch.diff" ]; then
git apply -v "$EVAL_DIR/test_patch.diff" 2>/dev/null || true
fi

# --- Apply candidate submission ---
if [ -f "$SUBMISSION_DIR/patch.diff" ]; then
git apply -v "$SUBMISSION_DIR/patch.diff" 2>/dev/null || true
fi

# --- Run tests via SWE-bench Pro run script ---

bash "$EVAL_DIR/scripts/run_script.sh" "components/smartBanner/SmartBanner.test.ts,packages/components/components/smartBanner/SmartBanner.test.tsx" \
> /tmp/stdout.log 2> /tmp/stderr.log || true


# --- Parse results ---
python3 "$EVAL_DIR/scripts/parser.py" /tmp/stdout.log /tmp/stderr.log /tmp/output.json

# --- Convert parser output to EE-bench JSON v2.0 format ---
python3 -c "
import json, sys, datetime
with open('/tmp/output.json') as f:
data = json.load(f)
stdout = open('/tmp/stdout.log').read()
stderr = open('/tmp/stderr.log').read()

passed = [t for t in data.get('tests', []) if t['status'] == 'PASSED']
failed = [t for t in data.get('tests', []) if t['status'] in ('FAILED', 'ERROR')]
skipped = [t for t in data.get('tests', []) if t['status'] == 'SKIPPED']

summary = {
'total': len(data.get('tests', [])),
'passed': len(passed),
'failed': len(failed),
'errors': 0,
'skipped': len(skipped),
}
passed_tests = [{'name': t['name']} for t in passed]
failed_tests = [{'name': t['name']} for t in failed]

result = {
'schema_version': '2.0',
'command': 'run',
'status': 'success',
'timestamp': datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
'criteria': [
{
'criterion': 'patch_applied',
'status': 'pass',
},
{
'criterion': 'compilation',
'status': 'pass',
},
{
'criterion': 'tests',
'status': 'pass' if not failed else 'fail',
'summary': summary,
'passed_tests': passed_tests,
'failed_tests': failed_tests,
},
],
'stdout': stdout,
'stderr': stderr,
# Deprecated v1.0 fields for backward compat
'patch_applied': True,
'compile_success': True,
'summary': summary,
'passed_tests': passed_tests,
'failed_tests': failed_tests,
}
print(json.dumps(result))
"
176 changes: 176 additions & 0 deletions .ee-bench/codegen/eval/scripts/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
"""
Test Results Parser

This script parses test execution outputs to extract structured test results.

Input:
- stdout_file: Path to the file containing standard output from test execution
- stderr_file: Path to the file containing standard error from test execution

Output:
- JSON file containing parsed test results with structure:
{
"tests": [
{
"name": "test_name",
"status": "PASSED|FAILED|SKIPPED|ERROR"
},
...
]
}
"""

import dataclasses
import json
import sys
import re
from enum import Enum
from pathlib import Path
from typing import List

class TestStatus(Enum):
"""The test status enum."""

PASSED = 1
FAILED = 2
SKIPPED = 3
ERROR = 4


@dataclasses.dataclass
class TestResult:
"""The test result dataclass."""

name: str
status: TestStatus



def parse_test_output(stdout_content: str, stderr_content: str) -> List[TestResult]:
"""
Parse Jest test output content and extract test results.

Jest output format includes:
- PASS/FAIL indicators for test files
- Individual test results with ✓ (passed) or ✗ (failed) symbols
- Test names in nested describe/it blocks
"""
results = []

file_pattern = r'(PASS|FAIL)\s+(.+?\.(?:test|spec)\.[jt]sx?)'

# Pattern to match individual test results
test_pattern = r'^\s*[✓✗×]\s+(.+?)(?:\s+\(\d+\s*ms\))?$'

skip_pattern = r'^\s*○\s+(.+?)(?:\s+\(\d+\s*ms\))?$'

current_file = None
current_describe_blocks = []

lines = stderr_content.split('\n')

for i, line in enumerate(lines):
file_match = re.search(file_pattern, line)
if file_match:
current_file = file_match.group(2)
current_describe_blocks = []
continue

if not current_file:
continue

if line.strip() and not re.match(r'^\s*[✓✗×○]', line) and not line.strip().startswith('Test Suites:') and not line.strip().startswith('Tests:'):
stripped = line.strip()
if stripped and not stripped.startswith('PASS') and not stripped.startswith('FAIL') and not stripped.startswith('Time:') and not stripped.startswith('(node:'):
is_describe = False
for j in range(i + 1, min(i + 10, len(lines))):
if re.match(r'^\s*[✓✗×○]', lines[j]):
is_describe = True
break
elif lines[j].strip().startswith('PASS') or lines[j].strip().startswith('FAIL'):
break

if is_describe:
indent_level = len(line) - len(line.lstrip())
if indent_level >= 2: # At least some indentation
current_describe_blocks = [stripped]

# Check for individual test results
test_match = re.match(test_pattern, line)
if test_match:
test_name = test_match.group(1).strip()
full_name = current_file
if current_describe_blocks:
full_name += f" | {current_describe_blocks[0]} {test_name}"
else:
full_name += f" | {test_name}"

if line.strip().startswith('✓'):
results.append(TestResult(full_name, TestStatus.PASSED))
else:
results.append(TestResult(full_name, TestStatus.FAILED))

skip_match = re.match(skip_pattern, line)
if skip_match:
test_name = skip_match.group(1).strip()
full_name = current_file
if current_describe_blocks:
full_name += f" | {current_describe_blocks[0]} {test_name}"
else:
full_name += f" | {test_name}"
results.append(TestResult(full_name, TestStatus.SKIPPED))

if "Error:" in stderr_content or "FAIL" in stderr_content:
pass

return results




def export_to_json(results: List[TestResult], output_path: Path) -> None:
"""
Export the test results to a JSON file.

Args:
results: List of TestResult objects
output_path: Path to the output JSON file
"""

unique_results = {result.name: result for result in results}.values()

json_results = {
'tests': [
{'name': result.name, 'status': result.status.name} for result in unique_results
]
}

with open(output_path, 'w') as f:
json.dump(json_results, f, indent=2)


def main(stdout_path: Path, stderr_path: Path, output_path: Path) -> None:
"""
Main function to orchestrate the parsing process.

Args:
stdout_path: Path to the stdout file
stderr_path: Path to the stderr file
output_path: Path to the output JSON file
"""
with open(stdout_path) as f:
stdout_content = f.read()
with open(stderr_path) as f:
stderr_content = f.read()

results = parse_test_output(stdout_content, stderr_content)

export_to_json(results, output_path)


if __name__ == '__main__':
if len(sys.argv) != 4:
print('Usage: python parsing.py <stdout_file> <stderr_file> <output_json>')
sys.exit(1)

main(Path(sys.argv[1]), Path(sys.argv[2]), Path(sys.argv[3]))
Loading