Run Integration Tests #432
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run Integration Tests | |
on: | |
pull_request: | |
types: [labeled] | |
workflow_dispatch: | |
inputs: | |
reason: | |
description: 'Reason for manual trigger' | |
required: true | |
default: '' | |
schedule: | |
- cron: '30 22 * * *' # Runs at 10:30pm UTC every day | |
env: | |
N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation | |
jobs: | |
run-integration-tests: | |
if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' | |
runs-on: ubuntu-latest | |
permissions: | |
contents: "read" | |
id-token: "write" | |
pull-requests: "write" | |
issues: "write" | |
strategy: | |
matrix: | |
python-version: ["3.12"] | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Install poetry via pipx | |
run: pipx install poetry | |
- name: Set up Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: Comment on PR if 'integration-test' label is present | |
if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test' | |
uses: KeisukeYamashita/create-comment@v1 | |
with: | |
unique: false | |
comment: | | |
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. | |
- name: Install Python dependencies using Poetry | |
run: poetry install --without evaluation,llama-index | |
- name: Configure config.toml for testing with Haiku | |
env: | |
LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022" | |
LLM_API_KEY: ${{ secrets.LLM_API_KEY }} | |
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} | |
MAX_ITERATIONS: 10 | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"$LLM_MODEL\"" >> config.toml | |
echo "api_key = \"$LLM_API_KEY\"" >> config.toml | |
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Build environment | |
run: make build | |
- name: Run integration test evaluation for Haiku | |
env: | |
SANDBOX_FORCE_REBUILD_RUNTIME: True | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'haiku_run' | |
# get integration tests report | |
REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE: $REPORT_FILE_HAIKU" | |
echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE_HAIKU >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
- name: Wait a little bit | |
run: sleep 10 | |
- name: Configure config.toml for testing with DeepSeek | |
env: | |
LLM_MODEL: "litellm_proxy/deepseek-chat" | |
LLM_API_KEY: ${{ secrets.LLM_API_KEY }} | |
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} | |
MAX_ITERATIONS: 10 | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"$LLM_MODEL\"" >> config.toml | |
echo "api_key = \"$LLM_API_KEY\"" >> config.toml | |
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Run integration test evaluation for DeepSeek | |
env: | |
SANDBOX_FORCE_REBUILD_RUNTIME: True | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'deepseek_run' | |
# get integration tests report | |
REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK" | |
echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
# ------------------------------------------------------------- | |
# Run DelegatorAgent tests for Haiku, limited to t01 and t02 | |
- name: Wait a little bit (again) | |
run: sleep 5 | |
- name: Configure config.toml for testing DelegatorAgent (Haiku) | |
env: | |
LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022" | |
LLM_API_KEY: ${{ secrets.LLM_API_KEY }} | |
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} | |
MAX_ITERATIONS: 30 | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"$LLM_MODEL\"" >> config.toml | |
echo "api_key = \"$LLM_API_KEY\"" >> config.toml | |
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Run integration test evaluation for DelegatorAgent (Haiku) | |
env: | |
SANDBOX_FORCE_REBUILD_RUNTIME: True | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run' | |
# Find and export the delegator test results | |
REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_30_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE_DELEGATOR_HAIKU: $REPORT_FILE_DELEGATOR_HAIKU" | |
echo "INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE_DELEGATOR_HAIKU >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
# ------------------------------------------------------------- | |
# Run DelegatorAgent tests for DeepSeek, limited to t01 and t02 | |
- name: Wait a little bit (again) | |
run: sleep 5 | |
- name: Configure config.toml for testing DelegatorAgent (DeepSeek) | |
env: | |
LLM_MODEL: "litellm_proxy/deepseek-chat" | |
LLM_API_KEY: ${{ secrets.LLM_API_KEY }} | |
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} | |
MAX_ITERATIONS: 30 | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"$LLM_MODEL\"" >> config.toml | |
echo "api_key = \"$LLM_API_KEY\"" >> config.toml | |
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Run integration test evaluation for DelegatorAgent (DeepSeek) | |
env: | |
SANDBOX_FORCE_REBUILD_RUNTIME: True | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run' | |
# Find and export the delegator test results | |
REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_30_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE_DELEGATOR_DEEPSEEK: $REPORT_FILE_DELEGATOR_DEEPSEEK" | |
echo "INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
# ------------------------------------------------------------- | |
# Run VisualBrowsingAgent tests for DeepSeek, limited to t05 and t06 | |
- name: Wait a little bit (again) | |
run: sleep 5 | |
- name: Configure config.toml for testing VisualBrowsingAgent (DeepSeek) | |
env: | |
LLM_MODEL: "litellm_proxy/deepseek-chat" | |
LLM_API_KEY: ${{ secrets.LLM_API_KEY }} | |
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} | |
MAX_ITERATIONS: 15 | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"$LLM_MODEL\"" >> config.toml | |
echo "api_key = \"$LLM_API_KEY\"" >> config.toml | |
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Run integration test evaluation for VisualBrowsingAgent (DeepSeek) | |
env: | |
SANDBOX_FORCE_REBUILD_RUNTIME: True | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD VisualBrowsingAgent '' 15 $N_PROCESSES "t05_simple_browsing,t06_github_pr_browsing.py" 'visualbrowsing_deepseek_run' | |
# Find and export the visual browsing agent test results | |
REPORT_FILE_VISUALBROWSING_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/VisualBrowsingAgent/deepseek*_maxiter_15_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE_VISUALBROWSING_DEEPSEEK: $REPORT_FILE_VISUALBROWSING_DEEPSEEK" | |
echo "INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE_VISUALBROWSING_DEEPSEEK >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
- name: Create archive of evaluation outputs | |
run: | | |
TIMESTAMP=$(date +'%y-%m-%d-%H-%M') | |
cd evaluation/evaluation_outputs/outputs # Change to the outputs directory | |
tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/* integration_tests/VisualBrowsingAgent/* # Only include the actual result directories | |
- name: Upload evaluation results as artifact | |
uses: actions/upload-artifact@v4 | |
id: upload_results_artifact | |
with: | |
name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }} | |
path: integration_tests_*.tar.gz | |
- name: Get artifact URLs | |
run: | | |
echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV | |
- name: Set timestamp and trigger reason | |
run: | | |
echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV | |
if [[ "${{ github.event_name }}" == "pull_request" ]]; then | |
echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV | |
elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV | |
else | |
echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV | |
fi | |
- name: Comment with results and artifact link | |
id: create_comment | |
uses: KeisukeYamashita/create-comment@v1 | |
with: | |
# if triggered by PR, use PR number, otherwise use 5318 as fallback issue number for manual triggers | |
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5318 }} | |
unique: false | |
comment: | | |
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} | |
Commit: ${{ github.sha }} | |
**Integration Tests Report (Haiku)** | |
Haiku LLM Test Results: | |
${{ env.INTEGRATION_TEST_REPORT_HAIKU }} | |
--- | |
**Integration Tests Report (DeepSeek)** | |
DeepSeek LLM Test Results: | |
${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} | |
--- | |
**Integration Tests Report Delegator (Haiku)** | |
${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU }} | |
--- | |
**Integration Tests Report Delegator (DeepSeek)** | |
${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK }} | |
--- | |
**Integration Tests Report VisualBrowsing (DeepSeek)** | |
${{ env.INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK }} | |
--- | |
Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) |