.github/workflows/integration-runner.yml

name: Run Integration Tests

on:
  pull_request:
    types: [labeled]
  workflow_dispatch:
    inputs:
      reason:
        description: 'Reason for manual trigger'
        required: true
        default: ''
  schedule:
    - cron: '30 22 * * *'  # Runs at 10:30pm UTC every day

env:
  N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation

jobs:
  run-integration-tests:
    if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
    runs-on: ubuntu-latest
    permissions:
      contents: "read"
      id-token: "write"
      pull-requests: "write"
      issues: "write"
    strategy:
      matrix:
        python-version: ["3.12"]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Install poetry via pipx
        run: pipx install poetry

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
          cache: "poetry"

      - name: Comment on PR if 'integration-test' label is present
        if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
        uses: KeisukeYamashita/create-comment@v1
        with:
          unique: false
          comment: |
            Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.

      - name: Install Python dependencies using Poetry
        run: poetry install --without evaluation,llama-index

      - name: Configure config.toml for testing with Haiku
        env:
          LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
          MAX_ITERATIONS: 10
        run: |
          echo "[llm.eval]" > config.toml
          echo "model = \"$LLM_MODEL\"" >> config.toml
          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
          echo "temperature = 0.0" >> config.toml

      - name: Build environment
        run: make build

      - name: Run integration test evaluation for Haiku
        env:
          SANDBOX_FORCE_REBUILD_RUNTIME: True
        run: |
          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'haiku_run'

          # get integration tests report
          REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
          echo "REPORT_FILE: $REPORT_FILE_HAIKU"
          echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
          cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
          echo >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

      - name: Wait a little bit
        run: sleep 10

      - name: Configure config.toml for testing with DeepSeek
        env:
          LLM_MODEL: "litellm_proxy/deepseek-chat"
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
          MAX_ITERATIONS: 10
        run: |
          echo "[llm.eval]" > config.toml
          echo "model = \"$LLM_MODEL\"" >> config.toml
          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
          echo "temperature = 0.0" >> config.toml

      - name: Run integration test evaluation for DeepSeek
        env:
          SANDBOX_FORCE_REBUILD_RUNTIME: True
        run: |
          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'deepseek_run'

          # get integration tests report
          REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
          echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
          echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
          cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
          echo >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

      # -------------------------------------------------------------
      # Run DelegatorAgent tests for Haiku, limited to t01 and t02
      - name: Wait a little bit (again)
        run: sleep 5

      - name: Configure config.toml for testing DelegatorAgent (Haiku)
        env:
          LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
          MAX_ITERATIONS: 30
        run: |
          echo "[llm.eval]" > config.toml
          echo "model = \"$LLM_MODEL\"" >> config.toml
          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
          echo "temperature = 0.0" >> config.toml

      - name: Run integration test evaluation for DelegatorAgent (Haiku)
        env:
          SANDBOX_FORCE_REBUILD_RUNTIME: True
        run: |
          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run'

          # Find and export the delegator test results
          REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_30_N* -name "report.md" -type f | head -n 1)
          echo "REPORT_FILE_DELEGATOR_HAIKU: $REPORT_FILE_DELEGATOR_HAIKU"
          echo "INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU<<EOF" >> $GITHUB_ENV
          cat $REPORT_FILE_DELEGATOR_HAIKU >> $GITHUB_ENV
          echo >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

      # -------------------------------------------------------------
      # Run DelegatorAgent tests for DeepSeek, limited to t01 and t02
      - name: Wait a little bit (again)
        run: sleep 5

      - name: Configure config.toml for testing DelegatorAgent (DeepSeek)
        env:
          LLM_MODEL: "litellm_proxy/deepseek-chat"
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
          MAX_ITERATIONS: 30
        run: |
          echo "[llm.eval]" > config.toml
          echo "model = \"$LLM_MODEL\"" >> config.toml
          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
          echo "temperature = 0.0" >> config.toml
      - name: Run integration test evaluation for DelegatorAgent (DeepSeek)
        env:
          SANDBOX_FORCE_REBUILD_RUNTIME: True
        run: |
          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run'

          # Find and export the delegator test results
          REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_30_N* -name "report.md" -type f | head -n 1)
          echo "REPORT_FILE_DELEGATOR_DEEPSEEK: $REPORT_FILE_DELEGATOR_DEEPSEEK"
          echo "INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK<<EOF" >> $GITHUB_ENV
          cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV
          echo >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
      # -------------------------------------------------------------
      # Run VisualBrowsingAgent tests for DeepSeek, limited to t05 and t06
      - name: Wait a little bit (again)
        run: sleep 5

      - name: Configure config.toml for testing VisualBrowsingAgent (DeepSeek)
        env:
          LLM_MODEL: "litellm_proxy/deepseek-chat"
          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
          MAX_ITERATIONS: 15
        run: |
          echo "[llm.eval]" > config.toml
          echo "model = \"$LLM_MODEL\"" >> config.toml
          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
          echo "temperature = 0.0" >> config.toml
      - name: Run integration test evaluation for VisualBrowsingAgent (DeepSeek)
        env:
          SANDBOX_FORCE_REBUILD_RUNTIME: True
        run: |
          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD VisualBrowsingAgent '' 15 $N_PROCESSES "t05_simple_browsing,t06_github_pr_browsing.py" 'visualbrowsing_deepseek_run'

          # Find and export the visual browsing agent test results
          REPORT_FILE_VISUALBROWSING_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/VisualBrowsingAgent/deepseek*_maxiter_15_N* -name "report.md" -type f | head -n 1)
          echo "REPORT_FILE_VISUALBROWSING_DEEPSEEK: $REPORT_FILE_VISUALBROWSING_DEEPSEEK"
          echo "INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK<<EOF" >> $GITHUB_ENV
          cat $REPORT_FILE_VISUALBROWSING_DEEPSEEK >> $GITHUB_ENV
          echo >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

      - name: Create archive of evaluation outputs
        run: |
          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
          cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/* integration_tests/VisualBrowsingAgent/* # Only include the actual result directories

      - name: Upload evaluation results as artifact
        uses: actions/upload-artifact@v4
        id: upload_results_artifact
        with:
          name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
          path: integration_tests_*.tar.gz

      - name: Get artifact URLs
        run: |
          echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV

      - name: Set timestamp and trigger reason
        run: |
          echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
            echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
          elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
            echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
          else
            echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
          fi

      - name: Comment with results and artifact link
        id: create_comment
        uses: KeisukeYamashita/create-comment@v1
        with:
          # if triggered by PR, use PR number, otherwise use 5318 as fallback issue number for manual triggers
          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5318 }}
          unique: false
          comment: |
              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
              Commit: ${{ github.sha }}
              **Integration Tests Report (Haiku)**
              Haiku LLM Test Results:
              ${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
              ---
              **Integration Tests Report (DeepSeek)**
              DeepSeek LLM Test Results:
              ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
              ---
                **Integration Tests Report Delegator (Haiku)**
              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU }}
              ---
              **Integration Tests Report Delegator (DeepSeek)**
              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK }}
              ---
              **Integration Tests Report VisualBrowsing (DeepSeek)**
              ${{ env.INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK }}
              ---
              Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})