.github/workflows/v3-nightly.yml

name: TorchBench V3 nightly (A100)
on:
  workflow_dispatch:
  schedule:
    - cron: '00 18 * * *' # run at 6:00 PM UTC, K8s containers will roll out at 12PM EST

jobs:
  run-benchmark:
    environment: docker-s3-upload
    env:
      BASE_CONDA_ENV: "torchbench"
      CONDA_ENV:  "torchbench-v3-nightly"
      PLATFORM_NAME: "gcp_a100"
      SETUP_SCRIPT: "/workspace/setup_instance.sh"
      TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
      IS_GHA: 1
      BUILD_ENVIRONMENT: benchmark-nightly
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: [a100-runner]
    steps:
      - name: Checkout TorchBench v3.0 branch
        uses: actions/checkout@v3
        with:
          ref: v3.0
          path: benchmark
      - name: Tune Nvidia GPU
        run: |
          sudo nvidia-smi -pm 1
          sudo nvidia-smi -ac 1215,1410
          nvidia-smi
      - name: Clone and setup conda env
        run: |
          CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
          conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}"
      - name: Install TorchBench
        run: |
          set -x
          . "${SETUP_SCRIPT}"
          pushd benchmark
          python install.py
      - name: Run the torch-nightly userbenchmark
        run: |
          . "${SETUP_SCRIPT}"
          # remove old results
          if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi
          pushd benchmark
          if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi
          python run_benchmark.py torch-nightly -c v3-cuda-tests.yaml
          cp -r ./.userbenchmark/torch-nightly ../benchmark-output
      - name: Detect potential regressions
        continue-on-error: true
        run: |
          . "${SETUP_SCRIPT}"
          pushd benchmark
          RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r))
          # TODO: the following assumes only one metrics-*.json is found. It will keep
          # overwriting gh-issue.md if multiple are found.
          for r in ${RESULTS[@]}; do
            python regression_detector.py --platform "${PLATFORM_NAME}" --treatment "${r}" --owner @xuzhao9 \
            --gh-issue-path gh-issue.md --errors-path errors.txt
          done
          rm -r ../benchmark-output || true
          cp -r ./.userbenchmark/torch-nightly ../benchmark-output
      - name: Copy artifact and upload to scribe and Amazon S3
        run: |
          . "${SETUP_SCRIPT}"
          pushd benchmark
          LATEST_RESULT=$(find ../benchmark-output/ -name "metrics-*.json" | sort -r | head -1)
          echo "Benchmark result file: ${LATEST_RESULT}"
          # Upload the result json to Scribe
          python ./scripts/userbenchmark/upload_scribe.py --userbenchmark_json "${LATEST_RESULT}" --userbenchmark_platform "${PLATFORM_NAME}"
          # Upload the result json to Amazon S3
          python ./scripts/userbenchmark/upload_s3.py --upload-file "${LATEST_RESULT}" --userbenchmark_platform "${PLATFORM_NAME}"
      - name: Copy regression results to Amazon S3 and kick off bisection
        if: env.TORCHBENCH_REGRESSION_DETECTED
        run: |
          . "${SETUP_SCRIPT}"
          pushd benchmark
          LATEST_REGRESSION_RESULT=$(find ../benchmark-output/ -name "regression-*.yaml" | sort -r | head -1)
          # Upload the regression json to Amazon S3
          python ./scripts/userbenchmark/upload_s3.py --upload-file "${LATEST_REGRESSION_RESULT}" --userbenchmark_platform "${PLATFORM_NAME}"
          # Get the workflow ID from
          # https://api.github.com/repos/pytorch/benchmark/actions/workflows
          # And dispatch the bisection workflow
          curl -u xuzhao9:${{ secrets.TORCHBENCH_ACCESS_TOKEN }} \
            -X POST \
            -H "Accept: application/vnd.github.v3+json" \
            https://api.github.com/repos/pytorch/benchmark/actions/workflows/57994037/dispatches \
            -d '{"ref": "main", "inputs": {"regression_date": "${{ env.TORCHBENCH_REGRESSION_DETECTED }}" } }'
      - name: Upload result to GH Actions Artifact
        uses: actions/upload-artifact@v3
        with:
          name: TorchBench V3 result
          path: benchmark-output/
      - name: Clean up Conda env
        if: always()
        run: |
          . "${SETUP_SCRIPT}"
          conda deactivate && conda deactivate
          conda remove -n "${CONDA_ENV}" --all