#49: GitHub CI Benchmarking

MFlowCode · Dec 15, 2023 · 00cc18b · 00cc18b
1 parent 371c51a
commit 00cc18b
Show file tree

Hide file tree

Showing 13 changed files with 172 additions and 111 deletions.
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -0,0 +1,37 @@
+name: 'Benchmark'
+
+on:
+  push:
+    paths:
+      - '**.f90'
+      - '**.fpp'
+      - '**.py'
+      - '**.yml'
+      - 'mfc.sh'
+      - 'CMakeLists.txt'
+      - 'requirements.txt'
+
+  pull_request:
+
+  workflow_dispatch:
+
+jobs:
+  self:
+    name: Georgia Tech | Phoenix (NVHPC)
+    if: github.repository == 'MFlowCode/MFC'
+    strategy:
+      matrix:
+        device: ['cpu', 'gpu']
+    runs-on:
+      group:  phoenix
+      labels: self-hosted
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+
+      - name: Bench
+        run: bash .github/workflows/phoenix/submit.sh .github/workflows/phoenix/bench.sh ${{ matrix.device }}
+
+      - name: Print
+        if: always()
+        run: cat bench-${{ matrix.device }}.out
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Login to Docker Hub
       uses: docker/login-action@v2

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     # We build doxygen from source because of
     # https://github.com/doxygen/doxygen/issues/9016

diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+./mfc.sh bench "$job_slug.yaml" -j $(nproc) -b mpirun
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+usage() {
+    echo "Usage: $0 [script.sh] [cpu|gpu]"
+}
+
+if [ ! -z "$1" ]; then
+    sbatch_script_contents=`cat $1`
+else
+    usage
+    exit 1
+fi
+
+sbatch_cpu_opts="\
+#SBATCH --ntasks-per-node=12       # Number of cores per node required
+#SBATCH --mem-per-cpu=2G           # Memory per core\
+"
+
+sbatch_gpu_opts="\
+#SBATCH -CV100-16GB
+#SBATCH -G2\
+"
+
+if [ "$2" == "cpu" ]; then
+    sbatch_device_opts="$sbatch_cpu_opts"
+elif [ "$2" == "gpu" ]; then
+    sbatch_device_opts="$sbatch_gpu_opts"
+else
+    usage
+    exit 1
+fi
+
+job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"
+
+cat <<EOT
+#!/bin/bash
+#SBATCH -Jshb-$job_slug            # Job name
+#SBATCH --account=gts-sbryngelson3 # charge account
+#SBATCH -N1                        # Number of nodes required
+$sbatch_device_opts
+#SBATCH -t 04:00:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -q embers                  # QOS Name
+#SBATCH -o$job_slug.out            # Combined output and error messages file
+#SBATCH -W                         # Do not exit until the submitted job terminates.
+
+set -x
+
+cd "\$SLURM_SUBMIT_DIR"
+echo "Running in $(pwd):"
+
+job_slug="$job_slug"
+job_device="$2"
+
+. ./mfc.sh load -c p -m $2
+
+$sbatch_script_contents
+
+EOT
diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+if [ "$job_device" == "gpu" ]; then
+    gpu_count=$(nvidia-smi -L | wc -l)        # number of GPUs on node
+    gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1
+    device_opts="--gpu -g $gpu_ids"
+fi
+
+./mfc.sh test -a -b mpirun -j $(nproc) $device_opts
diff --git a/.github/workflows/ci.yml → .github/workflows/test.yml b/.github/workflows/ci.yml → .github/workflows/test.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}-latest
     steps:
       - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Setup MacOS
         if:   matrix.os == 'macos'
@@ -95,7 +95,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Clone
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Test
       run:  sudo ./mfc.sh docker ./mfc.sh test -j $(nproc) -a
@@ -112,21 +112,16 @@ jobs:
       labels: self-hosted
     steps:
       - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Build
         run:  |
           . ./mfc.sh load -c p -m gpu
           ./mfc.sh build -j 2 $(if [ '${{ matrix.device }}' == 'gpu' ]; then echo '--gpu'; fi)
 
       - name: Test
-        run: |
-          . ./mfc.sh load -c p -m gpu
-          mv misc/run-phoenix-release-${{ matrix.device }}.sh ./
-          sbatch run-phoenix-release-${{ matrix.device }}.sh
+        run: bash .github/workflows/phoenix/submit.sh .github/workflows/phoenix/test.sh ${{ matrix.device }}
 
       - name: Print
         if: always()
-        run: |
-          cat test.out
-
+        run: cat test-${{ matrix.device }}.out
diff --git a/misc/run-phoenix-release-cpu.sh b/misc/run-phoenix-release-cpu.sh
diff --git a/misc/run-phoenix-release-gpu.sh b/misc/run-phoenix-release-gpu.sh
diff --git a/toolchain/bench.yaml b/toolchain/bench.yaml
@@ -0,0 +1,7 @@
+- name: 1D_bubblescreen
+  path: examples/1D_bubblescreen/case.py
+  args: []
+
+- name: 1D_kapilashocktube
+  path: examples/1D_kapilashocktube/case.py
+  args: []
diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py
@@ -122,7 +122,9 @@ def add_common_arguments(p, mask = None):
     run.add_argument("--wait",                 action="store_true",                       default=False,      help="(Batch) Wait for the job to finish.")
 
     # === BENCH ===
-    add_common_arguments(bench, "t")
+    add_common_arguments(bench, "tjgn")
+    bench.add_argument("output", metavar="OUTPUT", default=None, type=str, help="Path to the YAML output file to write the results to.")
+    bench.add_argument(metavar="FORWARDED", default=[], dest='forwarded', nargs=argparse.REMAINDER, help="Arguments to forward to the ./mfc.sh run invocations.")
 
     # === COUNT ===
     add_common_arguments(count, "g")

diff --git a/toolchain/mfc/bench.py b/toolchain/mfc/bench.py
@@ -1,62 +1,53 @@
-import os, json, time, typing, datetime, subprocess
-
-import rich.table
+import sys, time, subprocess, dataclasses
 
 from .printer import cons
-from .state   import ARG
+from .state   import ARG, CFG
 from .build   import PRE_PROCESS, SIMULATION, build_targets
-from .common  import system, MFC_SUBDIR
+from .common  import system, MFC_BENCH_FILEPATH, file_load_yaml, file_dump_yaml
 from .        import sched
 
+
+@dataclasses.dataclass
+class BenchCase:
+    name: str
+    path: str
+    args: list[str]
+
+
 def bench():
-    build_targets([PRE_PROCESS, SIMULATION])
-
+    cons.print()
     cons.print("[bold]Benchmarking [magenta]simulation[/magenta]:[/bold]")
     cons.indent()
-
-    CASES   = ["1D_bubblescreen", "1D_exercise_WENO", "1D_kapilashocktube"]
-    RESULTS = []
-
-    table = rich.table.Table(show_lines=False, show_edge=False)
-    table.add_column("Case")
-    table.add_column("(Simulation) Runtime (s)")
-
-    def __worker(case: str, devices: typing.Set[int]):
-        nonlocal RESULTS
-
-        system(["./mfc.sh", "run", f"examples/{case}/case.py", "--no-build", "-t", "pre_process"], stdout=subprocess.DEVNULL)
-        start   = time.monotonic()
-        system(["./mfc.sh", "run", f"examples/{case}/case.py", "--no-build", "-t", "simulation"], stdout=subprocess.DEVNULL)
-        end     = time.monotonic()
-        runtime = datetime.timedelta(seconds=end - start).total_seconds()
-
-        RESULTS.append({
-            "name":  f"Simulation: {case}",
-            "unit":  "seconds",
-            "value": runtime
-        })
-
-        table.add_row(case, str(runtime))
-
-    tasks: typing.List[sched.Task] = [
-        sched.Task(1, __worker, [ case ], 1) for case in CASES
-    ]
-
     cons.print()
-    nThreads = min(ARG('jobs'), len(ARG('gpus'))) if ARG("gpu") else ARG('jobs')
-    if ARG('case_optimization'):
-        nThreads = 1
 
-    sched.sched(tasks, nThreads, ARG("gpus"))
-    cons.print()
-    cons.unindent()
-    cons.print("[bold]Benchmark Results:[/bold]")
-    cons.print()
-    cons.raw.print(table)
-    cons.print()
-
-    filepath = os.path.join(MFC_SUBDIR, "bench.json")
-    with open(filepath, "w") as f:
-        json.dump(RESULTS, f)
-
-    cons.print(f"[bold green]✓[/bold green] Saved results to [magenta]{filepath}[/magenta].")
+    CASES = [ BenchCase(**case) for case in file_load_yaml(MFC_BENCH_FILEPATH) ]
+
+    for case in CASES:
+        case.args = case.args + ARG("forwarded")
+
+    cons.print(f"Found [magenta]{len(CASES)}[/magenta] cases.")
+
+    results = {
+        "metadata": {
+            "invocation": sys.argv[1:],
+            "lock":       dataclasses.asdict(CFG())
+        },
+        "cases": [],
+    }
+
+    for i, case in enumerate(CASES):
+        cons.print(f"{str(i+1).zfill(len(CASES) // 10 + 1)}/{len(CASES)}: {case.name} @ [bold]{case.path}[/bold]")
+        system(["./mfc.sh", "build", "--targets", "pre_process", "simulation", "--case-optimization", "--input", case.path], stdout=subprocess.DEVNULL)
+
+        case_results = dataclasses.asdict(case)
+
+        for target in [PRE_PROCESS, SIMULATION]:
+            start = time.time()
+            system(["./mfc.sh", "run", case.path, "--targets", target.name, "--case-optimization", *case.args], stdout=subprocess.DEVNULL)
+            case_results[target.name] = time.time() - start
+
+        results["cases"].append(case_results)
+
+    file_dump_yaml(ARG("output"), results)
+
+    cons.unindent()
diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py
@@ -7,12 +7,11 @@
 from os.path import abspath, normpath, dirname, realpath
 
 
-MFC_ROOTDIR       = normpath(f"{dirname(realpath(__file__))}/../..")
-MFC_TESTDIR       = abspath(f"{MFC_ROOTDIR}/tests")
-MFC_SUBDIR        = abspath(f"{MFC_ROOTDIR}/build")
-MFC_DEV_FILEPATH  = abspath(f"{MFC_ROOTDIR}/toolchain/mfc.dev.yaml")
-MFC_USER_FILEPATH = abspath(f"{MFC_ROOTDIR}/defaults.yaml")
-MFC_LOCK_FILEPATH = abspath(f"{MFC_SUBDIR}/lock.yaml")
+MFC_ROOTDIR        = normpath(f"{dirname(realpath(__file__))}/../..")
+MFC_TESTDIR        = abspath(f"{MFC_ROOTDIR}/tests")
+MFC_SUBDIR         = abspath(f"{MFC_ROOTDIR}/build")
+MFC_LOCK_FILEPATH  = abspath(f"{MFC_SUBDIR}/lock.yaml")
+MFC_BENCH_FILEPATH = abspath(f"{MFC_ROOTDIR}/toolchain/bench.yaml")
 
 MFC_LOGO = f"""
      .=++*:          -+*+=.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		./mfc.sh bench "$job_slug.yaml" -j $(nproc) -b mpirun