Skip to content

Commit

Permalink
Relentless fix (#605)
Browse files Browse the repository at this point in the history
  • Loading branch information
aricer123 authored Sep 5, 2024
1 parent 4ad7a41 commit bac18ff
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 22 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/frontier/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' '`
ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c`

./mfc.sh test -j $ngpus --sys-hdf5 --sys-fftw -- -c frontier
./mfc.sh test --max-attempts 3 -j $ngpus --sys-hdf5 --sys-fftw -- -c frontier

3 changes: 2 additions & 1 deletion .github/workflows/phoenix/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ if [ "$job_device" == "gpu" ]; then
n_test_threads=`expr $gpu_count \* 2`
fi

./mfc.sh test -a -j $n_test_threads $device_opts -- -c phoenix
./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix


4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ jobs:
- name: Test
run: |
if [ '${{ matrix.intel }}' == 'true' ]; then source /opt/intel/oneapi/setvars.sh; fi
/bin/bash mfc.sh test -j $(nproc) $OPT1 $OPT2
/bin/bash mfc.sh test --max-attempts 3 -j $(nproc) $OPT1 $OPT2
env:
OPT1: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
OPT2: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
Expand All @@ -109,7 +109,7 @@ jobs:
uses: actions/checkout@v4

- name: Test
run: sudo ./mfc.sh docker ./mfc.sh test -j $(nproc) -a
run: sudo ./mfc.sh docker ./mfc.sh test --max-attempts 3 -j $(nproc) -a

self:
name: Self Hosted
Expand Down
4 changes: 1 addition & 3 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,11 @@ def add_common_arguments(p, mask = None):
test.add_argument("-f", "--from", default=test_cases[0].get_uuid(), type=str, help="First test UUID to run.")
test.add_argument("-t", "--to", default=test_cases[-1].get_uuid(), type=str, help="Last test UUID to run.")
test.add_argument("-o", "--only", nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.")
test.add_argument("-r", "--relentless", action="store_true", default=False, help="Run all tests, even if multiple fail.")
test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.")
test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.")
test.add_argument("-m", "--max-attempts", type=int, default=3, help="Maximum number of attempts to run a test.")
test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.")
test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")

test_meg = test.add_mutually_exclusive_group()
test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.")
test_meg.add_argument("--add-new-variables", action="store_true", default=False, help="(Test Generation) If new variables are found in D/ when running tests, add them to the golden files.")
Expand Down
24 changes: 9 additions & 15 deletions toolchain/mfc/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@


nFAIL = 0
nPASS = 0
nSKIP = 0

def __filter(cases_) -> typing.List[TestCase]:
cases = cases_[:]
Expand Down Expand Up @@ -60,7 +62,7 @@ def __filter(cases_) -> typing.List[TestCase]:

def test():
# pylint: disable=global-statement, global-variable-not-assigned
global nFAIL
global nFAIL, nPASS, nSKIP

cases = [ _.to_case() for _ in list_cases() ]

Expand Down Expand Up @@ -123,12 +125,9 @@ def test():
ARG("jobs"), ARG("gpus"))

cons.print()
if nFAIL == 0:
cons.print("Tested Simulation [bold green]✓[/bold green]")
else:
raise MFCException(f"Testing: Encountered [bold red]{nFAIL}[/bold red] failure(s).")

cons.unindent()
cons.print(f"\nTest Summary: [bold green]{nPASS}[/bold green] passed, [bold red]{nFAIL}[/bold red] failed, [bold yellow]{nSKIP}[/bold yellow] skipped.")
exit(nFAIL)


# pylint: disable=too-many-locals, too-many-branches, too-many-statements
Expand Down Expand Up @@ -217,8 +216,8 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):


def handle_case(case: TestCase, devices: typing.Set[int]):
# pylint: disable=global-statement
global nFAIL
# pylint: disable=global-statement, global-variable-not-assigned
global nFAIL, nPASS, nSKIP

nAttempts = 0

Expand All @@ -227,18 +226,13 @@ def handle_case(case: TestCase, devices: typing.Set[int]):

try:
_handle_case(case, devices)
nPASS += 1
except Exception as exc:
if nAttempts < ARG("max_attempts"):
cons.print(f"[bold yellow] Attempt {nAttempts}: Failed test {case.get_uuid()}. Retrying...[/bold yellow]")
continue

nFAIL += 1

cons.print(f"[bold red]Failed test {case} after {nAttempts} attempt(s).[/bold red]")

if ARG("relentless"):
cons.print(f"{exc}")
else:
raise exc
cons.print(f"{exc}")

return

0 comments on commit bac18ff

Please sign in to comment.