-
Notifications
You must be signed in to change notification settings - Fork 68
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
GPU-Aware MPI on OLCF Frontier and Combined weak- & strong-scaling ca…
…se (#448)
- Loading branch information
1 parent
daa8e85
commit 31aed98
Showing
30 changed files
with
398 additions
and
169 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,6 +193,5 @@ | |
'Mono(1)%pulse' : 1, | ||
'Mono(1)%mag' : 1., | ||
'Mono(1)%length' : 0.2, | ||
'cu_mpi' : 'F', | ||
|
||
'rdma_mpi' : 'F', | ||
})) |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Strong- & Weak-scaling | ||
|
||
The [**Scaling**](case.py) case can exercise both weak- and strong-scaling. It | ||
adjusts itself depending on the number of requested ranks. | ||
|
||
This directory also contains a collection of scripts used to test strong-scaling | ||
on OLCF Frontier. They required modifying MFC to collect some metrics but are | ||
meant to serve as a reference to users wishing to run similar experiments. | ||
|
||
## Weak Scaling | ||
|
||
Pass `--scaling weak`. The `--memory` option controls (approximately) how much | ||
memory each rank should use, in Gigabytes. The number of cells in each dimension | ||
is then adjusted according to the number of requested ranks and an approximation | ||
for the relation between cell count and memory usage. The problem size increases | ||
linearly with the number of ranks. | ||
|
||
## Strong Scaling | ||
|
||
Pass `--scaling strong`. The `--memory` option controls (approximately) how much | ||
memory should be used in total during simulation, across all ranks, in Gigabytes. | ||
The problem size remains constant as the number of ranks increases. | ||
|
||
## Example | ||
|
||
For example, to run a weak-scaling test that uses ~4GB of GPU memory per rank | ||
on 8 2-rank nodes with case optimization, one could: | ||
|
||
```shell | ||
./mfc.sh run examples/scaling/case.py -t pre_process simulation \ | ||
-e batch -p mypartition -N 8 -n 2 -w "01:00:00" -# "MFC Weak Scaling" \ | ||
--case-optimization -j 32 -- --scaling weak --memory 4 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/bash | ||
|
||
./mfc.sh build -t pre_process simulation --case-optimization -i examples/scaling/case.py \ | ||
-j 8 --gpu --mpi --no-debug -- -s strong -m 512 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import re, os, csv, glob, statistics | ||
|
||
from dataclasses import dataclass, fields | ||
|
||
CDIR=os.path.abspath(os.path.join("examples", "scaling")) | ||
LDIR=os.path.join(CDIR, "logs") | ||
|
||
def get_num(s: str) -> float: | ||
try: | ||
return float(re.findall(r"[0-9]+\.[0-9]+(?:E[-+][0-9]+)?", s, re.MULTILINE)[0]) | ||
except: | ||
return None | ||
|
||
def get_nums(arr): | ||
return {get_num(_) for _ in arr if get_num(_)} | ||
|
||
@dataclass(frozen=True, order=True) | ||
class Configuration: | ||
nodes: int | ||
mem: int | ||
rdma_mpi: bool | ||
|
||
@dataclass | ||
class Result: | ||
ts_avg: float | ||
mpi_avg: float | ||
init_t: float | ||
sim_t: float | ||
|
||
runs = {} | ||
|
||
for logpath in glob.glob(os.path.join(LDIR, "run-*-sim*")): | ||
logdata = open(logpath, "r").read() | ||
|
||
tss = get_nums(re.findall(r'^ TS .+', logdata, re.MULTILINE)) | ||
mpis = get_nums(re.findall(r'^ MPI .+', logdata, re.MULTILINE)) | ||
try: | ||
perf = get_num(re.findall(r"^ Performance: .+", logdata, re.MULTILINE)[0]) | ||
except: | ||
perf = 'N/A' | ||
|
||
if len(tss) == 0: tss = [-1.0] | ||
if len(mpis) == 0: mpis = [-1.0] | ||
|
||
pathels = os.path.relpath(logpath, LDIR).split('-') | ||
|
||
runs[Configuration( | ||
nodes=int(pathels[1]), | ||
mem=int(pathels[2]), | ||
rdma_mpi=pathels[3] == 'T' | ||
)] = Result( | ||
ts_avg=statistics.mean(tss), | ||
mpi_avg=statistics.mean(mpis), | ||
init_t=get_num(re.findall(r"Init took .+", logdata, re.MULTILINE)[0]), | ||
sim_t=get_num(re.findall(r"sim_duration .+", logdata, re.MULTILINE)[0]), | ||
) | ||
|
||
with open(os.path.join(CDIR, "export.csv"), "w") as f: | ||
writer = csv.writer(f, delimiter=',') | ||
writer.writerow([ | ||
_.name for _ in fields(Configuration) + fields(Result) | ||
]) | ||
|
||
for cfg in sorted(runs.keys()): | ||
writer.writerow( | ||
[ getattr(cfg, _.name) for _ in fields(Configuration) ] + | ||
[ getattr(runs[cfg], _.name) for _ in fields(Result) ] | ||
) | ||
|
||
for rdma_mpi in (False, True): | ||
with open( | ||
os.path.join(CDIR, f"strong_scaling{'-rdma_mpi' if rdma_mpi else ''}.csv"), | ||
"w" | ||
) as f: | ||
writer = csv.writer(f, delimiter=',') | ||
|
||
for nodes in sorted({ | ||
_.nodes for _ in runs.keys() if _.rdma_mpi == rdma_mpi | ||
}): | ||
row = (nodes*8,) | ||
for mem in sorted({ | ||
_.mem for _ in runs.keys() if _.nodes == nodes and _.rdma_mpi == rdma_mpi | ||
}, reverse=True): | ||
ref = runs[Configuration(nodes=sorted({ | ||
_.nodes for _ in runs.keys() if _.rdma_mpi == rdma_mpi | ||
})[0], mem=mem, rdma_mpi=rdma_mpi)] | ||
run = runs[Configuration(nodes=nodes, mem=mem, rdma_mpi=rdma_mpi)] | ||
row = (*row,run.sim_t,ref.sim_t/nodes) | ||
|
||
writer.writerow(row) |
Oops, something went wrong.