Skip to content

Commit 9b9eeb9

Browse files
Merge pull request #48 from RuiApostolo/20240909_hello_world
Changes to configurations and fixes to several tests
2 parents 518b122 + 0c5cae9 commit 9b9eeb9

File tree

8 files changed

+116
-88
lines changed

8 files changed

+116
-88
lines changed

configuration/archer2.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def command(self, job):
3939
"--qos=standard",
4040
],
4141
"environs": ["PrgEnv-gnu", "PrgEnv-cray", "PrgEnv-aocc"],
42-
"max_jobs": 16,
42+
"max_jobs": 64,
4343
"processor": {
4444
"num_cpus": 128,
4545
"num_cpus_per_socket": 64,
@@ -49,6 +49,7 @@ def command(self, job):
4949
{
5050
"name": "compute-gpu",
5151
"descr": "Compute nodes with AMD GPUs",
52+
"max_jobs": 2,
5253
"features": ["gpu"],
5354
"scheduler": "slurm",
5455
"launcher": "srun",
@@ -68,8 +69,9 @@ def command(self, job):
6869
},
6970
{
7071
"name": "compute-gpu-torch",
71-
"descr": "Compute nodes with AMD GPUs",
72-
"features": ["gpu"],
72+
"descr": "Compute nodes with AMD GPUs, and torch launcher",
73+
"max_jobs": 2,
74+
"features": ["torch"],
7375
"scheduler": "slurm",
7476
"launcher": "torchrun",
7577
"access": ["--partition=gpu"],
@@ -122,6 +124,7 @@ def command(self, job):
122124
"craype-accel-amd-gfx90a",
123125
"craype-x86-milan",
124126
],
127+
"features": ["gpu"],
125128
"cc": "cc",
126129
"cxx": "CC",
127130
"ftn": "ftn",
@@ -130,6 +133,7 @@ def command(self, job):
130133
{
131134
"name": "rocm-PrgEnv-cray",
132135
"modules": ["PrgEnv-cray"],
136+
"features": ["gpu"],
133137
"cc": "cc",
134138
"cxx": "CC",
135139
"ftn": "ftn",
@@ -138,6 +142,7 @@ def command(self, job):
138142
{
139143
"name": "rocm-PrgEnv-aocc",
140144
"modules": ["PrgEnv-aocc"],
145+
"features": ["gpu"],
141146
"cc": "cc",
142147
"cxx": "CC",
143148
"ftn": "ftn",

configuration/cirrus.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
"--distribution=block:block",
5050
"--partition=highmem",
5151
],
52-
"max_jobs": 16,
52+
"max_jobs": 2,
5353
"environs": ["gcc", "intel"],
5454
"resources": [
5555
{
@@ -73,7 +73,7 @@
7373
"--partition=gpu",
7474
],
7575
"max_jobs": 4,
76-
"environs": ["nvidia-mpi"],
76+
"environs": ["Default", "nvidia-mpi"],
7777
"resources": [
7878
{"name": "qos", "options": ["--qos={qos}"]},
7979
{
@@ -88,26 +88,6 @@
8888
},
8989
"devices": [{"type": "gpu", "num_devices": 4}],
9090
},
91-
{
92-
"name": "compute-gpu-default",
93-
"descr": "Compute nodes with GPUs but doesn't load nvcc compilers or mpi",
94-
"scheduler": "slurm",
95-
"launcher": "srun",
96-
"access": [
97-
"--partition=gpu",
98-
],
99-
"max_jobs": 4,
100-
"environs": ["Default"],
101-
"resources": [
102-
{"name": "qos", "options": ["--qos={qos}"]},
103-
],
104-
"processor": {
105-
"num_cpus": 40,
106-
"num_cpus_per_socket": 20,
107-
"num_sockets": 2,
108-
},
109-
"devices": [{"type": "gpu", "num_devices": 4}],
110-
},
11191
],
11292
}
11393
],
@@ -139,6 +119,7 @@
139119
},
140120
{
141121
"name": "Default",
122+
"features": ["default"],
142123
"cc": "gcc",
143124
"ftn": "gfortran",
144125
"target_systems": ["cirrus"],

tests/apps/lammps/ethanol.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,12 @@ def setup_nnodes(self):
9494
elif self.current_system.name in ["cirrus"]:
9595
self.executable_opts = LAMMPSBaseEthanol.executable_opts + ["-sf gpu -pk gpu 4"]
9696
self.extra_resources["qos"] = {"qos": "short"}
97-
self.num_tasks_per_node = 40
97+
# self.num_tasks_per_node = 40
9898

9999
@run_after("setup")
100100
def setup_gpu_options(self):
101101
"""sets up different resources for gpu systems"""
102-
self.env_vars["PARAMS"] = (
103-
f'"--exclusive --ntasks={self.num_tasks_per_node} --tasks-per-node={self.num_tasks_per_node}"'
104-
)
102+
self.env_vars["PARAMS"] = "--exclusive --ntasks=40 --tasks-per-node=40"
105103
# Cirru slurm demands it be done this way.
106104
# Trying to add $PARAMS directly to job.launcher.options fails.
107105
if self.current_system.name in ["cirrus"]:

tests/compile/hello/hello.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ def assert_finished(self):
2727
class HelloTestCPU(HelloTestBase):
2828
"""CPU systems test class"""
2929

30-
valid_systems = ["*"]
31-
valid_prog_environs = ["-gpu"]
30+
valid_systems = ["-gpu"]
31+
valid_prog_environs = ["-gpu -default"]
3232
extra_resources = {
3333
"qos": {"qos": "standard"},
3434
}
@@ -38,11 +38,17 @@ class HelloTestCPU(HelloTestBase):
3838
class HelloTestGPU(HelloTestBase):
3939
"""GPU systems test class"""
4040

41-
valid_systems = ["+gpu"]
41+
valid_systems = ["-torch"]
4242
valid_prog_environs = ["+gpu"]
4343
extra_resources = {
4444
"qos": {"qos": "gpu"},
4545
"gpu": {"num_gpus_per_node": "1"},
4646
}
4747
num_tasks = None
4848
num_cpus_per_task = None
49+
50+
@run_after("setup")
51+
def setup_gpu_options(self):
52+
"""Change qos for ARCHER2"""
53+
if self.current_system.name in ["archer2"]:
54+
self.extra_resources["qos"]["qos"] = "gpu-shd"

tests/mlperf/cosmoflow/gpu.py

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,20 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck):
1212
"""Cosmoflow GPU benchmark"""
1313

1414
valid_prog_environs = ["Default", "rocm-PrgEnv-gnu"]
15-
valid_systems = ["cirrus:compute-gpu-default", "archer2:compute-gpu-torch"]
15+
valid_systems = ["cirrus:compute-gpu", "archer2:compute-gpu-torch"]
1616
descr = "CosmoFlow GPU Benchmark"
1717

1818
num_tasks = None
1919
num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7))
20-
lbs = parameter([8])
20+
# Due to memory, Cirrus is limited to a lbs of 2
21+
lbs = parameter([2])
2122

2223
time_limit = "1h"
2324
num_nodes = 1
2425

2526
@run_after("init")
2627
def setup_systems(self):
2728
"""Setup environment"""
28-
self.executable_opts = [
29-
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
30-
"--config",
31-
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
32-
"--device",
33-
"cuda",
34-
"-lbs",
35-
f"{self.lbs}",
36-
# "--t_subset_size", "2048",
37-
# "--v_subset_size", "512"
38-
]
3929
if self.current_system.name in ["archer2"]:
4030
self.executable = ""
4131
self.extra_resources = {
@@ -52,22 +42,46 @@ def setup_systems(self):
5242
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
5343
"HOME": "$PWD",
5444
}
45+
self.executable_opts = [
46+
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
47+
"--config",
48+
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
49+
"--device",
50+
"cuda",
51+
"-lbs",
52+
"8",
53+
# "--t_subset_size", "2048",
54+
# "--v_subset_size", "512"
55+
]
5556

5657
elif self.current_system.name in ["cirrus"]:
5758
self.executable = "python"
5859
self.extra_resources = {
5960
"qos": {"qos": "gpu"},
6061
}
61-
self.modules = ["openmpi/4.1.5-cuda-11.6"]
62+
self.modules = ["openmpi/4.1.6-cuda-11.6"]
6263
self.prerun_cmds = [
63-
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
64-
"conda activate mlperf_torch",
64+
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
65+
"conda activate torch_mlperf",
6566
]
6667
self.env_vars = {
6768
"OMP_NUM_THREADS": "5",
6869
"SRUN_CPUS_PER_TASK": "5",
6970
"OMPI_MCA_mpi_warn_on_fork": "0",
7071
}
72+
self.executable_opts = [
73+
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
74+
"--config",
75+
"/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/cirrus_config.yaml",
76+
"--device",
77+
"cuda",
78+
"--data-dir",
79+
"/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini ",
80+
"-lbs",
81+
"2",
82+
# "--t_subset_size", "2048",
83+
# "--v_subset_size", "512"
84+
]
7185

7286
@run_before("run")
7387
def set_task_distribution(self):

tests/mlperf/deepcam/gpu.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,20 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck):
1212
"""Class for deepcam tests on gpus"""
1313

1414
valid_prog_environs = ["Default", "rocm-PrgEnv-gnu"]
15-
valid_systems = ["cirrus:compute-gpu-default", "archer2:compute-gpu-torch"]
15+
valid_systems = ["cirrus:compute-gpu", "archer2:compute-gpu-torch"]
1616
descr = "Deepcam GPU Benchmark"
1717

1818
num_tasks = None
1919
num_gpus = parameter([4]) # parameter(1 << pow for pow in range(7))
20-
lbs = parameter([8])
20+
# Due to memory, Cirrus is limited to a lbs of 2
21+
# lbs = parameter([2])
2122

2223
time_limit = "1h"
2324
num_nodes = 1
2425

2526
@run_after("init")
2627
def setup_systems(self):
2728
"""Setup environment"""
28-
self.executable_opts = [
29-
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
30-
"--config",
31-
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
32-
"--device",
33-
"cuda",
34-
"-lbs",
35-
f"{self.lbs}",
36-
# "--t_subset_size", "1024",
37-
# "--v_subset_size", "512"
38-
]
3929
if self.current_system.name in ["archer2"]:
4030
self.executable = ""
4131
self.extra_resources = {
@@ -52,22 +42,44 @@ def setup_systems(self):
5242
"LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
5343
"HOME": "$PWD",
5444
}
45+
self.executable_opts = [
46+
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
47+
"--config",
48+
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
49+
"--device",
50+
"cuda",
51+
"-lbs",
52+
"8",
53+
# "--t_subset_size", "1024",
54+
# "--v_subset_size", "512"
55+
]
5556

5657
elif self.current_system.name in ["cirrus"]:
5758
self.executable = "python"
5859
self.extra_resources = {
5960
"qos": {"qos": "gpu"},
6061
}
61-
self.modules = ["openmpi/4.1.5-cuda-11.6"]
62+
self.modules = ["openmpi/4.1.6-cuda-11.6"]
6263
self.prerun_cmds = [
63-
'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
64-
"conda activate mlperf_torch",
64+
'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
65+
"conda activate torch_mlperf",
6566
]
6667
self.env_vars = {
6768
"OMP_NUM_THREADS": "5",
6869
"SRUN_CPUS_PER_TASK": "5",
6970
"OMPI_MCA_mpi_warn_on_fork": "0",
7071
}
72+
self.executable_opts = [
73+
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
74+
"--config",
75+
"/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/cirrusbenchmark_config.yaml",
76+
"--device",
77+
"cuda",
78+
"-lbs",
79+
"2",
80+
# "--t_subset_size", "1024",
81+
# "--v_subset_size", "512"
82+
]
7183

7284
@run_before("run")
7385
def set_task_distribution(self):
@@ -96,5 +108,3 @@ def setup_gpu_options(self):
96108
self.job.launcher.options.append(
97109
f"--ntasks={self.num_gpus} --tasks-per-node={self.num_gpus if self.num_gpus <= 4 else 4}"
98110
)
99-
100-
# ----------------------------------------------------------------------------

0 commit comments

Comments
 (0)