Merge pull request #48 from RuiApostolo/20240909_hello_world

eleanor-broadway · web-flow · commit 9b9eeb9ef51f · 2024-09-25T14:55:16.000+01:00
Changes to configurations and fixes to several tests
diff --git a/configuration/archer2.py b/configuration/archer2.py
@@ -39,7 +39,7 @@ def command(self, job):
                         "--qos=standard",
                     ],
                     "environs": ["PrgEnv-gnu", "PrgEnv-cray", "PrgEnv-aocc"],
-                    "max_jobs": 16,
+                    "max_jobs": 64,
                     "processor": {
                         "num_cpus": 128,
                         "num_cpus_per_socket": 64,
@@ -49,6 +49,7 @@ def command(self, job):
                 {
                     "name": "compute-gpu",
                     "descr": "Compute nodes with AMD GPUs",
+                    "max_jobs": 2,
                     "features": ["gpu"],
                     "scheduler": "slurm",
                     "launcher": "srun",
@@ -68,8 +69,9 @@ def command(self, job):
                 },
                 {
                     "name": "compute-gpu-torch",
-                    "descr": "Compute nodes with AMD GPUs",
-                    "features": ["gpu"],
+                    "descr": "Compute nodes with AMD GPUs, and torch launcher",
+                    "max_jobs": 2,
+                    "features": ["torch"],
                     "scheduler": "slurm",
                     "launcher": "torchrun",
                     "access": ["--partition=gpu"],
@@ -122,6 +124,7 @@ def command(self, job):
                 "craype-accel-amd-gfx90a",
                 "craype-x86-milan",
             ],
+            "features": ["gpu"],
             "cc": "cc",
             "cxx": "CC",
             "ftn": "ftn",
@@ -130,6 +133,7 @@ def command(self, job):
         {
             "name": "rocm-PrgEnv-cray",
             "modules": ["PrgEnv-cray"],
+            "features": ["gpu"],
             "cc": "cc",
             "cxx": "CC",
             "ftn": "ftn",
@@ -138,6 +142,7 @@ def command(self, job):
         {
             "name": "rocm-PrgEnv-aocc",
             "modules": ["PrgEnv-aocc"],
+            "features": ["gpu"],
             "cc": "cc",
             "cxx": "CC",
             "ftn": "ftn",
diff --git a/configuration/cirrus.py b/configuration/cirrus.py
@@ -49,7 +49,7 @@
                         "--distribution=block:block",
                         "--partition=highmem",
                     ],
-                    "max_jobs": 16,
+                    "max_jobs": 2,
                     "environs": ["gcc", "intel"],
                     "resources": [
                         {
@@ -73,7 +73,7 @@
                         "--partition=gpu",
                     ],
                     "max_jobs": 4,
-                    "environs": ["nvidia-mpi"],
+                    "environs": ["Default", "nvidia-mpi"],
                     "resources": [
                         {"name": "qos", "options": ["--qos={qos}"]},
                         {
@@ -88,26 +88,6 @@
                     },
                     "devices": [{"type": "gpu", "num_devices": 4}],
                 },
-                {
-                    "name": "compute-gpu-default",
-                    "descr": "Compute nodes with GPUs but doesn't load nvcc compilers or mpi",
-                    "scheduler": "slurm",
-                    "launcher": "srun",
-                    "access": [
-                        "--partition=gpu",
-                    ],
-                    "max_jobs": 4,
-                    "environs": ["Default"],
-                    "resources": [
-                        {"name": "qos", "options": ["--qos={qos}"]},
-                    ],
-                    "processor": {
-                        "num_cpus": 40,
-                        "num_cpus_per_socket": 20,
-                        "num_sockets": 2,
-                    },
-                    "devices": [{"type": "gpu", "num_devices": 4}],
-                },
             ],
         }
     ],
@@ -139,6 +119,7 @@
         },
         {
             "name": "Default",
+            "features": ["default"],
             "cc": "gcc",
             "ftn": "gfortran",
             "target_systems": ["cirrus"],
diff --git a/tests/apps/lammps/ethanol.py b/tests/apps/lammps/ethanol.py
@@ -94,14 +94,12 @@ def setup_nnodes(self):
         elif self.current_system.name in ["cirrus"]:
             self.executable_opts = LAMMPSBaseEthanol.executable_opts + ["-sf gpu -pk gpu 4"]
             self.extra_resources["qos"] = {"qos": "short"}
-            self.num_tasks_per_node = 40
+            #  self.num_tasks_per_node = 40
 
     @run_after("setup")
     def setup_gpu_options(self):
         """sets up different resources for gpu systems"""
-        self.env_vars["PARAMS"] = (
-            f'"--exclusive --ntasks={self.num_tasks_per_node} --tasks-per-node={self.num_tasks_per_node}"'
-        )
+        self.env_vars["PARAMS"] = "--exclusive --ntasks=40 --tasks-per-node=40"
         # Cirru slurm demands it be done this way.
         # Trying to add $PARAMS directly to job.launcher.options fails.
         if self.current_system.name in ["cirrus"]:
diff --git a/tests/compile/hello/hello.py b/tests/compile/hello/hello.py
@@ -27,8 +27,8 @@ def assert_finished(self):
 class HelloTestCPU(HelloTestBase):
     """CPU systems test class"""
 
-    valid_systems = ["*"]
-    valid_prog_environs = ["-gpu"]
+    valid_systems = ["-gpu"]
+    valid_prog_environs = ["-gpu -default"]
     extra_resources = {
         "qos": {"qos": "standard"},
     }
@@ -38,11 +38,17 @@ class HelloTestCPU(HelloTestBase):
 class HelloTestGPU(HelloTestBase):
     """GPU systems test class"""
 
-    valid_systems = ["+gpu"]
+    valid_systems = ["-torch"]
     valid_prog_environs = ["+gpu"]
     extra_resources = {
         "qos": {"qos": "gpu"},
         "gpu": {"num_gpus_per_node": "1"},
     }
     num_tasks = None
     num_cpus_per_task = None
+
+    @run_after("setup")
+    def setup_gpu_options(self):
+        """Change qos for ARCHER2"""
+        if self.current_system.name in ["archer2"]:
+            self.extra_resources["qos"]["qos"] = "gpu-shd"
diff --git a/tests/mlperf/cosmoflow/gpu.py b/tests/mlperf/cosmoflow/gpu.py
@@ -12,30 +12,20 @@ class CosmoFlowGPUBenchmark(CosmoFlowBaseCheck):
     """Cosmoflow GPU benchmark"""
 
     valid_prog_environs = ["Default", "rocm-PrgEnv-gnu"]
-    valid_systems = ["cirrus:compute-gpu-default", "archer2:compute-gpu-torch"]
+    valid_systems = ["cirrus:compute-gpu", "archer2:compute-gpu-torch"]
     descr = "CosmoFlow GPU Benchmark"
 
     num_tasks = None
     num_gpus = parameter([4])  # parameter(1 << pow for pow in range(7))
-    lbs = parameter([8])
+    # Due to memory, Cirrus is limited to a lbs of 2
+    lbs = parameter([2])
 
     time_limit = "1h"
     num_nodes = 1
 
     @run_after("init")
     def setup_systems(self):
         """Setup environment"""
-        self.executable_opts = [
-            "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
-            "--config",
-            "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
-            "--device",
-            "cuda",
-            "-lbs",
-            f"{self.lbs}",
-            # "--t_subset_size", "2048",
-            # "--v_subset_size", "512"
-        ]
         if self.current_system.name in ["archer2"]:
             self.executable = ""
             self.extra_resources = {
@@ -52,22 +42,46 @@ def setup_systems(self):
                 "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
                 "HOME": "$PWD",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/archer2_config.yaml",
+                "--device",
+                "cuda",
+                "-lbs",
+                "8",
+                # "--t_subset_size", "2048",
+                # "--v_subset_size", "512"
+            ]
 
         elif self.current_system.name in ["cirrus"]:
             self.executable = "python"
             self.extra_resources = {
                 "qos": {"qos": "gpu"},
             }
-            self.modules = ["openmpi/4.1.5-cuda-11.6"]
+            self.modules = ["openmpi/4.1.6-cuda-11.6"]
             self.prerun_cmds = [
-                'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
-                "conda activate mlperf_torch",
+                'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
+                "conda activate torch_mlperf",
             ]
             self.env_vars = {
                 "OMP_NUM_THREADS": "5",
                 "SRUN_CPUS_PER_TASK": "5",
                 "OMPI_MCA_mpi_warn_on_fork": "0",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML_HPC/CosmoFlow/Torch/configs/cirrus_config.yaml",
+                "--device",
+                "cuda",
+                "--data-dir",
+                "/work/z04/shared/mlperf-hpc/cosmoflow/mini/cosmoUniverse_2019_05_4parE_tf_v2_mini ",
+                "-lbs",
+                "2",
+                # "--t_subset_size", "2048",
+                # "--v_subset_size", "512"
+            ]
 
     @run_before("run")
     def set_task_distribution(self):
diff --git a/tests/mlperf/deepcam/gpu.py b/tests/mlperf/deepcam/gpu.py
@@ -12,30 +12,20 @@ class DeepCamGPUBenchmark(DeepCamBaseCheck):
     """Class for deepcam tests on gpus"""
 
     valid_prog_environs = ["Default", "rocm-PrgEnv-gnu"]
-    valid_systems = ["cirrus:compute-gpu-default", "archer2:compute-gpu-torch"]
+    valid_systems = ["cirrus:compute-gpu", "archer2:compute-gpu-torch"]
     descr = "Deepcam GPU Benchmark"
 
     num_tasks = None
     num_gpus = parameter([4])  # parameter(1 << pow for pow in range(7))
-    lbs = parameter([8])
+    # Due to memory, Cirrus is limited to a lbs of 2
+    # lbs = parameter([2])
 
     time_limit = "1h"
     num_nodes = 1
 
     @run_after("init")
     def setup_systems(self):
         """Setup environment"""
-        self.executable_opts = [
-            "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
-            "--config",
-            "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
-            "--device",
-            "cuda",
-            "-lbs",
-            f"{self.lbs}",
-            # "--t_subset_size", "1024",
-            # "--v_subset_size", "512"
-        ]
         if self.current_system.name in ["archer2"]:
             self.executable = ""
             self.extra_resources = {
@@ -52,22 +42,44 @@ def setup_systems(self):
                 "LD_PRELOAD": "$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so:$LD_PRELOAD",
                 "HOME": "$PWD",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/archer2benchmark_config.yaml",
+                "--device",
+                "cuda",
+                "-lbs",
+                "8",
+                # "--t_subset_size", "1024",
+                # "--v_subset_size", "512"
+            ]
 
         elif self.current_system.name in ["cirrus"]:
             self.executable = "python"
             self.extra_resources = {
                 "qos": {"qos": "gpu"},
             }
-            self.modules = ["openmpi/4.1.5-cuda-11.6"]
+            self.modules = ["openmpi/4.1.6-cuda-11.6"]
             self.prerun_cmds = [
-                'eval "$(/work/z043/shared/miniconda3/bin/conda shell.bash hook)"',
-                "conda activate mlperf_torch",
+                'eval "$(/work/z04/shared/ebroadwa/miniconda3/bin/conda shell.bash hook)"',
+                "conda activate torch_mlperf",
             ]
             self.env_vars = {
                 "OMP_NUM_THREADS": "5",
                 "SRUN_CPUS_PER_TASK": "5",
                 "OMPI_MCA_mpi_warn_on_fork": "0",
             }
+            self.executable_opts = [
+                "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/train.py",
+                "--config",
+                "/work/z043/shared/chris-ml-intern/ML_HPC/DeepCAM/Torch/configs/cirrusbenchmark_config.yaml",
+                "--device",
+                "cuda",
+                "-lbs",
+                "2",
+                # "--t_subset_size", "1024",
+                # "--v_subset_size", "512"
+            ]
 
     @run_before("run")
     def set_task_distribution(self):
@@ -96,5 +108,3 @@ def setup_gpu_options(self):
             self.job.launcher.options.append(
                 f"--ntasks={self.num_gpus} --tasks-per-node={self.num_gpus if self.num_gpus <= 4 else 4}"
             )
-
-    # ----------------------------------------------------------------------------
diff --git a/tests/mlperf/resnet50/gpu.py b/tests/mlperf/resnet50/gpu.py
diff --git a/tests/mlperf/resnet50/graphcore.py b/tests/mlperf/resnet50/graphcore.py