From d665f05e83874551bb6cd0cf02faffc03db6d484 Mon Sep 17 00:00:00 2001
From: "Joshua A. Anderson" <joaander@umich.edu>
Date: Mon, 20 May 2024 17:36:54 -0400
Subject: [PATCH 1/7] Support Andes.

---
 doc/src/clusters/built-in.md     |  7 +++++++
 doc/src/guide/tutorial/submit.md |  1 +
 src/builtin.rs                   | 24 ++++++++++++++++++++++--
 validate/validate.py             |  9 +++++----
 4 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/doc/src/clusters/built-in.md b/doc/src/clusters/built-in.md
index 06a3477..fd9a860 100644
--- a/doc/src/clusters/built-in.md
+++ b/doc/src/clusters/built-in.md
@@ -2,6 +2,13 @@
 
 **Row** includes built-in support for the following clusters.
 
+## Andes (OLCF)
+
+**Row automatically selects from the following partitions on [Andes]:
+* `batch`
+
+> Note: Andes has no shared partition. All jobs must use 32 CPUs per node.
+
 ## Anvil (Purdue)
 
 **Row** automatically selects from the following partitions on [Anvil]:
diff --git a/doc/src/guide/tutorial/submit.md b/doc/src/guide/tutorial/submit.md
index 515fe55..d56453f 100644
--- a/doc/src/guide/tutorial/submit.md
+++ b/doc/src/guide/tutorial/submit.md
@@ -7,6 +7,7 @@ This section explains how to **submit** jobs to the **scheduler** with **row**.
 ## Preliminary steps
 
 **Row** has built-in support for a number of [clusters](../../clusters/built-in.md):
+* Andes (OLCF)
 * Anvil (Purdue)
 * Delta (NCSA)
 * Great Lakes (University of Michigan)
diff --git a/src/builtin.rs b/src/builtin.rs
index dbebbb0..39f3cc3 100644
--- a/src/builtin.rs
+++ b/src/builtin.rs
@@ -65,6 +65,26 @@ impl BuiltIn for launcher::Configuration {
     }
 }
 
+fn andes() -> Cluster {
+    ////////////////////////////////////////////////////////////////////////////////////////
+    // OLCF Andes
+    Cluster {
+        name: "andes".into(),
+        identify: IdentificationMethod::ByEnvironment("LMOD_SYSTEM_NAME".into(), "andes".into()),
+        scheduler: SchedulerType::Slurm,
+        partition: vec![
+            // Auto-detected partitions: batch
+            Partition {
+                name: "batch".into(),
+                maximum_gpus_per_job: Some(0),
+                require_cpus_multiple_of: Some(32),
+                cpus_per_node: Some(32),
+                ..Partition::default()
+            },
+        ],
+    }
+}
+
 fn anvil() -> Cluster {
     ////////////////////////////////////////////////////////////////////////////////////////
     // Purdue Anvil
@@ -250,7 +270,7 @@ fn greatlakes() -> Cluster {
     }
 }
 
-// TODO: Add/test Frontier and Andes.
+// TODO: Add/test Frontier.
 
 fn none() -> Cluster {
     // Fallback none cluster.
@@ -267,7 +287,7 @@ fn none() -> Cluster {
 
 impl BuiltIn for cluster::Configuration {
     fn built_in() -> Self {
-        let cluster = vec![anvil(), delta(), greatlakes(), none()];
+        let cluster = vec![andes(), anvil(), delta(), greatlakes(), none()];
 
         cluster::Configuration { cluster }
     }
diff --git a/validate/validate.py b/validate/validate.py
index e20a38f..111a8bd 100644
--- a/validate/validate.py
+++ b/validate/validate.py
@@ -47,6 +47,7 @@
     'greatlakes': Cluster(cpus_per_node=36, gpus_per_node=2, gpu_arch='nvidia'),
     'anvil': Cluster(cpus_per_node=128, gpus_per_node=0, gpu_arch='nvidia'),
     'delta': Cluster(cpus_per_node=128, gpus_per_node=4, gpu_arch='nvidia'),
+    'andes': Cluster(cpus_per_node=32, gpus_per_node=0, gpu_arch='none', no_shared=True),
 }
 
 N_THREADS = 4
@@ -132,7 +133,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.cpus_per_node >= 1:
+        if cluster.cpus_per_node >= 1 and not cluster.get('no_shared', False):
             workflow.write(
                 textwrap.dedent("""
                 [[action]]
@@ -145,7 +146,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.cpus_per_node >= N_THREADS:
+        if cluster.cpus_per_node >= N_THREADS and not cluster.get('no_shared', False):
             workflow.write(
                 textwrap.dedent(f"""
                 [[action]]
@@ -159,7 +160,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.cpus_per_node >= N_PROCESSES:
+        if cluster.cpus_per_node >= N_PROCESSES and not cluster.get('no_shared', False):
             workflow.write(
                 textwrap.dedent(f"""
                 [[action]]
@@ -173,7 +174,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.cpus_per_node >= N_PROCESSES * N_THREADS:
+        if cluster.cpus_per_node >= N_PROCESSES * N_THREADS and not cluster.get('no_shared', False):
             workflow.write(
                 textwrap.dedent(f"""
                 [[action]]

From f41df59173674f98efaf3a48009886348fb81098 Mon Sep 17 00:00:00 2001
From: "Joshua A. Anderson" <joaander@umich.edu>
Date: Mon, 20 May 2024 18:02:59 -0400
Subject: [PATCH 2/7] Fix validate script on Andes.

---
 validate/validate.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/validate/validate.py b/validate/validate.py
index 111a8bd..245198f 100644
--- a/validate/validate.py
+++ b/validate/validate.py
@@ -42,12 +42,12 @@
 # Set the number of cpus and gpus per node in the *default* partitions that row selects.
 # Testing non-default partitions is beyond the scope of this script. Set to 0 to prevent
 # CPU and/or GPU jobs from executing.
-Cluster = collections.namedtuple('Cluster', ['cpus_per_node', 'gpus_per_node', 'gpu_arch'])
+Cluster = collections.namedtuple('Cluster', ('cpus_per_node', 'gpus_per_node', 'gpu_arch', 'has_shared'), defaults=(None, None, 'nvidia', True))
 CLUSTERS = {
     'greatlakes': Cluster(cpus_per_node=36, gpus_per_node=2, gpu_arch='nvidia'),
     'anvil': Cluster(cpus_per_node=128, gpus_per_node=0, gpu_arch='nvidia'),
     'delta': Cluster(cpus_per_node=128, gpus_per_node=4, gpu_arch='nvidia'),
-    'andes': Cluster(cpus_per_node=32, gpus_per_node=0, gpu_arch='none', no_shared=True),
+    'andes': Cluster(cpus_per_node=32, gpus_per_node=0, gpu_arch='none', has_shared=False),
 }
 
 N_THREADS = 4
@@ -115,7 +115,7 @@ def init(account, setup):
             [workspace]
             path = "{cluster_name}"
 
-            [submit_options.{cluster_name}]
+            [default.action.submit_options.{cluster_name}]
             """)
         )
 
@@ -133,7 +133,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.cpus_per_node >= 1 and not cluster.get('no_shared', False):
+        if cluster.cpus_per_node >= 1 and cluster.has_shared:
             workflow.write(
                 textwrap.dedent("""
                 [[action]]
@@ -146,7 +146,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.cpus_per_node >= N_THREADS and not cluster.get('no_shared', False):
+        if cluster.cpus_per_node >= N_THREADS and cluster.has_shared:
             workflow.write(
                 textwrap.dedent(f"""
                 [[action]]
@@ -160,7 +160,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.cpus_per_node >= N_PROCESSES and not cluster.get('no_shared', False):
+        if cluster.cpus_per_node >= N_PROCESSES and cluster.has_shared:
             workflow.write(
                 textwrap.dedent(f"""
                 [[action]]
@@ -174,7 +174,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.cpus_per_node >= N_PROCESSES * N_THREADS and not cluster.get('no_shared', False):
+        if cluster.cpus_per_node >= N_PROCESSES * N_THREADS and cluster.has_shared:
             workflow.write(
                 textwrap.dedent(f"""
                 [[action]]

From a1da112d373f5f80172b840904474d5f3a4a46b4 Mon Sep 17 00:00:00 2001
From: "Joshua A. Anderson" <joaander@umich.edu>
Date: Tue, 21 May 2024 08:37:16 -0400
Subject: [PATCH 3/7] Add Frontier support.

---
 doc/src/clusters/built-in.md     | 12 ++++++
 doc/src/guide/tutorial/submit.md |  1 +
 src/builtin.rs                   | 22 ++++++++++-
 validate/validate.py             | 66 +++++++++++++++++++++++++++++---
 4 files changed, 94 insertions(+), 7 deletions(-)

diff --git a/doc/src/clusters/built-in.md b/doc/src/clusters/built-in.md
index fd9a860..8ebd3e1 100644
--- a/doc/src/clusters/built-in.md
+++ b/doc/src/clusters/built-in.md
@@ -9,6 +9,8 @@
 
 > Note: Andes has no shared partition. All jobs must use 32 CPUs per node.
 
+[Andes]: https://docs.olcf.ornl.gov/systems/andes_user_guide.html
+
 ## Anvil (Purdue)
 
 **Row** automatically selects from the following partitions on [Anvil]:
@@ -40,6 +42,16 @@ allows full-node jobs and does not incur extra charges.
 
 [Delta]: https://docs.ncsa.illinois.edu/systems/delta
 
+## Frontier (OLCF)
+
+**Row automatically selects from the following partitions on [Frontier]:
+* `batch`
+
+> Note: Frontier has no shared partition. All jobs must use 8 GPUs per node.
+
+[Frontier]: https://docs.olcf.ornl.gov/systems/frontier_user_guide.html#
+
+
 ## Great Lakes (University of Michigan)
 
 **Row** automatically selects from the following partitions on [Great Lakes]:
diff --git a/doc/src/guide/tutorial/submit.md b/doc/src/guide/tutorial/submit.md
index d56453f..12eb4a0 100644
--- a/doc/src/guide/tutorial/submit.md
+++ b/doc/src/guide/tutorial/submit.md
@@ -10,6 +10,7 @@ This section explains how to **submit** jobs to the **scheduler** with **row**.
 * Andes (OLCF)
 * Anvil (Purdue)
 * Delta (NCSA)
+* Frontier (OLCF)
 * Great Lakes (University of Michigan)
 
 You can skip to the [next heading](#checking-your-job-script) if you are using one of
diff --git a/src/builtin.rs b/src/builtin.rs
index 39f3cc3..0c6ed30 100644
--- a/src/builtin.rs
+++ b/src/builtin.rs
@@ -199,6 +199,26 @@ fn delta() -> Cluster {
     }
 }
 
+fn frontier() -> Cluster {
+    ////////////////////////////////////////////////////////////////////////////////////////
+    // OLCF Frontier
+    Cluster {
+        name: "frontier".into(),
+        identify: IdentificationMethod::ByEnvironment("LMOD_SYSTEM_NAME".into(), "frontier".into()),
+        scheduler: SchedulerType::Slurm,
+        partition: vec![
+            // Auto-detected partitions: batch
+            Partition {
+                name: "batch".into(),
+                minimum_gpus_per_job: Some(8),
+                require_gpus_multiple_of: Some(8),
+                gpus_per_node: Some(8),
+                ..Partition::default()
+            },
+        ],
+    }
+}
+
 fn greatlakes() -> Cluster {
     ////////////////////////////////////////////////////////////////////////////////////////
     // Great Lakes
@@ -287,7 +307,7 @@ fn none() -> Cluster {
 
 impl BuiltIn for cluster::Configuration {
     fn built_in() -> Self {
-        let cluster = vec![andes(), anvil(), delta(), greatlakes(), none()];
+        let cluster = vec![andes(), anvil(), delta(), frontier(), greatlakes(), none()];
 
         cluster::Configuration { cluster }
     }
diff --git a/validate/validate.py b/validate/validate.py
index 245198f..1d25070 100644
--- a/validate/validate.py
+++ b/validate/validate.py
@@ -42,12 +42,17 @@
 # Set the number of cpus and gpus per node in the *default* partitions that row selects.
 # Testing non-default partitions is beyond the scope of this script. Set to 0 to prevent
 # CPU and/or GPU jobs from executing.
-Cluster = collections.namedtuple('Cluster', ('cpus_per_node', 'gpus_per_node', 'gpu_arch', 'has_shared'), defaults=(None, None, 'nvidia', True))
+Cluster = collections.namedtuple(
+    'Cluster',
+    ('cpus_per_node', 'gpus_per_node', 'gpu_arch', 'has_shared'),
+    defaults=(None, None, 'nvidia', True),
+)
 CLUSTERS = {
-    'greatlakes': Cluster(cpus_per_node=36, gpus_per_node=2, gpu_arch='nvidia'),
+    'andes': Cluster(cpus_per_node=32, gpus_per_node=0, gpu_arch='none', has_shared=False),
     'anvil': Cluster(cpus_per_node=128, gpus_per_node=0, gpu_arch='nvidia'),
     'delta': Cluster(cpus_per_node=128, gpus_per_node=4, gpu_arch='nvidia'),
-    'andes': Cluster(cpus_per_node=32, gpus_per_node=0, gpu_arch='none', has_shared=False),
+    'frontier': Cluster(cpus_per_node=0, gpus_per_node=8, gpu_arch='amd', has_shared=False),
+    'greatlakes': Cluster(cpus_per_node=36, gpus_per_node=2, gpu_arch='nvidia'),
 }
 
 N_THREADS = 4
@@ -94,6 +99,24 @@ def get_nvidia_gpus():
     return gpus
 
 
+def get_amd_gpus():
+    """Get the assigned AMD GPUs."""
+    result = subprocess.run(
+        ['rocm-smi', '--showuniqueid'], capture_output=True, check=True, text=True
+    )
+
+    gpus = []
+    pattern = re.compile(r'.*\(Unique ID: (.*)$')
+
+    # TODO: Do we need to parse ROCR_VISIBLE_DEVICES and match GPU[id] lines?
+    for line in result.stdout.splitlines():
+        match = pattern.match(line)
+
+        gpus.append(match.group(1))
+
+    return gpus
+
+
 def init(account, setup):
     """Initialize the project."""
     cluster_name = get_cluster_name()
@@ -218,7 +241,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.gpus_per_node >= 1 and cluster.gpu_arch == 'nvidia':
+        if cluster.gpus_per_node >= 1 and cluster.gpu_arch == 'nvidia' and cluster.has_shared:
             workflow.write(
                 textwrap.dedent("""
                 [[action]]
@@ -232,7 +255,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.gpus_per_node >= N_GPUS and cluster.gpu_arch == 'nvidia':
+        if cluster.gpus_per_node >= N_GPUS and cluster.gpu_arch == 'nvidia' and cluster.has_shared:
             workflow.write(
                 textwrap.dedent(f"""
                 [[action]]
@@ -246,7 +269,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.gpus_per_node >= 1 and cluster.gpu_arch == 'nvidia':
+        if cluster.gpus_per_node >= 1 and cluster.gpu_arch == 'nvidia' and cluster.has_shared:
             workflow.write(
                 textwrap.dedent(f"""
                 [[action]]
@@ -261,6 +284,20 @@ def init(account, setup):
                 """)
             )
 
+        if cluster.gpus_per_node >= 1 and cluster.gpu_arch == 'amd' and :
+            workflow.write(
+                textwrap.dedent(f"""
+                [[action]]
+                name = "mpi_wholenode_amd_gpus"
+                command = "python validate.py execute mpi_wholenode_amd_gpus {{directory}}"
+                products = ["mpi_wholenode_amd_gpus.out"]
+                launchers = ["mpi"]
+                [action.resources]
+                processes.per_submission = {cluster.gpus_per_node}
+                walltime.per_submission = "00:05:00"
+                """)
+            )
+
 
 def serial(directory):
     """Validate serial jobs."""
@@ -327,6 +364,8 @@ def check_mpi(directory, n_processes, n_threads, n_hosts, name, n_gpus=0, gpu_ar
     gpus = []
     if n_gpus > 0 and gpu_arch == 'nvidia':
         gpus = comm.gather(get_nvidia_gpus(), root=0)
+    if n_gpus > 0 and gpu_arch == 'amd':
+        gpus = comm.gather(get_amd_gpus(), root=0)
 
     if comm.Get_rank() == 0:
         cpuset_sizes = [len(s) for s in cpusets]
@@ -463,6 +502,21 @@ def nvidia_gpus(directory):
     """Validate multi-GPU jobs."""
     check_nvidia_gpu(directory, n_gpus=N_GPUS, name='nvidia_gpus')
 
+def mpi_wholenode_amd_gpus(directory):
+    """Check that MPI allocates processes correctly to all AMD GPUs on one node."""
+    cluster_name = get_cluster_name()
+    cluster = CLUSTERS.get(cluster_name)
+
+    check_mpi(
+        directory,
+        n_processes=cluster.gpus_per_node * N_NODES,
+        n_threads=1,
+        n_hosts=1,
+        name='mpi_wholenode_amd_gpus',
+        n_gpus=1,
+        gpu_arch='amd',
+    )
+
 
 if __name__ == '__main__':
     # Parse the command line arguments:

From fb5dc20158faf727fbe28243b1b6d6b34fbb48eb Mon Sep 17 00:00:00 2001
From: "Joshua A. Anderson" <joaander@umich.edu>
Date: Tue, 21 May 2024 09:24:46 -0400
Subject: [PATCH 4/7] Update frontier tests.

---
 validate/validate.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/validate/validate.py b/validate/validate.py
index 1d25070..dadcd54 100644
--- a/validate/validate.py
+++ b/validate/validate.py
@@ -106,13 +106,15 @@ def get_amd_gpus():
     )
 
     gpus = []
-    pattern = re.compile(r'.*\(Unique ID: (.*)$')
+    pattern = re.compile(r'.*Unique ID: (.*)$')
 
     # TODO: Do we need to parse ROCR_VISIBLE_DEVICES and match GPU[id] lines?
     for line in result.stdout.splitlines():
+        print(line)
         match = pattern.match(line)
 
-        gpus.append(match.group(1))
+        if match:
+            gpus.append(match.group(1))
 
     return gpus
 
@@ -284,7 +286,7 @@ def init(account, setup):
                 """)
             )
 
-        if cluster.gpus_per_node >= 1 and cluster.gpu_arch == 'amd' and :
+        if cluster.gpus_per_node >= 1 and cluster.gpu_arch == 'amd':
             workflow.write(
                 textwrap.dedent(f"""
                 [[action]]
@@ -294,6 +296,7 @@ def init(account, setup):
                 launchers = ["mpi"]
                 [action.resources]
                 processes.per_submission = {cluster.gpus_per_node}
+                gpus_per_process = 1
                 walltime.per_submission = "00:05:00"
                 """)
             )

From 3575fe03aca9de48104260c23b79682d36e39072 Mon Sep 17 00:00:00 2001
From: "Joshua A. Anderson" <joaander@umich.edu>
Date: Tue, 21 May 2024 10:12:05 -0400
Subject: [PATCH 5/7] Fix frontier validation.

---
 validate/validate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validate/validate.py b/validate/validate.py
index dadcd54..447880c 100644
--- a/validate/validate.py
+++ b/validate/validate.py
@@ -512,7 +512,7 @@ def mpi_wholenode_amd_gpus(directory):
 
     check_mpi(
         directory,
-        n_processes=cluster.gpus_per_node * N_NODES,
+        n_processes=cluster.gpus_per_node,
         n_threads=1,
         n_hosts=1,
         name='mpi_wholenode_amd_gpus',

From c3b841958fffe9f81c4e36ea326a619befd56929 Mon Sep 17 00:00:00 2001
From: "Joshua A. Anderson" <joaander@umich.edu>
Date: Tue, 21 May 2024 10:13:29 -0400
Subject: [PATCH 6/7] Remove TODO.

---
 src/builtin.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/builtin.rs b/src/builtin.rs
index 0c6ed30..5235a90 100644
--- a/src/builtin.rs
+++ b/src/builtin.rs
@@ -290,8 +290,6 @@ fn greatlakes() -> Cluster {
     }
 }
 
-// TODO: Add/test Frontier.
-
 fn none() -> Cluster {
     // Fallback none cluster.
     Cluster {

From 3951f07496a131fc1286ac47cc6b258ea71010e6 Mon Sep 17 00:00:00 2001
From: "Joshua A. Anderson" <joaander@umich.edu>
Date: Tue, 21 May 2024 14:49:43 -0400
Subject: [PATCH 7/7] Run pre-commit.

---
 validate/validate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/validate/validate.py b/validate/validate.py
index 447880c..da6db40 100644
--- a/validate/validate.py
+++ b/validate/validate.py
@@ -505,6 +505,7 @@ def nvidia_gpus(directory):
     """Validate multi-GPU jobs."""
     check_nvidia_gpu(directory, n_gpus=N_GPUS, name='nvidia_gpus')
 
+
 def mpi_wholenode_amd_gpus(directory):
     """Check that MPI allocates processes correctly to all AMD GPUs on one node."""
     cluster_name = get_cluster_name()