diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index b1c6e7fc6..0dfe240db 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -14,11 +14,12 @@ jobs:
   cloud-tests:
     strategy:
       fail-fast: true
+      max-parallel: 1
       matrix:
+        system: [1x_gpu, 2x_gpu, 2x_node]
         include:
           - arch: cuda
             exclude: "no-cuda"
-            run_on: azure__a100
           # - arch: rocm
           #   exclude : "no-rocm"
 
@@ -27,7 +28,7 @@ jobs:
 
     # Cancel previous jobs if a new version was pushed
     concurrency:
-      group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}"
+      group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}"
       cancel-in-progress: true
 
     defaults:
@@ -36,13 +37,15 @@ jobs:
 
     env:
       MILABENCH_CONFIG: "config/standard.yaml"
-      MILABENCH_SYSTEM: "config/cloud-system.yaml"
+      MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml"
       MILABENCH_BASE: "output"
       MILABENCH_ARGS: ""
       MILABENCH_DASH: "no"
       ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}"
       ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}"
       AZURE_CORE_OUTPUT: none
+      _MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus,llm-full-mp-gpus"
+      _MULTI_NODES: "diffusion-nodes,dinov2-giant-nodes,llm-lora-ddp-nodes,llm-full-mp-nodes"
 
     steps:
       - uses: actions/checkout@v3
@@ -90,23 +93,51 @@ jobs:
 
       - name: setup cloud
         run: |
+          case "${{ matrix.system }}" in
+            "1x_gpu")
+              export MILABENCH_SYSTEM="config/cloud-system.yaml"
+              export RUN_ON="azure__a100"
+              export SELECT=
+              export EXCLUDES="--exclude $_MULTI_GPUS,$_MULTI_NODES,llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
+              ;;
+            "2x_gpu")
+              export MILABENCH_SYSTEM="config/cloud-system.yaml"
+              export RUN_ON="azure__a100_x2"
+              export SELECT="--select $_MULTI_GPUS"
+              export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
+              ;;
+            "2x_node")
+              export MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml"
+              export RUN_ON="azure__a100"
+              export SELECT="--select $_MULTI_NODES"
+              export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
+              ;;
+            *)
+              exit 1
+              ;;
+          esac
+
           poetry run milabench cloud \
             --setup \
-            --run-on ${{ matrix.run_on }} \
-            --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.${{ matrix.run_on }}
-          echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.${{ matrix.run_on }}" >>$GITHUB_ENV
+            --run-on $RUN_ON \
+            --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON
+
+          echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV
+          echo "SELECT=$SELECT" >>$GITHUB_ENV
+          echo "EXCLUDES=$EXCLUDES" >>$GITHUB_ENV
+          echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV
 
       - name: install benchmarks
         run: |
-          poetry run milabench install --variant ${{ matrix.arch }}
+          poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDES
 
       - name: prepare benchmarks
         run: |
-          poetry run milabench prepare
+          poetry run milabench prepare $SELECT $EXCLUDES
 
       - name: run benchmarks
         run: |
-          poetry run milabench run
+          poetry run milabench run $SELECT $EXCLUDES
 
       - name: Summary
         run: |
@@ -118,6 +149,11 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ github.token }}
 
+      - name: DEBUG state file
+        if: always()
+        run: |
+          cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate
+
       - name: teardown cloud
         if: always()
         run: |
@@ -127,10 +163,10 @@ jobs:
           fi
           poetry run milabench cloud \
             --teardown \
-            --run-on ${{ matrix.run_on }} \
+            --run-on $RUN_ON \
             --all
 
-      - name: debug logs
+      - name: DEBUG logs
         if: always()
         run: |
           cat ~/.cache/covalent/covalent_ui.log
diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py
old mode 100644
new mode 100755
index bd6668dab..8afdddae0
--- a/benchmarks/diffusion/main.py
+++ b/benchmarks/diffusion/main.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 from dataclasses import dataclass
 
 from accelerate import Accelerator
diff --git a/config/cloud-multinodes-system.yaml b/config/cloud-multinodes-system.yaml
index e5dc14f2b..b6d4d6bee 100644
--- a/config/cloud-multinodes-system.yaml
+++ b/config/cloud-multinodes-system.yaml
@@ -5,6 +5,7 @@ system:
     - name: manager
       # Use 1.1.1.1 as an ip placeholder
       ip: 1.1.1.1
+      port: 5000
       # Use this node as the master node or not
       main: true
       # User to use in remote milabench operations
@@ -21,11 +22,14 @@ system:
       username: ubuntu
       size: Standard_NC24ads_A100_v4
       location: eastus2
+      disk_size: 512
     azure__a100_x2:
       username: ubuntu
       size: Standard_NC48ads_A100_v4
       location: eastus2
+      disk_size: 512
     azure__a10_x2:
       username: ubuntu
       size: Standard_NV72ads_A10_v5
       location: eastus2
+      disk_size: 512
diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml
index 2d1a049ad..9a3a27285 100644
--- a/config/cloud-system.yaml
+++ b/config/cloud-system.yaml
@@ -5,6 +5,7 @@ system:
     - name: manager
       # Use 1.1.1.1 as an ip placeholder
       ip: 1.1.1.1
+      port: 5000
       # Use this node as the master node or not
       main: true
       # User to use in remote milabench operations
@@ -16,11 +17,19 @@ system:
       username: ubuntu
       size: Standard_NC24ads_A100_v4
       location: eastus2
+      disk_size: 512
     azure__a100_x2:
       username: ubuntu
       size: Standard_NC48ads_A100_v4
       location: eastus2
+      disk_size: 512
+    azure__a10:
+      username: ubuntu
+      size: Standard_NV36ads_A10_v5
+      location: eastus2
+      disk_size: 512
     azure__a10_x2:
       username: ubuntu
       size: Standard_NV72ads_A10_v5
       location: eastus2
+      disk_size: 512
diff --git a/config/examples/test.yaml b/config/examples/test.yaml
index 6e155a0bf..4f74ac33b 100644
--- a/config/examples/test.yaml
+++ b/config/examples/test.yaml
@@ -7,18 +7,18 @@ _defaults:
 
 test:
   inherits: _defaults
-  group: test_remote
-  install_group: test_remote
-  definition: ../../benchmarks/_template
+  group: simple
+  install_group: test
+  definition: ../../benchmarks/_templates/simple
   plan:
     method: njobs
     n: 1
 
 testing:
   inherits: _defaults
-  definition: ../../benchmarks/_template
-  group: test_remote_2
-  install_group: test_remote_2
+  definition: ../../benchmarks/_templates/stdout
+  group: stdout
+  install_group: test
   plan:
     method: njobs
     n: 1
diff --git a/milabench/cli/run.py b/milabench/cli/run.py
index f5e75b702..b5e8e7f7c 100644
--- a/milabench/cli/run.py
+++ b/milabench/cli/run.py
@@ -3,6 +3,7 @@
 
 from coleo import Option, tooled
 
+from milabench.remote import is_remote
 from milabench.utils import validation_layers
 
 from ..common import (
@@ -63,14 +64,21 @@ def arguments():
     return Arguments(run_name, repeat, fulltrace, report, dash, noterm, validations)
 
 
-
 def _fetch_arch(mp):
     try:
         arch = next(iter(mp.packs.values())).config["system"]["arch"]
     except StopIteration:
         print("no selected bench")
         return None
-    
+
+
+def _fetch_first_pack(mp):
+    try:
+        return next(iter(mp.packs.values()))
+    except StopIteration:
+        print("no selected bench")
+        return None
+
 
 @tooled
 def cli_run(args=None):
@@ -78,8 +86,6 @@ def cli_run(args=None):
     if args is None:
         args = arguments()
 
-    layers = validation_names(args.validations)
-
     dash_class = {
         "short": ShortDashFormatter,
         "long": LongDashFormatter,
@@ -87,8 +93,14 @@ def cli_run(args=None):
     }.get(args.dash, None)
 
     mp = get_multipack(run_name=args.run_name)
+    first_pack = _fetch_first_pack(mp)
     arch = _fetch_arch(mp)
 
+    layers = validation_names(args.validations)
+    if is_remote(first_pack):
+        # Remote execution will never send back rates
+        layers.remove("ensure_rate")
+
     # Initialize the backend here so we can retrieve GPU stats
     init_arch(arch)
 
diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
index 0de00f756..875497159 100644
--- a/milabench/commands/__init__.py
+++ b/milabench/commands/__init__.py
@@ -674,7 +674,16 @@ def __init__(self, executor: Command, **kwargs) -> None:
         main = self.nodes[0]
 
         # node[port] is for SSH
-        main_host = main["ip"]
+        # Find local ip such that workers can connect to the port
+        for main_host in main["ipaddrlist"]:
+            if ":" in main_host or main_host == "127.0.0.1":
+                continue
+            if all(str.isnumeric(n) for n in main_host.split(".")):
+                break
+        else:
+            main_host = main["ip"]
+        if len(self.nodes) == 1:
+            main_host = "localhost"
         # add them as option so we could tweak them if necessary
         main_port = option("torchrun.port", int, default=29400)
         backend = option("torchrun.backend", str, default="c10d")
@@ -939,6 +948,15 @@ def _get_main_and_workers(self):
     def _argv(self, **_) -> List:
         manager, nodes = self._get_main_and_workers()
 
+        # Find local ip such that workers can connect to the port
+        for manager_ip in manager["ipaddrlist"]:
+            if ":" in manager_ip or manager_ip == "127.0.0.1":
+                continue
+            if all(str.isnumeric(n) for n in manager_ip.split(".")):
+                break
+        else:
+            manager_ip = manager['ip']
+
         num_machines = max(1, len(nodes) + 1)
 
         # Cant do that maybe this run is constrained
@@ -978,7 +996,7 @@ def _argv(self, **_) -> List:
             *deepspeed_argv,
             f"--gradient_accumulation_steps={self.pack.config.get('gradient_accumulation_steps', 1)}",
             f"--num_cpu_threads_per_process={cpu_per_process}",
-            f"--main_process_ip={manager['ip']}",
+            f"--main_process_ip={manager_ip}",
             f"--main_process_port={manager['port']}",
             f"--num_processes={nproc}",
             *self.accelerate_argv,
diff --git a/milabench/common.py b/milabench/common.py
index 03bc08028..95a1ab7bc 100644
--- a/milabench/common.py
+++ b/milabench/common.py
@@ -314,18 +314,18 @@ def _read_reports(*runs):
     return all_data
 
 
-def _filter_reports(*reports):
-    all_reports = []
+def _filter_reports(**reports):
+    _reports = {}
 
-    for report in reports:
+    for k, report in reports.items():
         config = next(iter(e for e in report if e["event"] == "config"), None)
         if config is None:
             continue
 
         if config["data"]["name"] != "remote":
-            all_reports.append(report)
+            _reports[k] = report
 
-    return all_reports
+    return _reports
 
 
 def _push_reports(reports_repo, runs):
@@ -356,8 +356,8 @@ def _push_reports(reports_repo, runs):
 
     device_reports = {}
     for run in runs:
-        reports = list(_read_reports(run).values())
-        reports = _filter_reports(*reports)
+        reports = _read_reports(run)
+        reports = list(_filter_reports(**reports).values())
 
         if not reports:
             continue
@@ -392,7 +392,7 @@ def _push_reports(reports_repo, runs):
     for (device, build), reports in device_reports.items():
         reports_dir = XPath(reports_repo.working_tree_dir) / build
         reports = _read_reports(*reports)
-        reports = _filter_reports(*reports.values())
+        reports = _filter_reports(**reports)
         summary = make_summary(reports)
 
         successes = [s["successes"] for s in summary.values()]
diff --git a/milabench/config.py b/milabench/config.py
index 726f9f540..4936054dc 100644
--- a/milabench/config.py
+++ b/milabench/config.py
@@ -150,6 +150,7 @@ def build_config(*config_files):
     for layer in _config_layers(config_files):
         all_configs = merge(all_configs, layer)
 
+    all_configs.setdefault("*", {})
     all_configs["*"]["hash"] = compute_config_hash(all_configs)
 
     all_configs = build_matrix_bench(all_configs)
diff --git a/milabench/remote.py b/milabench/remote.py
index b657f98c5..2b9c2119a 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -2,16 +2,11 @@
 import os
 import sys
 
-import yaml
-
-from milabench.fs import XPath
-
 from . import ROOT_FOLDER
 from .commands import (
     CmdCommand,
     Command,
     ListCommand,
-    SCPCommand,
     SequenceCommand,
     SSHCommand,
     VoidCommand,
@@ -291,6 +286,6 @@ def milabench_remote_run(pack) -> Command:
 
     argv = sys.argv[2:]
     return SequenceCommand(
-        milabench_remote_command(pack, "run", *argv, run_for="main"),
+        milabench_remote_command(pack, "run", *argv, "--run-name", pack.config["run_name"], run_for="main"),
         milabench_remote_fetch_reports_plan(pack, run_for="main"),
     )
diff --git a/milabench/scripts/covalent/__main__.py b/milabench/scripts/covalent/__main__.py
index 995cc856f..e5bedc11a 100644
--- a/milabench/scripts/covalent/__main__.py
+++ b/milabench/scripts/covalent/__main__.py
@@ -89,15 +89,14 @@ def _popen(cmd, *args, _env=None, **kwargs):
     return_code = 0
     try:
         if args.setup:
-            dispatch_id = ct.dispatch(
-                ct.lattice(executor.get_connection_attributes), disable_run=False
-            )()
-
-            result = ct.get_result(dispatch_id=dispatch_id, wait=True).result
+            result = ct.dispatch_sync(
+                ct.lattice(executor.get_connection_attributes)
+            )().result
 
             assert result and result[0]
 
             all_connection_attributes, _ = result
+            master_host:str = None
             for hostname, connection_attributes in all_connection_attributes.items():
                 print(f"hostname::>{hostname}")
                 for attribute,value in connection_attributes.items():
@@ -105,17 +104,65 @@ def _popen(cmd, *args, _env=None, **kwargs):
                         continue
                     print(f"{attribute}::>{value}")
 
-        if argv:
-            dispatch_id = ct.dispatch(
-                ct.lattice(
-                    lambda:ct.electron(_popen, executor=executor)(argv)
-                ),
-                disable_run=False
-            )()
+                master_host = master_host or hostname
+
+            if len(all_connection_attributes) > 1:
+                # Add master node to known host to avoid unknown host error
+                # The authenticity of host '[hostname] ([IP address])' can't be established.
+                new_host = subprocess.run(
+                    ["ssh-keyscan", master_host],
+                    stdout=subprocess.PIPE,
+                    check=True
+                ).stdout.decode("utf8")
+                known_hosts = pathlib.Path("~/.ssh/known_hosts").expanduser()
+                with known_hosts.open("at") as _f:
+                    _f.write(new_host)
+
+                # Add ssh file to master node to allow connections to worker
+                # nodes
+                ssh_key_file = all_connection_attributes[master_host]["ssh_key_file"]
+                fn = pathlib.Path(ssh_key_file)
+                result = ct.dispatch_sync(
+                    ct.lattice(executor.cp_to_remote)
+                )(f".ssh/{fn.name.split('.')[0]}", str(fn))
+
+                assert result.status == ct.status.COMPLETED
 
-            result = ct.get_result(dispatch_id=dispatch_id, wait=True).result
+        if argv:
+            result = ct.dispatch_sync(
+                ct.lattice(executor.list_running_instances)
+            )().result
+
+            assert result
+
+            dispatch_ids = set()
+            for connection_attributes in result.get(
+                (executor.state_prefix, executor.state_id),
+                {"env": None}
+            ).values():
+                kwargs = {
+                    **_get_executor_kwargs(args),
+                    **connection_attributes
+                }
+                del kwargs["env"]
+
+                _executor:ct.executor.BaseExecutor = executor_cls(**kwargs)
+
+                dispatch_ids.add(
+                    ct.dispatch(
+                        ct.lattice(
+                            lambda:ct.electron(_popen, executor=_executor)(argv)
+                        ),
+                        disable_run=False
+                    )()
+                )
+
+            for dispatch_id in dispatch_ids:
+                result = ct.get_result(dispatch_id=dispatch_id, wait=True).result
+
+                _return_code, _, _ = result if result is not None else (1, "", "")
+                return_code = return_code or _return_code
 
-            return_code, _, _ = result if result is not None else (1, "", "")
     finally:
         if args.teardown:
             result = executor.stop_cloud_instance().result
diff --git a/milabench/system.py b/milabench/system.py
index 8b9711514..8d137d642 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -258,6 +258,10 @@ def _resolve_ip(ip):
     if not offline:
         # Resolve the IP
         try:
+            # Workaround error with `gethostbyaddr` on azure DNS (like
+            # `inmako.eastus2.cloudapp.azure.com`). A proper fix might be a
+            # correct network config in terraform.
+            # socket.herror: [Errno 1] Unknown host
             hostname, aliaslist, ipaddrlist = socket.gethostbyname_ex(ip)
             lazy_raise = None
         
diff --git a/poetry.lock b/poetry.lock
index ec0f16753..b910db129 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "alabaster"
@@ -2190,4 +2190,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<4.0"
-content-hash = "59901f6d97314b2a67cac2cf9c4300cb5bde2feba01b0198b20c8ac477adae05"
+content-hash = "e8817803c68c0acc023e37a954027d5870b08d0e29cf46e8dd673df7e9d6994d"
diff --git a/pyproject.toml b/pyproject.toml
index 6a1693bf6..e7f784793 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,9 @@ blessed = "^1.19.1"
 pathspec = "^0.9.0"
 cp-template = "^0.3.0"
 pandas = ">=1.4.2"
-numpy = ">=1.23.0,<2.0.0"
+# Work around for compatibility issue between numpy 2.0.0 and pandas
+# https://github.com/numpy/numpy/issues/26710
+numpy = "^1.23.0"
 pynvml = "^11.4.1"
 tqdm = "^4.64.1"
 pip-tools = "^7.4.1"