diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index b1c6e7fc6..0dfe240db 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -14,11 +14,12 @@ jobs: cloud-tests: strategy: fail-fast: true + max-parallel: 1 matrix: + system: [1x_gpu, 2x_gpu, 2x_node] include: - arch: cuda exclude: "no-cuda" - run_on: azure__a100 # - arch: rocm # exclude : "no-rocm" @@ -27,7 +28,7 @@ jobs: # Cancel previous jobs if a new version was pushed concurrency: - group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}" + group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}" cancel-in-progress: true defaults: @@ -36,13 +37,15 @@ jobs: env: MILABENCH_CONFIG: "config/standard.yaml" - MILABENCH_SYSTEM: "config/cloud-system.yaml" + MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml" MILABENCH_BASE: "output" MILABENCH_ARGS: "" MILABENCH_DASH: "no" ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}" ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}" AZURE_CORE_OUTPUT: none + _MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus,llm-full-mp-gpus" + _MULTI_NODES: "diffusion-nodes,dinov2-giant-nodes,llm-lora-ddp-nodes,llm-full-mp-nodes" steps: - uses: actions/checkout@v3 @@ -90,23 +93,51 @@ jobs: - name: setup cloud run: | + case "${{ matrix.system }}" in + "1x_gpu") + export MILABENCH_SYSTEM="config/cloud-system.yaml" + export RUN_ON="azure__a100" + export SELECT= + export EXCLUDES="--exclude $_MULTI_GPUS,$_MULTI_NODES,llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" + ;; + "2x_gpu") + export MILABENCH_SYSTEM="config/cloud-system.yaml" + export RUN_ON="azure__a100_x2" + export SELECT="--select $_MULTI_GPUS" + export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" + ;; + "2x_node") + export MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml" + export RUN_ON="azure__a100" + export SELECT="--select $_MULTI_NODES" + export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" + ;; + *) + exit 1 + ;; + esac + poetry run milabench cloud \ --setup \ - --run-on ${{ matrix.run_on }} \ - --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.${{ matrix.run_on }} - echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.${{ matrix.run_on }}" >>$GITHUB_ENV + --run-on $RUN_ON \ + --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON + + echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV + echo "SELECT=$SELECT" >>$GITHUB_ENV + echo "EXCLUDES=$EXCLUDES" >>$GITHUB_ENV + echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV - name: install benchmarks run: | - poetry run milabench install --variant ${{ matrix.arch }} + poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDES - name: prepare benchmarks run: | - poetry run milabench prepare + poetry run milabench prepare $SELECT $EXCLUDES - name: run benchmarks run: | - poetry run milabench run + poetry run milabench run $SELECT $EXCLUDES - name: Summary run: | @@ -118,6 +149,11 @@ jobs: env: GITHUB_TOKEN: ${{ github.token }} + - name: DEBUG state file + if: always() + run: | + cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate + - name: teardown cloud if: always() run: | @@ -127,10 +163,10 @@ jobs: fi poetry run milabench cloud \ --teardown \ - --run-on ${{ matrix.run_on }} \ + --run-on $RUN_ON \ --all - - name: debug logs + - name: DEBUG logs if: always() run: | cat ~/.cache/covalent/covalent_ui.log diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py old mode 100644 new mode 100755 index bd6668dab..8afdddae0 --- a/benchmarks/diffusion/main.py +++ b/benchmarks/diffusion/main.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + from dataclasses import dataclass from accelerate import Accelerator diff --git a/config/cloud-multinodes-system.yaml b/config/cloud-multinodes-system.yaml index e5dc14f2b..b6d4d6bee 100644 --- a/config/cloud-multinodes-system.yaml +++ b/config/cloud-multinodes-system.yaml @@ -5,6 +5,7 @@ system: - name: manager # Use 1.1.1.1 as an ip placeholder ip: 1.1.1.1 + port: 5000 # Use this node as the master node or not main: true # User to use in remote milabench operations @@ -21,11 +22,14 @@ system: username: ubuntu size: Standard_NC24ads_A100_v4 location: eastus2 + disk_size: 512 azure__a100_x2: username: ubuntu size: Standard_NC48ads_A100_v4 location: eastus2 + disk_size: 512 azure__a10_x2: username: ubuntu size: Standard_NV72ads_A10_v5 location: eastus2 + disk_size: 512 diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml index 2d1a049ad..9a3a27285 100644 --- a/config/cloud-system.yaml +++ b/config/cloud-system.yaml @@ -5,6 +5,7 @@ system: - name: manager # Use 1.1.1.1 as an ip placeholder ip: 1.1.1.1 + port: 5000 # Use this node as the master node or not main: true # User to use in remote milabench operations @@ -16,11 +17,19 @@ system: username: ubuntu size: Standard_NC24ads_A100_v4 location: eastus2 + disk_size: 512 azure__a100_x2: username: ubuntu size: Standard_NC48ads_A100_v4 location: eastus2 + disk_size: 512 + azure__a10: + username: ubuntu + size: Standard_NV36ads_A10_v5 + location: eastus2 + disk_size: 512 azure__a10_x2: username: ubuntu size: Standard_NV72ads_A10_v5 location: eastus2 + disk_size: 512 diff --git a/config/examples/test.yaml b/config/examples/test.yaml index 6e155a0bf..4f74ac33b 100644 --- a/config/examples/test.yaml +++ b/config/examples/test.yaml @@ -7,18 +7,18 @@ _defaults: test: inherits: _defaults - group: test_remote - install_group: test_remote - definition: ../../benchmarks/_template + group: simple + install_group: test + definition: ../../benchmarks/_templates/simple plan: method: njobs n: 1 testing: inherits: _defaults - definition: ../../benchmarks/_template - group: test_remote_2 - install_group: test_remote_2 + definition: ../../benchmarks/_templates/stdout + group: stdout + install_group: test plan: method: njobs n: 1 diff --git a/milabench/cli/run.py b/milabench/cli/run.py index f5e75b702..b5e8e7f7c 100644 --- a/milabench/cli/run.py +++ b/milabench/cli/run.py @@ -3,6 +3,7 @@ from coleo import Option, tooled +from milabench.remote import is_remote from milabench.utils import validation_layers from ..common import ( @@ -63,14 +64,21 @@ def arguments(): return Arguments(run_name, repeat, fulltrace, report, dash, noterm, validations) - def _fetch_arch(mp): try: arch = next(iter(mp.packs.values())).config["system"]["arch"] except StopIteration: print("no selected bench") return None - + + +def _fetch_first_pack(mp): + try: + return next(iter(mp.packs.values())) + except StopIteration: + print("no selected bench") + return None + @tooled def cli_run(args=None): @@ -78,8 +86,6 @@ def cli_run(args=None): if args is None: args = arguments() - layers = validation_names(args.validations) - dash_class = { "short": ShortDashFormatter, "long": LongDashFormatter, @@ -87,8 +93,14 @@ def cli_run(args=None): }.get(args.dash, None) mp = get_multipack(run_name=args.run_name) + first_pack = _fetch_first_pack(mp) arch = _fetch_arch(mp) + layers = validation_names(args.validations) + if is_remote(first_pack): + # Remote execution will never send back rates + layers.remove("ensure_rate") + # Initialize the backend here so we can retrieve GPU stats init_arch(arch) diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index 0de00f756..875497159 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -674,7 +674,16 @@ def __init__(self, executor: Command, **kwargs) -> None: main = self.nodes[0] # node[port] is for SSH - main_host = main["ip"] + # Find local ip such that workers can connect to the port + for main_host in main["ipaddrlist"]: + if ":" in main_host or main_host == "127.0.0.1": + continue + if all(str.isnumeric(n) for n in main_host.split(".")): + break + else: + main_host = main["ip"] + if len(self.nodes) == 1: + main_host = "localhost" # add them as option so we could tweak them if necessary main_port = option("torchrun.port", int, default=29400) backend = option("torchrun.backend", str, default="c10d") @@ -939,6 +948,15 @@ def _get_main_and_workers(self): def _argv(self, **_) -> List: manager, nodes = self._get_main_and_workers() + # Find local ip such that workers can connect to the port + for manager_ip in manager["ipaddrlist"]: + if ":" in manager_ip or manager_ip == "127.0.0.1": + continue + if all(str.isnumeric(n) for n in manager_ip.split(".")): + break + else: + manager_ip = manager['ip'] + num_machines = max(1, len(nodes) + 1) # Cant do that maybe this run is constrained @@ -978,7 +996,7 @@ def _argv(self, **_) -> List: *deepspeed_argv, f"--gradient_accumulation_steps={self.pack.config.get('gradient_accumulation_steps', 1)}", f"--num_cpu_threads_per_process={cpu_per_process}", - f"--main_process_ip={manager['ip']}", + f"--main_process_ip={manager_ip}", f"--main_process_port={manager['port']}", f"--num_processes={nproc}", *self.accelerate_argv, diff --git a/milabench/common.py b/milabench/common.py index 03bc08028..95a1ab7bc 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -314,18 +314,18 @@ def _read_reports(*runs): return all_data -def _filter_reports(*reports): - all_reports = [] +def _filter_reports(**reports): + _reports = {} - for report in reports: + for k, report in reports.items(): config = next(iter(e for e in report if e["event"] == "config"), None) if config is None: continue if config["data"]["name"] != "remote": - all_reports.append(report) + _reports[k] = report - return all_reports + return _reports def _push_reports(reports_repo, runs): @@ -356,8 +356,8 @@ def _push_reports(reports_repo, runs): device_reports = {} for run in runs: - reports = list(_read_reports(run).values()) - reports = _filter_reports(*reports) + reports = _read_reports(run) + reports = list(_filter_reports(**reports).values()) if not reports: continue @@ -392,7 +392,7 @@ def _push_reports(reports_repo, runs): for (device, build), reports in device_reports.items(): reports_dir = XPath(reports_repo.working_tree_dir) / build reports = _read_reports(*reports) - reports = _filter_reports(*reports.values()) + reports = _filter_reports(**reports) summary = make_summary(reports) successes = [s["successes"] for s in summary.values()] diff --git a/milabench/config.py b/milabench/config.py index 726f9f540..4936054dc 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -150,6 +150,7 @@ def build_config(*config_files): for layer in _config_layers(config_files): all_configs = merge(all_configs, layer) + all_configs.setdefault("*", {}) all_configs["*"]["hash"] = compute_config_hash(all_configs) all_configs = build_matrix_bench(all_configs) diff --git a/milabench/remote.py b/milabench/remote.py index b657f98c5..2b9c2119a 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -2,16 +2,11 @@ import os import sys -import yaml - -from milabench.fs import XPath - from . import ROOT_FOLDER from .commands import ( CmdCommand, Command, ListCommand, - SCPCommand, SequenceCommand, SSHCommand, VoidCommand, @@ -291,6 +286,6 @@ def milabench_remote_run(pack) -> Command: argv = sys.argv[2:] return SequenceCommand( - milabench_remote_command(pack, "run", *argv, run_for="main"), + milabench_remote_command(pack, "run", *argv, "--run-name", pack.config["run_name"], run_for="main"), milabench_remote_fetch_reports_plan(pack, run_for="main"), ) diff --git a/milabench/scripts/covalent/__main__.py b/milabench/scripts/covalent/__main__.py index 995cc856f..e5bedc11a 100644 --- a/milabench/scripts/covalent/__main__.py +++ b/milabench/scripts/covalent/__main__.py @@ -89,15 +89,14 @@ def _popen(cmd, *args, _env=None, **kwargs): return_code = 0 try: if args.setup: - dispatch_id = ct.dispatch( - ct.lattice(executor.get_connection_attributes), disable_run=False - )() - - result = ct.get_result(dispatch_id=dispatch_id, wait=True).result + result = ct.dispatch_sync( + ct.lattice(executor.get_connection_attributes) + )().result assert result and result[0] all_connection_attributes, _ = result + master_host:str = None for hostname, connection_attributes in all_connection_attributes.items(): print(f"hostname::>{hostname}") for attribute,value in connection_attributes.items(): @@ -105,17 +104,65 @@ def _popen(cmd, *args, _env=None, **kwargs): continue print(f"{attribute}::>{value}") - if argv: - dispatch_id = ct.dispatch( - ct.lattice( - lambda:ct.electron(_popen, executor=executor)(argv) - ), - disable_run=False - )() + master_host = master_host or hostname + + if len(all_connection_attributes) > 1: + # Add master node to known host to avoid unknown host error + # The authenticity of host '[hostname] ([IP address])' can't be established. + new_host = subprocess.run( + ["ssh-keyscan", master_host], + stdout=subprocess.PIPE, + check=True + ).stdout.decode("utf8") + known_hosts = pathlib.Path("~/.ssh/known_hosts").expanduser() + with known_hosts.open("at") as _f: + _f.write(new_host) + + # Add ssh file to master node to allow connections to worker + # nodes + ssh_key_file = all_connection_attributes[master_host]["ssh_key_file"] + fn = pathlib.Path(ssh_key_file) + result = ct.dispatch_sync( + ct.lattice(executor.cp_to_remote) + )(f".ssh/{fn.name.split('.')[0]}", str(fn)) + + assert result.status == ct.status.COMPLETED - result = ct.get_result(dispatch_id=dispatch_id, wait=True).result + if argv: + result = ct.dispatch_sync( + ct.lattice(executor.list_running_instances) + )().result + + assert result + + dispatch_ids = set() + for connection_attributes in result.get( + (executor.state_prefix, executor.state_id), + {"env": None} + ).values(): + kwargs = { + **_get_executor_kwargs(args), + **connection_attributes + } + del kwargs["env"] + + _executor:ct.executor.BaseExecutor = executor_cls(**kwargs) + + dispatch_ids.add( + ct.dispatch( + ct.lattice( + lambda:ct.electron(_popen, executor=_executor)(argv) + ), + disable_run=False + )() + ) + + for dispatch_id in dispatch_ids: + result = ct.get_result(dispatch_id=dispatch_id, wait=True).result + + _return_code, _, _ = result if result is not None else (1, "", "") + return_code = return_code or _return_code - return_code, _, _ = result if result is not None else (1, "", "") finally: if args.teardown: result = executor.stop_cloud_instance().result diff --git a/milabench/system.py b/milabench/system.py index 8b9711514..8d137d642 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -258,6 +258,10 @@ def _resolve_ip(ip): if not offline: # Resolve the IP try: + # Workaround error with `gethostbyaddr` on azure DNS (like + # `inmako.eastus2.cloudapp.azure.com`). A proper fix might be a + # correct network config in terraform. + # socket.herror: [Errno 1] Unknown host hostname, aliaslist, ipaddrlist = socket.gethostbyname_ex(ip) lazy_raise = None diff --git a/poetry.lock b/poetry.lock index ec0f16753..b910db129 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "alabaster" @@ -2190,4 +2190,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "59901f6d97314b2a67cac2cf9c4300cb5bde2feba01b0198b20c8ac477adae05" +content-hash = "e8817803c68c0acc023e37a954027d5870b08d0e29cf46e8dd673df7e9d6994d" diff --git a/pyproject.toml b/pyproject.toml index 6a1693bf6..e7f784793 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,9 @@ blessed = "^1.19.1" pathspec = "^0.9.0" cp-template = "^0.3.0" pandas = ">=1.4.2" -numpy = ">=1.23.0,<2.0.0" +# Work around for compatibility issue between numpy 2.0.0 and pandas +# https://github.com/numpy/numpy/issues/26710 +numpy = "^1.23.0" pynvml = "^11.4.1" tqdm = "^4.64.1" pip-tools = "^7.4.1"