From a94fb7ac964dd2aea8582ee8965ee571e7ab1eff Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 1 Mar 2024 14:11:34 -0500 Subject: [PATCH] Update Dockerfile-cuda & avoid capacity fetch on prepare & install (#207) --- .github/workflows/tests.yml | 4 ++-- config/base.yaml | 2 +- docker/Dockerfile-cuda | 26 +++++++++----------------- milabench/cli/publish.py | 17 ++++++++--------- milabench/common.py | 9 +++++---- milabench/config.py | 28 +++++++++++++++++++++------- milabench/log.py | 12 ++++++------ milabench/merge.py | 1 - milabench/scripts/vcs.py | 1 + milabench/sizer.py | 14 +++++--------- 10 files changed, 58 insertions(+), 56 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1192bd0d7..7d456f9bb 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -26,9 +26,9 @@ jobs: matrix: include: - arch: cuda - exclude : "no-cuda" + exclude : "unsupported-cuda" # - arch: rocm - # exclude : "no-rocm" + # exclude : "unsupported-rocm" runs-on: [self-hosted, "${{ matrix.arch }}"] diff --git a/config/base.yaml b/config/base.yaml index daa358f77..d848f7179 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -522,7 +522,7 @@ rwkv: tags: - llm - rnn - - no-rocm + - unsupported-rocm plan: method: per_gpu argv: diff --git a/docker/Dockerfile-cuda b/docker/Dockerfile-cuda index ebbc4a460..459ddb4dd 100644 --- a/docker/Dockerfile-cuda +++ b/docker/Dockerfile-cuda @@ -1,15 +1,17 @@ -FROM ubuntu:22.04 +# FROM ubuntu:22.04 # For cuda-gdb -# FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 # Arguments # --------- +# Use ofed_info -s to get your local version +ARG MOFED_VERSION=5.4-3.4.0.0 +ARG CONFIG=standard.yaml ARG ARCH=cuda -ENV MILABENCH_GPU_ARCH=$ARCH -ARG CONFIG=standard.yaml +ENV MILABENCH_GPU_ARCH=$ARCH ENV MILABENCH_CONFIG_NAME=$CONFIG ENV MILABENCH_DOCKER=1 @@ -36,24 +38,18 @@ COPY . /milabench/milabench/ # rustc: used by BERT models inside https://pypi.org/project/tokenizers/ # build-essential: for rust -# Use ofed_info -s to get your local version -ARG MOFED_VERSION=5.4-3.4.0.0 ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y &&\ - apt-get install -y git build-essential curl python3 python-is-python3 python3-pip &&\ + apt-get install -y --no-install-recommends git build-essential curl python3 python-is-python3 python3-pip &&\ curl -o /etc/apt/trusted.gpg.d/mellanox.asc https://content.mellanox.com/ofed/RPM-GPG-KEY-Mellanox &&\ curl -o /etc/apt/sources.list.d/mellanox.list https://linux.mellanox.com/public/repo/mlnx_ofed/${MOFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list &&\ - curl -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb &&\ - dpkg -i cuda-keyring_1.1-1_all.deb &&\ apt-get update -y &&\ - apt-get install -y libibverbs1 nvidia-compute-utils-535 nvidia-utils-535 cuda-11-8 &&\ + apt-get install -y --no-install-recommends libibverbs1 &&\ apt-get clean &&\ - rm -rf /var/lib/apt/lists/* &&\ - rm cuda-keyring_1.1-1_all.deb + rm -rf /var/lib/apt/lists/* # Install Rust - RUN curl https://sh.rustup.rs -sSf | sh -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" ENV CUDA_HOME=/usr/local/cuda-11.8 @@ -78,8 +74,4 @@ RUN milabench install --config $MILABENCH_CONFIG --base $MILABENCH_BASE $MILABEN milabench prepare --config $MILABENCH_CONFIG --base $MILABENCH_BASE $MILABENCH_ARGS &&\ python -m pip cache purge -# Patch for https://github.com/pytorch/pytorch/issues/97041 -RUN cd /milabench/envs/venv/torch/lib/python3.10/site-packages/torch/lib &&\ - ln -sfn libnvrtc-672ee683.so.11.2 libnvrtc.so - CMD milabench run diff --git a/milabench/cli/publish.py b/milabench/cli/publish.py index cb60812d3..077cda9cd 100644 --- a/milabench/cli/publish.py +++ b/milabench/cli/publish.py @@ -1,19 +1,18 @@ -import re import json -import subprocess -from contextlib import contextmanager import multiprocessing -from dataclasses import dataclass -from urllib.parse import urlparse, ParseResult -import time -import threading -import signal import os +import re +import signal +import subprocess import sys +import threading +import time +from contextlib import contextmanager +from dataclasses import dataclass +from urllib.parse import ParseResult, urlparse from coleo import Option, tooled - SLEEP = 0.01 _INIT = 0 _READY = 1 diff --git a/milabench/common.py b/milabench/common.py index 70789b212..35f9cf125 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -75,7 +75,7 @@ def arguments(): # Define capabilities capabilities: Option = "" - + return CommonArguments( config, system, @@ -91,7 +91,7 @@ def arguments(): def get_multipack(args = None, run_name=None, overrides={}): if args is None: args = arguments() - + override = [ o if re.match(pattern=r"[.\w]+=", string=o) else f"={o}" for o in args.override ] @@ -225,13 +225,14 @@ def _get_multipack( arch = deduce_arch() base_defaults = get_base_defaults( - base=args.base, - arch=arch, + base=args.base, + arch=arch, run_name=run_name ) system_config = build_system_config( args.system, defaults={"system": base_defaults["_defaults"]["system"]}, + gpu=True ) overrides = merge({"*": system_config}, overrides) diff --git a/milabench/config.py b/milabench/config.py index 14c8bf46e..bfee806e7 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -1,4 +1,5 @@ import contextvars +import os import socket import psutil @@ -174,16 +175,28 @@ def resolve_addresses(nodes): return self -def get_gpu_capacity(): - capacity = float(0) +def get_gpu_capacity(strict=False): + try: + capacity = 0 + + for k, v in get_gpu_info()["gpus"].items(): + capacity = min(v["memory"]["total"], capacity) - for k, v in get_gpu_info()["gpus"].items(): - capacity = min(v["memory"]["total"], capacity) + return capacity + except: + print("GPU not available, defaulting to 0 MiB") + if strict: + raise + return 0 - return capacity +def is_autoscale_enabled(): + return ( + os.getenv("MILABENCH_SIZER_AUTO", False) + or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None + ) -def build_system_config(config_file, defaults=None): +def build_system_config(config_file, defaults=None, gpu=True): """Load the system configuration, verify its validity and resolve ip addresses Notes @@ -204,7 +217,8 @@ def build_system_config(config_file, defaults=None): system = config.get("system", {}) - if "gpu" not in system: + # capacity is only required if batch resizer is enabled + if (gpu or is_autoscale_enabled()) and not "gpu" not in system: system["gpu"] = {"capacity": f"{int(get_gpu_capacity())} MiB"} if system.get("sshkey") is not None: diff --git a/milabench/log.py b/milabench/log.py index 5826d309b..a6f7388a9 100644 --- a/milabench/log.py +++ b/milabench/log.py @@ -300,9 +300,9 @@ def on_data(self, entry, data, row): load = int(data.get("load", 0) * 100) currm, totalm = data.get("memory", [0, 0]) temp = int(data.get("temperature", 0)) - row[ - f"gpu:{gpuid}" - ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + row[f"gpu:{gpuid}"] = ( + f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + ) row["gpu_load"] = f"{load}%" row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB" row["gpu_temp"] = f"{temp}C" @@ -376,9 +376,9 @@ def on_data(self, entry, data, row): load = int(data.get("load", 0) * 100) currm, totalm = data.get("memory", [0, 0]) temp = int(data.get("temperature", 0)) - row[ - f"gpu:{gpuid}" - ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + row[f"gpu:{gpuid}"] = ( + f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + ) else: task = data.pop("task", "") units = data.pop("units", "") diff --git a/milabench/merge.py b/milabench/merge.py index e5010c629..a9efa4cec 100644 --- a/milabench/merge.py +++ b/milabench/merge.py @@ -1,6 +1,5 @@ """Utilities to merge dictionaries and other data structures.""" - from collections import deque from functools import reduce from typing import Union diff --git a/milabench/scripts/vcs.py b/milabench/scripts/vcs.py index f1a8c4ddf..0f895f886 100644 --- a/milabench/scripts/vcs.py +++ b/milabench/scripts/vcs.py @@ -1,5 +1,6 @@ """Use to retrieve GIT version info, this file cannot import milabench modules as it is executed as part of the installation process""" + import os import subprocess import warnings diff --git a/milabench/sizer.py b/milabench/sizer.py index a2aa8b87b..4ce2a3f22 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -6,7 +6,7 @@ import numpy as np import yaml -from .config import system_global +from .config import is_autoscale_enabled, system_global from .validation.validation import ValidationLayer ROOT = os.path.dirname(__file__) @@ -14,13 +14,6 @@ default_scaling_config = os.path.join(ROOT, "..", "config", "scaling.yaml") -def is_autoscale_enabled(): - return ( - os.getenv("MILABENCH_SIZER_AUTO", False) - or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None - ) - - def getenv(name, type): value = os.getenv(name) @@ -109,6 +102,9 @@ def get_capacity(self, capacity): def auto_size(self, benchmark, capacity): capacity = self.get_capacity(capacity) + if capacity is None: + return None + config = self.benchscaling(benchmark) data = list(sorted(config["model"].items(), key=lambda x: x[0])) @@ -182,7 +178,7 @@ def scale_argv(pack, argv): sizer = sizer_global.get() system = system_global.get() - capacity = system["gpu"]["capacity"] + capacity = system.get("gpu", dict()).get("capacity") return sizer.argv(pack, capacity, argv)