From bda550c7d4fae2d191d0ea58136c1ae82480fb13 Mon Sep 17 00:00:00 2001 From: Xiangru Lian Date: Thu, 17 Jun 2021 15:45:10 +0800 Subject: [PATCH] fix: merge bagua_install_library and setup.py, remove nccl<=2.6 support --- MANIFEST.in | 1 - bagua_install_library/__init__.py | 0 bagua_install_library/install_library.py | 161 ----------------------- setup.py | 138 +++++++++++++++++-- 4 files changed, 128 insertions(+), 172 deletions(-) delete mode 100644 bagua_install_library/__init__.py delete mode 100644 bagua_install_library/install_library.py diff --git a/MANIFEST.in b/MANIFEST.in index 15cefa8..2d5a52d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -11,5 +11,4 @@ recursive-include bagua-core-py/src * include bagua-core-c/Cargo.toml include bagua-core-c/build.rs recursive-include bagua-core-c/src * -recursive-include bagua_install_library * recursive-include python * diff --git a/bagua_install_library/__init__.py b/bagua_install_library/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bagua_install_library/install_library.py b/bagua_install_library/install_library.py deleted file mode 100644 index 114c902..0000000 --- a/bagua_install_library/install_library.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python - -""" -Installs the latest library for bagua. Modifed from https://github.com/kmaehashi/cupy/blob/8f431ca7791deb0893fe2e79850705fc63d23266/cupyx/tools/install_library.py -""" - -import argparse -import os -import platform -import shutil -import sys -import tempfile -import urllib.request -from tqdm import tqdm - - -_nccl_records = [] -library_records = {} - - -class DownloadProgressBar(tqdm): - def update_to(self, b=1, bsize=1, tsize=None): - if tsize is not None: - self.total = tsize - self.update(b * bsize - self.n) - - -def download_url(url, output_path): - with DownloadProgressBar(unit='B', unit_scale=True, - miniters=1, desc=url.split('/')[-1]) as t: - urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to) - - -def _make_nccl_url(public_version, filename): - # https://developer.download.nvidia.com/compute/redist/nccl/v2.8/nccl_2.8.4-1+cuda11.2_x86_64.txz - return ( - "https://developer.download.nvidia.com/compute/redist/nccl/" - + "v{}/{}".format(public_version, filename) - ) - - -def _make_nccl_record(cuda_version, full_version, public_version, filename_linux): - return { - "cuda": cuda_version, - "nccl": full_version, - "assets": { - "Linux": { - "url": _make_nccl_url(public_version, filename_linux), - "filename": "libnccl.so.{}".format(full_version), - }, - }, - } - - -_nccl_records.append( - _make_nccl_record("11.3", "2.9.8", "2.9", "nccl_2.9.8-1+cuda11.3_x86_64.txz") -) -_nccl_records.append( - _make_nccl_record("11.2", "2.8.4", "2.8", "nccl_2.8.4-1+cuda11.2_x86_64.txz") -) -_nccl_records.append( - _make_nccl_record("11.1", "2.8.4", "2.8", "nccl_2.8.4-1+cuda11.1_x86_64.txz") -) -_nccl_records.append( - _make_nccl_record("11.0", "2.9.8", "2.9", "nccl_2.9.8-1+cuda11.0_x86_64.txz") -) -_nccl_records.append( - _make_nccl_record("10.2", "2.9.8", "2.9", "nccl_2.9.8-1+cuda10.2_x86_64.txz") -) -_nccl_records.append( - _make_nccl_record("10.1", "2.8.3", "2.8", "nccl_2.8.3-1+cuda10.1_x86_64.txz") -) -_nccl_records.append( - _make_nccl_record("10.0", "2.6.4", "2.6", "nccl_2.6.4-1+cuda10.0_x86_64.txz") -) -_nccl_records.append( - _make_nccl_record("9.2", "2.4.8", "2.4", "nccl_2.4.8-1+cuda9.2_x86_64.txz") -) -library_records["nccl"] = _nccl_records - - -def install_lib(cuda, prefix, library): - record = None - lib_records = library_records - for record in lib_records[library]: - if record["cuda"] == cuda: - break - else: - raise RuntimeError( - """ -The CUDA version({}) specified is not supported. -Should be one of {}.""".format( - cuda, str([x["cuda"] for x in lib_records[library]]) - ) - ) - if prefix is None: - prefix = os.path.expanduser("~/.bagua_core/cuda_lib") - destination = calculate_destination(prefix, cuda, library, record[library]) - - if os.path.exists(destination): - print("The destination directory {} already exists.".format(destination)) - shutil.rmtree(destination) - - target_platform = platform.system() - asset = record["assets"].get(target_platform, None) - if asset is None: - raise RuntimeError( - """ -The current platform ({}) is not supported.""".format( - target_platform - ) - ) - - print( - "Installing {} {} for CUDA {} to: {}".format( - library, record[library], record["cuda"], destination - ) - ) - - url = asset["url"] - print("Downloading {}...".format(url)) - with tempfile.TemporaryDirectory() as tmpdir: - filename = os.path.join(tmpdir, os.path.basename(url)) - download_url(url, filename) - print("Extracting...") - outdir = os.path.join(tmpdir, "extract") - shutil.unpack_archive(filename, outdir) - print("Installing...") - if library == "nccl": - subdir = os.listdir(outdir) - assert len(subdir) == 1 - shutil.move(os.path.join(outdir, subdir[0]), destination) - else: - assert False - print("Cleaning up...") - print("Done!") - - -def calculate_destination(prefix, cuda, lib, lib_ver): - """Calculates the installation directory.""" - return os.path.join(prefix, ".data") - - -def main(args): - parser = argparse.ArgumentParser() - - parser.add_argument( - "--library", choices=["nccl"], required=True, help="Library to install" - ) - parser.add_argument("--cuda", type=str, required=True, help="CUDA version") - parser.add_argument("--prefix", type=str, default=None, help="Install destination") - params = parser.parse_args(args) - - if params.prefix is not None: - params.prefix = os.path.abspath(params.prefix) - - install_lib(params.cuda, params.prefix, params.library) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/setup.py b/setup.py index 9c32a6a..404c46a 100644 --- a/setup.py +++ b/setup.py @@ -5,6 +5,133 @@ from setuptools import setup, find_packages from setuptools_rust import Binding, RustExtension import sys +import platform +import shutil +import sys +import tempfile +import urllib.request +from tqdm import tqdm + + +_nccl_records = [] +library_records = {} + + +class DownloadProgressBar(tqdm): + def update_to(self, b=1, bsize=1, tsize=None): + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) + + +def download_url(url, output_path): + with DownloadProgressBar(unit='B', unit_scale=True, + miniters=1, desc=url.split('/')[-1]) as t: + urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to) + + +def _make_nccl_url(public_version, filename): + # https://developer.download.nvidia.com/compute/redist/nccl/v2.8/nccl_2.8.4-1+cuda11.2_x86_64.txz + return ( + "https://developer.download.nvidia.com/compute/redist/nccl/" + + "v{}/{}".format(public_version, filename) + ) + + +def _make_nccl_record(cuda_version, full_version, public_version, filename_linux): + return { + "cuda": cuda_version, + "nccl": full_version, + "assets": { + "Linux": { + "url": _make_nccl_url(public_version, filename_linux), + "filename": "libnccl.so.{}".format(full_version), + }, + }, + } + + +_nccl_records.append( + _make_nccl_record("11.3", "2.9.8", "2.9", "nccl_2.9.8-1+cuda11.3_x86_64.txz") +) +_nccl_records.append( + _make_nccl_record("11.2", "2.8.4", "2.8", "nccl_2.8.4-1+cuda11.2_x86_64.txz") +) +_nccl_records.append( + _make_nccl_record("11.1", "2.8.4", "2.8", "nccl_2.8.4-1+cuda11.1_x86_64.txz") +) +_nccl_records.append( + _make_nccl_record("11.0", "2.9.8", "2.9", "nccl_2.9.8-1+cuda11.0_x86_64.txz") +) +_nccl_records.append( + _make_nccl_record("10.2", "2.9.8", "2.9", "nccl_2.9.8-1+cuda10.2_x86_64.txz") +) +_nccl_records.append( + _make_nccl_record("10.1", "2.8.3", "2.8", "nccl_2.8.3-1+cuda10.1_x86_64.txz") +) +library_records["nccl"] = _nccl_records + + +def install_lib(cuda, prefix, library): + record = None + lib_records = library_records + for record in lib_records[library]: + if record["cuda"] == cuda: + break + else: + raise RuntimeError( + """ +The CUDA version({}) specified is not supported. +Should be one of {}.""".format( + cuda, str([x["cuda"] for x in lib_records[library]]) + ) + ) + if prefix is None: + prefix = os.path.expanduser("~/.bagua_core/cuda_lib") + destination = calculate_destination(prefix, cuda, library, record[library]) + + if os.path.exists(destination): + print("The destination directory {} already exists.".format(destination)) + shutil.rmtree(destination) + + target_platform = platform.system() + asset = record["assets"].get(target_platform, None) + if asset is None: + raise RuntimeError( + """ +The current platform ({}) is not supported.""".format( + target_platform + ) + ) + + print( + "Installing {} {} for CUDA {} to: {}".format( + library, record[library], record["cuda"], destination + ) + ) + + url = asset["url"] + print("Downloading {}...".format(url)) + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, os.path.basename(url)) + download_url(url, filename) + print("Extracting...") + outdir = os.path.join(tmpdir, "extract") + shutil.unpack_archive(filename, outdir) + print("Installing...") + if library == "nccl": + subdir = os.listdir(outdir) + assert len(subdir) == 1 + shutil.move(os.path.join(outdir, subdir[0]), destination) + else: + assert False + print("Cleaning up...") + print("Done!") + + +def calculate_destination(prefix, cuda, lib, lib_ver): + """Calculates the installation directory.""" + return os.path.join(prefix, ".data") def check_torch_version(): @@ -23,7 +150,6 @@ def check_torch_version(): def install_dependency_library(): - from bagua_install_library import install_library nvcc_version = ( os.popen( "nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'" @@ -32,15 +158,7 @@ def install_dependency_library(): .strip() ) print("nvcc_version: ", nvcc_version) - args = [ - "--library", - "nccl", - "--cuda", - nvcc_version, - "--prefix", - os.path.join(cwd, "python/bagua_core"), - ] - install_library.main(args) + install_lib(nvcc_version, os.path.join(cwd, "python/bagua_core"), "nccl") if __name__ == "__main__":