Skip to content

Commit

Permalink
Merge branch 'main' into from-cudnnex/per_device_handle
Browse files Browse the repository at this point in the history
  • Loading branch information
vedaanta authored Apr 1, 2024
2 parents b048b59 + 56bc01b commit c076acb
Show file tree
Hide file tree
Showing 14 changed files with 122 additions and 32 deletions.
4 changes: 3 additions & 1 deletion .azure/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ jobs:
# CUDA 12.1
'cuda 12.1 | torch 2.2 | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.2.1"}
'cuda 12.1 | torch 2.3 /nightly | cudnn FE v1.2':
'cuda 12.1 | torch 2.3 /test | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.3.0', TRITON_VERSION: '2.2.0', TORCH_INSTALL: 'test', CUDNN_FRONTEND: "1.2.1"}
'cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.2.1"}
#'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found
# how much time to give 'run always even if cancelled tasks' before stopping them
Expand Down
13 changes: 11 additions & 2 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,20 @@ jobs:
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
# how long to run the job before automatically cancelling
timeoutInMinutes: "35"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: "lit-rtx-3090"
Expand Down Expand Up @@ -89,6 +94,7 @@ jobs:
--timeout=240 \
--random-order-seed=42 \
--durations=250 \
--timeout=240 \
--numprocesses=9 \
--ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py
# compile coverage results
Expand All @@ -98,6 +104,7 @@ jobs:
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
timeoutInMinutes: "30"
displayName: 'Testing: regular'
- bash: |
Expand All @@ -117,6 +124,7 @@ jobs:
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
timeoutInMinutes: "15"
displayName: 'Testing: networks'
#- bash: |
Expand All @@ -138,6 +146,7 @@ jobs:
# ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
# --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure
condition: eq(variables['testing'], 'distributed')
timeoutInMinutes: "20"
displayName: 'Testing: distributed'
# todo (mruberry): decide whether this should be here or in another workflow
Expand Down
7 changes: 5 additions & 2 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,8 @@
* @mruberry @lantiga @robieta @t-vi @carmocca

# CI/CD and configs
/.github/ @mruberry @lantiga @t-vi @carmocca
*.yml @mruberry @lantiga @t-vi @carmocca
/.azure/ @borda @lantiga @t-vi @carmocca
/.github/ @borda @lantiga @t-vi @carmocca
/dockers/ @borda @lantiga @t-vi @carmocca
Makefile @borda @lantiga @t-vi @carmocca
*.yml @borda @lantiga @t-vi @carmocca
12 changes: 0 additions & 12 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,9 @@ updates:
- package-ecosystem: "pip"
# Look for a `requirements` in the `root` directory
directory: "/"
# Check for updates once a week
schedule:
interval: "monthly"
# Labels on pull requests for version updates only
labels:
- "ci / tests"
pull-request-branch-name:
# Separate sections of the branch name with a hyphen
# for example, `dependabot-npm_and_yarn-next_js-acorn-6.4.1`
separator: "-"
# Allow up to 5 open pull requests for pip dependencies
open-pull-requests-limit: 5
Expand All @@ -25,15 +19,9 @@ updates:
# Enable version updates for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/"
# Check for updates once a week
schedule:
interval: "monthly"
# Labels on pull requests for version updates only
labels:
- "ci / tests"
pull-request-branch-name:
# Separate sections of the branch name with a hyphen
# for example, `dependabot-npm_and_yarn-next_js-acorn-6.4.1`
separator: "-"
# Allow up to 5 open pull requests for GitHub Actions
open-pull-requests-limit: 5
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/ci-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@ concurrency:

jobs:
precommit-run:
uses: Lightning-AI/utilities/.github/workflows/check-precommit.yml@v0.10.1
uses: Lightning-AI/utilities/.github/workflows/check-precommit.yml@v0.11.2
with:
python-version: "3.10"

check-schema:
uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.11.0
uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.11.2
with:
azure-dir: ".azure"

check-package:
uses: Lightning-AI/utilities/.github/workflows/check-package.yml@v0.11.0
uses: Lightning-AI/utilities/.github/workflows/check-package.yml@v0.11.2
with:
actions-ref: v0.11.0
actions-ref: v0.11.2
import-name: "thunder"
artifact-name: dist-packages-${{ github.sha }}
testing-matrix: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ jobs:
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml
Expand Down
47 changes: 47 additions & 0 deletions .github/workflows/release-nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: Nightly packages

on:
pull_request: # this shall test only the part of workflow before publishing
branches: [main, "release/*"]
types: [opened, reopened, ready_for_review, synchronize]
paths:
- ".github/workflows/release-nightly.yml"
schedule:
- cron: "0 0 * * 0" # on Sundays
workflow_dispatch: {}

defaults:
run:
shell: bash

jobs:
releasing-nightly:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.10"

- name: Install dependencies
run: python -m pip install --user --upgrade setuptools wheel
- name: Build
env:
CONVERT_VERSION2NIGHTLY: "1"
run: python setup.py sdist bdist_wheel

# We do this, since failures on test.pypi aren't that bad
- name: Publish to Test PyPI
if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
uses: pypa/gh-action-pypi-publish@v1.8.14
with:
user: __token__
password: ${{ secrets.test_pypi_password }}
repository_url: https://test.pypi.org/legacy/

- name: Publish distribution 📦 to PyPI
if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
uses: pypa/gh-action-pypi-publish@v1.8.14
with:
user: __token__
password: ${{ secrets.pypi_password }}
3 changes: 1 addition & 2 deletions .github/workflows/release-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ on: # Trigger the workflow on push or pull request, but only for the main branc
# based on https://github.com/pypa/gh-action-pypi-publish

jobs:
build:
releasing-pypi:
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
Expand Down
9 changes: 8 additions & 1 deletion dockers/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,14 @@ RUN \
cd .. && \
rm -rf Fuser ; \
elif [ "${TORCH_INSTALL}" == "test" ]; then \
echo "Not supported option" ; \
# building nvFuser from source
git clone https://github.com/NVIDIA/Fuser.git && \
cd Fuser && \
git submodule update --init --recursive && \
pip install -r requirements.txt && \
python setup.py install --no-test --no-benchmark && \
cd .. && \
rm -rf Fuser ; \
else \
# installing pytorch from wheels \
CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
Expand Down
2 changes: 1 addition & 1 deletion requirements/docs.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
sphinx ==5.3.0
myst-parser ==1.0.0
nbsphinx ==0.9.3
ipython[all] ==8.22.2
ipython[all] ==8.23.0
pandoc ==2.3
docutils >=0.16
sphinxcontrib-fulltoc ==1.2.0
Expand Down
2 changes: 1 addition & 1 deletion requirements/notebooks.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
ipython[all] ==8.22.2
ipython[all] ==8.23.0

litgpt @ git+https://github.com/Lightning-AI/lit-gpt@940ffc96f7214bca24aa77479bc7c33900aaef28
8 changes: 4 additions & 4 deletions requirements/test.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
coverage ==7.4.3
pytest ==8.0.2
coverage ==7.4.4
pytest ==8.1.1
pytest-timeout ==2.2.0
pytest-cov ==4.1.0
pytest-xdist ==3.5.0
pytest-random-order ==1.1.1
pytest-timestamper ==0.0.9
pytest-timestamper ==0.0.10
graphviz ==0.20.1
fdm ==0.4.1
expecttest ==0.2.1 # for test_ddp.py
hypothesis ==6.99.10 # for test_ddp.py
hypothesis ==6.100.0 # for test_ddp.py
numpy # for test_ops.py
einops # for test_einops.py
litgpt @ git+https://github.com/Lightning-AI/lit-gpt@940ffc96f7214bca24aa77479bc7c33900aaef28
Expand Down
35 changes: 35 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python
import glob
import os
import re
from importlib.util import module_from_spec, spec_from_file_location
from pathlib import Path

Expand All @@ -10,6 +11,8 @@

_PATH_ROOT = os.path.dirname(__file__)
_PATH_REQUIRES = os.path.join(_PATH_ROOT, "requirements")
# check if os env. variable is set to convert version to nightly
_CONVERT_VERSION = int(os.environ.get("CONVERT_VERSION2NIGHTLY", 0))


def _load_py_module(fname, pkg="thunder"):
Expand All @@ -19,6 +22,35 @@ def _load_py_module(fname, pkg="thunder"):
return py


def convert_version2nightly(about_file: str = "thunder/__about__.py") -> None:
"""Load the actual version and convert it to the nightly version."""
from datetime import datetime

# load the about file
with open(about_file) as fo:
lines = fo.readlines()
idx = None
# find the line with version
for i, ln in enumerate(lines):
if ln.startswith("__version__"):
idx = i
break
if idx is None:
raise ValueError("The version is not found in the `__about__.py` file.")
# parse the version from variable assignment
version = lines[idx].split("=")[1].strip().strip('"')
# parse X.Y.Z version and prune any suffix
vers = re.match(r"(\d+)\.(\d+)\.(\d+).*", version)
# create timestamp YYYYMMDD
timestamp = datetime.now().strftime("%Y%m%d")
version = f"{'.'.join(vers.groups())}.dev{timestamp}"
# print the new version
lines[idx] = f'__version__ = "{version}"\n'
# dump updated lines
with open(about_file, "w") as fo:
fo.writelines(lines)


def _load_requirements(path_dir: str, file_name: str = "requirements.txt") -> list:
reqs = parse_requirements(open(os.path.join(path_dir, file_name)).readlines())
return [r for r in list(map(str, reqs)) if "@" not in r]
Expand Down Expand Up @@ -56,6 +88,9 @@ def _load_readme_description(path_dir: str, homepage: str, version: str) -> str:
return text


if _CONVERT_VERSION:
convert_version2nightly()

about = _load_py_module("__about__.py")

# https://packaging.python.org/discussions/install-requires-vs-requirements /
Expand Down
2 changes: 1 addition & 1 deletion thunder/__about__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1.0"
__version__ = "0.2.0dev"
__author__ = "Lightning-AI et al"
__author_email__ = "community@lightning.ai"
__license__ = "Apache 2.0"
Expand Down

0 comments on commit c076acb

Please sign in to comment.