Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into enable_nvfuser_cat
Browse files Browse the repository at this point in the history
  • Loading branch information
jacobhinkle committed Mar 21, 2024
2 parents 1e8e7d2 + f0e57ed commit a93b35e
Show file tree
Hide file tree
Showing 126 changed files with 7,719 additions and 10,762 deletions.
20 changes: 13 additions & 7 deletions .azure/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,24 @@ jobs:
#maxParallel: "3"
matrix:
# CUDA 12.1
'cuda 12.1 | torch 2.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0'}
'cuda 12.1 | torch 2.3 /nightly':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source'}
'cuda 12.1 | torch 2.2 | cudnn FE v1.1': # todo: drop updating this image when CI transition to newer FE version
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.1.0"}
'cuda 12.1 | torch 2.2 | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.2.0"}
'cuda 12.1 | torch 2.3 /nightly | cudnn FE v1.1': # todo: drop updating this image when CI transition to newer FE version
{CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.1.0"}
'cuda 12.1 | torch 2.3 /nightly | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.2.0"}
#'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found
# how long to run the job before automatically cancelling
timeoutInMinutes: "95"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
timeoutInMinutes: "95"
variables:
UBUNTU_VERSION: '22.04'
PYTHON_VERSION: '3.10'
imageRepository: 'pytorchlightning/lightning-thunder'
dockerfilePath: 'dockers/ubuntu-cuda/Dockerfile'
imageTag: 'ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}'
imageTag: 'ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}'
pool: 'lit-rtx-3090'
workspace:
clean: all
Expand All @@ -74,11 +77,13 @@ jobs:
-f $(dockerfilePath) \
--build-arg UBUNTU_VERSION="$(UBUNTU_VERSION)" \
--build-arg CUDA_VERSION="$(CUDA_VERSION)" \
--build-arg CUDNN_FRONTEND_CHECKOUT="v$(CUDNN_FRONTEND)" \
--build-arg PYTHON_VERSION="$(PYTHON_VERSION)" \
--build-arg TORCH_VERSION="$(TORCH_VERSION)" \
--build-arg TRITON_VERSION="$(TRITON_VERSION)" \
--build-arg TORCH_INSTALL="$(TORCH_INSTALL)" \
. --no-cache
timeoutInMinutes: "95"
displayName: 'Build base image'
- bash: |
Expand All @@ -98,6 +103,7 @@ jobs:
echo $(DOCKERHUB_PAT) | docker login --username $(DOCKERHUB_USER) --password-stdin
docker push $(imageRepository):$(imageTag)
condition: ne(variables['Build.Reason'], 'PullRequest')
timeoutInMinutes: "35"
displayName: 'Push base image'
#- task: Docker@1
Expand Down
10 changes: 5 additions & 5 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@ jobs:
matrix:
# CUDA 12.1
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-py3.10-pt_2.2.1'
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-py3.10-pt_2.2.1'
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1'
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-py3.10-pt_main'
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.0-py3.10-pt_main'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-py3.10-pt_main'
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.0-py3.10-pt_main'
CUDA_VERSION_MM: '121'
testing: 'distributed'
# how long to run the job before automatically cancelling
Expand Down Expand Up @@ -111,7 +111,7 @@ jobs:
condition: eq(variables['testing'], 'distributed')
displayName: 'Testing: distributed'
# todo for Mike as he promised some time ago already... or shall it ne another workflow so keep time low?
# todo (mruberry): decide whether this should be here or in another workflow
#- bash: |
# python benchmarks/ops_benchmark.py nanogpt-gelu
# python benchmarks/nvfuser_benchmarks.py nanogpt-mlp -x thunder
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ci-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ jobs:
# actions-ref: main

check-schema:
uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.10.1
uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.11.0
with:
azure-dir: ".azure"

check-package:
uses: Lightning-AI/utilities/.github/workflows/check-package.yml@v0.10.1
uses: Lightning-AI/utilities/.github/workflows/check-package.yml@v0.11.0
with:
actions-ref: v0.10.1
actions-ref: v0.11.0
import-name: "thunder"
artifact-name: dist-packages-${{ github.sha }}
testing-matrix: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/ci-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ jobs:
- name: Install package & dependencies
run: |
pip --version
pip install -e '.[test]' -U \
pip install -e . -U \
-r requirements/test.txt \
--find-links=${TORCH_URL} ${PIP_EXTRA_FLAG}
pip list
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docs-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ defaults:

jobs:
build-docs:
uses: Lightning-AI/utilities/.github/workflows/check-docs.yml@v0.10.1
uses: Lightning-AI/utilities/.github/workflows/check-docs.yml@v0.11.0
with:
python-version: "3.10"
requirements-file: "requirements/docs.txt"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ jobs:
# We do this, since failures on test.pypi aren't that bad
- name: Publish to Test PyPI
if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
uses: pypa/gh-action-pypi-publish@v1.8.12
uses: pypa/gh-action-pypi-publish@v1.8.14
with:
user: __token__
password: ${{ secrets.test_pypi_password }}
repository_url: https://test.pypi.org/legacy/

- name: Publish distribution 📦 to PyPI
if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'
uses: pypa/gh-action-pypi-publish@v1.8.12
uses: pypa/gh-action-pypi-publish@v1.8.14
with:
user: __token__
password: ${{ secrets.pypi_password }}
116 changes: 87 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,31 +1,94 @@
<div align="center">
<img alt="Thunder" src="docs/source/_static/images/lightning_thunder_lightmode_nobyline.png" width="400px" style="max-width: 100%;">
<br/>
<br/>

**Make PyTorch models Lightning fast.**

______________________________________________________________________

<p align="center">
<a href="https://lightning.ai/">Lightning.ai</a> •
<a href="#performance">Performance</a> •
<a href="#get-started">Get started</a> •
<a href="#install-thunder">Install</a> •
<a href="#hello-world">Examples</a> •
<a href="#features">Features</a> •
<a href="#documentation">Documentation</a> •
</p>

[![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/Lightning-AI/lightning-thunder/blob/main/LICENSE)
[![CI testing](https://github.com/Lightning-AI/lightning-thunder/actions/workflows/ci-testing.yml/badge.svg?event=push)](https://github.com/Lightning-AI/lightning-thunder/actions/workflows/ci-testing.yml)
[![General checks](https://github.com/Lightning-AI/lightning-thunder/actions/workflows/ci-checks.yml/badge.svg?event=push)](https://github.com/Lightning-AI/lightning-thunder/actions/workflows/ci-checks.yml)
[![Documentation Status](https://readthedocs.org/projects/lightning-thunder/badge/?version=latest)](https://lightning-thunder.readthedocs.io/en/latest/?badge=latest)
[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/Lightning-AI/lightning-thunder/main.svg)](https://results.pre-commit.ci/latest/github/Lightning-AI/lightning-thunder/main)

</div>

# Welcome to ⚡ Lightning Thunder

Lightning Thunder is a deep learning compiler for PyTorch. It makes PyTorch programs faster both on single accelerators or in distributed settings.
**Thunder makes PyTorch models Lightning fast.**

Thunder is a source-to-source compiler for PyTorch. It makes PyTorch programs faster by combining and using different hardware executors at once (ie: nvFuser, torch.compile, cuDNN, and TransformerEngine FP8).

Works on single accelerators and in multi-GPU settings.
Thunder aims to be usable, understandable, and extensible.

## Performance

Thunder can achieve significant speedups over standard PyTorch eager code, through the compounding effects of optimizations and the use of best-in-class executors. Here is an example of the pretraining throughput for Llama 2 7B as implemented in [LitGPT](https://github.com/Lightning-AI/litgpt).

<div align="center">
<img alt="Thunder" src="docs/source/_static/images/training_throughput_single.png" width="800px" style="max-width: 100%;">
</div>

Thunder achieves a 40% speedup in training throughput compared to eager code on H100 using a combination of executors including nvFuser, torch.compile, cuDNN, and TransformerEngine FP8.

Thunder supports distributed strategies like DDP and FSDP (ZeRO2 and ZeRO3). Here is the normalized throughput measured for Llama 2 7B (this time without FP8 mixed precision, support for FSDP is underway).

The main goal for Lightning Thunder is to allow optimizing user programs in the most extensible and expressive way possible.
<div align="center">
<img alt="Thunder" src="docs/source/_static/images/normalized_training_throughput_zero2.png" width="800px" style="max-width: 100%;">
</div>

**NOTE: Lightning Thunder is alpha and not ready for production runs.** Feel free to get involved, expect a few bumps along the way.
**NOTE: Lightning Thunder is alpha.** Feel free to get involved, expect a few bumps along the way.

## Get started

Try Thunder without installing by using our [Zero to Thunder Tutorial Studio](https://lightning.ai/lightning-ai/studios/zero-to-thunder-tutorial).

## Install Thunder

Install the nvFuser nightly, which will also install the matching PyTorch nightly:
Install [nvFuser](https://github.com/NVIDIA/Fuser) nightly, and Thunder together

```bash
pip install --pre "nvfuser-cu121[torch]" --extra-index-url https://pypi.nvidia.com
# install nvFuser which installs the matching nightly PyTorch
pip install --pre 'nvfuser-cu121[torch]' --extra-index-url https://pypi.nvidia.com

# install thunder
pip install lightning-thunder
```

Install Thunder:
<details>
<summary>Advanced install options</summary>
<!-- following section will be skipped from PyPI description -->

### Install from main

```bash
pip install git+https://github.com/Lightning-AI/lightning-thunder.git
```

or install from the local repo:
### Install to tinker and contribute

Install this way to tinker with the internals and contribute:

```bash
pip install .
pip install -e .
```

</details>
<!-- end skipping PyPI description -->

## Hello World

Here is a simple example of how Thunder lets you compile and run PyTorch code:
Expand Down Expand Up @@ -56,11 +119,11 @@ print(result)

The compiled function `jfoo` takes and returns PyTorch tensors, just like the original function, so modules and functions compiled by Thunder can be used as part of larger PyTorch programs.

## Running training
## Train models

Thunder is in its early stages, it should not be used for production runs yet.
Thunder is in its early stages and should not be used for production runs yet.

However, it can already deliver outstanding performance on models supported by [LitGPT](https://github.com/Lightning-AI/lit-gpt), such as Mistral, Llama2, Gemma, Falcon, and derivatives.
However, it can already deliver outstanding performance on LLM model supported by [LitGPT](https://github.com/Lightning-AI/lit-gpt), such as Mistral, Llama 2, Gemma, Falcon, and others.

Run training loop for Llama, single-GPU:

Expand All @@ -76,25 +139,25 @@ python examples/lit-gpt/train_fsdp.py

See [README.md](examples/lit-gpt/README.md) for details on running LitGPT with Thunder.

## What's in the box
## Features

Given a program, Thunder can generate an optimized program that:
Given a Python callable or PyTorch module, Thunder can generate an optimized program that:

- computes its forward and backward passes
- coalesces operations into efficient fusion regions
- dispatches computations to optimized kernels
- distributes computations optimally across machines
- Computes its forward and backward passes
- Coalesces operations into efficient fusion regions
- Dispatches computations to optimized kernels
- Distributes computations optimally across machines

To do so, Thunder ships with:

- a JIT for acquiring Python programs targeting PyTorch and custom operations
- a multi-level IR to represent them as a trace of a reduced op-set
- an extensible set of transformations on the trace, such as `grad`, fusions, distributed (like `ddp`, `fsdp`), functional (like `vmap`, `vjp`, `jvp`)
- a way to dispatch operations to an extensible collection of executors
- A JIT for acquiring Python programs targeting PyTorch and custom operations
- A multi-level IR to represent operations as a trace of a reduced op-set
- An extensible set of transformations on the trace, such as `grad`, fusions, distributed (like `ddp`, `fsdp`), functional (like `vmap`, `vjp`, `jvp`)
- A way to dispatch operations to an extensible collection of executors

Thunder is written entirely in Python. Even its trace is represented as valid Python at all stages of transformation. This allows unprecedented levels of introspection and extensibility.

Thunder doesn't generate device code. It acquires and transforms user programs so that it's possible to optimally select or generate device code using fast executors like:
Thunder doesn't generate code for accelerators directly. It acquires and transforms user programs so that it's possible to optimally select or generate device code using fast executors like:

- [torch.compile](https://pytorch.org/get-started/pytorch-2.0/)
- [nvFuser](https://github.com/NVIDIA/Fuser)
Expand All @@ -106,7 +169,7 @@ Thunder doesn't generate device code. It acquires and transforms user programs s

Modules and functions compiled with Thunder fully interoperate with vanilla PyTorch and support PyTorch's autograd. Also, Thunder works alongside torch.compile to leverage its state-of-the-art optimizations.

## Build the documentation
## Documentation

Docs are currently not hosted publicly. However you can build them locally really quickly:

Expand Down Expand Up @@ -141,9 +204,4 @@ Thunder is very thoroughly tested, so expect this to take a while.
## License

Lightning Thunder is released under the [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) license.
See LICENSE file for details.

[![CI testing](https://github.com/Lightning-AI/lightning-thunder/actions/workflows/ci-testing.yml/badge.svg?event=push)](https://github.com/Lightning-AI/lightning-thunder/actions/workflows/ci-testing.yml)
[![General checks](https://github.com/Lightning-AI/lightning-thunder/actions/workflows/ci-checks.yml/badge.svg?event=push)](https://github.com/Lightning-AI/lightning-thunder/actions/workflows/ci-checks.yml)
[![Documentation Status](https://readthedocs.org/projects/lightning-thunder/badge/?version=latest)](https://lightning-thunder.readthedocs.io/en/latest/?badge=latest)
[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/Lightning-AI/lightning-thunder/main.svg?badge_token=mqheL1-cTn-280Vx4cJUdg)](https://results.pre-commit.ci/latest/github/Lightning-AI/lightning-thunder/main?badge_token=mqheL1-cTn-280Vx4cJUdg)
See the [LICENSE](LICENSE) file for details.
2 changes: 1 addition & 1 deletion dockers/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ RUN \
RUN \
echo "CUDA_VERSION=$CUDA_VERSION ; CUDNN_VERSION=$CUDNN_VERSION " && \
CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
# there is missing cudnn for 12.1 so use 12.2 instead
# There are some test failures from cuDNN 12.1, so 'upgrade' requests for 12.1 to 12.2.
CUDA_VERSION_MM="${CUDA_VERSION_MM/12.1/12.2}" && \
CUDNN_BASE_VER=${CUDNN_VERSION%%.*} && \
CUDNN_PACKAGE_VER="${CUDNN_VERSION}+cuda${CUDA_VERSION_MM}" && \
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions docs/source/advanced/inside_thunder.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ Bytecode interpretation

Thunder's interpreter works by:

1. disassembling the PyTorch module or function into CPython bytecode
2. interpreting the bytecode using an extended Python interpreter
3. generating a sequential trace of operations on tensors and numbers
1. Disassembling the PyTorch module or function into CPython bytecode
2. Interpreting the bytecode using an extended Python interpreter
3. Generating a sequential trace of operations on tensors and numbers

Representing Operations
=======================
Expand Down
3 changes: 2 additions & 1 deletion docs/source/basic/mlp_mnist.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ Here's the code::
# The training model has both "forward" and "backward" traces, corresponding
# to its forward and backward computations.
# The evaluation model has only one set of traces.
fwd_traces, bwd_traces = thunder.last_traces(jitted_train_model)
fwd_traces = thunder.last_traces(jitted_train_model)
bwd_traces = thunder.last_backward_traces(jitted_train_model)
eval_traces = thunder.last_traces(jitted_eval_model)

print("This is the trace that thunder executed for training's forward computation:")
Expand Down
Loading

0 comments on commit a93b35e

Please sign in to comment.