Skip to content

Commit

Permalink
Merge branch 'main' into ci/torch-2.3
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda authored Mar 27, 2024
2 parents 996181c + 1222864 commit 5275c6a
Show file tree
Hide file tree
Showing 10 changed files with 77 additions and 29 deletions.
6 changes: 4 additions & 2 deletions .azure/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ jobs:
variables:
UBUNTU_VERSION: '22.04'
PYTHON_VERSION: '3.10'
APEX_CHECKOUT: 'master'
imageRepository: 'pytorchlightning/lightning-thunder'
dockerfilePath: 'dockers/ubuntu-cuda/Dockerfile'
imageTag: 'ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}'
imageTag: 'ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}-apex'
pool: 'lit-rtx-3090'
workspace:
clean: all
Expand All @@ -78,8 +79,9 @@ jobs:
--build-arg CUDNN_FRONTEND_CHECKOUT="v$(CUDNN_FRONTEND)" \
--build-arg PYTHON_VERSION="$(PYTHON_VERSION)" \
--build-arg TORCH_VERSION="$(TORCH_VERSION)" \
--build-arg TRITON_VERSION="$(TRITON_VERSION)" \
--build-arg TORCH_INSTALL="$(TORCH_INSTALL)" \
--build-arg TRITON_VERSION="$(TRITON_VERSION)" \
--build-arg APEX_CHECKOUT="$(APEX_CHECKOUT)" \
. --no-cache
timeoutInMinutes: "95"
displayName: 'Build base image'
Expand Down
15 changes: 9 additions & 6 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ jobs:
matrix:
# CUDA 12.1
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular':
Expand All @@ -31,10 +31,10 @@ jobs:
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
# how much time to give 'run always even if cancelled tasks' before stopping them
Expand All @@ -47,7 +47,7 @@ jobs:
PYTHONHASHSEED: "0"
CI: "true"
container:
image: "$(docker-image)"
image: "pytorchlightning/lightning-thunder:$(docker-image)"
options: "--gpus=all --shm-size=16g -v /var/tmp:/var/tmp"
workspace:
clean: all
Expand Down Expand Up @@ -90,6 +90,7 @@ jobs:
pytest thunder/tests/ \
-m "not standalone" \
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--timeout=240 \
--random-order-seed=42 \
--durations=250 \
--timeout=240 \
Expand All @@ -111,7 +112,9 @@ jobs:
pytest \
thunder/tests/test_networks.py \
-m "not standalone" \
-v --random-order-seed=42 --durations=0 --numprocesses=3
-v --durations=0 \
--random-order-seed=42 \
--numprocesses=3
# compile coverage results
python -m coverage report
python -m coverage xml
Expand Down
6 changes: 3 additions & 3 deletions .azure/notebook-runs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ jobs:
strategy:
matrix:
'ubuntu22.04 | cuda 12.1 | torch 2.2':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-py3.10-pt_2.2.1'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | torch-nightly':
docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-py3.10-pt_main'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.0-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
# how long to run the job before automatically cancelling
timeoutInMinutes: "45"
Expand All @@ -31,7 +31,7 @@ jobs:
TORCH_HOME: "/var/tmp/torch"
PIP_CACHE_DIR: "/var/tmp/pip"
container:
image: "$(docker-image)"
image: "pytorchlightning/lightning-thunder:$(docker-image)"
options: "--gpus=all --shm-size=16g -v /var/tmp:/var/tmp"
workspace:
clean: all
Expand Down
18 changes: 18 additions & 0 deletions dockers/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ ARG PYTHON_VERSION="3.10"
ARG TORCH_VERSION="2.2.1"
ARG TRITON_VERSION="2.2.0"
ARG TORCH_INSTALL="stable"
ARG APEX_CHECKOUT="master"

SHELL ["/bin/bash", "-c"]
# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
Expand Down Expand Up @@ -98,6 +99,7 @@ ENV \
CUDA_SELECT_NVCC_ARCH_FLAGS="8.0"

ARG TORCH_INSTALL
ENV TORCH_USE_CUDA_DSA=1

RUN \
if [ "${TORCH_INSTALL}" == "source" ]; then \
Expand Down Expand Up @@ -151,6 +153,22 @@ RUN \
pip install -U "nvfuser-cu${CUDA_VERSION_MM/./}-torch${TORCH_VERSION_MM/./}" ; \
fi

RUN \
# building Apex from source
pip install "pip>=23.1" packaging && \
git clone https://github.com/NVIDIA/apex && \
cd apex && \
git checkout ${APEX_CHECKOUT} && \
# https://github.com/NVIDIA/apex#linux
pip install -v \
--disable-pip-version-check \
--no-cache-dir \
--no-build-isolation \
--config-settings "--build-option=--xentropy" \
. && \
cd .. && \
rm -rf apex

RUN \
ls -lh requirements/ && \
CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
Expand Down
2 changes: 0 additions & 2 deletions notebooks/.ignore.ci
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
adding_custom_operator_backward.ipynb
dev_tutorials/extend.ipynb
14 changes: 10 additions & 4 deletions thunder/core/jit_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,12 +912,18 @@ def is_from_torch(fn):
# Torch functions have __name__ defined
fn_name = f"{fn.__module__}.{fn.__name__}"

# For now, only torch-like opaque functions are sharp edges
return _general_jit_sharp_edge(
f"Trying to call function {fn_name}, but it's unsupported. Please file an issue requesting support.",
None,
# Probably merge with sharp edges
calling_opaque_torch_msg = (
f"Trying to call function {fn_name}, but it is not yet supported. "
"Please file an issue requesting support. "
"To find out which operations are not yet recongnized by `thunder.jit`, "
"please run `examine` as per:\n\n"
"from thunder.examine import examine\n"
"examine(<your thunder.jit callable argument>, ...)\n"
)

return do_raise(NotImplementedError(calling_opaque_torch_msg))

return lookaside


Expand Down
5 changes: 3 additions & 2 deletions thunder/distributed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from contextlib import contextmanager
from contextvars import ContextVar, Token
from enum import auto, Enum
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
from collections.abc import Generator
from functools import partial


Expand Down Expand Up @@ -58,7 +59,7 @@ def get_skip_data_parallel_grad_sync() -> bool:


@contextmanager
def skip_data_parallel_grad_sync() -> None:
def skip_data_parallel_grad_sync() -> Generator[Any, Any, Any]:
"""A context manager to skip data parallel grad sync."""
token = set_skip_data_parallel_grad_sync(True)
try:
Expand Down
18 changes: 18 additions & 0 deletions thunder/tests/distributed/test_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,24 @@ def test_fsdp_grad_parity_with_without_bucketing(
self.assertEqual(loss, orig_loss)
self.assertEqual(tuple(p.grad for p in cm.parameters() if p.grad is not None), gradients)

# Make sure that at least one of "pack" takes multiple tensors.
from thunder.executors.torchex import pack_for_fsdp_prim_impl
from thunder.distributed.prims import PrimIDs as DistPrimIDs

for ex_trace in (thunder.last_traces(cm)[-1], thunder.last_backward_traces(cm)[-1]):
pack_bsyms = list(
filter(
lambda bsym: bsym.sym.id in {DistPrimIDs.PACK_FOR_FSDP, pack_for_fsdp_prim_impl.id},
ex_trace.bound_symbols,
)
)
has_pack_multiple_tensors = False
for bsym in pack_bsyms:
first_arg = bsym.args[0]
self.assertIsInstance(first_arg, list)
has_pack_multiple_tensors |= len(first_arg) > 1
self.assertTrue(has_pack_multiple_tensors, msg=f"{[bsym.args[0] for bsym in pack_bsyms]=}")

@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 devices")
def test_fsdp_shard_unshard(self):
from thunder.distributed import _shard_params, _unshard_params
Expand Down
2 changes: 2 additions & 0 deletions thunder/tests/test_apex_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def fn(*args, **kwargs):
raise ValueError("No supported inputs were generated by the OpInfo")


@pytest.mark.xfail(reason="this was not tested yet, but it should be fixed") # todo
@pytest.mark.parametrize(
"dtype", [torch.float16, torch.bfloat16, torch.float32], ids=("float16", "bfloat16", "float32")
)
Expand Down Expand Up @@ -129,6 +130,7 @@ def test(logits, labels):
assert not is_any_bw


@pytest.mark.xfail(reason="this was not tested yet, but it should be fixed") # todo
@pytest.mark.parametrize(
"dtype", [torch.float16, torch.bfloat16, torch.float32], ids=("float16", "bfloat16", "float32")
)
Expand Down
20 changes: 10 additions & 10 deletions thunder/tests/test_jit_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,23 +50,23 @@ def skipif_not_pytorch_2_1(f):
)(f)


def test_jitting_through_opaque_torch_symbols_sharp_edge():
def no_sharp_edge(x):
def test_jitting_through_opaque_torch_symbols_error():
def no_error(x):
# randn_like is in ltorch
return torch.randn_like(x)

def sharp_edge(x):
def should_error(x):
# rand_like is not yet in ltroch
return torch.rand_like(x)

x = torch.rand(1)

jno_sharp_edge = thunder.jit(no_sharp_edge, sharp_edges="error")
jno_sharp_edge(x)
jno_error = thunder.jit(no_error)
jno_error(x)

jsharp_edge = thunder.jit(sharp_edge, sharp_edges="error")
with pytest.raises(JITSharpEdgeError):
jsharp_edge(x)
jshould_error = thunder.jit(should_error)
with pytest.raises(NotImplementedError):
jshould_error(x)


def test_binary_add_tensors():
Expand Down Expand Up @@ -613,7 +613,7 @@ def test_nanogpt():
"falcon-7b-like",
"falcon-40b-like",
"codellama2-like",
pytest.param("mixtral-like", marks=pytest.mark.xfail(raises=TypeError, reason="topk", strict=True)),
pytest.param("mixtral-like", marks=pytest.mark.xfail(raises=NotImplementedError, reason="topk", strict=True)),
),
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -662,7 +662,7 @@ def test_litgpt_variants(name, device):
"falcon-7b-like",
"falcon-40b-like",
"codellama2-like",
pytest.param("mixtral-like", marks=pytest.mark.xfail(raises=TypeError, reason="topk", strict=True)),
pytest.param("mixtral-like", marks=pytest.mark.xfail(raises=NotImplementedError, reason="topk", strict=True)),
),
)
@pytest.mark.parametrize(
Expand Down

0 comments on commit 5275c6a

Please sign in to comment.