Skip to content

Commit

Permalink
Merge branch 'main' into docker/apex
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda committed Mar 26, 2024
2 parents 9390e7d + 1857436 commit 3dc554b
Show file tree
Hide file tree
Showing 11 changed files with 161 additions and 28 deletions.
6 changes: 2 additions & 4 deletions .azure/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,10 @@ jobs:
#maxParallel: "3"
matrix:
# CUDA 12.1
'cuda 12.1 | torch 2.2 | cudnn FE v1.1': # todo: drop updating this image when CI transition to newer FE version
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.1.0"}
'cuda 12.1 | torch 2.2 | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.2.0"}
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.2.1"}
'cuda 12.1 | torch 2.3 /nightly | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.2.0"}
{CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.2.1"}
#'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
Expand Down
9 changes: 4 additions & 5 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@ jobs:
matrix:
# CUDA 12.1
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1-apex'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1-apex'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.0-py3.10-pt_main-apex'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.0-py3.10-pt_main-apex'
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
# how long to run the job before automatically cancelling
Expand Down Expand Up @@ -87,7 +87,6 @@ jobs:
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--random-order-seed=42 \
--durations=250 \
# --numprocesses=9 \ # todo: trying to debug Out of Memory error
--ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py
# compile coverage results
python -m coverage report
Expand Down
2 changes: 1 addition & 1 deletion .github/lightning-probot.yml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
tracking_issue: 1464
tracking_issue: 72
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ Thunder doesn't generate code for accelerators directly. It acquires and transfo
- [Apex](https://github.com/NVIDIA/apex)
- [TransformerEngine](https://github.com/NVIDIA/TransformerEngine)
- [PyTorch eager](https://github.com/pytorch/pytorch)
- custom kernels, including those written with [OpenAI Triton](https://github.com/openai/triton)
- Custom CUDA kernels through [PyCUDA](https://documen.tician.de/pycuda/tutorial.html#interoperability-with-other-libraries-using-the-cuda-array-interface), [Numba](https://numba.readthedocs.io/en/stable/cuda/kernels.html), [CuPy](https://docs.cupy.dev/en/stable/user_guide/kernel.html)
- Custom kernels written in [OpenAI Triton](https://github.com/openai/triton)

Modules and functions compiled with Thunder fully interoperate with vanilla PyTorch and support PyTorch's autograd. Also, Thunder works alongside torch.compile to leverage its state-of-the-art optimizations.

Expand Down
2 changes: 1 addition & 1 deletion dockers/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ ARG IMAGE_TYPE="devel"
FROM nvidia/cuda:${CUDA_VERSION}-${IMAGE_TYPE}-ubuntu${UBUNTU_VERSION}

ARG CUDNN_VERSION="8.9.7.29-1"
ARG CUDNN_FRONTEND_CHECKOUT="v1.1.0"
ARG CUDNN_FRONTEND_CHECKOUT="v1.2.1"
ARG PYTHON_VERSION="3.10"
ARG TORCH_VERSION="2.2.1"
ARG TRITON_VERSION="2.2.0"
Expand Down
2 changes: 1 addition & 1 deletion docs/source/fundamentals/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Thunder can use NVIDIA's cuDNN Python frontend bindings to accelerate some PyTor
export CUDNN_PATH=/usr/local/lib/python3.10/dist-packages/nvidia/cudnn/
for file in $CUDNN_PATH/lib/*.so.[0-9]; do filename_without_version="${file%??}"; ln -s $file $filename_without_version; done

git clone -b v1.1.0 https://github.com/NVIDIA/cudnn-frontend.git
git clone -b v1.2.1 https://github.com/NVIDIA/cudnn-frontend.git
export CUDAToolkit_ROOT=/path/to/cuda
CMAKE_BUILD_PARALLEL_LEVEL=16 pip install cudnn_frontend/ -v

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ quiet-level = 3
# https://github.com/codespell-project/codespell/issues/2839#issuecomment-1731601603
# also adding links until they ignored by its: nature
# https://github.com/codespell-project/codespell/issues/2243#issuecomment-1732019960
#ignore-words-list = ""
# documen is used in an url in README
ignore-words-list = "documen"


[tool.black]
Expand Down
4 changes: 2 additions & 2 deletions thunder/benchmarks/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,8 +873,8 @@ def test_llama2_7b_rmsnorm_grad(benchmark, executor: Callable):
ids=(
"torch",
"torch.compile",
"thunder-fwd-bwd",
"thunder+nvfuser+torch.compile-fwd-bwd",
"thunder",
"thunder+nvfuser+torch.compile",
"torch+apex",
"torch.compile+apex",
),
Expand Down
94 changes: 91 additions & 3 deletions thunder/core/interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,10 @@ def __init__(

self._uncacheable_classes = uncacheable_classes

@property
def with_provenance_tracking(self):
return self._with_provenance_tracking

def interpret(self, inst: dis.Instruction, /, **interpreter_state) -> None | int | INTERPRETER_SIGNALS:
return self._opcode_interpreter(inst, **interpreter_state)

Expand Down Expand Up @@ -887,6 +891,7 @@ class PseudoInst(str, enum.Enum):
BINARY_SUBSCR = "BINARY_SUBSCR"
BUILD_DICT = "BUILD_DICT"
BUILD_TUPLE = "BUILD_TUPLE"
BUILD_NAMEDTUPLE = "BUILD_NAMEDTUPLE"
CONSTANT = "CONSTANT"
EXCEPTION_HANDLER = "EXCEPTION_HANDLER"
INPUT_ARGS = "INPUT_ARGS"
Expand Down Expand Up @@ -2589,6 +2594,55 @@ def impl(self, other):
return _interpret_call(impl, self, other)


def _collections_namedtuple_lookaside(
typename: str,
field_names: Iterable[str],
*,
rename: bool = False,
defaults: None | Iterable[Any] = None,
module: None | str = None,
):
# Type checks {
assert wrapped_isinstance(typename, str)
assert wrapped_isinstance(field_names, Iterable)
assert wrapped_isinstance(rename, bool)
if defaults is not None:
assert wrapped_isinstance(defaults, Iterable)
if module is not None:
assert wrapped_isinstance(module, str)
# }

# Wrap defaults {
if not isinstance(rename, WrappedValue):
rename = wrap_const(rename)

if defaults is None:
defaults = wrap_const(defaults)

if module is None:
# To prevent taking module from the direct caller,
# we use the module's name from the active frame
curr_frame = get_interpreterruntimectx().frame_stack[-1]
module = unwrap(curr_frame.globals).get("__name__", None)
module = wrap_const(module)
# }

# Run opaque namedtuple {
@interpreter_needs_wrap
def create_namedtuple(typename: str, field_names: str, **kwargs):
namedtuple_type = collections.namedtuple(typename, field_names, **kwargs)
return namedtuple_type

namedtuple_type = create_namedtuple(typename, field_names, rename=rename, defaults=defaults, module=module)
if namedtuple_type is INTERPRETER_SIGNALS.EXCEPTION_RAISED:
return namedtuple_type

assert wrapped_isinstance(namedtuple_type, type)
# }

return namedtuple_type


_default_lookaside_map: dict[Callable, Callable] = {
# Jit lookasides
is_jitting: _is_jitting_lookaside,
Expand All @@ -2612,16 +2666,19 @@ def impl(self, other):
isinstance: _isinstance_lookaside,
functools.reduce: _functools_reduce_lookaside,
operator.getitem: _getitem_lookaside,
collections.namedtuple: _collections_namedtuple_lookaside,
}


# While mutuable sequences (lists) are created empty in __new__ and populated in __init__,
# immutuable sequences (tuples) are created with contents in __new__ and __init__ is a nop
# (object.__init__, actually).
def _tuple_new_provenance_tracking_lookaside(cls, iterable=(), /):
new_tuple_type = cls.value
assert issubclass(new_tuple_type, tuple)

if iterable == ():
iterable = wrap_const(())
assert cls.value is tuple

if isinstance(iterable.value, (list, tuple)):
# special case to avoid infinite recursion
Expand All @@ -2648,8 +2705,39 @@ def _tuple_new_provenance_tracking_lookaside(cls, iterable=(), /):
else:
item_wrappers.append(wv)

ures = tuple(w.value for w in item_wrappers)
pr = ProvenanceRecord(PseudoInst.BUILD_TUPLE, inputs=[w.provenance for w in item_wrappers])
def is_likely_from_collections_namedtuple(tuple_type):
from collections import namedtuple

# Check if tuple_type code object is coming from namedtuple
return (
hasattr(tuple_type, "__repr__")
and hasattr(tuple_type.__repr__, "__code__")
and tuple_type.__repr__.__code__ in namedtuple.__code__.co_consts
)

# Construction of namedtuples may raise
try:
ures = tuple(w.value for w in item_wrappers)
# Named tuples expect varargs, not iterables at new/init
if is_likely_from_collections_namedtuple(new_tuple_type):
if hasattr(new_tuple_type, "__bases__") and new_tuple_type.__bases__ == (tuple,):
ures = new_tuple_type(*ures)
build_inst = PseudoInst.BUILD_NAMEDTUPLE
else:
return do_raise(
NotImplementedError(
f"The type {new_tuple_type} is likely a subclassed named tuple. "
"Subclassing the types returned by `collections.namedtuple` "
"is currently not supported! Please, file an issue requesting this support."
)
)
else:
ures = new_tuple_type(ures)
build_inst = PseudoInst.BUILD_TUPLE
except Exception as e:
return do_raise(e)

pr = ProvenanceRecord(build_inst, inputs=[w.provenance for w in item_wrappers])
res = wrap(ures, provenance=pr)
res.item_wrappers = item_wrappers

Expand Down
14 changes: 5 additions & 9 deletions thunder/executors/transformer_engineex.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from thunder.core.proxies import TensorProxy, CollectionProxy
from thunder.core.symbol import Symbol
from thunder.extend import OperatorExecutor, register_executor
from thunder.core.langctxs import langctx, Languages

__all__ = [
"transformer_engine_ex",
Expand Down Expand Up @@ -369,6 +370,10 @@ def bind_postprocess(bsym: BoundSymbol) -> None:
#
# Registers transformer_engine_ex as an executor for torch.nn.functional.linear
#


# NOTE: We need langctx so that we can resolve `view` on TensorProxy.
@langctx(Languages.TORCH)
def _linear_checker(
a: TensorProxy,
w: TensorProxy,
Expand Down Expand Up @@ -398,15 +403,6 @@ def linear_forwad_rule(a, w, bias):
return primal, saved_for_backward


def linear_forward_rule_checker(a: TensorProxy, w: TensorProxy, bias: None | TensorProxy) -> bool:
from thunder.core.compile_data import get_compile_data

cd = get_compile_data()
if transformer_engine_ex in cd.executors_list:
return _linear_checker(a, w, bias)
return False


def linear_backward_rule(a_shape, w_shape, b_shape, ctx_idx, grad):
return te_functional_linear_backward(grad, a_shape, w_shape, b_shape, ctx_idx)

Expand Down
50 changes: 50 additions & 0 deletions thunder/tests/test_interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,56 @@ def add(x, y):
assert jfoo((1, 2, 3), jadd) == 6


def test_namedtuple_lookaside(jit):
from collections import namedtuple

typename = "MyNamedTuple"
field_names = ("a", "b", "c")

# Test returnign just the type {
def f():
return namedtuple(typename, field_names)

jf = jit(f)

jtype = jf()
assert isinstance(jtype, type)
assert jtype.__name__ == typename
assert all(hasattr(jtype, field) for field in field_names)

# Check module name
import inspect

assert jtype.__module__ == inspect.currentframe().f_globals["__name__"]
# }

# Test accessing elements {
a = torch.rand(1)
b = torch.rand(1)
c = torch.rand(1)

def f(a, b, c):
nt = namedtuple(typename, field_names)
obj = nt(a, b, c)
return obj[0]

jf = jit(f)

assert f(a, b, c) is a
assert jf(a, b, c) is a

def f(a, b, c):
nt = namedtuple(typename, field_names)
obj = nt(a, b, c)
return obj.a

jf = jit(f)

assert f(a, b, c) is a
assert jf(a, b, c) is a
# }


def test_calling_methods(jit):
jitting = False

Expand Down

0 comments on commit 3dc554b

Please sign in to comment.