Merge branch 'main' into ci/torch-2.3

Lightning-AI · Mar 27, 2024 · 5275c6a · 5275c6a
2 parents 996181c + 1222864
commit 5275c6a
Show file tree

Hide file tree

Showing 10 changed files with 77 additions and 29 deletions.
diff --git a/.azure/docker-build.yml b/.azure/docker-build.yml
@@ -53,9 +53,10 @@ jobs:
     variables:
       UBUNTU_VERSION: '22.04'
       PYTHON_VERSION: '3.10'
+      APEX_CHECKOUT: 'master'
       imageRepository: 'pytorchlightning/lightning-thunder'
       dockerfilePath: 'dockers/ubuntu-cuda/Dockerfile'
-      imageTag: 'ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}'
+      imageTag: 'ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}-apex'
     pool: 'lit-rtx-3090'
     workspace:
       clean: all
@@ -78,8 +79,9 @@ jobs:
             --build-arg CUDNN_FRONTEND_CHECKOUT="v$(CUDNN_FRONTEND)" \
             --build-arg PYTHON_VERSION="$(PYTHON_VERSION)" \
             --build-arg TORCH_VERSION="$(TORCH_VERSION)" \
-            --build-arg TRITON_VERSION="$(TRITON_VERSION)" \
             --build-arg TORCH_INSTALL="$(TORCH_INSTALL)" \
+            --build-arg TRITON_VERSION="$(TRITON_VERSION)" \
+            --build-arg APEX_CHECKOUT="$(APEX_CHECKOUT)" \
             . --no-cache
         timeoutInMinutes: "95"
         displayName: 'Build base image'

diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
@@ -17,10 +17,10 @@ jobs:
       matrix:
         # CUDA 12.1
         'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular':
-          docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1'
+          docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
           CUDA_VERSION_MM: '121'
         'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed':
-          docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1'
+          docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
           CUDA_VERSION_MM: '121'
           testing: 'distributed'
         'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular':
@@ -31,10 +31,10 @@ jobs:
           CUDA_VERSION_MM: '121'
           testing: 'distributed'
         'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular':
-          docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main'
+          docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
           CUDA_VERSION_MM: '121'
         'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed':
-          docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main'
+          docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
           CUDA_VERSION_MM: '121'
           testing: 'distributed'
     # how much time to give 'run always even if cancelled tasks' before stopping them
@@ -47,7 +47,7 @@ jobs:
       PYTHONHASHSEED: "0"
       CI: "true"
     container:
-      image: "$(docker-image)"
+      image: "pytorchlightning/lightning-thunder:$(docker-image)"
       options: "--gpus=all --shm-size=16g -v /var/tmp:/var/tmp"
     workspace:
       clean: all
@@ -90,6 +90,7 @@ jobs:
           pytest thunder/tests/ \
             -m "not standalone" \
             -v --datefmt="%Y%m%d-%H:%M:%S.%f" \
+            --timeout=240 \
             --random-order-seed=42 \
             --durations=250 \
             --timeout=240 \
@@ -111,7 +112,9 @@ jobs:
            pytest \
              thunder/tests/test_networks.py \
              -m "not standalone" \
-             -v --random-order-seed=42 --durations=0 --numprocesses=3
+             -v --durations=0 \
+             --random-order-seed=42 \
+             --numprocesses=3
         # compile coverage results
         python -m coverage report
         python -m coverage xml

diff --git a/.azure/notebook-runs.yml b/.azure/notebook-runs.yml
@@ -16,10 +16,10 @@ jobs:
     strategy:
       matrix:
         'ubuntu22.04 | cuda 12.1 | torch 2.2':
-          docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-py3.10-pt_2.2.1'
+          docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1-apex'
           CUDA_VERSION_MM: '121'
         'ubuntu22.04 | cuda 12.1 | torch-nightly':
-          docker-image: 'pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-py3.10-pt_main'
+          docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.0-py3.10-pt_main-apex'
           CUDA_VERSION_MM: '121'
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "45"
@@ -31,7 +31,7 @@ jobs:
       TORCH_HOME: "/var/tmp/torch"
       PIP_CACHE_DIR: "/var/tmp/pip"
     container:
-      image: "$(docker-image)"
+      image: "pytorchlightning/lightning-thunder:$(docker-image)"
       options: "--gpus=all --shm-size=16g -v /var/tmp:/var/tmp"
     workspace:
       clean: all

diff --git a/dockers/ubuntu-cuda/Dockerfile b/dockers/ubuntu-cuda/Dockerfile
@@ -25,6 +25,7 @@ ARG PYTHON_VERSION="3.10"
 ARG TORCH_VERSION="2.2.1"
 ARG TRITON_VERSION="2.2.0"
 ARG TORCH_INSTALL="stable"
+ARG APEX_CHECKOUT="master"
 
 SHELL ["/bin/bash", "-c"]
 # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
@@ -98,6 +99,7 @@ ENV \
     CUDA_SELECT_NVCC_ARCH_FLAGS="8.0"
 
 ARG TORCH_INSTALL
+ENV TORCH_USE_CUDA_DSA=1
 
 RUN \
     if [ "${TORCH_INSTALL}" == "source" ]; then \
@@ -151,6 +153,22 @@ RUN \
         pip install -U "nvfuser-cu${CUDA_VERSION_MM/./}-torch${TORCH_VERSION_MM/./}" ; \
     fi
 
+RUN \
+    # building Apex from source
+    pip install "pip>=23.1" packaging && \
+    git clone https://github.com/NVIDIA/apex && \
+    cd apex && \
+    git checkout ${APEX_CHECKOUT} && \
+    # https://github.com/NVIDIA/apex#linux
+    pip install -v \
+      --disable-pip-version-check \
+      --no-cache-dir \
+      --no-build-isolation \
+      --config-settings "--build-option=--xentropy" \
+      . && \
+    cd .. && \
+    rm -rf apex
+
 RUN \
     ls -lh requirements/ && \
     CUDA_VERSION_MM=${CUDA_VERSION%.*} && \

diff --git a/notebooks/.ignore.ci b/notebooks/.ignore.ci
@@ -1,2 +0,0 @@
-adding_custom_operator_backward.ipynb
-dev_tutorials/extend.ipynb

diff --git a/thunder/core/jit_ext.py b/thunder/core/jit_ext.py
@@ -912,12 +912,18 @@ def is_from_torch(fn):
             # Torch functions have __name__ defined
             fn_name = f"{fn.__module__}.{fn.__name__}"
 
-            # For now, only torch-like opaque functions are sharp edges
-            return _general_jit_sharp_edge(
-                f"Trying to call function {fn_name}, but it's unsupported. Please file an issue requesting support.",
-                None,
+            # Probably merge with sharp edges
+            calling_opaque_torch_msg = (
+                f"Trying to call function {fn_name}, but it is not yet supported. "
+                "Please file an issue requesting support. "
+                "To find out which operations are not yet recongnized by `thunder.jit`, "
+                "please run `examine` as per:\n\n"
+                "from thunder.examine import examine\n"
+                "examine(<your thunder.jit callable argument>, ...)\n"
             )
 
+            return do_raise(NotImplementedError(calling_opaque_torch_msg))
+
     return lookaside
 
 

diff --git a/thunder/distributed/__init__.py b/thunder/distributed/__init__.py
@@ -3,7 +3,8 @@
 from contextlib import contextmanager
 from contextvars import ContextVar, Token
 from enum import auto, Enum
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
+from collections.abc import Generator
 from functools import partial
 
 
@@ -58,7 +59,7 @@ def get_skip_data_parallel_grad_sync() -> bool:
 
 
 @contextmanager
-def skip_data_parallel_grad_sync() -> None:
+def skip_data_parallel_grad_sync() -> Generator[Any, Any, Any]:
     """A context manager to skip data parallel grad sync."""
     token = set_skip_data_parallel_grad_sync(True)
     try:

diff --git a/thunder/tests/distributed/test_ddp.py b/thunder/tests/distributed/test_ddp.py
@@ -693,6 +693,24 @@ def test_fsdp_grad_parity_with_without_bucketing(
                 self.assertEqual(loss, orig_loss)
                 self.assertEqual(tuple(p.grad for p in cm.parameters() if p.grad is not None), gradients)
 
+                # Make sure that at least one of "pack" takes multiple tensors.
+                from thunder.executors.torchex import pack_for_fsdp_prim_impl
+                from thunder.distributed.prims import PrimIDs as DistPrimIDs
+
+                for ex_trace in (thunder.last_traces(cm)[-1], thunder.last_backward_traces(cm)[-1]):
+                    pack_bsyms = list(
+                        filter(
+                            lambda bsym: bsym.sym.id in {DistPrimIDs.PACK_FOR_FSDP, pack_for_fsdp_prim_impl.id},
+                            ex_trace.bound_symbols,
+                        )
+                    )
+                    has_pack_multiple_tensors = False
+                    for bsym in pack_bsyms:
+                        first_arg = bsym.args[0]
+                        self.assertIsInstance(first_arg, list)
+                        has_pack_multiple_tensors |= len(first_arg) > 1
+                    self.assertTrue(has_pack_multiple_tensors, msg=f"{[bsym.args[0] for bsym in pack_bsyms]=}")
+
     @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 devices")
     def test_fsdp_shard_unshard(self):
         from thunder.distributed import _shard_params, _unshard_params

diff --git a/thunder/tests/test_apex_executor.py b/thunder/tests/test_apex_executor.py
@@ -84,6 +84,7 @@ def fn(*args, **kwargs):
         raise ValueError("No supported inputs were generated by the OpInfo")
 
 
+@pytest.mark.xfail(reason="this was not tested yet, but it should be fixed")  # todo
 @pytest.mark.parametrize(
     "dtype", [torch.float16, torch.bfloat16, torch.float32], ids=("float16", "bfloat16", "float32")
 )
@@ -129,6 +130,7 @@ def test(logits, labels):
             assert not is_any_bw
 
 
+@pytest.mark.xfail(reason="this was not tested yet, but it should be fixed")  # todo
 @pytest.mark.parametrize(
     "dtype", [torch.float16, torch.bfloat16, torch.float32], ids=("float16", "bfloat16", "float32")
 )

diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
@@ -50,23 +50,23 @@ def skipif_not_pytorch_2_1(f):
     )(f)
 
 
-def test_jitting_through_opaque_torch_symbols_sharp_edge():
-    def no_sharp_edge(x):
+def test_jitting_through_opaque_torch_symbols_error():
+    def no_error(x):
         # randn_like is in ltorch
         return torch.randn_like(x)
 
-    def sharp_edge(x):
+    def should_error(x):
         # rand_like is not yet in ltroch
         return torch.rand_like(x)
 
     x = torch.rand(1)
 
-    jno_sharp_edge = thunder.jit(no_sharp_edge, sharp_edges="error")
-    jno_sharp_edge(x)
+    jno_error = thunder.jit(no_error)
+    jno_error(x)
 
-    jsharp_edge = thunder.jit(sharp_edge, sharp_edges="error")
-    with pytest.raises(JITSharpEdgeError):
-        jsharp_edge(x)
+    jshould_error = thunder.jit(should_error)
+    with pytest.raises(NotImplementedError):
+        jshould_error(x)
 
 
 def test_binary_add_tensors():
@@ -613,7 +613,7 @@ def test_nanogpt():
         "falcon-7b-like",
         "falcon-40b-like",
         "codellama2-like",
-        pytest.param("mixtral-like", marks=pytest.mark.xfail(raises=TypeError, reason="topk", strict=True)),
+        pytest.param("mixtral-like", marks=pytest.mark.xfail(raises=NotImplementedError, reason="topk", strict=True)),
     ),
 )
 @pytest.mark.parametrize(
@@ -662,7 +662,7 @@ def test_litgpt_variants(name, device):
         "falcon-7b-like",
         "falcon-40b-like",
         "codellama2-like",
-        pytest.param("mixtral-like", marks=pytest.mark.xfail(raises=TypeError, reason="topk", strict=True)),
+        pytest.param("mixtral-like", marks=pytest.mark.xfail(raises=NotImplementedError, reason="topk", strict=True)),
     ),
 )
 @pytest.mark.parametrize(
Original file line number	Diff line number	Diff line change
		@@ -1,2 +0,0 @@
		adding_custom_operator_backward.ipynb
		dev_tutorials/extend.ipynb