Merge branch 'main' into pre-commit-ci-update-config

Lightning-AI · Oct 18, 2024 · 1ba9abf · 1ba9abf
2 parents 58e1d34 + ec50c73
commit 1ba9abf
Show file tree

Hide file tree

Showing 35 changed files with 873 additions and 195 deletions.
diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
@@ -84,7 +84,6 @@ jobs:
             pytest thunder/tests/ \
               -m "not standalone" \
               -v --datefmt="%Y%m%d-%H:%M:%S.%f" \
-              --timeout=240 \
               --random-order-seed=42 \
               --durations=250 \
               --timeout=240 \
@@ -97,7 +96,7 @@ jobs:
           ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
             --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
         condition: ne(variables['testing'], 'distributed')
-        timeoutInMinutes: "30"
+        timeoutInMinutes: "40"
         displayName: "Testing: regular"
 
       - bash: |

diff --git a/notebooks/zero_to_thunder.ipynb b/notebooks/zero_to_thunder.ipynb
@@ -180,7 +180,7 @@
    "source": [
     "So what has changed? Quite a bit!\n",
     "\n",
-    "When we call the Thunder module, it do the computation in a single function without control flow. And what's more, it applies optimizations, such as creating fusions for NVFuser to execute. We can see all this by showing the last computation trace:"
+    "When we call the Thunder module, it does the computation in a single function without control flow. And what's more, it applies optimizations, such as creating fusions for NVFuser to execute. We can see all this by showing the last computation trace:"
    ]
   },
   {

diff --git a/thunder/__init__.py b/thunder/__init__.py
@@ -348,7 +348,6 @@ def jit(
         sharp_edges=sharp_edges,
         using_jit=True,
         disable_torch_autograd_support=disable_torch_autograd,
-        use_rematerialization=False,
         only_execute_prims=False,
         disable_preprocessing=True,
         compile_options=compile_options,
@@ -614,7 +613,7 @@ def get_computation_and_inputs(*args, **kwargs):
                 use_del_last_used=False,
             )
             prologue_trc = prologue_traces[-1]
-            pro = prologue_trc.python_callable()
+            pro = prologue_trc.python_callable(include_decorators=False)
 
             if epilogue_trc is not None:
                 epilogue = epilogue_trc.python_callable()

diff --git a/thunder/benchmarks/__init__.py b/thunder/benchmarks/__init__.py
@@ -18,6 +18,7 @@
 from torch.testing import make_tensor
 
 import thunder
+import thunder.dynamo
 import thunder.core.devices as Devices
 import thunder.core.dtypes as dtypes
 import thunder.executors as executors
@@ -30,8 +31,7 @@
 from thunder.executors.sdpaex import sdpa_ex
 from thunder.executors.torch_compile import torch_compile_cat_ex, torch_compile_ex
 from thunder.transforms.cudagraph import CUDAGraphTransform
-from thunder.tests import nanogpt_model, hf_bart_self_attn, litgpt_model
-from thunder.tests.litgpt_model import Config as LitGPTConfig
+from thunder.tests import nanogpt_model, hf_bart_self_attn
 from thunder.tests.make_tensor import make_tensor, make_tensor_like
 
 # List of all benchmarks
@@ -707,6 +707,13 @@ def torch_compile_executor(fn: Callable) -> Callable:
     return torch.compile(fn)
 
 
+def thunderfx_executor(fn: Callable) -> Callable:
+    torch.backends.cuda.matmul.allow_tf32 = True
+    backend = thunder.dynamo.ThunderCompiler()
+    torch._dynamo.reset()
+    return torch.compile(fn, backend=backend)
+
+
 def thunder_torch_executor(fn: Callable) -> Callable:
     torch.backends.cuda.matmul.allow_tf32 = True
     return thunder.jit(fn, executors=[thunder.pytorch_executor])
@@ -781,9 +788,6 @@ def func(fn: Callable) -> Callable:
 
 @dataclass(frozen=True)
 class get_default_torch_fsdp_executor:
-    from torch.distributed.fsdp import ShardingStrategy
-
-    sharding_strategy: ShardingStrategy
     apply_torch_compile: bool
     auto_wrap_policy: Any | None
 
@@ -1213,12 +1217,14 @@ def args(cls) -> tuple[BenchmarkArg, ...]:
 
     def __init__(
         self,
-        config: str | LitGPTConfig,
+        config: str,
         batchdims: Sequence[int],
         device: str,
         dtype: dtypes.dtype,
         requires_grad: bool,
     ) -> None:
+        from litgpt.config import Config as LitGPTConfig
+
         super().__init__()
 
         self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
@@ -1282,13 +1288,15 @@ def args(cls) -> tuple[BenchmarkArg, ...]:
 
     def __init__(
         self,
-        config: str | LitGPTConfig,
+        config: str,
         batchdims: Sequence[int],
         device: str,
         dtype: dtypes.dtype,
         requires_grad: bool,
         use_liger: bool = False,
     ) -> None:
+        from litgpt.config import Config as LitGPTConfig
+
         super().__init__()
 
         self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
@@ -1941,12 +1949,14 @@ def args(cls) -> tuple[BenchmarkArg, ...]:
 
     def __init__(
         self,
-        config: str | LitGPTConfig = "Llama-2-7b-hf",
+        config: str = "Llama-2-7b-hf",
         batchdims: Sequence[int] = (16,),
         device: str = "cuda",
         dtype: dtypes.dtype = thunder.bfloat16,
         requires_grad: bool = True,
     ) -> None:
+        from litgpt.config import Config as LitGPTConfig
+
         super().__init__()
 
         self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
@@ -1968,11 +1978,9 @@ def make_batch(self) -> tuple[list, dict]:
         return (make(shape),), {}
 
     def fn(self) -> Callable:
-        module = (
-            litgpt_model.LLaMAMLP(self.config)
-            .to(device=self.device, dtype=self.tdtype)
-            .requires_grad_(self.requires_grad)
-        )
+        from litgpt.model import LLaMAMLP
+
+        module = LLaMAMLP(self.config).to(device=self.device, dtype=self.tdtype).requires_grad_(self.requires_grad)
         return module
 
 
@@ -2012,12 +2020,14 @@ def args(cls) -> tuple[BenchmarkArg, ...]:
 
     def __init__(
         self,
-        config: str | LitGPTConfig = "Llama-2-7b-hf",
+        config: str = "Llama-2-7b-hf",
         batchdims: Sequence[int] = (16,),
         device: str = "cuda",
         dtype: dtypes.dtype = thunder.bfloat16,
         requires_grad: bool = True,
     ) -> None:
+        from litgpt.config import Config as LitGPTConfig
+
         super().__init__()
 
         self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
@@ -2038,8 +2048,10 @@ def make_batch(self) -> tuple[list, dict]:
         return (x, cos, sin, mask, input_pos), {}
 
     def fn(self) -> Callable:
+        from litgpt.model import CausalSelfAttention
+
         module = (
-            litgpt_model.CausalSelfAttention(self.config)
+            CausalSelfAttention(self.config)
             .to(device=self.device, dtype=self.tdtype)
             .requires_grad_(self.requires_grad)
         )
@@ -2119,8 +2131,10 @@ def make_batch(self) -> tuple[list, dict]:
         return (make(shape),), {}
 
     def fn(self) -> Callable:
+        from litgpt.model import RMSNorm
+
         module = (
-            litgpt_model.RMSNorm(self.size, self.dim, self.eps)
+            RMSNorm(self.size, self.dim, self.eps)
             .to(device=self.device, dtype=self.tdtype)
             .requires_grad_(self.requires_grad)
         )
@@ -2168,7 +2182,7 @@ def args(cls) -> tuple[BenchmarkArg, ...]:
 
     def __init__(
         self,
-        config: LitGPTConfig,
+        config,
         batchdims: Sequence[int] = (8,),
         indices_dtype: dtypes.dtype = thunder.int64,
         device: str = "cuda",
@@ -2201,11 +2215,9 @@ def make_batch(self) -> tuple[list, dict]:
         return (x,), {}
 
     def fn(self) -> Callable:
-        gpt = (
-            litgpt_model.GPT(self.config)
-            .to(device=self.device, dtype=self.model_tdtype)
-            .requires_grad_(self.requires_grad)
-        )
+        from litgpt.model import GPT
+
+        gpt = GPT(self.config).to(device=self.device, dtype=self.model_tdtype).requires_grad_(self.requires_grad)
         return gpt
 
     def postprocess_for_backward(self, output: torch.Tensor) -> torch.Tensor | None:
@@ -2222,6 +2234,8 @@ def postprocess_for_backward(self, output: torch.Tensor) -> torch.Tensor | None:
 # "scaled_dot_product_attention" call.
 class QKVSplitRope(nn.Module):
     def __init__(self, config, use_apex) -> None:
+        from litgpt.model import apply_rope
+
         self.fused_apply_rotary_pos_emb_cached = None
         if use_apex:
             try:
@@ -2233,7 +2247,7 @@ def __init__(self, config, use_apex) -> None:
 
         super().__init__()
         self.config = config
-        self.apply_rope = litgpt_model.apply_rope
+        self.apply_rope = apply_rope
         self.use_apex = use_apex
 
     def forward(
@@ -2324,13 +2338,15 @@ def args(cls) -> tuple[BenchmarkArg, ...]:
 
     def __init__(
         self,
-        config: str | LitGPTConfig = "Llama-2-7b-hf",
+        config: str = "Llama-2-7b-hf",
         batchdims: Sequence[int] = (16,),
         device: str = "cuda",
         dtype: dtypes.dtype = thunder.bfloat16,
         requires_grad: bool = True,
         use_apex: bool = False,
     ) -> None:
+        from litgpt.config import Config as LitGPTConfig
+
         super().__init__()
 
         self.config = LitGPTConfig.from_name(config) if not isinstance(config, LitGPTConfig) else config
@@ -2635,7 +2651,7 @@ def description(cls) -> str:
 
     def __init__(
         self,
-        config: str | LitGPTConfig = "Llama-2-7b-hf",
+        config: str = "Llama-2-7b-hf",
         batchdims: Sequence[int] = (16,),
         device: str = "cuda",
         dtype: dtypes.dtype = thunder.bfloat16,
@@ -2824,6 +2840,9 @@ def __init__(
         dtype: thunder.dtypes.dtype | torch.dtype | str = thunder.bfloat16,
         requires_grad: bool = True,
     ) -> None:
+        from litgpt.model import build_rope_cache
+        from litgpt.config import Config as LitGPTConfig
+
         super().__init__()
 
         self.config = LitGPTConfig.from_name(config)
@@ -2839,9 +2858,7 @@ def __init__(
         # Sets required benchmark parameters
         self.devices: list[str] = [device]
 
-        self.cos, self.sin = litgpt_model.build_rope_cache(
-            seq_len=seq_length, n_elem=self.config.rope_n_elem, device=self.device
-        )
+        self.cos, self.sin = build_rope_cache(seq_len=seq_length, n_elem=self.config.rope_n_elem, device=self.device)
 
     def make_batch(self) -> tuple[list, dict]:
         make = partial(make_tensor, device=self.device, dtype=self.tdtype, requires_grad=self.requires_grad)
@@ -2851,9 +2868,9 @@ def make_batch(self) -> tuple[list, dict]:
         return (a, self.cos, self.sin), {}
 
     def fn(self) -> Callable:
-        model = (
-            litgpt_model.Block(self.config).to(device=self.device, dtype=self.tdtype).requires_grad_(self.requires_grad)
-        )
+        from litgpt.model import Block
+
+        model = Block(self.config).to(device=self.device, dtype=self.tdtype).requires_grad_(self.requires_grad)
         return model
 
 

diff --git a/thunder/benchmarks/targets.py b/thunder/benchmarks/targets.py
@@ -33,6 +33,7 @@
     thunder_cudnn_executor,
     thunder_cudnn_nvfuser_executor,
     thunder_executor,
+    thunderfx_executor,
     thunder_sdpa_torch_compile_nvfuser_executor,
     torch_compile_executor,
     torch_executor,
@@ -55,6 +56,7 @@
     "phi-2",
 ]
 RUN_ALL_CONFIGS = os.environ.get("THUNDER_BENCH_RUN_ALL_CONFIGS", "0") == "1"
+MAX_ALLOCATED_MEMORY_KEYWORD = "max_allocated_memory_MB"
 
 
 class ComputeType(Enum):
@@ -112,7 +114,7 @@ def deco(old_timer):
         @functools.wraps(old_timer)
         def timer():
             ret = old_timer()
-            benchmark.extra_info["max_allocated_memory_MB"] = torch.cuda.max_memory_allocated() / (1024 * 1024.0)
+            benchmark.extra_info[MAX_ALLOCATED_MEMORY_KEYWORD] = torch.cuda.max_memory_allocated() / (1024 * 1024.0)
             torch.cuda.reset_peak_memory_stats()
             return ret
 
@@ -151,17 +153,19 @@ def interpreter_fwd(module: Callable):
     return fn_
 
 
-executors = (
-    torch_executor,
-    torch_compile_executor,
-    thunder_executor,
-)
+executors = (torch_executor, torch_compile_executor, thunder_executor)
 executors_ids = (
     "torch",
     "torch.compile",
     "thunder",
 )
 
+torchbench_executors = (*executors, thunderfx_executor)
+torchbench_executors_ids = (
+    *executors_ids,
+    "thunderfx",
+)
+
 apex_executors = (thunder_apex_executor, thunder_apex_nvfuser_executor)
 apex_executors_ids = ("thunder+apex-grad", "thunder+apex+nvfuser-grad")
 
@@ -841,8 +845,8 @@ def test_resnet50(benchmark, executor: Callable, compute_type: ComputeType):
 )
 @pytest.mark.parametrize(
     "executor,",
-    executors,
-    ids=executors_ids,
+    torchbench_executors,
+    ids=torchbench_executors_ids,
 )
 @parametrize_compute_type
 def test_torchbench(benchmark, module_name, executor, compute_type: ComputeType):
@@ -867,8 +871,8 @@ def test_torchbench(benchmark, module_name, executor, compute_type: ComputeType)
 )
 @pytest.mark.parametrize(
     "executor,",
-    executors,
-    ids=executors_ids,
+    torchbench_executors,
+    ids=torchbench_executors_ids,
 )
 @parametrize_compute_type
 def test_torchbench_canary(benchmark, module_name, executor, compute_type: ComputeType):