Merge branch 'main' into add_celu

beverlylytle · beverlylytle · commit 381916d88ebc · 2024-10-31T10:53:19.000+02:00
diff --git a/thunder/__init__.py b/thunder/__init__.py
@@ -458,15 +458,10 @@ def get_computation_and_inputs(*args, **kwargs):
                         _vanilla_args,
                     ) = cache_entry
                     try:
-                        cs.last_prologue_execution_start = time.perf_counter_ns()
                         inps, pro_to_epi = pro(*args, **kwargs)
-                        cs.last_prologue_execution_stop = time.perf_counter_ns()
                     except Exception as _:
                         continue
 
-                    cs.last_trace_host_tracing_start = time.perf_counter_ns()
-                    cs.last_trace_host_tracing_stop = time.perf_counter_ns()
-
                     # Updates cache statistics
                     cs.cache_hits += 1
                     cs.last_traces = comp_traces
@@ -495,12 +490,7 @@ def get_computation_and_inputs(*args, **kwargs):
                     backward_traces,
                 ) = cache_entry
 
-                cs.last_prologue_execution_start = time.perf_counter_ns()
                 inps, pro_to_epi = pro(*args, **kwargs)
-                cs.last_prologue_execution_stop = time.perf_counter_ns()
-
-                cs.last_trace_host_tracing_start = time.perf_counter_ns()
-                cs.last_trace_host_tracing_stop = time.perf_counter_ns()
 
                 # Updates cache statistics
                 cs.cache_hits += 1
@@ -622,6 +612,7 @@ def get_computation_and_inputs(*args, **kwargs):
             )
             prologue_trc = prologue_traces[-1]
             pro = prologue_trc.python_callable(include_decorators=False)
+            pro = prologue_execution_timer(pro)
 
             if epilogue_trc is not None:
                 epilogue = epilogue_trc.python_callable()
@@ -637,9 +628,7 @@ def get_computation_and_inputs(*args, **kwargs):
             cs.last_interpreter_log = last_interpreter_log
             cs.last_interpreted_instructions = (i for i in last_interpreter_log if isinstance(i, dis.Instruction))
 
-            cs.last_prologue_execution_start = time.perf_counter_ns()
             inps, pro_to_epi = pro(*args, **kwargs)
-            cs.last_prologue_execution_stop = time.perf_counter_ns()
 
             computation_trc = dce(computation_trc)
             computation_traces.append(computation_trc)
@@ -729,23 +718,55 @@ def get_computation_and_inputs(*args, **kwargs):
 
         return cache_entry, inps, pro_to_epi
 
-    cd.get_computation_and_inputs = get_computation_and_inputs
+    def host_execution_timer(fn):
+        def wrapped(*args, **kwargs):
+            cs.last_trace_host_execution_start = time.perf_counter_ns()
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                cs.last_trace_host_execution_stop = time.perf_counter_ns()
 
-    @wraps(fn)
-    def fn_(*args, **kwargs) -> Any:
-        if is_tracing():
-            _recursive_jit_call_warning()
-            return fn(*args, **kwargs)
+        return wrapped
 
-        # Updats call statistics
-        cs.last_trace_host_start = time.perf_counter_ns()
-        cs.calls += 1
+    def prologue_execution_timer(fn):
+        def wrapped(*args, **kwargs):
+            cs.last_prologue_execution_start = time.perf_counter_ns()
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                cs.last_prologue_execution_stop = time.perf_counter_ns()
 
-        cache_entry, inps, pro_to_epi = get_computation_and_inputs(*args, **kwargs)
-        cs.last_trace_host_execution_start = time.perf_counter_ns()
+        return wrapped
+
+    def decorate_computation_function(get_computation_and_inputs_fn, *decorators):
+        def wrapped(*args, **kwargs):
+            cache_entry, inps, pro_to_epi = get_computation_and_inputs_fn(*args, **kwargs)
+            decorated_computation_fn = cache_entry.computation_fn
+            for decorator in decorators:
+                decorated_computation_fn = decorator(decorated_computation_fn)
+            if decorators:
+                cache_entry = cache_entry._replace(computation_fn=decorated_computation_fn)
+            return cache_entry, inps, pro_to_epi
+
+        return wrapped
+
+    get_computation_and_inputs = decorate_computation_function(get_computation_and_inputs, host_execution_timer)
+    cd.get_computation_and_inputs = get_computation_and_inputs
+
+    def update_call_statistics(fn):
+        def wrapped(*args, **kwargs):
+            cs.calls += 1
+            cs.last_trace_host_start = time.perf_counter_ns()
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                cs.last_trace_host_stop = time.perf_counter_ns()
 
+        return wrapped
+
+    def check_storage_aliases(cache_entry, args):
         if cache_entry.vanilla_tensor_args:
-            if alias_tensor_indices_str := _alias_tensor_of_args_kwargs(*inps):
+            if alias_tensor_indices_str := _alias_tensor_of_args_kwargs(*args):
                 alias_tensor_indices = alias_tensor_indices_str
                 alias_tensor_indices = {int(i) for i in alias_tensor_indices_str.split(",")}
                 vanilla_tensor_args = cache_entry.vanilla_tensor_args
@@ -755,13 +776,12 @@ def fn_(*args, **kwargs) -> Any:
                     NotImplementedError,
                 )
 
-        result = cache_entry.computation_fn(*inps)
-
+    def maybe_connect_to_autograd(cache_entry, result):
         if cache_entry.backward_fn:
-            # Run the compiled forward function
+            # If the backward function is available, we need to connect the
+            # resulting tensors to PyTorch's Autograd graph using the
+            # ThunderFunction (which is a torch.autograd.Function subclass)
             data_for_autograd, (saved_tensors, saved_other) = result
-
-            # Connect produced tensors with PyTorch's autograd graph
             ThunderFunction.apply(
                 cache_entry.return_none_instead_of_grads,
                 cache_entry.backward_fn,
@@ -772,17 +792,31 @@ def fn_(*args, **kwargs) -> Any:
             )
             result = data_for_autograd["output"]
 
+        return result
+
+    def maybe_call_epilogue(cache_entry, result, pro_to_epi):
         if cache_entry.epilogue_fn:
             result, comp_to_epi = result
             cache_entry.epilogue_fn(*pro_to_epi, *comp_to_epi)
 
-        cs.last_trace_host_execution_stop = time.perf_counter_ns()
-        cs.last_computation_execution_stop = cs.last_trace_host_execution_stop
+        return result
 
-        cs.last_executed = cache_entry.computation_fn
-        cs.last_trace_cache_stop = time.perf_counter_ns()
-        cs.last_trace_host_stop = time.perf_counter_ns()
+    @wraps(fn)
+    @update_call_statistics
+    def fn_(*args, **kwargs) -> Any:
+        if is_tracing():
+            _recursive_jit_call_warning()
+            return fn(*args, **kwargs)
+
+        cache_entry, inps, pro_to_epi = get_computation_and_inputs(*args, **kwargs)
+
+        check_storage_aliases(cache_entry, inps)
+
+        result = cache_entry.computation_fn(*inps)
+        result = maybe_connect_to_autograd(cache_entry, result)
+        result = maybe_call_epilogue(cache_entry, result, pro_to_epi)
 
+        cs.last_computation = cache_entry.computation_fn
         return result
 
     if isinstance(fn, pytorch.nn.Module):
diff --git a/thunder/benchmarks/benchmark_litgpt.py b/thunder/benchmarks/benchmark_litgpt.py
@@ -532,7 +532,7 @@ def setup_distributed(self, model):
         return model
 
     def setup_activation_checkpointing(self):
-        if "thunder" in self.compile:
+        if "thunder" in self.compile and "dynamo" not in self.compile:
             # checkpointing is an option to thunder.jit
             return
 
@@ -571,25 +571,23 @@ def setup_compile(self, model):
 
                 executors.insert(0, transformer_engine_ex)
 
-            jit_options = {
-                "enable_saved_for_backward_recomputation": self.checkpoint_activations,
-                "recomputation_policy": None,
-            }
-
             if "dynamo" in self.compile:
                 if self.distributed_mode == "fsdp2":
                     print("Resetting cache size for when fsdp2 and using thunder as backend torch.compile")
                     import torch._dynamo.config as dynamo_config
 
                     dynamo_config.cache_size_limit = 64
 
-                backend = ThunderCompiler(executors=executors, **jit_options)
+                self.backend = ThunderCompiler(executors=executors)
                 # Because Lightning Fabric is imported in this script it monkey patches the torch.compile function
                 # https://github.com/Lightning-AI/pytorch-lightning/blob/828fd998961f6a60f92c35254bb94d6e049ad069/src/lightning/fabric/wrappers.py#L421
                 # using __wrapped__ to access the original torch.compile function did not work
                 # so we are using the lower level torch._dynamo.optimize function
-                model = torch._dynamo.optimize(backend=backend)(model)
+                model = torch._dynamo.optimize(backend=self.backend)(model)
             else:
+                jit_options = {
+                    "enable_saved_for_backward_recomputation": self.checkpoint_activations,
+                }
                 jit_options["fp8_shard_intermediate_activation"] = self.fp8_shard_intermediate_activation
                 model = thunder.jit(model, executors=executors, **jit_options)
 
@@ -844,16 +842,24 @@ def benchmark_main(return_metrics_as_json=False, json_path="", **kwargs) -> None
                 for jitted in benchmark.thunder_as_torch_compile_backend.gm_to_thunder.values():
                     fwd_traces.append(thunder.last_traces(jitted))
                     bwd_traces.append(thunder.last_backward_traces(jitted))
-            else:
+            elif "dynamo" not in benchmark.compile:
                 fwd_traces = [thunder.last_traces(benchmark.model)]
                 bwd_traces = [thunder.last_backward_traces(benchmark.model)]
 
-            for i, f_traces in enumerate(fwd_traces, start=1):
-                print(f"##########\n#{i}-th ThunderModule\n##########")
-                print(f_traces[-1])
-            for i, b_traces in enumerate(bwd_traces, start=1):
-                print(f"##########\n#{i}-th ThunderModule\n##########")
-                print(b_traces[-1])
+            if "dynamo" in benchmark.compile:
+                for gid, infos in enumerate(benchmark.backend.subgraph_infos):
+                    for subgid, thunder_fn in enumerate(infos.thunder_compiled_fns):
+                        print(f"##########\n#Graph{gid}-ThunderFn{subgid} last forward trace\n##########")
+                        print(thunder.last_traces(thunder_fn)[-1])
+                        print(f"##########\n#Graph{gid}-ThunderFn{subgid} last backward trace\n##########")
+                        print(thunder.last_backward_traces(thunder_fn)[-1])
+            else:
+                for i, f_traces in enumerate(fwd_traces, start=1):
+                    print(f"##########\n#{i}-th ThunderModule\n##########")
+                    print(f_traces[-1])
+                for i, b_traces in enumerate(bwd_traces, start=1):
+                    print(f"##########\n#{i}-th ThunderModule\n##########")
+                    print(b_traces[-1])
 
     if global_rank in [0, None]:
         if return_metrics_as_json:
diff --git a/thunder/common.py b/thunder/common.py
@@ -64,7 +64,6 @@
 
 
 # Holds statistics and caches for a compiled function
-# TODO RC1 Update last_executed to last_computation
 # TODO RC1 Review how autograd traces are presented
 class CompileStats:
     """A class holding statistics and caches for a compiled function.
@@ -76,7 +75,7 @@ class CompileStats:
         See :mod:`thunder` for more of such utility functions.
 
     Attributes:
-        last_executed:
+        last_computation (Callable):
         last_traces (Sequence[TraceCtx]):
         last_prologue (TraceCtx):
         last_prologue_traces (Sequence[TraceCtx]):
@@ -107,7 +106,7 @@ class CompileStats:
 
     def __init__(self):
         # Callables and traces
-        self.last_executed = None
+        self.last_computation = None
         self.last_traces = None
         self.last_prologue = None
         self.last_prologue_traces = None
diff --git a/thunder/core/jit_ext.py b/thunder/core/jit_ext.py
@@ -1512,7 +1512,7 @@ def is_variableified_tensorproxy(v: Variable | Proxy) -> Proxy:
         return not isinstance(p, TensorProxy)
 
     # TODO: This is just a WAR to get things working. We'll revisit this when
-    # we deal with cosntraints in prologue trace.
+    # we deal with constraints in prologue trace.
     #
     # We sort variables to before `unpack` to put TensorProxy before others.
     # Because we could have TensorProxy.shape be part of `pro_to_xxx` along with
diff --git a/thunder/executors/apex_fused_rms_norm_impl.py b/thunder/executors/apex_fused_rms_norm_impl.py
@@ -14,6 +14,12 @@
 
 APEX_FUSED_NORMS_AVAILABLE = True
 try:
+    # Fused layer norm is only importable if torch.distributed is available
+    # https://github.com/NVIDIA/apex/issues/1853
+    from torch.distributed import is_available
+
+    if not is_available():
+        raise ImportError
     import fused_layer_norm_cuda
     from apex.normalization.fused_layer_norm import FusedRMSNormAffineMixedDtypesFunction
 except ImportError:
diff --git a/thunder/tests/test_apex_fused_norms.py b/thunder/tests/test_apex_fused_norms.py
@@ -3,14 +3,19 @@
 from torch.testing import assert_close
 
 fused_layer_norm_cuda = pytest.importorskip("fused_layer_norm_cuda")
-from apex.normalization.fused_layer_norm import FusedRMSNormAffineMixedDtypesFunction
+
+from torch.distributed import is_available
 from thunder.executors.apexex import apex_ex
 import thunder
 
 
+# See https://github.com/NVIDIA/apex/issues/1853
+@pytest.mark.skipif(not is_available(), reason="torch.distributed is not available")
 @pytest.mark.parametrize("requires_grad", [True, False])
 @pytest.mark.parametrize("memory_efficient", [True, False])
 def test_apex_fused_rms_norm(requires_grad, memory_efficient):
+    from apex.normalization.fused_layer_norm import FusedRMSNormAffineMixedDtypesFunction
+
     def fn(x, weight, normalized_shape, eps):
         return FusedRMSNormAffineMixedDtypesFunction.apply(x, weight, normalized_shape, eps, memory_efficient)
 
@@ -34,9 +39,13 @@ def fn(x, weight, normalized_shape, eps):
         assert_close(actual_grad, expected_grad)
 
 
+# See https://github.com/NVIDIA/apex/issues/1853
+@pytest.mark.skipif(not is_available(), reason="torch.distributed is not available")
 @pytest.mark.parametrize("requires_grad", [True, False])
 @pytest.mark.parametrize("memory_efficient", [True, False])
 def test_apex_fused_rms_norm_autoregister(requires_grad, memory_efficient):
+    from apex.normalization.fused_layer_norm import FusedRMSNormAffineMixedDtypesFunction
+
     def fn(x, weight, normalized_shape, eps):
         return FusedRMSNormAffineMixedDtypesFunction.apply(x, weight, normalized_shape, eps, memory_efficient)
 

Original file line number	Diff line number	Diff line change
`@@ -1512,7 +1512,7 @@ def is_variableified_tensorproxy(v: Variable \| Proxy) -> Proxy:`
`1512`	`1512`	`return not isinstance(p, TensorProxy)`
`1513`	`1513`
`1514`	`1514`	`# TODO: This is just a WAR to get things working. We'll revisit this when`
`1515`		`- # we deal with cosntraints in prologue trace.`
	`1515`	`+ # we deal with constraints in prologue trace.`
`1516`	`1516`	`#`
`1517`	`1517`	# We sort variables to before `unpack` to put TensorProxy before others.
`1518`	`1518`	# Because we could have TensorProxy.shape be part of `pro_to_xxx` along with