Lightning-AI · kiya00 · Nov 27, 2024 · Oct 23, 2024 · Oct 24, 2024 · Oct 28, 2024
diff --git a/thunder/benchmarks/benchmark_litgpt.py b/thunder/benchmarks/benchmark_litgpt.py
@@ -241,6 +241,7 @@ def __init__(
         use_torchao_fp8_allgather: bool = False,
         use_torchao_fp8_precompute_scale_for_fsdp: bool = False,
         fp8_shard_intermediate_activation: bool = False,
+        save_dynamo_repro: str | None = None,
     ):
         seed = 1337
         torch.manual_seed(seed)
@@ -275,6 +276,11 @@ def __init__(
         self.dump_thunder_traces = dump_thunder_traces
         self.dump_memory_snapshot = dump_memory_snapshot
         self.fp8_shard_intermediate_activation = fp8_shard_intermediate_activation
+        if save_dynamo_repro is not None:
+            assert (
+                "dynamo" in self.compile and "thunder" in self.compile
+            ), "save_dynamo_repro can only be used if --compile=thunder+dynamo"
+        self.save_dynamo_repro = save_dynamo_repro
 
         if use_torchao_fp8_linear:
 
@@ -892,6 +898,9 @@ def benchmark_main(return_metrics_as_json=False, json_path="", **kwargs) -> None
                     print(f"##########\n#{i}-th ThunderModule\n##########")
                     print(b_traces[-1])
 
+        if benchmark.save_dynamo_repro:
+            benchmark.backend.save_reproducer_to_folder(benchmark.save_dynamo_repro)
+
     if global_rank in [0, None]:
         if return_metrics_as_json:
             benchmark.add_model_info_to_metrics()

@@ -400,6 +400,9 @@ def reverse_transform_state_dict_for_submodule(
     ) -> dict[str, Any]:
         return state_dict
 
+    def __repr__(self) -> str:
+        return f"{self.__class__.__module__}.{self.__class__.__name__}()"
+
 
 def order_proxies(bsyms: Sequence[BoundSymbol]) -> dict[str, int]:
     """computes a canonical ordering of proxies in the bound symbols based on the order of appearance

@@ -7,11 +7,13 @@
 import torch
 
 from thunder.core.baseutils import run_once
-from thunder.dynamo.utils import recompile_graph, remove_empty_autocast
+from thunder.core.utils import safe_zip
+from thunder.dynamo.utils import recompile_graph, remove_empty_autocast, reproducer, CompilerType
 from thunder.dynamo.splitter import _splitter
 
 if TYPE_CHECKING:
     from thunder.dynamo.utils import SubgraphInfo
+    from os import PathLike
 
 
 @run_once
@@ -83,3 +85,47 @@ def __call__(self, gm: torch.fx.GraphModule, sample_args: list[torch.SymInt, tor
         split_module, subgraph_info = _splitter(gm, self._thunder_jit, self._torch_compile, sample_args)
         self.subgraph_infos.append(subgraph_info)
         return split_module
+
+    def save_reproducer_to_folder(self, reproducer_folder: str | PathLike, use_pytest_benchmark: bool = False):
+        """
+        Save the reproducer script for the GraphModule executed by Thunder to the specified `reproducer_folder`.
+        Each saved script is named as "graph[graph_id]_thunder_[module_id]", where:
+
+                - `graph_id` indexes the graph generated by Dynamo, which is then passed to Thunder.
+                - `module_id` indexes the submodule split by the :func:`thunder.dynamo.utils._splitter`.
+
+        Both `graph_id` and `module_id` start from 1.
+
+        Args:
+            reproducer_folder (str | PathLike): The folder where the reproducer code will be written.
+            use_pytest_benchmark (str): Determines the type of script to create:
+
+                - If use_pytest_benchmark=False: Creates a reproducer script.
+                - If use_pytest_benchmark=True: Creates a benchmark script to compare the reproducer's performance with other backends, including Torch eager, torch.compile, and torch.compile with `backend="eager"`.
+        """
+        if not self.subgraph_infos:
+            raise TypeError(f"{self} doesn't seem to have been called yet.")
+
+        for graph_idx, subgraph_info in enumerate(self.subgraph_infos):
+            thunder_module_names = []
+            for node in subgraph_info.split_graph_module.graph.nodes:
+                target = node.target
+                if isinstance(target, str) and target.startswith("thunder_"):
+                    thunder_module_names.append(target)
+            original_thunder_modules = (
+                m
+                for m, compiled_m in subgraph_info.submodule_to_compiled_functions.items()
+                if compiled_m.compiler == CompilerType.THUNDER
+            )
+            example_inputs = subgraph_info.thunder_compiled_fns_example_inputs
+            for cur_module, example_input, cur_name in safe_zip(
+                original_thunder_modules, example_inputs, thunder_module_names
+            ):
+                reproducer(
+                    cur_module,
+                    self.thunder_options,
+                    example_input,
+                    reproducer_folder,
+                    f"graph{graph_idx+1}_{cur_name}",
+                    use_pytest_benchmark,
+                )
@@ -103,19 +103,24 @@ def run_bench(self, gm: torch.fx.GraphModule, name: str, *sample_args):
             if self.post_graph:
                 compiled_fn = self.post_graph(compiled_fn, sample_args)
 
-            with record_peak_allocated_memory(self.bench):
+            if torch.cuda.is_available():
+                with record_peak_allocated_memory(self.bench):
+                    self.bench(compiled_fn, *sample_args)
+            else:
                 self.bench(compiled_fn, *sample_args)
             # BenchmarkFixture.stats is created each time bench is called (ref: https://github.com/pybenchmark/pytest-benchmark/blob/8c9a5faa1dd178b53ab7b2a66f5364a77e903d74/src/pytest_benchmark/fixture.py#L150)
             # Adds the graph number, split module name and executor suffix to the name string
             gid_key, module_name_key, ex_key = GRAPH_BY_GRAPH_BENCHMARK_PARAMS_KEYS
             self.bench.stats.name += f"-{gid_key}[{self.graph_idx+1}]-{module_name_key}[{name}]-{ex_key}[{ex_name}]"
-            assert MAX_ALLOCATED_MEMORY_KEYWORD in self.bench.extra_info
-            assert f"{self.bench.stats.name}_{MAX_ALLOCATED_MEMORY_KEYWORD}" not in self.bench.extra_info
-            # NOTE: A benchmark can include multiple stats, but only one extra_info field is allowed per benchmark.
-            # Therefore, we use the current stats name as a prefix to distinguish memory usage for each stats.
-            self.bench.extra_info[f"{self.bench.stats.name}_{MAX_ALLOCATED_MEMORY_KEYWORD}"] = (
-                self.bench.extra_info.pop(MAX_ALLOCATED_MEMORY_KEYWORD)
-            )
+
+            if torch.cuda.is_available():
+                assert MAX_ALLOCATED_MEMORY_KEYWORD in self.bench.extra_info
+                assert f"{self.bench.stats.name}_{MAX_ALLOCATED_MEMORY_KEYWORD}" not in self.bench.extra_info
+                # NOTE: A benchmark can include multiple stats, but only one extra_info field is allowed per benchmark.
+                # Therefore, we use the current stats name as a prefix to distinguish memory usage for each stats.
+                self.bench.extra_info[f"{self.bench.stats.name}_{MAX_ALLOCATED_MEMORY_KEYWORD}"] = (
+                    self.bench.extra_info.pop(MAX_ALLOCATED_MEMORY_KEYWORD)
+                )
 
             # when the graph is segmented, the self.bench run multiple times, pybenchmark throws an error:
             # `FixtureAlreadyUsed("Fixture can only be used once. Previously it was used in %s mode." % self._mode)`
@@ -158,7 +163,7 @@ def has_checkpoint_node(g):
                 cur_nodes = cur_module.graph.nodes
                 # Greates random input values for the current module based on the faketensor 'example_value' of the placeholder node
                 placeholders = list(n for n in cur_nodes if n.op == "placeholder")
-                args = chain(*map(_get_example_inputs_from_placeholder, placeholders))
+                args = list(map(_get_example_inputs_from_placeholder, placeholders))
                 # Runs the benchmark on the original module with the generated random inputs
                 self.run_bench(compiled_functions_to_submodule[cur_module], target, *args)
         self.graph_idx += 1

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import copy
+from functools import partial
 
 import torch
 from torch.fx.passes.split_module import split_module
@@ -16,6 +17,7 @@
     update_node_and_submodule,
     recompile_graph,
     checkpoint_converter,
+    _get_example_inputs_from_placeholder,
 )
 
 if TYPE_CHECKING:
@@ -142,11 +144,18 @@ def is_thunder_supported_partition(node: torch.fx.Node) -> bool:
 
     # Call compile on the split region/s.
     thunder_compiled_fns = []
+    example_input_metadatas = []
     submodule_to_compiled_fns = {}
     for node in split_gm.graph.nodes:
         node_name = node.name
         if is_thunder_supported_partition(node):
             graph_module = getattr(split_gm, node.name)
+            # Record the input tensor metadata of the current module based on the faketensor 'example_value' of the placeholder node
+            placeholders = list(n for n in graph_module.graph.nodes if n.op == "placeholder")
+            example_input_metadata = map(
+                partial(_get_example_inputs_from_placeholder, only_metadata=True), placeholders
+            )
+            example_input_metadatas.append(list(example_input_metadata))
             # Replace PyTorch operators within the checkpointed function with the corresponding Thunder operators
             checkpoint_converter(split_gm, graph_module)
             jit_fn = thunder_jit(graph_module)
@@ -176,6 +185,7 @@ def is_thunder_supported_partition(node: torch.fx.Node) -> bool:
         original_split_gm,
         split_gm,
         thunder_compiled_fns,
+        example_input_metadatas,
         submodule_to_compiled_fns,
         split_reasons,
     )