Lightning-AI · IvanYashchuk · Nov 4, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
@@ -14,6 +14,7 @@
     get_nodes_in_unsupported_ctx_regions,
     update_node_and_submodule,
     recompile_graph,
+    checkpoint_converter,
 )
 
 if TYPE_CHECKING:
@@ -143,6 +144,8 @@ def is_thunder_supported_partition(node: torch.fx.Node) -> bool:
     for node in split_gm.graph.nodes:
         if is_thunder_supported_partition(node):
             graph_module = getattr(split_gm, node.name)
+            # Replace the torch operators within the function called by activation checkpoint with the corresponding Thunder symbols
+            checkpoint_converter(split_gm, graph_module)
             jit_fn = thunder_jit(graph_module)
             # Update the node name from "submod_*" to "thunder_*" for more user-friendly names
             update_node_and_submodule(split_gm, node, node.name.replace("submod", "thunder"), jit_fn)

@@ -5,14 +5,15 @@
 import dataclasses
 import inspect
 import itertools
-import warnings
+import copy
 
 import torch
 
 from thunder.torch.default_torch_ops import torch_auto_registered_ops
 from thunder.torch import _torch_to_thunder_function_map
 from thunder.torch.langctx import torchctx
 from thunder.core.utils import check
+from thunder.core.pytree import tree_map
 
 if TYPE_CHECKING:
     from thunder.core.symbol import Symbol
@@ -259,6 +260,27 @@ def is_no_grad_ctx_exit(node):
     return nodes_in_unsupported_ctx_regions
 
 
+def is_graphmodule_supported_by_thunder(gm):
+    nodes_in_unsupported_ctx_regions = get_nodes_in_unsupported_ctx_regions(gm)
+    for node in gm.graph.nodes:
+        if node.op in (
+            "placeholder",
+            "get_attr",
+            "output",
+        ):
+            continue
+        if node in nodes_in_unsupported_ctx_regions:
+            split_reason = SplitReason(
+                SplitReasonType.UNSUPPORTED_NODE,
+                info=f"node with name: {node.name} and target: {node.target} is not supported probably because it is in unsupported context.",
+            )
+            return False, split_reason
+        is_thunder_supported, split_reason = is_node_supported_by_thunder(node)
+        if not is_thunder_supported:
+            return False, split_reason
+    return True, None
+
+
 def is_node_supported_by_thunder(node: torch.fx.Node) -> tuple[bool, SplitReason | None]:
     """
     Determine whether thunder can execute the operation described by this node.
@@ -306,6 +328,14 @@ def is_node_supported_by_thunder(node: torch.fx.Node) -> tuple[bool, SplitReason
         )
         return False, split_reason
 
+    # If the operation is higher order function for checkpointing, check whether the submodule is supported by Thunder
+    if target is torch.ops.higher_order.tag_activation_checkpoint:
+        m = node.graph.owning_module
+        assert hasattr(m, node.args[0].name)
+        checkpointed_fn = getattr(m, node.args[0].name)
+        is_module_supported, split_reason = is_graphmodule_supported_by_thunder(checkpointed_fn)
+        return is_module_supported, split_reason
+
     # If thunder has a mapping for this operation, try executing the meta function and see.
     # We have a symbol for `torch.where`, but we don't support one overload of it.
     # So, we try and execute the meta to get a real signal.
@@ -418,3 +448,55 @@ def _get_example_inputs_from_placeholder(node) -> tuple[torch.Tensor]:
         raise TypeError(
             "The 'example_value' in the placeholder node is expected to be either a Tensor or a Tuple of Tensors."
         )
+
+
+def _checkpoint_function_converter(gm: torch.fx.GraphModule):
+    """
+    Replace the Torch operators in the GraphModule called by activation checkpoint operator with the corresponding Thunder symbols in place
+    Args:
+        gm: The GraphModule of the checkpointed function, which is modified in place
+    """
+    new_graph = copy.deepcopy(gm.graph)
+    for n in new_graph.nodes:
+        # replace the torch operator in "call_function" node
+        if n.op == "call_function":
+            assert isinstance(n.target, Callable)
+            if n.target.__module__ in ("_operator", "builtins"):
+                continue
+            check(
+                n.target in _torch_to_thunder_function_map, lambda: f"Unexpected {n.target}, not registered in Thunder"
+            )
+            with new_graph.inserting_before(n):
+                thunder_node = new_graph.call_function(
+                    _torch_to_thunder_function_map[n.target], args=n.args, kwargs=n.kwargs
+                )
+            n.replace_all_uses_with(thunder_node)
+            new_graph.erase_node(n)
+        else:
+            if n.op == "call_module":
+                raise RuntimeError(
+                    "Unexpected call_module detected inside a checkpoint. This should have been inlined in dynamo graphs"
+                )
+    new_graph.lint()
+    gm.graph = new_graph
+    recompile_graph(gm)
+
+
+def checkpoint_converter(gm: torch.fx.GraphModule, sub_gm: torch.fx.GraphModule):
+    """
+    Utility function to convert the GraphModule that uses activation checkpointing into a Thunder-traceable GraphModule.
+
+    Args:
+        gm: The parent GraphModule containing the submodule(sub_gm), as well as the GraphModule of the checkpointed function.
+        sub_gm: the GraphModule containing the checkpoint operator
+
+    Note:
+        The GraphModule of the checkpointed function is updated inplace
+    """
+    for n in sub_gm.graph.nodes:
+        if n.op == "call_function":
+            if n.target in (torch.ops.higher_order.tag_activation_checkpoint,):
+                name = n.args[0].name
+                assert hasattr(gm, name)
+                function_module = getattr(gm, name)
+                _checkpoint_function_converter(function_module)
@@ -1,11 +1,15 @@
 import pytest
+import warnings
 import torch
 import torch.fx
+import torch.nn as nn
+import torch.nn.functional as F
 
 from thunder import dtypes
 from thunder.dynamo import ThunderCompiler
 from thunder.dynamo.compiler_graph_benchmark import ThunderCompilerGraphBenchmarking
 from thunder import last_traces
+from thunder.core.symbol import Symbol
 from thunder.tests.bf16 import device_supports_bf16
 from thunder.tests.framework import (
     instantiate,
@@ -535,3 +539,85 @@ def f(x):
     )
     compiled = torch.compile(backend=backend)(f)
     compiled(x)
+
+
+@requiresCUDA
+@pytest.mark.filterwarnings(r"ignore:`torch\.cpu\.amp\.autocast\((.*?)\)` is deprecated.*:FutureWarning")
+def test_checkpoint_converter():
+    import torch.utils.checkpoint as checkpoint
+
+    class SimpleModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layer1 = nn.Linear(10, 20)
+            self.layer2 = nn.Linear(20, 20)
+
+        def forward(self, x):
+            x = torch.sin(x)
+            x = checkpoint.checkpoint(self.layer1, x)
+            x = checkpoint.checkpoint(self.layer2, x)
+            x = F.relu(x)
+            return x
+
+    # Input tensor
+    x = torch.randn(5, 10).cuda().requires_grad_()
+    x_ref = x.detach().requires_grad_()
+
+    model = SimpleModel().cuda().train()
+    ref_model = SimpleModel().cuda().train()
+    ref_model.load_state_dict(model.state_dict())
+
+    backend = ThunderCompiler()
+    jf = torch.compile(backend=backend)(model)
+
+    ref_out = ref_model(x_ref)
+    out = jf(x)
+    torch.testing.assert_close(ref_out, out)
+
+    g = torch.randn_like(out)
+    out.backward(g)
+
+    ref_g = g.clone()
+    ref_out.backward(ref_g)
+    torch.testing.assert_close(x.grad, x_ref.grad)
+    torch.testing.assert_close(tuple(model.parameters()), tuple(ref_model.parameters()))
+
+
+@requiresCUDA
+def test_checkpoint_converter_submodule():
+    import torch.utils.checkpoint as checkpoint
+
+    class SubModule(torch.nn.Module):
+        def __init__(self) -> None:
+            super().__init__()
+            self.lin = nn.Sequential(nn.ReLU(), nn.Linear(10, 10))
+
+        def forward(self, x):
+            return self.lin(x)
+
+    class SimpleModel(torch.nn.Module):
+        def __init__(self) -> None:
+            super().__init__()
+            self.sub_mod = SubModule()
+
+        def forward(self, x):
+            x = torch.sin(x)
+            x = checkpoint.checkpoint(self.sub_mod, x)
+            return x
+
+    x = torch.randn(5, 10).cuda().requires_grad_()
+    model = SimpleModel().cuda().train()
+    backend = ThunderCompiler()
+    jf = torch.compile(backend=backend)(model)
+    out = jf(x)
+
+    subgraph_info = backend.subgraph_infos[0]
+    split_m = subgraph_info.split_graph_module
+    submodule_name = "wrap_body_0"
+    assert hasattr(split_m, submodule_name)
+
+    submodule = getattr(split_m, submodule_name)
+
+    for n in submodule.graph.nodes:
+        if n.op == "call_function":
+            assert isinstance(n.target, Symbol)
diff --git a/thunder/torch/__init__.py b/thunder/torch/__init__.py
@@ -5262,8 +5262,8 @@ def _backward_checkpoint(
 ) -> tuple[None | TensorLike, ...]:
     from thunder.core.transforms import vjp
 
-    result = vjp(function)(args, grad_outputs, **kwargs)
-    return result
+    result, grads = vjp(function)(args, grad_outputs, **kwargs)
+    return grads
 
 
 #