pytorch
diff --git a/‎functorch/_src/aot_autograd.py‎
Lines changed: 107 additions & 34 deletions b/‎functorch/_src/aot_autograd.py‎
Lines changed: 107 additions & 34 deletions
diff --git a/‎functorch/_src/partitioners.py‎
Lines changed: 46 additions & 2 deletions b/‎functorch/_src/partitioners.py‎
Lines changed: 46 additions & 2 deletions
@@ -1,14 +1,14 @@
 import torch
 import torch.nn as nn
-from torch import Tensor
+from torch import Tensor, is_grad_enabled
 from functorch import make_fx
 from torch.fx import immutable_collections
 import torch.utils._pytree as pytree
 import torch.utils.dlpack
 from torch.nn.utils import _stateless
 from functorch._C import CompileCache
 from .decompositions import register_decomposition
-from .partitioners import default_partition
+from .partitioners import default_partition, _get_saved_values, _extract_fwd_bwd_modules, _extract_fwd_bwd_modules_db
 from .named_members_polyfill import _named_parameters, _named_buffers
 from typing import Callable, List, Dict, Any, Tuple, Optional
 from functools import wraps
@@ -54,7 +54,7 @@ def _dict_unflatten(values: List[Any], context: Context) -> Dict[Any, Any]:
 
 def create_joint_forward_backward(fn):
     def joint_forward_backward(
-        primals: List[Any], tangents: List[Any]
+        primals: List[Any], cotangents: List[Any]
     ) -> Tuple[List[Any], List[Any]]:
         # Call the forward pass
         outs = fn(*primals)
@@ -68,21 +68,21 @@ def joint_forward_backward(
                 grad_primals.append(p)
 
         # Get the outputs that need gradients
-        assert len(tangents) == len(outs)
+        assert len(cotangents) == len(outs)
         needed_outs = []
-        needed_tangents = []
-        for out, tangent in zip(outs, tangents):
+        needed_cotangents = []
+        for out, cotangent in zip(outs, cotangents):
             if isinstance(out, Tensor) and out.requires_grad:
                 needed_outs.append(out)
-                needed_tangents.append(tangent)
+                needed_cotangents.append(cotangent)
         backward_out = []
         # Call the backwards pass
         if grad_primals:
             backward_out = torch.autograd.grad(
                 needed_outs,
                 grad_primals,
-                grad_outputs=needed_tangents,
-                allow_unused=True,
+                grad_outputs=needed_cotangents,
+                allow_unused=True
             )
         backward_out_iter = iter(backward_out)
         return outs, [
@@ -138,14 +138,18 @@ def create_aot_autograd_function(
     joint_forward_backward = create_joint_forward_backward(flat_fn)
 
     compiled_fw = None
-    compiled_bw = None
+    bw_modules = []
+    fw_module = None
     num_outs = None
+    saved_value_names = None
+    aot_decompositions = {**aot_autograd_decompositions, **decompositions}
 
     class CompiledFunction(torch.autograd.Function):
         @staticmethod
         @disable_torchdynamo
         def forward(ctx, *flat_tensor_args):
-            nonlocal compiled_fw, compiled_bw, num_outs
+            ctx.set_materialize_grads(False)
+            nonlocal compiled_fw, num_outs, saved_value_names, fw_module
             if compiled_fw is None:
                 with torch.set_grad_enabled(grad_state):
                     out = flat_fn(*flat_tensor_args)
@@ -159,34 +163,101 @@ def forward(ctx, *flat_tensor_args):
                     num_outs = 1
 
                 joint_inputs = (flat_tensor_args, out)
-                aot_decompositions = {**aot_autograd_decompositions, **decompositions}
+                # Need it because autograd.Function disables grad in forward
                 with torch.set_grad_enabled(grad_state):
                     fx_g = make_fx(joint_forward_backward, aot_decompositions)(
                         *joint_inputs
                     )
-                fw_module, bw_module = partition_fn(fx_g, joint_inputs)
-                # print(fw_module.code, bw_module.code)
-
+                # This means the forward and backward graphs are created based on the input fn
+                # However we need to take in grad_out for the saved intermediates as well.
+                fw_module, bw_module, saved_value_nodes = partition_fn(fx_g, joint_inputs)
+                saved_value_names = [node.name for node in saved_value_nodes]
                 compiled_fw = fw_compiler(fw_module, flat_tensor_args)
                 fw_outs = normalize_as_list(compiled_fw(*flat_tensor_args))
-
-                bw_args = fw_outs[num_outs:] + fw_outs[0:num_outs]
-                compiled_bw = bw_compiler(bw_module, bw_args)
             else:
                 fw_outs = normalize_as_list(compiled_fw(*flat_tensor_args))
-            ctx.save_for_backward(*fw_outs[num_outs:])
-            return tuple(fw_outs[0:num_outs])
+
+            # print(fw_module.code)
+            ctx.num_intermediate = len(fw_outs[num_outs:])
+            ctx.num_inputs = len(flat_tensor_args)
+            to_be_saved = fw_outs[num_outs:] + list(flat_tensor_args) + fw_outs[0:num_outs]
+            ctx.save_for_backward(*to_be_saved)
+            return tuple(fw_outs)
 
         @staticmethod
         @disable_torchdynamo
-        def backward(ctx, *flat_args):
-            contiguous_args = [t.contiguous() for t in flat_args]
-            # contiguous_args = [t for t in flat_args]
-            out = normalize_as_list(compiled_bw(*ctx.saved_tensors, *contiguous_args))
-            return tuple(out)
-
-    return CompiledFunction
-
+        def backward(ctx, *flat_grad_outs):
+            nonlocal bw_modules, saved_value_names, fw_module, num_outs
+            intermediates = ctx.saved_tensors[:ctx.num_intermediate]
+            inputs = ctx.saved_tensors[ctx.num_intermediate:ctx.num_intermediate+ctx.num_inputs]
+            is_grad_enabled = torch.is_grad_enabled()
+            if not is_grad_enabled:
+                input_flat_grad_outs = []
+                for grad in flat_grad_outs:
+                    if grad is not None:
+                        input_flat_grad_outs.append(grad)
+                with torch.set_grad_enabled(grad_state):
+                    fx_g_b = make_fx(joint_forward_backward, aot_decompositions)(inputs, input_flat_grad_outs)
+                saved_value_nodes = _get_saved_values(fx_g_b, saved_value_names)
+                assert len(saved_value_nodes) <= len(saved_value_names)
+                fw_module_b, bw_module_b, saved_values_new = _extract_fwd_bwd_modules(fx_g_b, saved_value_nodes)
+                if len(saved_values_new) != len(saved_value_names):
+                    new_intermediates = []
+                    # Forward saves more intermediates than needed
+                    assert len(saved_values_new) < len(saved_value_names)
+                    j = 0
+                    for node in saved_values_new:
+                        while node.name != saved_value_names[j]:
+                            j+=1
+                        new_intermediates.append(intermediates[j])
+                        j+=1
+                    intermediates = new_intermediates
+            # else:
+            #     input_flat_grad_outs = flat_grad_outs
+            #     # create_joint_forward_backward takes inputs and cotangents as inps
+            #     # inps: inputs, cotangents: flat_grad_outs
+            #     j_b = create_joint_forward_backward(ctx.fw_module)
+            #     # setting grad is not needed
+            #     with torch.set_grad_enabled(grad_state):
+            #         fx_g_b = make_fx(j_b, aot_decompositions)(inputs, input_flat_grad_outs)
+            #     saved_value_nodes = _get_saved_values(fx_g_b, saved_value_names)
+            #     # print(saved_value_nodes)
+            #     # print(saved_value_names)
+            #     # assert len(saved_value_nodes) == len(saved_value_names)
+            #     fw_module_b, bw_module_b, saved_values_new = _extract_fwd_bwd_modules_db(fx_g_b, saved_value_nodes)
+            #     # print(fx_g_b.code, ctx.fw_module.code, fw_module_b.code, bw_module_b.code)
+            #     # assert fw_module_b.code == fw_module.code
+            #     # print(len(sew), len(saved_value_names))
+            #     if len(saved_values_new) != len(saved_value_names):
+            #         new_intermediates = []
+            #         # Forward saves more intermediates than needed
+            #         assert len(saved_values_new) < len(saved_value_names)
+            #         for node in saved_values_new:
+            #             j = 0
+            #             while node.name != saved_value_names[j]:
+            #                 j+=1
+            #             new_intermediates.append(intermediates[j])
+            #             j+=1
+            #         intermediates = new_intermediates
+
+            # This is needed because aot function caching uses function id right now
+            bw_module_fn = None
+            for elem in bw_modules:
+                if elem.code == bw_module_b.code:
+                    bw_module_fn = elem
+                    break
+            if bw_module_fn is None:
+                bw_modules.append(bw_module_b)
+                bw_module_fn = bw_module_b
+
+            f = aot_function(bw_module_fn, bw_compiler, bw_compiler, partition_fn, aot_decompositions)
+            out = f(*intermediates, *input_flat_grad_outs)
+            return tuple(normalize_as_list(out))
+
+    def return_fn(*args, **kwargs):
+        out = CompiledFunction.apply(*args, **kwargs)
+        return out[0:num_outs]
+    return return_fn
 
 class _CompileCache(CompileCache):
     pass
@@ -275,7 +346,7 @@ def rearrange(tensor_args, static_args, static_argnums):
     return args
 
 
-KNOWN_TYPES = [torch.Tensor, int, str, float, bool]
+KNOWN_TYPES = [torch.Tensor, int, str, float, bool, None]
 
 
 def aot_function(
@@ -411,7 +482,9 @@ def returned_function(*args, **kwargs):
             hasher_type,
             *flat_args_for_cache,
         )
-
+        # print("fn_id: ", fn_id)
+        # print("size: ", compile_cache.size())
+        # print("num_tensor_args: ", num_tensor_args)
         # Compile the function and save it in the cache
         if cached_res is None:
             # Save the args_spec for flat_tensor_args to unflatten while tracing
@@ -436,7 +509,7 @@ def flat_fn(*flat_tensor_args):
                 for i in flat_out:
                     is_known_type = False
                     for j in KNOWN_TYPES:
-                        if isinstance(i, j):
+                        if j is None or isinstance(i, j):
                             is_known_type = True
                             break
                     if not is_known_type:
@@ -458,7 +531,7 @@ def flat_fn(*flat_tensor_args):
                 partition_fn,
                 decompositions,
                 grad_state=torch.is_grad_enabled(),
-            ).apply
+            )
             cached_res = (compiled_fn, out_spec)
 
             # Save the compiled_fn in the cache
@@ -598,7 +671,7 @@ def aot_function_simplified(
             partition_fn,
             decompositions,
             grad_state=torch.is_grad_enabled(),
-        ).apply
+        )
 
         return compiled_fn
 
@@ -620,4 +693,4 @@ def forward(self, *args, **kwargs):
 
 
 compiled_function = aot_function
-compiled_module = aot_module
+compiled_module = aot_module
@@ -108,8 +108,52 @@ def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values):
 
     fwd_module = fx.GraphModule(joint_module, fwd_graph)
     bwd_module = fx.GraphModule(joint_module, bwd_graph)
-    return fwd_module, bwd_module
+    return fwd_module, bwd_module, saved_values
 
+def _extract_fwd_bwd_modules_db(joint_module: fx.GraphModule, saved_values):
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module)
+    print("FWD OUTS: ", fwd_outputs)
+    print("BWD OUTS: ", bwd_outputs)
+    primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
+    tangent_inputs = list(filter(_is_tangent, joint_module.graph.nodes))
+    print("primal_inputs: ", primal_inputs)
+    print("tangent_inputs: ", tangent_inputs)
+    # Construct the forward module
+    fwd_graph = _extract_graph_with_inputs_outputs(joint_module.graph, primal_inputs, fwd_outputs)
+    bwd_graph = _extract_graph_with_inputs_outputs(joint_module.graph, saved_values + tangent_inputs, bwd_outputs)
+
+    # This is to filter out saved values that don't actually end up being used by the backwards pass
+    for node in bwd_graph.nodes:
+        if node.op == 'placeholder' and not node.users:
+            for saved_value in saved_values:
+                if saved_value.name == node.name:
+                    saved_values.remove(saved_value)
+                    break
+
+    # Now, we re-generate the fwd/bwd graphs.
+    # NB: This might increase compilation time, but I doubt it matters
+    fwd_graph = _extract_graph_with_inputs_outputs(joint_module.graph, primal_inputs, fwd_outputs)
+    bwd_graph = _extract_graph_with_inputs_outputs(joint_module.graph, saved_values + tangent_inputs, bwd_outputs)
+
+    fwd_module = fx.GraphModule(joint_module, fwd_graph)
+    bwd_module = fx.GraphModule(joint_module, bwd_graph)
+    return fwd_module, bwd_module, saved_values
+
+def _get_saved_values(new_module: fx.GraphModule, saved_value_names):
+    saved_values = []
+    for node in new_module.graph.nodes:
+        if node.name in saved_value_names:
+            if 'tensor_meta' not in node.meta and node.op == 'call_function':
+                users = node.users
+                assert all(user.target == operator.getitem for user in users)
+                for user in users:
+                    saved_values.append(user)
+            else:
+                saved_values.append(node)
+
+    saved_values = list(saved_values)
+
+    return saved_values
 
 def default_partition(
     joint_module: fx.GraphModule, _joint_inputs
@@ -153,8 +197,8 @@ def default_partition(
                 saved_values.append(user)
         else:
             saved_values.append(node)
-    saved_values = list(set(saved_values))
 
+    saved_values = list(saved_values)
     return _extract_fwd_bwd_modules(joint_module, saved_values)