Lightning-AI · t-vi · Jun 20, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
@@ -33,7 +33,13 @@
 import thunder.core.prims as prims
 import thunder.core.dtypes as dtypes
 import thunder.core.devices as devices
-from thunder.core.transform_common import dce, EarlyTransform, AdditionalTransform, PostOptimizationTransform
+from thunder.core.transform_common import (
+    dce,
+    EarlyTransform,
+    AdditionalTransform,
+    PostOptimizationTransform,
+    functionalize_inplace_ops,
+)
 from thunder.common import (
     CompileData,
     CompileStats,
@@ -503,6 +509,9 @@ def get_computation_and_inputs(*args, **kwargs):
 
             prologue_traces = [prologue_trc]
             computation_traces = [computation_trc]
+            if not compile_options.get("skip_inplace_functionalization", False):
+                computation_traces.extend(functionalize_inplace_ops(computation_trace=computation_trc))
+                computation_trc = computation_traces[-1]
 
             if epilogue_trc is not None:
                 epilogue_traces = [epilogue_trc]

@@ -72,7 +72,7 @@ def resolve_method(id: Any, *args, **kwargs) -> None | Callable:
         # ctx.get_method throws an AttributeError when the context does not have the requested attribute, except
         # for the prims language context, which always throws a ValueError
         method: Callable = ctx.get_method(id, *args, **kwargs)
-    except (AttributeError, ValueError) as e:
+    except (AttributeError, ValueError):
         return None
     return method
 

@@ -275,6 +275,7 @@ class OpTags(Enum):
     DEVICE_SYNC_OP = auto()
     # Labels operations that should not be removed by the dead code elimination (DCE) pass
     DONT_DCE = auto()
+    IN_PLACE = auto()
 
 
 # TODO RC1 Document this function and describe the parts of a primitive

@@ -1391,6 +1391,13 @@ def __add__(self, other):
         method = resolve_method("add", self, other)
         return method(self, other)
 
+    def __iadd__(self, other):
+        return self.add_(other)
+
+    def add_(self, other):
+        method = resolve_method("add_", self, other)
+        return method(self, other)
+
     def __radd__(self, other):
         method = resolve_method("add", other, self)
         return method(other, self)
@@ -1427,6 +1434,13 @@ def __mul__(self, other):
         method = resolve_method("mul", self, other)
         return method(self, other)
 
+    def __imul__(self, other):
+        return self.mul_(other)
+
+    def mul_(self, other):
+        method = resolve_method("mul_", self, other)
+        return method(self, other)
+
     def __rmul__(self, other):
         method = resolve_method("mul", other, self)
         return method(other, self)
@@ -1435,6 +1449,13 @@ def __pow__(self, other):
         method = resolve_method("pow", self, other)
         return method(self, other)
 
+    def __ipow__(self, other):
+        return self.pow_(other)
+
+    def pow_(self, other):
+        method = resolve_method("pow_", self, other)
+        return method(self, other)
+
     def __rpow__(self, other):
         method = resolve_method("pow", other, self)
         return method(other, self)
@@ -1443,6 +1464,13 @@ def __sub__(self, other):
         method = resolve_method("sub", self, other)
         return method(self, other)
 
+    def __isub__(self, other):
+        return self.sub_(other)
+
+    def sub_(self, other):
+        method = resolve_method("sub_", self, other)
+        return method(self, other)
+
     def __rsub__(self, other):
         method = resolve_method("sub", other, self)
         return method(other, self)
@@ -1455,6 +1483,13 @@ def __rtruediv__(self, other):
         method = resolve_method("true_divide", other, self)
         return method(other, self)
 
+    def __itruediv__(self, other):
+        return self.div_(other)
+
+    def div_(self, other, *, rounding_mode: str | None = None):
+        method = resolve_method("div_", self, other, rounding_mode=rounding_mode)
+        return method(self, other)
+
     #
     # Logical operations
     #

@@ -1,18 +1,23 @@
+from __future__ import annotations
 import time
-from typing import Any
+from typing import TYPE_CHECKING
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from itertools import filterfalse
 from functools import partial
 
 import thunder.core.prims as prims
 from thunder.core.baseutils import BoundSymbolInterface
-from thunder.core.proxies import Proxy, variableify, Variable
-from thunder.core.pytree import tree_flatten, tree_map
+from thunder.core.proxies import Proxy, variableify, Variable, TensorProxy
+from thunder.core.pytree import tree_flatten, tree_map, tree_unflatten
 from thunder.core.symbol import BoundSymbol, BoundSymbolRHS, has_tags
 from thunder.core.trace import from_trace, TraceProvenance, TraceCtx as Trace
 from thunder.core.utils import ProxyDict, producers, check
 
+if TYPE_CHECKING:
+    from thunder.core.proxies import ProxyInterface
+    from thunder.core.symbol import Symbol, VariableInterface
+
 
 #
 # Common optimization and transform passes
@@ -363,3 +368,107 @@ class PostOptimizationTransform(Transform, ABC):
     @abstractmethod
     def transform_trace(self, computation_trace: Trace, **kwargs):
         pass
+
+
+def functionalize_inplace_ops(computation_trace: Trace) -> list[Trace]:
+    """Functionalize in-place ops in ``computation_trace``.
+
+    In thunder, an in-place is an out-of-place or functional op followed by :func:`~thunder.core.prims.copy_`.
+    This function replaces such in-place ops with out-of-place ops.
+    Note that functionalization is not applied, if any of an in-place op's arguments are
+    ``computation_trace.args`` or ``computation_trace.kwargs``.
+
+    For example, :func:`thunder.torch.add_` is represented as a :class:`thunder.core.symbol.BoundSymbol`
+    whose `subsymbols` are :func:`thunder.torch.add` and :func:`thunder.core.prims.copy_`. This function
+    replaces it with a :class:`~thunder.core.symbol.BoundSymbol` of :func:`~thunder.torch.add`.
+    """
+    import thunder.torch
+
+    def is_functionalizable(bsym: BoundSymbol) -> bool:
+        """Has `OpTags.IN_PLACE` and its args are NOT ``computation_trace.args`` nor ``computation_trace.kwargs``."""
+        return (
+            bsym.sym in thunder.torch._inplace_to_out_of_place
+            and bsym.subsymbols
+            and bsym.subsymbols[-1].sym.id == prims.PrimIDs.COPY_
+        )
+
+    if not any(is_functionalizable(bsym) for bsym in computation_trace.bound_symbols):
+        return []
+
+    # Step 1: return the tensors returned from `prims.copy_` as possible not the args for clarity.
+    bsym: BoundSymbol
+    swap_map: dict[VariableInterface, ProxyInterface] = {}
+    bsyms: list[BoundSymbol] = []
+    for bsym in computation_trace.bound_symbols:
+        new_bsym = bsym.from_bsym_swap_proxies(swap_map)
+
+        # in-place functionalizable ops has `prims.copy_` as the last subsymbol.
+        if not is_functionalizable(new_bsym):
+            bsyms.append(new_bsym)
+            continue
+
+        copy_bsym = bsym.subsymbols[-1]
+        copy_out = copy_bsym.flat_proxy_outs[0]
+        copy_dst = copy_bsym.flat_proxy_args[1]
+        swap_map[variableify(copy_dst)] = copy_out
+        # make sure an in-place bsym returns `prims.copy_` output
+        new_bsym = new_bsym.from_bsym_swap_proxies(swap_map, skip_inputs=True, skip_subsymbols=True)
+        bsyms.append(new_bsym)
+
+    intermediate_trace = from_trace(computation_trace)
+    intermediate_trace.bound_symbols = bsyms[:]
+    intermediate_trace.set_provenance(TraceProvenance("Intermediate trace of `functionalize_inplace_ops`"))
+    del bsyms
+
+    # Step 2: Remove `prims.copy_` if it's the last one of `bsym.subsymbols`,
+    # unless `copy_to` is `computation_trace.args` or `computation_trace.kwargs`
+    trace_args_set = ProxyDict()
+    for a in filter(
+        lambda a: isinstance(a, TensorProxy), tree_flatten((computation_trace.args, computation_trace.kwargs))[0]
+    ):
+        trace_args_set[a] = a
+    bsym_inplace_to_functional = {}
+    swap_map.clear()
+    new_bsyms: list[BoundSymbol] = []
+    for bsym in intermediate_trace.bound_symbols:
+        new_bsym = bsym.from_bsym_swap_proxies(swap_map)
+
+        if not is_functionalizable(new_bsym):
+            new_bsyms.append(new_bsym)
+            continue
+        copy_bsym = bsym.subsymbols[-1]
+        copy_return = copy_bsym.flat_proxy_outs[0]
+        copy_from = copy_bsym.flat_proxy_args[0]
+        copy_to = copy_bsym.flat_proxy_args[1]
+        if copy_to in trace_args_set:
+            new_bsyms.append(new_bsym)
+        else:
+            swap_map[variableify(copy_return)] = copy_from
+            new_bsym.subsymbols = new_bsym.subsymbols[:-1]
+            new_bsym = new_bsym.from_bsym_swap_proxies(swap_map)
+
+            functional_sym: Symbol
+            optional_inplace_arg_index: int
+            functional_sym, optional_inplace_arg_index = thunder.torch._inplace_to_out_of_place[new_bsym.sym]
+
+            flat_args, flat_args_spec = tree_flatten((new_bsym.args, new_bsym.kwargs))
+            if optional_inplace_arg_index > -1:
+                flat_args[optional_inplace_arg_index] = False
+            args, kwargs = tree_unflatten(flat_args, flat_args_spec)
+            new_functional_bsym = functional_sym.bind(
+                *args,
+                **kwargs,
+                output=new_bsym.output,
+                subsymbols=new_bsym.subsymbols,
+                _call_ctx=new_bsym._call_ctx,
+            )
+            new_bsyms.append(new_functional_bsym)
+            bsym_inplace_to_functional[new_bsym] = new_functional_bsym
+
+    functionalized_computation_trace = from_trace(computation_trace)
+    functionalized_computation_trace.bound_symbols = new_bsyms
+    functionalized_computation_trace.set_provenance(TraceProvenance("Functionalize in-place ops"))
+    # note(crcrpar): I kind of want to do the following two.
+    # functionalized_computation_trace._provenance.swap_map = swap_map
+    # functionalized_computation_trace._provenance.bsym_inplace_to_functional = bsym_inplace_to_functional
+    return [intermediate_trace, functionalized_computation_trace]
@@ -1278,16 +1278,10 @@ def _abs_torch(x: torch.Tensor | Number):
 elementwise_unary_ops.append(signbit_opinfo)
 
 
-def silu_error_generator(op, device, dtype=torch.float32, **kwargs):
-    a = make_tensor((), dtype=dtype, device=device)
-    yield (SampleInput(a, inplace=True), NotImplementedError, "Thunder only supports silu with inplace=False")
-
-
 silu_opinfo = OpInfo(
     ltorch.silu,
     dtypes=(datatypes.floating,),
     sample_input_generator=partial(elementwise_unary_generator, supports_numbers=False),
-    error_input_generator=silu_error_generator,
     torch_reference=_elementwise_unary_torch(torch.nn.functional.silu),
     test_directives=(
         DecorateInfo(
@@ -1623,20 +1617,9 @@ def silu_error_generator(op, device, dtype=torch.float32, **kwargs):
 elementwise_unary_ops.append(reciprocal_opinfo)
 
 
-def relu_error_generator(op, device, dtype=torch.float32, **kwargs):
-    a = make_tensor((), dtype=dtype, device=device)
-    yield (SampleInput(a, inplace=True), NotImplementedError, "relu only supports inplace=False")
-
-
-def relu6_error_generator(op, device, dtype=torch.float32, **kwargs):
-    a = make_tensor((), dtype=dtype, device=device)
-    yield (SampleInput(a, inplace=True), NotImplementedError, "relu6 only supports inplace=False")
-
-
 relu_opinfo = OpInfo(
     ltorch.relu,
     sample_input_generator=elementwise_unary_generator,
-    error_input_generator=relu_error_generator,
     torch_reference=_elementwise_unary_torch(torch.relu),
     test_directives=(
         # PyTorch does not support bool and complex types
@@ -1665,7 +1648,6 @@ def relu6_error_generator(op, device, dtype=torch.float32, **kwargs):
 relu6_opinfo = OpInfo(
     ltorch.relu6,
     sample_input_generator=elementwise_unary_generator,
-    error_input_generator=relu6_error_generator,
     torch_reference=_elementwise_unary_torch(torch.nn.functional.relu6),
     test_directives=(
         # PyTorch does not support bool for both CPU and CUDA relu6
@@ -1684,15 +1666,9 @@ def relu6_error_generator(op, device, dtype=torch.float32, **kwargs):
 elementwise_unary_ops.append(relu6_opinfo)
 
 
-def hardswish_error_generator(op, device, dtype=torch.float32, **kwargs):
-    a = make_tensor((), dtype=dtype, device=device)
-    yield (SampleInput(a, inplace=True), NotImplementedError, "hardswish only supports inplace=False")
-
-
 hardswish_opinfo = OpInfo(
     ltorch.hardswish,
     sample_input_generator=elementwise_unary_generator,
-    error_input_generator=hardswish_error_generator,
     torch_reference=_elementwise_unary_torch(torch.nn.functional.hardswish),
     dtypes=(datatypes.floating,),
     test_directives=(
@@ -1713,16 +1689,10 @@ def hardswish_error_generator(op, device, dtype=torch.float32, **kwargs):
 elementwise_unary_ops.append(hardswish_opinfo)
 
 
-def selu_error_generator(op, device, dtype=torch.float32, **kwargs):
-    a = make_tensor((), dtype=dtype, device=device)
-    yield (SampleInput(a, inplace=True), NotImplementedError, "selu only supports inplace=False")
-
-
 selu_opinfo = OpInfo(
     ltorch.selu,
     dtypes=(datatypes.floating,),
     sample_input_generator=elementwise_unary_generator,
-    error_input_generator=selu_error_generator,
     torch_reference=_elementwise_unary_torch(torch.selu),
     test_directives=(
         # Some versions of PyTorch do not support CPU float16 selu

@@ -2121,7 +2121,8 @@ def test_xor(s, o):
 
     for t in tests:
         cfn = thunder.jit(t)
-        with pytest.raises(RuntimeError, match="not supported"):
+        # Some ops of `tests` already have in-place supported, leading to broadcast error
+        with pytest.raises(RuntimeError, match="not supported|Attempting"):
             cfn(t1, t2)
         # Note: Python maps inplace operations on (immutuables) to
         #       out of place operations, NumberProxy does this, too.

@@ -89,8 +89,6 @@ class Net(nn.Module):
         def __init__(self):
             super().__init__()
             self.dense1_bn = nn.BatchNorm3d(2, track_running_stats=True)
-            # To address the failure, use a workaround since `add_` is utilized in `nn.BatchNorm3d` when `num_batches_tracked` is not None.
-            self.dense1_bn.num_batches_tracked = None
 
         def forward(self, x):
             x = self.dense1_bn(x)
@@ -112,6 +110,9 @@ def forward(self, x):
     assert_close(thunder_out, torch_out)
     assert_close(net.state_dict()["dense1_bn.running_mean"], torch_net.state_dict()["dense1_bn.running_mean"])
     assert_close(net.state_dict()["dense1_bn.running_var"], torch_net.state_dict()["dense1_bn.running_var"])
+    assert_close(
+        net.state_dict()["dense1_bn.num_batches_tracked"], torch_net.state_dict()["dense1_bn.num_batches_tracked"]
+    )
     assert_close(x.grad, x1.grad)