Lightning-AI
diff --git a/‎notebooks/zero_to_thunder.ipynb
Lines changed: 2 additions & 1 deletion b/‎notebooks/zero_to_thunder.ipynb
Lines changed: 2 additions & 1 deletion
diff --git a/‎thunder/__init__.py
Lines changed: 18 additions & 1 deletion b/‎thunder/__init__.py
Lines changed: 18 additions & 1 deletion
diff --git a/‎thunder/clang/__init__.py
Lines changed: 1 addition & 1 deletion b/‎thunder/clang/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎thunder/core/baseutils.py
Lines changed: 4 additions & 0 deletions b/‎thunder/core/baseutils.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎thunder/core/dtypes.py
Lines changed: 56 additions & 12 deletions b/‎thunder/core/dtypes.py
Lines changed: 56 additions & 12 deletions
diff --git a/‎thunder/core/module.py
Lines changed: 42 additions & 7 deletions b/‎thunder/core/module.py
Lines changed: 42 additions & 7 deletions
diff --git a/‎thunder/core/proxies.py
Lines changed: 16 additions & 6 deletions b/‎thunder/core/proxies.py
Lines changed: 16 additions & 6 deletions
diff --git a/‎thunder/core/rematerialization.py
Lines changed: 3 additions & 1 deletion b/‎thunder/core/rematerialization.py
Lines changed: 3 additions & 1 deletion
@@ -3389,7 +3389,8 @@
     }
    ],
    "source": [
-    "!torchrun --nproc_per_node=2 zero_to_thunder_fsdp_simple_example.py"
+    "# commented out for CI limitations, see https://github.com/Lightning-AI/lightning-thunder/issues/465\n",
+    "# !torchrun --standalone --nnodes=1 --nproc_per_node=2 zero_to_thunder_fsdp_simple_example.py"
    ]
   },
   {
 
@@ -92,6 +92,10 @@
     "int32",
     "int64",
     "bfloat16",
+    "float8_e5m2",
+    "float8_e5m2fnuz",
+    "float8_e4m3fn",
+    "float8_e4m3fnuz",
     "float16",
     "float32",
     "float64",
@@ -130,6 +134,10 @@ def __version__():
 int32 = dtypes.int32
 int64 = dtypes.int64
 bfloat16 = dtypes.bfloat16
+float8_e5m2 = dtypes.float8_e5m2
+float8_e5m2fnuz = dtypes.float8_e5m2fnuz
+float8_e4m3fn = dtypes.float8_e4m3fn
+float8_e4m3fnuz = dtypes.float8_e4m3fnuz
 float16 = dtypes.float16
 float32 = dtypes.float32
 float64 = dtypes.float64
@@ -328,14 +336,17 @@ def jit(
     assert type(record_history) is bool
 
     # TODO RC1 Refine the compile data option to remove unused options
+    # TODO: refine options
+    # NOTE(fixme): use_cudagraphs is being absorbed into compile_options
+    use_cudagraphs = compile_options.get("use_cudagraphs", False)
     cd = CompileData(
         fn=fn,
         langctx=langctx,
         executors_list=executors,
         cache_option=cache,
         sharp_edges=sharp_edges,
         using_jit=True,
-        use_cudagraphs=False,
+        use_cudagraphs=use_cudagraphs,
         disable_torch_autograd_support=disable_torch_autograd,
         use_rematerialization=False,
         only_execute_prims=False,
@@ -587,6 +598,12 @@ def get_computation_and_inputs(*args, **kwargs):
             else:
                 backward_fn = None
 
+            # TODO: using vanilla CUDAGraphExecutor is not safe unless the graph is always static!
+            # (fixme): inspect torch.cuda.make_graph_callables and/or use it instead!
+            # See https://github.com/Lightning-AI/lightning-thunder/issues/433
+            if cd.use_cudagraphs:
+                comp = CUDAGraphExecutor(comp)
+
             # TODO RC1 Update the cache
             cache_entry = CacheEntry(
                 pro,
 
@@ -1948,7 +1948,7 @@ def argmin(a: TensorProxy, /, dim: int | None = None, keepdim: bool | None = Fal
 @clangop()
 def topk(
     a: TensorLike, /, k: int, dim: int | None = None, largest: bool = True, sorted: bool = True, *, out=None
-) -> (TensorProxy, TensorProxy):
+) -> tuple[TensorProxy, TensorProxy]:
     if dim is None:
         dim = a.ndim - 1 if a.ndim > 0 else 0
     dim = utils.canonicalize_dim(a.ndim, dim)
 
@@ -304,6 +304,10 @@ def indent(level):
     torch.int32: "torch.int32",
     torch.int64: "torch.int64",
     torch.bfloat16: "torch.bfloat16",
+    torch.float8_e4m3fn: "torch.float8_e4m3fn",
+    torch.float8_e4m3fnuz: "torch.float8_e4m3fnuz",
+    torch.float8_e5m2: "torch.float8_e5m2",
+    torch.float8_e5m2fnuz: "torch.float8_e5m2fnuz",
     torch.float16: "torch.float16",
     torch.float32: "torch.float32",
     torch.float64: "torch.float64",
 
@@ -59,9 +59,10 @@ def __new__(cls, *args, **kwargs):
 
         return object.__new__(cls)
 
-    def __init__(self, *, python_type, name, shortname, bytes, is_weak):
+    def __init__(self, *, python_type, name, shortname, bytes, is_weak, variant=None):
         self._python_type = python_type
         self._name = name
+        self._variant = variant
         self._shortname = shortname
         self._bytes = bytes
         self._is_weak = is_weak
@@ -80,23 +81,30 @@ def is_weak(self):
         return self._is_weak
 
     def shortname(self):
-        return f"{self._shortname}{8 * self._bytes}"
+        return f"{self._shortname}{8 * self._bytes}{f'_{self._variant}' if self._variant else ''}"
 
     # TODO Fix name printing
     def __repr__(self):
-        return f"{self._name}{8 * self._bytes}{'_' if self._is_weak else ''}"
+        return (
+            f"{self._name}{8 * self._bytes}{f'_{self._variant}' if self._variant else ''}{'_' if self._is_weak else ''}"
+        )
 
     def __str__(self):
         return self.__repr__()
 
     def __hash__(self) -> int:
-        return hash((self._name, self._bytes, self._is_weak))
+        return hash((self._name, self._bytes, self._is_weak, f"{self._variant if self._variant else ''}"))
 
     def __eq__(self, other) -> bool:
         if not isinstance(other, dtype):
             return False
 
-        return self._name == other._name and self._bytes == other._bytes and self._is_weak == other._is_weak
+        return (
+            self._name == other._name
+            and self._bytes == other._bytes
+            and self._is_weak == other._is_weak
+            and self._variant == other._variant
+        )
 
 
 class exact(dtype):
@@ -152,14 +160,24 @@ class inexact(dtype):
 
 
 class floating(inexact):
-    """Base class for the floating dtypes: bfloat16, float16, float32, float64."""
+    """Base class for the floating dtypes: float8, bfloat16, float16, float32, float64."""
 
-    def __init__(self, name, shortname, *, bytes, is_weak):
-        super().__init__(python_type=float, name=name, shortname=shortname, bytes=bytes, is_weak=is_weak)
+    def __init__(self, name, shortname, *, bytes, is_weak, variant=None):
+        super().__init__(
+            python_type=float, name=name, shortname=shortname, bytes=bytes, is_weak=is_weak, variant=variant
+        )
 
 
 bfloat16 = floating("bfloat", "bf", bytes=2, is_weak=False)
 bfloat16_ = floating("bfloat", "bf", bytes=2, is_weak=True)
+float8_e5m2 = floating("float", "f", bytes=1, is_weak=False, variant="e5m2")
+float8_e5m2_ = floating("float", "f", bytes=1, is_weak=True, variant="e5m2")
+float8_e5m2fnuz = floating("float", "f", bytes=1, is_weak=False, variant="e5m2fnuz")
+float8_e5m2fnuz_ = floating("float", "f", bytes=1, is_weak=True, variant="e5m2fnuz")
+float8_e4m3fn = floating("float", "f", bytes=1, is_weak=False, variant="e4m3fn")
+float8_e4m3fn_ = floating("float", "f", bytes=1, is_weak=True, variant="e4m3fn")
+float8_e4m3fnuz = floating("float", "f", bytes=1, is_weak=False, variant="e4m3fnuz")
+float8_e4m3fnuz_ = floating("float", "f", bytes=1, is_weak=True, variant="e4m3fnuz")
 float16 = floating("float", "f", bytes=2, is_weak=False)
 float16_ = floating("float", "f", bytes=2, is_weak=True)
 float32 = floating("float", "f", bytes=4, is_weak=False)
@@ -200,6 +218,14 @@ def __init__(self, name, shortname, *, bytes, is_weak):
     int64_,
     bfloat16,
     bfloat16_,
+    float8_e5m2,
+    float8_e5m2_,
+    float8_e5m2fnuz,
+    float8_e5m2fnuz_,
+    float8_e4m3fn,
+    float8_e4m3fn_,
+    float8_e4m3fnuz,
+    float8_e4m3fnuz_,
     float16,
     float16_,
     float32,
@@ -242,6 +268,10 @@ def __init__(self, name, shortname, *, bytes, is_weak):
 
 float_dtypes = {d for d in all_dtypes if isinstance(d, floating)} | {float}
 
+float_math_dtypes = {d for d in all_dtypes if isinstance(d, floating) and d.bytes >= 2}
+
+float_8bit_dtypes = {d for d in all_dtypes if (isinstance(d, floating) and d.bytes == 1)}
+
 complex_dtypes = {d for d in all_dtypes if isinstance(d, complexfloating)} | {complex}
 
 inexact_dtypes = float_dtypes | complex_dtypes
@@ -306,11 +336,12 @@ def has_subdtype(x, cls):
 
 
 # Translates a sequence of dtypes and dtype classes into a concrete set of corresponding (strong) dtypes
-def resolve_dtypes(args):
+def resolve_dtypes(args: Iterable) -> set[dtype]:
     dtypes = set()
     for arg in args:
         if isinstance(arg, dtype):
-            dtypes.add(arg)
+            if not arg.is_weak:
+                dtypes.add(arg)
             continue
 
         if isinstance(arg, Iterable):
@@ -320,7 +351,8 @@ def resolve_dtypes(args):
                     lambda: f"Iterables passed to resolve_dtypes must only contain dtypes, but found an Iterable with {a}",
                     exception_type=NotImplementedError,
                 )
-                dtypes.add(a)
+                if not a.is_weak:
+                    dtypes.add(a)
 
         baseutils.check(
             arg in (dtype, exact, signedinteger, unsignedinteger, bool_, inexact, floating, complexfloating),
@@ -373,6 +405,10 @@ def corresponding_complex_dtype(dtype):
     int32: int32_,
     int64: int64_,
     bfloat16: bfloat16_,
+    float8_e5m2: float8_e5m2_,
+    float8_e5m2fnuz: float8_e5m2fnuz_,
+    float8_e4m3fn: float8_e4m3fn_,
+    float8_e4m3fnuz: float8_e4m3fnuz_,
     float16: float16_,
     float32: float32_,
     float64: float64_,
@@ -520,6 +556,14 @@ def are_same_dtypes(a, b, *, weak_and_strong_are_equivalent=True):
     int64: torch.int64,
     bfloat16_: torch.bfloat16,
     bfloat16: torch.bfloat16,
+    float8_e5m2: torch.float8_e5m2,
+    float8_e5m2_: torch.float8_e5m2,
+    float8_e5m2fnuz: torch.float8_e5m2fnuz,
+    float8_e5m2fnuz_: torch.float8_e5m2fnuz,
+    float8_e4m3fn: torch.float8_e4m3fn,
+    float8_e4m3fn_: torch.float8_e4m3fn,
+    float8_e4m3fnuz: torch.float8_e4m3fnuz,
+    float8_e4m3fnuz_: torch.float8_e4m3fnuz,
     float16_: torch.float16,
     float16: torch.float16,
     float32_: torch.float32,
@@ -551,7 +595,7 @@ def to_torch_dtype(x: None | torch.dtype | dtype) -> None | torch.dtype:
 
 # Converts NumPy dtypes to and from thunder dtypes
 
-# NOTE NumPy does not support the bfloat16 or complexhalf (complex32) datatypes
+# NOTE NumPy does not support the bfloat16, complexhalf (complex32) or float8 datatypes
 _thunder_to_numpy_dtype_map = {
     bool: np.bool_,
     int: np.int_,
 
@@ -30,24 +30,22 @@ def __init__(self, model, compiled_model_call):
         # we populate these here for performance reasons (sam as module cache),
         # a single dict lookup is cheaper than traversin the module
         # hierarchy, see https://github.com/Lightning-AI/lightning-thunder/issues/396#issuecomment-2113231498
-        self._overrides = {
-            k: v for k, v in itertools.chain(self._model.named_parameters(), self._model.named_buffers())
-        }
+        self._overrides_parameters = dict(self._model.named_parameters())
+        self._overrides_buffers = dict(self._model.named_buffers())
         self._module_cache = {k: v for k, v in self._model.named_modules()}
-
         self._null = object()
 
     def get_buffer(self, name):
-        p = self._overrides.get(name, self._null)
+        p = self._overrides_buffers.get(name, self._null)
         if p is not self._null:
             return p
         return self._model.get_buffer(name)
 
     def set_buffer(self, name, value):
-        p = self._overrides[name] = value
+        p = self._overrides_buffers[name] = value
 
     def get_parameter(self, name):
-        p = self._overrides.get(name, self._null)
+        p = self._overrides_parameters.get(name, self._null)
         if p is not self._null:
             return p
         return self._model.get_parameter(name)
@@ -62,6 +60,43 @@ def forward(self, *args, **kwargs):
         res = self._forward_fn(*args, **kwargs)
         return res
 
+    def _named_parameters_or_buffers(self, overrides, orig_iter, prefix="", recurse=True, remove_duplicate=True):
+        seen_ids = set()
+        seen_names = set()
+        for k, v in itertools.chain(overrides.items(), orig_iter(remove_duplicate=remove_duplicate)):
+            if remove_duplicate:
+                id_v = id(v)
+                if id_v in seen_ids:
+                    continue
+                seen_ids.add(id_v)
+
+            mod, _, base_param = k.rpartition(".")
+            if recurse or not mod:
+                if k not in seen_names:
+                    seen_names.add(k)
+                    if prefix:
+                        yield (f"{prefix}.{k}", v)
+                    else:
+                        yield (k, v)
+
+    def named_parameters(self, prefix="", recurse=True, remove_duplicate=True):
+        yield from self._named_parameters_or_buffers(
+            self._overrides_parameters,
+            self._model.named_parameters,
+            prefix=prefix,
+            recurse=recurse,
+            remove_duplicate=remove_duplicate,
+        )
+
+    def named_buffers(self, prefix="", recurse=True, remove_duplicate=True):
+        yield from self._named_parameters_or_buffers(
+            self._overrides_buffers,
+            self._model.named_buffers,
+            prefix=prefix,
+            recurse=recurse,
+            remove_duplicate=remove_duplicate,
+        )
+
     @contextmanager
     def no_sync(self):
         r"""Context manager to disable gradient synchronization in data parallel mode.
 
@@ -17,7 +17,7 @@
 from thunder.core.trace import VariableInterface, get_tracectx, TraceCtx
 from thunder.core.baseutils import ProxyInterface, NumberProxyInterface, TensorProxyInterface
 import thunder.core.baseutils as baseutils
-from thunder.core.langctxs import resolve_method
+from thunder.core.langctxs import resolve_method, get_langctx
 import thunder.core.devices as devices
 import thunder.core.dtypes as dtypes
 
@@ -592,13 +592,18 @@ def known_value(self) -> bool:
     # fn is the function to call if executing outside a language context
     @staticmethod
     def _elementwise_unary_helper(a, name, fn, type_promotion_kind=None):
-        trace: None | TraceCtx = get_tracectx()
 
         vala = pyval(a)
 
-        if trace is None:
-            # Outside of a trace context, operations on NumberProxies are executed by the
-            #   Python interpreter
+        trace: None | TraceCtx = get_tracectx()
+        lang: None | LangCtx = None
+        try:
+            lang = get_langctx()
+        except LookupError:
+            pass
+        if trace is None or lang is None:
+            # Outside of a trace or language context, operations on NumberProxies are
+            #   executed by the Python interpreter
             baseutils.check(
                 vala is not None,
                 lambda: f"Trying to {name} a number with an unknown value",
@@ -649,7 +654,12 @@ def _elementwise_binary_helper(a, b, name, fn, type_promotion_kind=None):
         valb = pyval(b) if isinstance(b, NumberProxy) else b
 
         trace: None | TraceCtx = get_tracectx()
-        if trace is None:
+        lang: None | LangCtx = None
+        try:
+            lang = get_langctx()
+        except LookupError:
+            pass
+        if trace is None or lang is None:
             # Outside of a trace or language context, binary operations on NumberProxies are
             #   executed by the Python interpreter
             baseutils.check(
 
@@ -12,7 +12,7 @@
 from thunder.core import prims, utils
 from thunder.core.baseutils import BoundSymbolInterface, ProxyInterface
 from thunder.core.prims import PrimIDs
-from thunder.core.proxies import TensorProxy, variableify
+from thunder.core.proxies import TensorProxy, variableify, NumberProxy
 from thunder.core.pytree import tree_flatten, tree_unflatten
 from thunder.core.symbol import has_tags
 from thunder.core.trace import from_trace, TraceCtx, TraceProvenance
@@ -332,6 +332,8 @@ def add_edge(src, dst, capacity):
     def get_weight(var):
         if isinstance(var, TensorProxy):
             return WEIGHT * var.dtype.bytes
+        elif isinstance(var, NumberProxy):
+            return 0.0
         return WEIGHT
 
     def add_edges(var):
Original file line number	Diff line number	Diff line change
`@@ -3389,7 +3389,8 @@`
`3389`	`3389`	`}`
`3390`	`3390`	`],`
`3391`	`3391`	`"source": [`
`3392`		`- "!torchrun --nproc_per_node=2 zero_to_thunder_fsdp_simple_example.py"`
	`3392`	`+ "# commented out for CI limitations, see https://github.com/Lightning-AI/lightning-thunder/issues/465\n",`
	`3393`	`+ "# !torchrun --standalone --nnodes=1 --nproc_per_node=2 zero_to_thunder_fsdp_simple_example.py"`
`3393`	`3394`	`]`
`3394`	`3395`	`},`
`3395`	`3396`	`{`