diff --git a/thunder/__init__.py b/thunder/__init__.py
index eceacc4cad..bbfa8e1808 100644
--- a/thunder/__init__.py
+++ b/thunder/__init__.py
@@ -12,12 +12,14 @@
 from looseversion import LooseVersion
 
 from thunder.core.module import ThunderModule
+import thunder.core.profile
 from thunder.core.interpreter import InterpreterLogItem
 from thunder.core.options import (
     CACHE_OPTIONS,
     SHARP_EDGES_OPTIONS,
     DebugOptions,
 )
+from thunder.core.profile import annotate_for_profile
 from thunder.core.trace import (
     TraceResults,
     TraceCtx,
@@ -277,6 +279,7 @@ def compile(fn: Callable, recipe: Recipe | None):
 
 # This function will replace compile() (below) before RC1
 # TODO RC1 Consider renaming compile_options to additional_compile_options
+@thunder.core.profile.annotate_for_profile("jit")
 def jit(
     fn: Callable,
     /,
@@ -378,6 +381,7 @@ def _alias_tensor_of_args_kwargs_dict(*args, **kwargs) -> dict[int, list[int]]:
                 tensor_group_index_to_tensor_indices[tgi].append(idx)
         return tensor_group_index_to_tensor_indices
 
+    @thunder.core.profile.annotate_for_profile("_alias_tensor_of_args_kwargs")
     def _alias_tensor_of_args_kwargs(*args, **kwargs) -> str:
         """If no aliases found, empty string, otherwise, aliases are comma separated, groups are hyphen separated."""
 
@@ -392,6 +396,7 @@ def _alias_tensor_of_args_kwargs(*args, **kwargs) -> str:
 
     @langctxs.langctx(cd.langctx)
     @_with_cache_info_ctx
+    @thunder.core.profile.annotate_for_profile("get_computation_and_inputs")
     def get_computation_and_inputs(*args, **kwargs):
         # set up a record of things in the current environment that impact caching / prologues
         # this could be replaced by the respective querying in the prologues
@@ -441,7 +446,8 @@ def get_computation_and_inputs(*args, **kwargs):
         # alaises wouldn't matter, thus it'd be better to nullify this entry in such cases.
         # It however would require the functionalized computation trace to interact with `cache_info`,
         # which seems to break the consistency of cache_info, leading to a failure in cache_info check.
-        cache_info["alias_tensor_indices"] = _alias_tensor_of_args_kwargs(*args, **kwargs)
+        with thunder.core.profile.annotate_for_profile("Alias_tensor_of_args_kwarsgs"):
+            cache_info["alias_tensor_indices"] = _alias_tensor_of_args_kwargs(*args, **kwargs)
 
         # Store the `is_grad_enabled` state of PyTorch. This is used by vjp transform
         # to treat certain Symbols as constant.
@@ -451,7 +457,6 @@ def get_computation_and_inputs(*args, **kwargs):
         # TODO RC1 Add module and function checks to prologue (make it a compile option)
 
         # Checks cache
-        cs.last_trace_cache_start = time.perf_counter_ns()
         if (cd.cache_option is CACHE_OPTIONS.CONSTANT_VALUES) or (cd.cache_option is CACHE_OPTIONS.SYMBOLIC_VALUES):
             for cache_entry in reversed(cs.interpreter_cache):
                 with compile_data_and_stats(cd, cs):
@@ -467,7 +472,9 @@ def get_computation_and_inputs(*args, **kwargs):
                         _return_none_instead_of_grads,
                     ) = cache_entry
                     try:
-                        inps, pro_to_epi = pro(*args, **kwargs)
+                        from thunder.core.profile import annotate_for_profile
+
+                        inps, pro_to_epi = annotate_for_profile("prologue")(pro(*args, **kwargs))
                     except Exception as _:
                         continue
 
@@ -478,10 +485,6 @@ def get_computation_and_inputs(*args, **kwargs):
                     cs.last_interpreter_log = None
                     cs.last_prologue_traces = pro_traces
                     cs.last_prologue = pro
-                    cs.last_prologue_transformation_start = 0
-                    cs.last_prologue_transformation_stop = 0
-                    cs.last_computation_transformation_start = 0
-                    cs.last_computation_transformation_stop = 0
 
                     return cache_entry, inps, pro_to_epi
 
@@ -499,7 +502,7 @@ def get_computation_and_inputs(*args, **kwargs):
                     backward_traces,
                 ) = cache_entry
 
-                inps, pro_to_epi = pro(*args, **kwargs)
+                inps, pro_to_epi = annotate_for_profile("Prologue")(pro(*args, **kwargs))
 
                 # Updates cache statistics
                 cs.cache_hits += 1
@@ -512,7 +515,6 @@ def get_computation_and_inputs(*args, **kwargs):
                 return cache_entry, inps, pro_to_epi
 
         cs.cache_misses += 1
-        cs.last_trace_cache_stop = time.perf_counter_ns()
 
         # Resets use of compile flags
         cs.last_compile_reasons = defaultdict(list)
@@ -520,8 +522,6 @@ def get_computation_and_inputs(*args, **kwargs):
         with compile_data_and_stats(cd, cs):
             # Acquires the trace OR inlines the trace into an existing trace and
             #   returns the (proxied) result of the operation
-            cs.last_trace_tracing_start = time.perf_counter_ns()
-
             prologue_trc: TraceCtx
             computation_trc: TraceCtx
             jit_results: TraceResults = thunder_general_jit(
@@ -563,11 +563,7 @@ def get_computation_and_inputs(*args, **kwargs):
 
             epilogue_traces = [epilogue_trc]
 
-            cs.last_trace_tracing_stop = time.perf_counter_ns()
-
             # Makes the prologue callable
-            cs.last_prologue_transformation_start = time.perf_counter_ns()
-
             transform: Transform
             for transform in transforms:
                 thunder.core.utils.check_type(transform, Transform)
@@ -596,13 +592,15 @@ def get_computation_and_inputs(*args, **kwargs):
                 use_del_last_used=False,
             )
             prologue_trc = prologue_traces[-1]
-            pro = prologue_trc.python_callable(include_decorators=False)
-            pro = prologue_execution_timer(pro)
+            pro = thunder.core.profile.annotate_for_profile("prologue python_callable")(
+                prologue_trc.python_callable(include_decorators=False)
+            )
 
             epilogue_trc = transform_to_torch_types(epilogue_trc)
-            epilogue = epilogue_trc.python_callable()
+            epilogue = thunder.core.profile.annotate_for_profile("epilogue python_callable")(
+                epilogue_trc.python_callable()
+            )
 
-            cs.last_prologue_transformation_stop = time.perf_counter_ns()
             cs.last_prologue_traces = prologue_traces
             cs.last_prologue = pro
             cs.last_traces = computation_traces
@@ -672,7 +670,9 @@ def get_computation_and_inputs(*args, **kwargs):
                         backward_traces.append(backward_trc)
 
             if backward_trc is not None:
-                backward_fn = backward_trc.python_callable()
+                backward_fn = thunder.core.profile.annotate_for_profile("backward python_callable")(
+                    backward_trc.python_callable()
+                )
             else:
                 backward_fn = None
                 # We do not have to return auxiliary tensors, which will only be useful in backward pass
@@ -701,33 +701,21 @@ def get_computation_and_inputs(*args, **kwargs):
 
     def host_execution_timer(fn):
         def wrapped(*args, **kwargs):
-            cs.last_trace_host_execution_start = time.perf_counter_ns()
-            try:
+            with thunder.core.profile.annotate_for_profile("computation"):
                 return fn(*args, **kwargs)
-            finally:
-                cs.last_trace_host_execution_stop = time.perf_counter_ns()
-
-        return wrapped
-
-    def prologue_execution_timer(fn):
-        def wrapped(*args, **kwargs):
-            cs.last_prologue_execution_start = time.perf_counter_ns()
-            try:
-                return fn(*args, **kwargs)
-            finally:
-                cs.last_prologue_execution_stop = time.perf_counter_ns()
 
         return wrapped
 
     def decorate_computation_function(get_computation_and_inputs_fn, *decorators):
         def wrapped(*args, **kwargs):
-            cache_entry, inps, pro_to_epi = get_computation_and_inputs_fn(*args, **kwargs)
-            decorated_computation_fn = cache_entry.computation_fn
-            for decorator in decorators:
-                decorated_computation_fn = decorator(decorated_computation_fn)
-            if decorators:
-                cache_entry = cache_entry._replace(computation_fn=decorated_computation_fn)
-            return cache_entry, inps, pro_to_epi
+            with thunder.core.profile.annotate_for_profile("get_computation_and_inputs"):
+                cache_entry, inps, pro_to_epi = get_computation_and_inputs_fn(*args, **kwargs)
+                decorated_computation_fn = cache_entry.computation_fn
+                for decorator in decorators:
+                    decorated_computation_fn = decorator(decorated_computation_fn)
+                if decorators:
+                    cache_entry = cache_entry._replace(computation_fn=decorated_computation_fn)
+                return cache_entry, inps, pro_to_epi
 
         return wrapped
 
@@ -737,14 +725,11 @@ def wrapped(*args, **kwargs):
     def update_call_statistics(fn):
         def wrapped(*args, **kwargs):
             cs.calls += 1
-            cs.last_trace_host_start = time.perf_counter_ns()
-            try:
-                return fn(*args, **kwargs)
-            finally:
-                cs.last_trace_host_stop = time.perf_counter_ns()
+            return fn(*args, **kwargs)
 
         return wrapped
 
+    @thunder.core.profile.annotate_for_profile("maybe_connect_to_autograd")
     def maybe_connect_to_autograd(cache_entry, result):
         if cache_entry.backward_fn:
             # If the backward function is available, we need to connect the
@@ -769,6 +754,7 @@ def call_epilogue(cache_entry, comp_result, pro_to_epi):
 
     @wraps(fn)
     @update_call_statistics
+    @thunder.core.profile.annotate_for_profile("fn_")
     def fn_(*args, **kwargs) -> Any:
         if is_tracing():
             _recursive_jit_call_warning()
@@ -990,6 +976,7 @@ def get_auto_registered_torch_op_names(fn: Callable, /) -> set[str] | None:
 
 
 # TODO (mruberry) Update this
+@thunder.core.profile.annotate_for_profile("_grad_transform")
 def _grad_transform(trace):
     grad_fwd_trace = from_trace(trace)
     trace_tok = set_tracectx(grad_fwd_trace)
@@ -1043,10 +1030,12 @@ def _grad_transform(trace):
 
 # TODO Test nesting of grad and grad and grad and grad
 # TODO Test nesting of a regular function + calling grad
+@thunder.core.profile.annotate_for_profile("grad")
 def grad(fn):
     cfn = compile(fn)
 
     @wraps(cfn)
+    @thunder.core.profile.annotate_for_profile("_fn")
     def _fn(*args, **kwargs):
         original_result, original_trace = cfn(*args, **kwargs)
         original_trace = last_traces(cfn)
diff --git a/thunder/benchmarks/__init__.py b/thunder/benchmarks/__init__.py
index e4e3ca232e..d28a749585 100644
--- a/thunder/benchmarks/__init__.py
+++ b/thunder/benchmarks/__init__.py
@@ -232,15 +232,6 @@ class BenchmarkRunStatistics:
     stop_time: int
     host_stop_time: int
     called_backward: bool
-    has_extended_stats: bool = False
-    last_trace_host_start: int = -1
-    last_trace_host_stop: int = -1
-    last_trace_cache_start: int = -1
-    last_trace_cache_stop: int = -1
-    last_trace_tracing_start: int = -1
-    last_trace_tracing_stop: int = -1
-    last_trace_host_execution_start: int = -1
-    last_trace_host_execution_stop: int = -1
 
 
 # A timing helper
@@ -294,18 +285,6 @@ def _benchmark(
             called_backward=called_backward,
         )
 
-        # TODO Ensure the compile data statistics are always populated
-        if cs is not None and cs.last_trace_host_start > 0:
-            stat.has_extended_stats = True
-            stat.last_trace_host_start = cs.last_trace_host_start
-            stat.last_trace_host_stop = cs.last_trace_host_stop
-            stat.last_trace_cache_start = cs.last_trace_cache_start
-            stat.last_trace_cache_stop = cs.last_trace_cache_stop
-            stat.last_trace_tracing_start = cs.last_trace_tracing_start
-            stat.last_trace_tracing_stop = cs.last_trace_tracing_stop
-            stat.last_trace_host_execution_start = cs.last_trace_host_execution_start
-            stat.last_trace_host_execution_stop = cs.last_trace_host_execution_stop
-
         stats.append(stat)
 
     return stats
@@ -417,51 +396,9 @@ def _prettyprint_stats(
         for rank, (memory_allocated, memory_reserved) in rank_mem_info.items():
             short_printout += f"\n    rank-{rank} - peak allocated memory {memory_allocated/1024/1024:.2f}MB, peak reserved: {memory_reserved/1024/1024:.2f}MB"
         short_printout += "\n"
-    if median_benchmark_stat.has_extended_stats:
-        # NOTE At this point in the program extended statistics are available
-        trace_time_ns = median_benchmark_stat.last_trace_host_stop - median_benchmark_stat.last_trace_host_start
-        cache_time_ns = median_benchmark_stat.last_trace_cache_stop - median_benchmark_stat.last_trace_cache_start
-        tracing_time_ns = median_benchmark_stat.last_trace_tracing_stop - median_benchmark_stat.last_trace_tracing_start
-        trace_execution_time_ns = (
-            median_benchmark_stat.last_trace_host_execution_stop - median_benchmark_stat.last_trace_host_execution_start
-        )
-
-        trace_time_us: str = ns_to_us(trace_time_ns)
-        cache_time_us: str = ns_to_us(cache_time_ns)
-        tracing_time_us: str = ns_to_us(tracing_time_ns)
-        trace_execution_time_us: str = ns_to_us(trace_execution_time_ns)
-
-        trace_time_percentage: str = f"{round(trace_time_ns / median_benchmark_stat.total_time * 100)}%"
-        cache_time_percentage: str = f"{round(cache_time_ns / median_benchmark_stat.total_time * 100)}%"
-        tracing_time_percentage: str = f"{round(tracing_time_ns / median_benchmark_stat.total_time * 100)}%"
-        trace_execution_time_percentage: str = (
-            f"{round(trace_execution_time_ns / median_benchmark_stat.total_time * 100)}%"
-        )
-
-        before_trace_time_ns = median_benchmark_stat.last_trace_host_start - median_benchmark_stat.start_time
-        accelerator_wait_time_ns = median_benchmark_stat.stop_time - median_benchmark_stat.last_trace_host_stop
-
-        before_trace_time_us: str = ns_to_us(before_trace_time_ns)
-        accelerator_wait_time_us: str = ns_to_us(accelerator_wait_time_ns)
-
-        before_trace_time_percentage: str = f"{round(before_trace_time_ns / median_benchmark_stat.total_time * 100)}%"
-        accelerator_wait_time_percentage: str = (
-            f"{round(accelerator_wait_time_ns / median_benchmark_stat.total_time * 100)}%"
-        )
-
-        extension = f"""\
-            The median benchmark took {before_trace_time_us} to get into the tracing logic, {before_trace_time_percentage} of the total time.
-            The median benchmark took {accelerator_wait_time_us} waiting for the accelerator's computation to finish, {accelerator_wait_time_percentage} of the total time.
-            The median benchmark run's total time in tracing logic is {trace_time_us}, {trace_time_percentage} of the total time.
-            The median benchmark run's cache lookup time is {cache_time_us}, {cache_time_percentage} of the total time.
-            The median benchmark run's time spent tracing is {tracing_time_us}, {tracing_time_percentage} of the total time.
-            The median benchmark run's time to request the traced program be executed is {trace_execution_time_us}, {trace_execution_time_percentage} of the total time.
-        """
-    else:
         extension = ""
 
-    output = textwrap.dedent(preamble) + textwrap.indent(textwrap.dedent(extension), " " * 4)
-    print(output)
+    print(textwrap.dedent(preamble))
 
 
 def print_rank_0(message):
diff --git a/thunder/common.py b/thunder/common.py
index c5101b8c27..e6858bad6e 100644
--- a/thunder/common.py
+++ b/thunder/common.py
@@ -83,20 +83,6 @@ class CompileStats:
         last_interpreted_instructions (Generator[dist.Instruction, None, None] | None):
         last_interpreter_log (list[InterpreterLogItem] | None):
         last_backward_traces (Sequence[TraceCtx]):
-        last_trace_host_start (int):
-        last_trace_host_stop (int):
-        last_trace_cache_start (int):
-        last_trace_cache_stop (int):
-        last_trace_tracing_start (int):
-        last_trace_tracing_stop (int):
-        last_trace_host_execution_start (int):
-        last_trace_host_execution_stop (int):
-        last_prologue_transformation_start (int):
-        last_prologue_transformation_stop (int):
-        last_prologue_execution_start (int):
-        last_prologue_execution_stop (int):
-        last_computation_execution_start (int):
-        last_computation_execution_stop (int):
         cache (dict):
         interpreter_cache (list):
         calls (int):
@@ -117,23 +103,6 @@ def __init__(self):
         # torch.autograd.Function specific data
         self.last_backward_traces = None
 
-        # Timing stats
-        self.last_trace_host_start: int = -1
-        self.last_trace_host_stop: int = -1
-        self.last_trace_cache_start: int = -1
-        self.last_trace_cache_stop: int = -1
-        self.last_trace_tracing_start: int = -1
-        self.last_trace_tracing_stop: int = -1
-        self.last_trace_host_execution_start: int = -1
-        self.last_trace_host_execution_stop: int = -1
-
-        self.last_prologue_transformation_start: int = -1
-        self.last_prologue_transformation_stop: int = -1
-        self.last_prologue_execution_start: int = -1
-        self.last_prologue_execution_stop: int = -1
-        self.last_computation_execution_start: int = -1
-        self.last_computation_execution_stop: int = -1
-
         # Cache stats
         self.cache = {}
         self.interpreter_cache: list = []
@@ -151,31 +120,6 @@ def _time_template(self, start: int, stop: int, desc: str, /) -> int:
             raise AssertionError(f"The {desc} times {start=} and {stop=} were not recorded correctly")
         return stop - start
 
-    def last_cache_lookup_time(self, /) -> int:
-        start: int = self.last_trace_cache_start
-        stop: int = self.last_trace_cache_stop
-        return self._time_template(start, stop, "cache lookup")
-
-    def last_trace_construction_time(self, /) -> int:
-        start: int = self.last_trace_host_start
-        stop: int = self.last_trace_host_stop
-        return self._time_template(start, stop, "trace construction")
-
-    def last_prologue_transformation_time(self, /) -> int:
-        start: int = self.last_prologue_transformation_start
-        stop: int = self.last_prologue_transformation_stop
-        return self._time_template(start, stop, "prologue construction")
-
-    def last_prologue_execution_time(self, /) -> int:
-        start: int = self.last_prologue_execution_start
-        stop: int = self.last_prologue_execution_stop
-        return self._time_template(start, stop, "prologue execution")
-
-    def last_computation_execution_time(self, /) -> int:
-        start: int = self.last_computation_execution_start
-        stop: int = self.last_computation_execution_stop
-        return self._time_template(start, stop, "computation execution")
-
 
 # A class that holds data about the compiled object, including statistics about how it's been called
 # TODO Better document the module-related data the preprocessing harvests,
@@ -638,6 +582,7 @@ def wait_for_future(f: FutureTensorProxy) -> TensorProxy:
 # TODO Consider making this faster by reusing more data
 # TODO Create a general mechanism for running traces that produces reproducible provenance and the
 #   appropriate error checks
+@thunder.core.profile.annotate_for_profile("transform_for_execution")
 def transform_for_execution(
     trace: TraceCtx,
     executors_list: Sequence[Executor],
diff --git a/thunder/core/functionalization.py b/thunder/core/functionalization.py
index 86859e6428..d7b3dfabdd 100644
--- a/thunder/core/functionalization.py
+++ b/thunder/core/functionalization.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING
 
 import thunder.core.prims as prims
+import thunder.core.profile
 from thunder.core.proxies import variableify, TensorProxy, unvariableify, ProxyInterface
 from thunder.core.pytree import tree_flatten, tree_unflatten
 from thunder.core.symbol import BoundSymbol
@@ -39,6 +40,7 @@ def bsym_of_to_return_self(bsym: BoundSymbol):
     return result_is_self
 
 
+@thunder.core.profile.annotate_for_profile("check_inplace_to_views")
 def check_inplace_to_views(computation_trace: Trace) -> dict[VariableInterface, TensorProxy]:
     """Error out if in-place op that outputs of different number of elements from the input and the input has other consumers."""
     import thunder.torch as ltorch
@@ -613,6 +615,7 @@ def _reshape_bsym_ctor(src: TensorProxy, dst: TensorProxy, trace: Trace) -> tupl
     return functionalized_computation_trace
 
 
+@thunder.core.profile.annotate_for_profile("functionalize_inplace_ops")
 def functionalize_inplace_ops(
     computation_trace: Trace,
     orig_to_view_swap_map: dict[VariableInterface, TensorProxy],
diff --git a/thunder/core/jit_ext.py b/thunder/core/jit_ext.py
index 726353983a..d4586d7419 100644
--- a/thunder/core/jit_ext.py
+++ b/thunder/core/jit_ext.py
@@ -1784,6 +1784,7 @@ def update_tags(proxy_swapmap: dict[Variable, Proxy]) -> None:
 )
 
 
+@thunder.core.profile.annotate_for_profile("thunder_general_jit")
 def thunder_general_jit(
     fn: Callable,
     args: tuple[Any, ...],
diff --git a/thunder/core/rematerialization.py b/thunder/core/rematerialization.py
index f24ee1fad2..fff3707574 100644
--- a/thunder/core/rematerialization.py
+++ b/thunder/core/rematerialization.py
@@ -9,6 +9,7 @@
 
 import networkx as nx
 
+import thunder.core.profile
 from thunder.core import prims, utils
 from thunder.core.baseutils import BoundSymbolInterface, ProxyInterface
 from thunder.core.prims import PrimIDs
@@ -512,6 +513,7 @@ def rematerialize_all_gather(fw_trace: TraceCtx, bw_trace: TraceCtx) -> tuple[Tr
     return new_fw_trace, new_bw_trace
 
 
+@thunder.core.profile.annotate_for_profile("rematerialize")
 def rematerialize(trace: TraceCtx) -> TraceCtx:
     """Rematerialize the trace.
 
@@ -522,8 +524,6 @@ def rematerialize(trace: TraceCtx) -> TraceCtx:
         TraceCtx: Rematerialized trace and the list of
             rematerialized traces.
     """
-    start_time_ns = time.perf_counter_ns()
-
     static_consumer_info = utils.consumers(trace)
 
     # Find all the producers and consumers
@@ -568,11 +568,8 @@ def rematerialize(trace: TraceCtx) -> TraceCtx:
     rematerialized_trace = from_trace(trace)
     rematerialized_trace.bound_symbols = tuple(new_bsyms.get(bsym, bsym) for bsym in trace.bound_symbols)
 
-    end_time_ns = time.perf_counter_ns()
-    elapsed_time_ns = end_time_ns - start_time_ns
-    elapsed_time_millis = elapsed_time_ns // 1000000
+    rematerialized_trace.set_provenance(TraceProvenance(f"Rematerialization"))
 
-    rematerialized_trace.set_provenance(TraceProvenance(f"Rematerialization (took {elapsed_time_millis} milliseconds)"))
     return rematerialized_trace
 
 
@@ -663,9 +660,9 @@ def joint_fn(args, kwargs, cotangents):
     return new_fw_trace, new_bw_trace
 
 
+@thunder.core.profile.annotate_for_profile("replace_uniform")
 def replace_uniform(trace: TraceCtx) -> TraceCtx:
     """For better rematerialization, replace the uniform operator with the stateless uniform_philox operator and manually update the RNG state."""
-    start_time_ns = time.perf_counter_ns()
     from thunder.core.trace import VariableInterface
     from thunder.core.proxies import Proxy
     from thunder.core.devices import Device
@@ -696,11 +693,6 @@ def visit_(bsym: BoundSymbolInterface) -> VISIT_TYPE:
         bound_symbols.append(nbsym)
 
     new_trace.bound_symbols = bound_symbols
+    new_trace.set_provenance(TraceProvenance("Transform for replace uniform"))
 
-    end_time_ns = time.perf_counter_ns()
-    elapsed_time_ns = end_time_ns - start_time_ns
-    elapsed_time_millis = elapsed_time_ns // 1000000
-    new_trace.set_provenance(
-        TraceProvenance(f"Transform for replace uniform (took {elapsed_time_millis} milliseconds)")
-    )
     return new_trace
diff --git a/thunder/core/trace.py b/thunder/core/trace.py
index b8670212aa..c71795f587 100644
--- a/thunder/core/trace.py
+++ b/thunder/core/trace.py
@@ -14,6 +14,7 @@
 import thunder
 import thunder.core.codeutils as codeutils
 import thunder.core.baseutils as baseutils
+import thunder.core.profile
 from thunder.core.baseutils import ProxyInterface, BoundSymbolInterface, TagBase
 import thunder.core.devices as devices
 from thunder.core.pytree import tree_flatten, tree_unflatten
@@ -479,6 +480,7 @@ def keyfn(class_or_module: type | ModuleType) -> str:
     # Returns a Python callable that executes the trace
     # TODO issue "Create a mechanism for freezing TraceCtx objects"
     #   Create a mechanism for freezing traces and cache the compilation
+    @thunder.core.profile.annotate_for_profile("TraceCtx.python_callable")
     def python_callable(self, *, global_dicts: None | dict = None, **kwargs: Any) -> Callable:
         python_str: str
 
diff --git a/thunder/core/transform_common.py b/thunder/core/transform_common.py
index bfe4123dc6..44ad485311 100644
--- a/thunder/core/transform_common.py
+++ b/thunder/core/transform_common.py
@@ -9,6 +9,7 @@
 
 import thunder
 import thunder.core.prims as prims
+import thunder.core.profile
 from thunder.core.baseutils import BoundSymbolInterface, NumberProxyInterface
 from thunder.core.proxies import Proxy, variableify, Variable, TensorProxy, unvariableify
 from thunder.core.pytree import tree_flatten, tree_iter, tree_map, tree_unflatten
@@ -141,9 +142,8 @@ def keep_or_swap(p):
 #   that only produce non-proxy objects
 # NOTE needed_proxies is an in/out argument, it takes an initial set of Variables you want to keep, and return
 #   all the needed proxies of the input trace
+@thunder.core.profile.annotate_for_profile("dce")
 def dce(trace: Trace, needed_proxies: None | set[Variable] = None) -> Trace:
-    start_time_ns = time.perf_counter_ns()
-
     producer_map: ProxyDict = producers(trace)
 
     flat_trace_outputs, _ = tree_flatten(trace.output)
@@ -203,10 +203,7 @@ def _helper(x):
     dced_bound_symbols = remove_duplicate_number_proxies(dced_bound_symbols)
     dcetrace.bound_symbols = dced_bound_symbols
 
-    end_time_ns = time.perf_counter_ns()
-    elapsed_time_ns = end_time_ns - start_time_ns
-    elapsed_time_millis = elapsed_time_ns // 1000000
-    dcetrace.set_provenance(TraceProvenance(f"Dead Code Elimination (took {elapsed_time_millis} milliseconds)"))
+    dcetrace.set_provenance(TraceProvenance(f"Dead Code Elimination"))
 
     return dcetrace
 
@@ -300,6 +297,7 @@ def cse_single_bsym(
 
 # TODO Update the replacement of redundant proxies to use a visitor pattern
 #   when that architecture is added in the future
+@thunder.core.profile.annotate_for_profile("cse")
 def cse(trace: Trace) -> Trace:
     """Remove bound symbols whose right hand side is common expression.
 
@@ -355,8 +353,6 @@ def thunder_140410131706304(x, y):
     Returns:
         :class:`TraceCtx` with common subexpression eliminated.
     """
-    start_time_ns = time.perf_counter_ns()
-
     cse_trace = from_trace(trace)
 
     cse_trace_bound_symbols = []
@@ -374,12 +370,7 @@ def thunder_140410131706304(x, y):
     new_bsyms = replace_redundant_inputs(redundant_map, cse_trace_bound_symbols)
     cse_trace.bound_symbols = new_bsyms
 
-    end_time_ns = time.perf_counter_ns()
-    elapsed_time_ns = end_time_ns - start_time_ns
-    elapsed_time_millis = elapsed_time_ns // 1000000
-    cse_trace.set_provenance(
-        TraceProvenance(f"Common Subexpression Elimination (took {elapsed_time_millis} milliseconds)")
-    )
+    cse_trace.set_provenance(TraceProvenance("Common Subexpression Elimination"))
     return cse_trace
 
 
@@ -491,6 +482,7 @@ def process_bound_symbols(src_bound_symbols, target_bound_symbols):
     return output
 
 
+@thunder.core.profile.annotate_for_profile("wrap_return_value_together_with_arguments")
 def wrap_return_value_together_with_arguments(trace: Trace) -> Trace:
     last = trace.bound_symbols[-1]
     assert last.sym.id == prims.PrimIDs.RETURN
@@ -515,6 +507,7 @@ def unwrap_return_value(trace: Trace) -> Trace:
     return new_trace
 
 
+@thunder.core.profile.annotate_for_profile("remove_context_manager_prims_from_trace")
 def remove_context_manager_prims_from_trace(trace: Trace) -> Trace:
     def is_context_manager_prim(bsym):
         # context manager prims would/should be explicitly tagged.
diff --git a/thunder/core/transforms.py b/thunder/core/transforms.py
index 3cb34a623e..cf37d7f170 100644
--- a/thunder/core/transforms.py
+++ b/thunder/core/transforms.py
@@ -24,6 +24,7 @@
     interpret_trace_to_trace,
     trace_interpreter_skip_list,
 )
+from thunder.core.profile import annotate_for_profile
 from thunder.core.proxies import (
     CollectionProxy,
     NumberProxy,
@@ -484,10 +485,10 @@ def add_transform(
 
 # The no-op transform. A trivial composable transform, only useful as an example.
 class _NoopTransform(Transform):
+    @thunder.core.profile.annotate_for_profile("NoopTransform.transform_trace_pre_prologue")
     def transform_trace_pre_prologue(
         self, prologue_trace: Trace, computation_trace: Trace, epilogue_trace: Trace | None, **kwargs
     ) -> Trace:
-        start_time_ns = time.perf_counter_ns()
         noop_trace = from_trace(computation_trace)
 
         tracectx_tok: Any
@@ -499,10 +500,7 @@ def transform_trace_pre_prologue(
 
         noop_trace.bound_symbols.extend(computation_trace.bound_symbols)
 
-        end_time_ns = time.perf_counter_ns()
-        elapsed_time_ns = end_time_ns - start_time_ns
-        elapsed_time_millis = elapsed_time_ns // 1000000
-        noop_trace.set_provenance(TraceProvenance(f"No-op Transform (took {elapsed_time_millis} milliseconds)"))
+        noop_trace.set_provenance(TraceProvenance("No-op Transform"))
 
         return prologue_trace, noop_trace, computation_trace
 
@@ -515,8 +513,8 @@ def noop(cfn: Callable) -> Callable:
 # The comment fusions transform. Just adds a comment before and after each fusion.
 #   This is an example of a post-optimization transform.
 class _CommentFusionsTransform(Transform):
+    @thunder.core.profile.annotate_for_profile("CommentFusionsTransform.transform_trace_post_optimization")
     def transform_trace_post_optimization(self, trace: Trace, **kwargs) -> Trace:
-        start_time_ns = time.perf_counter_ns()
         commented_trace = from_trace(trace)
 
         nbsyms: list[BoundSymbol] = []
@@ -531,11 +529,8 @@ def transform_trace_post_optimization(self, trace: Trace, **kwargs) -> Trace:
                 nbsyms.append(bsym)
 
         commented_trace.bound_symbols = nbsyms
-        end_time_ns = time.perf_counter_ns()
-        elapsed_time_ns = end_time_ns - start_time_ns
-        elapsed_time_millis = elapsed_time_ns // 1000000
 
-        commented_trace.set_provenance(TraceProvenance(f"Comment Fusions (took {elapsed_time_millis} milliseconds)"))
+        commented_trace.set_provenance(TraceProvenance("Comment Fusions"))
 
         return commented_trace
 
@@ -1513,6 +1508,7 @@ def transform_traces_pre_prologue(
             computation_trc = dce(computation_trc)
 
             @wraps(computation_trc.python_callable())
+            @annotate_for_profile("_GradTransform.python_callable")
             def python_callable(*args, **kwargs):
                 return eval_trace(computation_trc, *args, **kwargs)["output"]
 
@@ -3162,6 +3158,7 @@ def backward_fn(saved_for_backward, cotangents):
     return ForwardBackwardTraces(forward_trace, backward_trace)
 
 
+@thunder.core.profile.annotate_for_profile("recompute_saved_for_backward")
 def recompute_saved_for_backward(fwd_trace: Trace, bwd_trace: Trace) -> tuple[Trace, Trace]:
     """Generates the pair of traces with rematerializaion of the saved-for-backward tensors.
     Args:
@@ -3171,9 +3168,6 @@ def recompute_saved_for_backward(fwd_trace: Trace, bwd_trace: Trace) -> tuple[Tr
     Returns:
         tuple[Trace, Trace]: A tuple containing the new forward and backward traces.
     """
-
-    start_time_ns = time.perf_counter_ns()
-
     saved_for_bw = get_saved_for_backward_tensors(fwd_trace)
     fwd_trace_args = {variableify(j) for j in fwd_trace.args}
     old_saved_for_bwd = {variableify(j) for j in saved_for_bw}
@@ -3240,12 +3234,7 @@ def recompute_saved_for_backward(fwd_trace: Trace, bwd_trace: Trace) -> tuple[Tr
 
     new_bwd_trace.args = [(new_saved_for_backward, fwd_trace.output[1][1]), *bwd_trace.args[1:]]
 
-    elapsed_time_ns = time.perf_counter_ns() - start_time_ns
-    new_bwd_trace.set_provenance(
-        TraceProvenance(f"Saved for backward remat trace (took {elapsed_time_ns * 1e-6:.2f} milliseconds)")
-    )
-    new_fwd_trace.set_provenance(
-        TraceProvenance(f"Saved for backward remat trace (took {elapsed_time_ns * 1e-6:.2f} milliseconds)")
-    )
+    new_bwd_trace.set_provenance(TraceProvenance("Saved for backward remat trace"))
+    new_fwd_trace.set_provenance(TraceProvenance("Saved for backward remat trace"))
 
     return new_fwd_trace, new_bwd_trace
diff --git a/thunder/dev_utils/debug_transform.py b/thunder/dev_utils/debug_transform.py
index 53d12aef8c..48fac67562 100644
--- a/thunder/dev_utils/debug_transform.py
+++ b/thunder/dev_utils/debug_transform.py
@@ -27,8 +27,8 @@ def __init__(
         self.pre_callback = pre_callback
         self.post_callback = post_callback
 
+    @thunder.core.profile.annotate_for_profile("DebugTransform.transform_trace_post_optimization")
     def transform_trace_post_optimization(self, trace: TraceCtx, **kwargs) -> TraceCtx:
-        start_time_ns = time.perf_counter_ns()
         debug_trace = from_trace(trace)
         debug_counter = 1
 
@@ -67,9 +67,8 @@ def _post_call_ctx(post_debug_bsym, bsym, *args, **kwargs):
             debug_counter += 1
 
         debug_trace.bound_symbols = new_bsyms
-        elapsed_time_ns = time.perf_counter_ns() - start_time_ns
 
-        debug_trace.set_provenance(TraceProvenance(f"Debug trace (took {elapsed_time_ns * 1e-6:.2f} milliseconds)"))
+        debug_trace.set_provenance(TraceProvenance("Debug trace"))
 
         return debug_trace
 
diff --git a/thunder/dev_utils/nvtx_profile_transform.py b/thunder/dev_utils/nvtx_profile_transform.py
index 6839d047c4..ba6f2ce78b 100644
--- a/thunder/dev_utils/nvtx_profile_transform.py
+++ b/thunder/dev_utils/nvtx_profile_transform.py
@@ -1,3 +1,4 @@
+from thunder.core.profile import annotate_for_profile
 from thunder.core.trace import TraceCtx as Trace, from_trace, TraceProvenance
 from thunder.dev_utils.utils import NON_COMPUTATION_PRIMS
 from thunder.extend import OperatorExecutor
@@ -40,23 +41,19 @@ def nvtx_pop_impl():
 
 
 class NvtxProfileTransform(thunder.core.transforms.Transform):
+    @annotate_for_profile("NvtxProfileTransform.transform_trace_post_optimization")
     def transform_trace_post_optimization(self, trace: Trace, **kwargs) -> Trace:
-        with Timer() as timer:
-            profile_trace = from_trace(trace)
-
-            for bound_symbol in trace.bound_symbols:
-                if bound_symbol.sym.id in NON_COMPUTATION_PRIMS:
-                    profile_trace.bound_symbols.append(bound_symbol)
-                    continue
-
-                # Add nvtx range for the symbol.
-                profile_trace.bound_symbols.append(
-                    nvtx_push.bind(f"{''.join(bound_symbol.python(indent=0))}", output=None)
-                )
+        profile_trace = from_trace(trace)
+
+        for bound_symbol in trace.bound_symbols:
+            if bound_symbol.sym.id in NON_COMPUTATION_PRIMS:
                 profile_trace.bound_symbols.append(bound_symbol)
-                profile_trace.bound_symbols.append(nvtx_pop.bind(output=None))
+                continue
+
+            # Add nvtx range for the symbol.
+            profile_trace.bound_symbols.append(nvtx_push.bind(f"{''.join(bound_symbol.python(indent=0))}", output=None))
+            profile_trace.bound_symbols.append(bound_symbol)
+            profile_trace.bound_symbols.append(nvtx_pop.bind(output=None))
 
-        profile_trace.set_provenance(
-            TraceProvenance(f"NVTX Profile Transform (took {timer.get_elapsed_time_in_ms()} milliseconds)")
-        )
+        profile_trace.set_provenance(TraceProvenance("NVTX Profile Transform"))
         return profile_trace
diff --git a/thunder/executors/nvfuserex_impl.py b/thunder/executors/nvfuserex_impl.py
index 9abab32c79..ca9e549fc4 100644
--- a/thunder/executors/nvfuserex_impl.py
+++ b/thunder/executors/nvfuserex_impl.py
@@ -18,6 +18,7 @@
 import thunder.torch as ltorch
 from thunder.torch import TensorLike
 
+import thunder.core.profile
 from thunder.core import prims, utils
 from thunder.core.baseutils import BoundSymbolInterface
 from thunder.core.prims import PrimIDs
@@ -743,6 +744,7 @@ def fuse(self, region: Region, fusion_counter: int) -> BoundSymbol:
 
     # TODO Update the replacement of redundant proxies to use a visitor pattern
     #   when that architecture is added in the future
+    @thunder.core.profile.annotate_for_profile("nvFuserExecutor.cse")
     def cse(self, trace: TraceCtx) -> TraceCtx:
         """Remove bound symbols whose right hand side is common expression.
         Nvfuser specific CSE pass.
@@ -753,9 +755,6 @@ def cse(self, trace: TraceCtx) -> TraceCtx:
         Returns:
             :class:`TraceCtx` with common subexpression eliminated.
         """
-
-        start_time_ns = time.perf_counter_ns()
-
         cse_trace = from_trace(trace)
 
         # The trace_rhs_to_bsym_map is used for CSE on trace outside of nvFusion region.
@@ -826,18 +825,12 @@ def map_redundant(x: Any) -> Any:
         trace_output = tree_map(map_redundant, return_bsym.args)
         cse_trace.bound_symbols[-1] = prims.python_return.bind(*trace_output, output=None)
 
-        end_time_ns = time.perf_counter_ns()
-        elapsed_time_ns = end_time_ns - start_time_ns
-        elapsed_time_millis = elapsed_time_ns // 1000000
-
-        cse_trace.set_provenance(
-            TraceProvenance(f"Nvfuser Common Subexpression Elimination (took {elapsed_time_millis} milliseconds)")
-        )
+        cse_trace.set_provenance(TraceProvenance("Nvfuser Common Subexpression Elimination"))
         return cse_trace
 
     # TODO Restore fusion logic here -- this just replaces supported operations in isolation at the moment
+    @thunder.core.profile.annotate_for_profile("nvFuserExecutor.fusion_pass")
     def fusion_pass(self, trace: TraceCtx) -> TraceCtx:
-        start_time_ns: int = time.perf_counter_ns()
         # Replace uniform with uniform_philox and rng state operators for better rematerialization
         from thunder.core.rematerialization import replace_uniform
 
@@ -947,11 +940,7 @@ def _can_fuse_node(n: Node):
         fusedtrace = dce(fusedtrace)
 
         fusedtrace = update_fusion_call_ctx(fusedtrace)
-
-        end_time_ns: int = time.perf_counter_ns()
-        elapsed_time_ns: int = end_time_ns - start_time_ns
-        elapsed_time_millis: int = elapsed_time_ns // 1000000
-        fusedtrace.set_provenance(TraceProvenance(f"Fusion (took {elapsed_time_millis} milliseconds)"))
+        fusedtrace.set_provenance(TraceProvenance("nvFuser Fusion"))
 
         return fusedtrace
 
@@ -2129,9 +2118,8 @@ def copy_(
 # NOTE This only handles conversions performed by CONVERT_ELEMENT_TYPE, and not conversions caused
 #   by other Symbols, like torch.to, which may be unflattened
 # TODO This could be extended to non-float conversions, like complex -> complex conversions
+@thunder.core.profile.annotate_for_profile("remove_redundant_casts")
 def remove_redundant_casts(trace: TraceCtx) -> tuple[TraceCtx, list[TraceCtx]]:
-    start_time_ns = time.perf_counter_ns()
-
     rrctrace = from_trace(trace)
 
     # Returns a tuple (is proxy float->float conversion?, object to convert, dtype to convert to)
@@ -2278,10 +2266,8 @@ def map_inside_replacement(x: Any) -> None:
 
     rrctrace.bound_symbols = nbsyms
 
-    end_time_ns = time.perf_counter_ns()
-    elapsed_time_ns = end_time_ns - start_time_ns
-    elapsed_time_millis = elapsed_time_ns // 1000000
-    rrctrace.set_provenance(TraceProvenance(f"Remove redundant casts (took {elapsed_time_millis} milliseconds)"))
+    rrctrace.set_provenance(TraceProvenance("Remove redundant casts"))
+
     return rrctrace
 
 
diff --git a/thunder/executors/passes.py b/thunder/executors/passes.py
index 2d5b9bd1f0..1516427d8c 100644
--- a/thunder/executors/passes.py
+++ b/thunder/executors/passes.py
@@ -10,6 +10,7 @@
 from thunder.core.trace import TraceCtx, from_trace, TraceProvenance, VariableInterface
 import thunder.core.dtypes as dtypes
 import thunder.core.utils as cutils
+import thunder.core.profile
 from thunder.core.utils import ProxyDict, check, safe_map_flat
 from thunder.core.symbol import BoundSymbol
 from thunder.core.pytree import tree_flatten, tree_unflatten, tree_map
@@ -28,9 +29,8 @@
 
 # Transforms a trace by determining which execution transforms to call given the list of executors in priority order
 # This pass tries to preserve the original trace and proxies.
+@thunder.core.profile.annotate_for_profile("_transform_for_operator_execution")
 def _transform_for_operator_executor_execution(trace: TraceCtx, executors_list: Sequence[Executor]) -> TraceCtx:
-    start_time_ns = time.perf_counter_ns()
-
     swapmap: dict[Variable, Proxy] = {}
 
     def update_swapmap(o: Any, no: Any) -> None:
@@ -106,20 +106,15 @@ def process_bsym(self, bsym):
 
     extrace, _ = OpExProcessor(trace)()
 
-    end_time_ns = time.perf_counter_ns()
-    elapsed_time_ns = end_time_ns - start_time_ns
-    elapsed_time_millis = elapsed_time_ns // 1000000
-    extrace.set_provenance(
-        TraceProvenance(f"Transform for operator executor execution (took {elapsed_time_millis} milliseconds)")
-    )
+    extrace.set_provenance(TraceProvenance("Transform for operator executor execution"))
+
     return extrace
 
 
+@thunder.core.profile.annotate_for_profile("transform_for_execution")
 def transform_for_execution(trace: TraceCtx, executors_list: Sequence[Executor]) -> TraceCtx:
     import torch
 
-    start_time_ns = time.perf_counter_ns()
-
     if torch.distributed.is_available():
         # Apply AllReduce bucketing if possible & needed
         from thunder.distributed.transforms.ddp import apply_bucketing_to_grad_allreduce
@@ -148,11 +143,7 @@ def transform_for_execution(trace: TraceCtx, executors_list: Sequence[Executor])
     # NOTE This occurs if a fusion executor declines to execute a symbol after running its fusion pass
     extrace = _transform_for_operator_executor_execution(extrace, get_always_executors())
 
-    end_time_ns = time.perf_counter_ns()
-    elapsed_time_ns = end_time_ns - start_time_ns
-    elapsed_time_millis = elapsed_time_ns // 1000000
-
-    extrace.set_provenance(TraceProvenance(f"Transform for execution (took {elapsed_time_millis} milliseconds)"))
+    extrace.set_provenance(TraceProvenance("Transform for execution"))
     return extrace
 
 
@@ -193,6 +184,7 @@ def fusion_bsym_to_region(bsym: BoundSymbol):
     return bsym.sym.executor.fuse(fusion_bsym_to_region(bsym), counter)
 
 
+@thunder.core.profile.annotate_for_profile("update_fusion_call_ctx")
 def update_fusion_call_ctx(trace: TraceCtx) -> TraceCtx:
     """Updates the call context of the trace to be the current call context.
 
@@ -205,7 +197,6 @@ def update_fusion_call_ctx(trace: TraceCtx) -> TraceCtx:
     Returns:
         (TraceCtx): transformed trace
     """
-    start_time_ns = time.perf_counter_ns()
 
     new_trace = from_trace(trace)
     new_trace.bound_symbols = []
@@ -215,10 +206,7 @@ def update_fusion_call_ctx(trace: TraceCtx) -> TraceCtx:
         else:
             new_trace.bound_symbols.append(bsym)
 
-    end_time_ns = time.perf_counter_ns()
-    elapsed_time_ns = end_time_ns - start_time_ns
-    elapsed_time_millis = elapsed_time_ns // 1000000
-    new_trace.set_provenance(TraceProvenance(f"Update Call Context (took {elapsed_time_millis} milliseconds)"))
+    new_trace.set_provenance(TraceProvenance("Update Call Context"))
     return new_trace
 
 
@@ -268,6 +256,7 @@ def _del_last_used(bound_symbols, flattened_final_output, *, clear_mutable_colle
 
 
 # TODO Review deleting non-proxies
+@thunder.core.profile.annotate_for_profile("del_last_used")
 def del_last_used(trace: TraceCtx, *, clear_mutable_collections=False) -> TraceCtx:
     """Mark last used intermediates to be deleted. This lets the Python garbage collector free
         unused tensor memory.
@@ -278,8 +267,6 @@ def del_last_used(trace: TraceCtx, *, clear_mutable_collections=False) -> TraceC
     Returns:
         list: transformed trace
     """
-    start_time_ns = time.perf_counter_ns()
-
     del_trace = from_trace(trace)
 
     outs = cutils.sequencify(trace.output)
@@ -289,10 +276,5 @@ def del_last_used(trace: TraceCtx, *, clear_mutable_collections=False) -> TraceC
         trace.bound_symbols, flat_outs, clear_mutable_collections=clear_mutable_collections
     )
 
-    end_time_ns = time.perf_counter_ns()
-    elapsed_time_ns = end_time_ns - start_time_ns
-    elapsed_time_millis = elapsed_time_ns // 1000000
-
-    del_trace.set_provenance(TraceProvenance(f"Delete Last Used (took {elapsed_time_millis} milliseconds)"))
-
+    del_trace.set_provenance(TraceProvenance("Delete Last Used"))
     return del_trace
diff --git a/thunder/executors/torch_compile.py b/thunder/executors/torch_compile.py
index ce95b91bf8..afab3380ff 100644
--- a/thunder/executors/torch_compile.py
+++ b/thunder/executors/torch_compile.py
@@ -6,6 +6,7 @@
 import torch
 from lightning_utilities import compare_version
 
+import thunder.core.profile
 from thunder.core import prims, utils
 from thunder.core.proxies import Proxy, TensorProxy, unvariableify, Variable
 from thunder.core.rematerialization import rematerialize
@@ -154,9 +155,8 @@ def keyfn(x: Variable) -> str:
 
         return fusion_bsym
 
+    @thunder.core.profile.annotate_for_profile("TorchCompileExecutor.fusion_pass")
     def fusion_pass(self, trace: TraceCtx) -> TraceCtx:
-        start_time_ns: int = time.perf_counter_ns()
-
         fusedtrace: TraceCtx = from_trace(trace)
 
         producers, consumers = utils.producers_and_consumers(trace)
@@ -198,10 +198,7 @@ def _can_fuse_node(n: Node):
         fusedtrace = dce(fusedtrace)
         fusedtrace = update_fusion_call_ctx(fusedtrace)
 
-        end_time_ns: int = time.perf_counter_ns()
-        elapsed_time_ns: int = end_time_ns - start_time_ns
-        elapsed_time_millis: int = elapsed_time_ns // 1000000
-        fusedtrace.set_provenance(TraceProvenance(f"Fusion (took {elapsed_time_millis} milliseconds)"))
+        fusedtrace.set_provenance(TraceProvenance("torch.compile Fusion"))
 
         return fusedtrace
 
diff --git a/thunder/tests/test_core.py b/thunder/tests/test_core.py
index bf6e9bd7d3..34a2c4ede4 100644
--- a/thunder/tests/test_core.py
+++ b/thunder/tests/test_core.py
@@ -13,7 +13,7 @@
 from types import FunctionType
 
 import thunder
-from thunder import last_traces, cache_option, cache_hits, cache_misses
+from thunder import last_traces, cache_option
 import thunder.examine as examine
 import thunder.clang as clang
 import thunder.core.profile
@@ -270,55 +270,6 @@ def grad_second(a):
     assert_close(actual_second, expected)
 
 
-@instantiate(dtypes=(thunder.float32,))
-def test_grad_no_recompile(executor, device, dtype):
-    # Checks that having .grad or not does not cause recompile
-
-    def foo(a):
-        return a * 2
-
-    cfoo = executor.make_callable(foo)
-
-    tdtype = ltorch.to_torch_dtype(dtype)
-    a = make_tensor((2, 2), device=device, dtype=tdtype, requires_grad=True)
-    a.grad = make_tensor((2, 2), device=device, dtype=tdtype)
-    cfoo(a)
-    assert thunder.cache_misses(cfoo) == 1
-
-    a.grad = None
-    cfoo(a)
-    assert thunder.cache_misses(cfoo) == 1
-
-    b = make_tensor((3, 3), device=device, dtype=tdtype, requires_grad=True)
-    cfoo(b)
-    assert thunder.cache_misses(cfoo) == 2
-
-    b.grad = make_tensor((3, 3), device=device, dtype=tdtype)
-    cfoo(b)
-    assert thunder.cache_misses(cfoo) == 2
-
-
-@instantiate(dtypes=(thunder.float32,))
-def test_grad_recompile(executor, device, dtype):
-    # Checks that change in the metadata of a.grad causes recompile
-
-    def foo(a):
-        return a.grad * 2
-
-    cfoo = executor.make_callable(foo)
-
-    tdtype = ltorch.to_torch_dtype(dtype)
-    a = make_tensor((2, 2), device=device, dtype=tdtype, requires_grad=True)
-    a.grad = make_tensor((2, 2), device=device, dtype=tdtype)
-    cfoo(a)
-    assert thunder.cache_misses(cfoo) == 1
-
-    b = make_tensor((3, 3), device=device, dtype=tdtype, requires_grad=True)
-    b.grad = make_tensor((3, 3), device=device, dtype=tdtype)
-    cfoo(b)
-    assert thunder.cache_misses(cfoo) == 2
-
-
 @instantiate(dtypes=(thunder.float32,))
 def test_optimizer_unpack(executor, device, dtype):
     class Optimizer(torch.optim.Optimizer):
@@ -816,62 +767,42 @@ def foo(a, b):
 
     # Tensor x tensor
     result = cfoo(a, b)
-    assert cache_misses(cfoo) == 1
-    assert cache_hits(cfoo) == 0
     assert_close(result, a + b)
 
     # Same tensors -- cache hit
     result = cfoo(a, b)
-    assert cache_misses(cfoo) == 1
-    assert cache_hits(cfoo) == 1
     assert_close(result, a + b)
 
     # Different tensor, same metadata -- cache hit
     result = cfoo(a, c)
-    assert cache_misses(cfoo) == 1
-    assert cache_hits(cfoo) == 2
     assert_close(result, a + c)
 
     # Different tensor, different shape -- cache miss
     result = cfoo(a, d)
-    assert cache_misses(cfoo) == 2
-    assert cache_hits(cfoo) == 2
     assert_close(result, a + d)
 
     # Different tensor, different dtype -- cache miss
     result = cfoo(a, e)
-    assert cache_misses(cfoo) == 3
-    assert cache_hits(cfoo) == 2
     assert_close(result, a + e)
 
     # Tensor x float number -- cache miss
     result = cfoo(a, 1.0)
-    assert cache_misses(cfoo) == 4
-    assert cache_hits(cfoo) == 2
     assert_close(result, a + 1.0)
 
     # Tensor x float number, different tensor data -- cache hit
     result = cfoo(b, 1.0)
-    assert cache_misses(cfoo) == 4
-    assert cache_hits(cfoo) == 3
     assert_close(result, b + 1.0)
 
     # Tensor x float number, different number value -- cache miss
     result = cfoo(b, 3.0)
-    assert cache_misses(cfoo) == 5
-    assert cache_hits(cfoo) == 3
     assert_close(result, b + 3.0)
 
     # Tensor x int number, different number type -- cache miss
     result = cfoo(b, 3)
-    assert cache_misses(cfoo) == 6
-    assert cache_hits(cfoo) == 3
     assert_close(result, b + 3)
 
     # Tensor x int number -- cache hit
     result = cfoo(b, 3)
-    assert cache_misses(cfoo) == 6
-    assert cache_hits(cfoo) == 4
     assert_close(result, b + 3)
 
     def bar(a, b):
@@ -884,30 +815,20 @@ def bar(a, b):
 
     # String x string -- cache miss
     cbar(astr, bstr)
-    assert cache_misses(cbar) == 1
-    assert cache_hits(cbar) == 0
 
     # Same strings -- cache hit
     cbar(astr, bstr)
-    assert cache_misses(cbar) == 1
-    assert cache_hits(cbar) == 1
 
     # Same string values -- different strings
     bother_str = "b"
     cbar(astr, bother_str)
-    assert cache_misses(cbar) == 1
-    assert cache_hits(cbar) == 2
 
     # Object x string -- cache miss
     cbar(object(), bother_str)
-    assert cache_misses(cbar) == 2
-    assert cache_hits(cbar) == 2
 
     # TODO: test objects in prologues
     # object() != object() -- cache miss
     # cbar(object(), bother_str)
-    # assert cache_misses(cbar) == 3
-    # assert cache_hits(cbar) == 2
 
     # Module tests
     m = torch.nn.Linear(5, 5, device=device, dtype=torch_dtype)
@@ -920,9 +841,6 @@ def bar(a, b):
 
     assert_close(result, torch_result)
 
-    assert cache_misses(cm) == 1
-    assert cache_hits(cm) == 0
-
     # Same input -- cache hit
 
     result = cm(inp)
@@ -930,9 +848,6 @@ def bar(a, b):
 
     assert_close(result, torch_result)
 
-    assert cache_misses(cm) == 1
-    assert cache_hits(cm) == 1
-
     # Different input, same metadata -- cache hit
     inp = make_tensor((5, 5), device=device, dtype=torch_dtype)
     result = cm(inp)
@@ -940,9 +855,6 @@ def bar(a, b):
 
     assert_close(result, torch_result)
 
-    assert cache_misses(cm) == 1
-    assert cache_hits(cm) == 2
-
     # Different input, different metadata -- cache miss
     inp = make_tensor((6, 5), device=device, dtype=torch_dtype)
     result = cm(inp)
@@ -950,9 +862,6 @@ def bar(a, b):
 
     assert_close(result, torch_result)
 
-    assert cache_misses(cm) == 2
-    assert cache_hits(cm) == 2
-
     #
     # Sequence tests
     #
@@ -970,9 +879,6 @@ def caz(tup):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(ccaz) == 1
-    assert cache_hits(ccaz) == 0
-
     # List with different values -- cache miss
     inp1 = [6, 3, 7]
     thunder_result = ccaz(inp1)
@@ -980,9 +886,6 @@ def caz(tup):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(ccaz) == 2
-    assert cache_hits(ccaz) == 0
-
     # List with same values -- cache hit
     inp2 = [5, 3, 7]
     thunder_result = ccaz(inp2)
@@ -990,9 +893,6 @@ def caz(tup):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(ccaz) == 2
-    assert cache_hits(ccaz) == 1
-
     # List with same values but different types -- cache miss
     inp3 = [5.0, 3, 7]
     thunder_result = ccaz(inp3)
@@ -1000,9 +900,6 @@ def caz(tup):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(ccaz) == 3
-    assert cache_hits(ccaz) == 1
-
     #
     # Kwarg tests
     #
@@ -1017,9 +914,6 @@ def daz(*, a, b):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(cdaz) == 1
-    assert cache_hits(cdaz) == 0
-
     # Same keys and tensor metadata the same -- cache hit
     inp1 = {"a": b, "b": a}
     thunder_result = cdaz(**inp1)
@@ -1027,9 +921,6 @@ def daz(*, a, b):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(cdaz) == 1
-    assert cache_hits(cdaz) == 1
-
     # Same keys but different tensor metadata -- cache hit
     inp2 = {"a": b, "b": e}
     thunder_result = cdaz(**inp2)
@@ -1037,9 +928,6 @@ def daz(*, a, b):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(cdaz) == 2
-    assert cache_hits(cdaz) == 1
-
 
 #
 # Tests related to trace manipulation and transformation
@@ -2738,8 +2626,6 @@ def fn(x):
         actual_dtype = jfn(x)
         assert actual_dtype == torch.float16
 
-    assert thunder.cache_misses(jfn) == 2
-
 
 def test_change_default_dtype_in_jitted_fn():
     default_dtype = torch.get_default_dtype()
@@ -2786,8 +2672,6 @@ def fn(x):
         except Exception:
             pass
 
-    assert thunder.cache_misses(jfn) == 2
-
 
 @requiresCUDA
 def test_change_default_device_in_jitted_fn():
@@ -3019,20 +2903,15 @@ def fn():
 
     jfn = thunder.jit(fn)
     assert jfn() == 1
-    assert thunder.cache_misses(jfn) == 1  # Due to first compilation.
 
     # Call jfn with no changes
     # this should be cache hit.
     assert jfn() == 1
-    assert thunder.cache_hits(jfn) == 1
-    assert thunder.cache_misses(jfn) == 1
 
     # Change the value of the captured dict.
-    # This should be a cache miss, verify that.
+    # This should be a cache miss
     d[h] = 2
     assert jfn() == 2  # Verify that jfn now returns 2
-    assert thunder.cache_hits(jfn) == 1
-    assert thunder.cache_misses(jfn) == 2
 
 
 def test_profiling_decorator():
diff --git a/thunder/tests/test_inplace_functionalization.py b/thunder/tests/test_inplace_functionalization.py
index 6f88f1f8eb..d9cb60cef1 100644
--- a/thunder/tests/test_inplace_functionalization.py
+++ b/thunder/tests/test_inplace_functionalization.py
@@ -653,7 +653,6 @@ def f(a, b):
 
     res_of_a, a_out = jitted_f(a, a)
     ref_res_of_a, ref_a_out = f(a_ref, a_ref)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (0, 1)
     torch.testing.assert_close(res_of_a, ref_res_of_a)
     torch.testing.assert_close(a, a_ref)
     assert a_out.data_ptr() == a.data_ptr()
@@ -662,12 +661,10 @@ def f(a, b):
     a_ref = a.clone().detach()
     res_of_a_and_b, _ = jitted_f(a, b)
     ref_res_of_a_and_b, _ = f(a_ref, b_ref)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (0, 2)
     torch.testing.assert_close(res_of_a_and_b, ref_res_of_a_and_b)
 
     res_of_b, _ = jitted_f(b, b)
     ref_res_of_b, _ = f(b_ref, b_ref)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (1, 2)
     torch.testing.assert_close(res_of_b, ref_res_of_b)
     torch.testing.assert_close(b, b_ref)
 
@@ -675,7 +672,6 @@ def f(a, b):
     b_ref = b.clone().detach()
     res_of_b_and_a, _ = jitted_f(b, a)
     ref_res_of_b_and_a, _ = f(b_ref, a_ref)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (2, 2)
     torch.testing.assert_close(res_of_b_and_a, ref_res_of_b_and_a)
 
     # TODO(crcrpar): The message should be from the check of in-place to aliases of different shapes.
@@ -687,13 +683,9 @@ def f(a, b):
 
     jitted_f = executor.make_callable(f)
     jitted_f(a, a)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (0, 1)
     jitted_f(a, b)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (0, 2)
     jitted_f(b, a)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (1, 2)
     jitted_f(b, b)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (2, 2)
 
     def f(a, b, c):
         d = a.exp_()
diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
index 0d578cbea0..c6a95eb8ff 100644
--- a/thunder/tests/test_jit_general.py
+++ b/thunder/tests/test_jit_general.py
@@ -172,75 +172,6 @@ def foo(a, b):
     assert_close(actual, expected)
 
 
-def test_cache_basic():
-    def foo(a, b):
-        return a + b
-
-    jfoo = thunder.jit(foo)
-
-    a = torch.randn((2, 2), device="cpu")
-    b = torch.randn((2, 2), device="cpu")
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 1
-
-    # Tests rank changing
-    a = torch.randn((2), device="cpu")
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 2
-    assert thunder.cache_hits(jfoo) == 1
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 2
-    assert thunder.cache_hits(jfoo) == 2
-
-    # Tests dtype changing
-    a = torch.randn((2, 2), device="cpu", dtype=torch.bfloat16)
-    b = torch.randn((2, 2), device="cpu", dtype=torch.bfloat16)
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 3
-    assert thunder.cache_hits(jfoo) == 2
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 3
-    assert thunder.cache_hits(jfoo) == 3
-
-    # Tests shape changing
-    a = torch.randn((2, 1), device="cpu", dtype=torch.bfloat16)
-    b = torch.randn((2, 1), device="cpu", dtype=torch.bfloat16)
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 4
-    assert thunder.cache_hits(jfoo) == 3
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 4
-    assert thunder.cache_hits(jfoo) == 4
-
-
 def test_cache_always_trace():
     def foo(a, b):
         return a + b
@@ -256,8 +187,6 @@ def foo(a, b):
     actual = jfoo(a, b)
     actual = jfoo(a, b)
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 4
-    assert thunder.cache_hits(jfoo) == 0
 
 
 def test_cache_equality_constraint():
@@ -274,15 +203,6 @@ def fn(b):
     assert_close(fn(True), jfn(True))
     assert_close(fn(False), jfn(False))
 
-    assert thunder.cache_misses(jfn) == 2
-    assert thunder.cache_hits(jfn) == 0
-
-    assert_close(fn(True), jfn(True))
-    assert_close(fn(False), jfn(False))
-
-    assert thunder.cache_misses(jfn) == 2
-    assert thunder.cache_hits(jfn) == 2
-
 
 def test_nn_parameter():
     a = torch.nn.Parameter(torch.randn(2, 3))
@@ -577,9 +497,6 @@ def test_litgpt():
     result = jfn(*args, **kwargs)
     assert_close(result, module(*args, **kwargs))
 
-    assert thunder.cache_misses(jfn) == 1
-    assert thunder.cache_hits(jfn) == 1
-
 
 def test_nanogpt_block():
     from thunder.benchmarks import NanoGPTBlockBenchmark, NanoGPTConfig, _nanogpt_configs
@@ -853,8 +770,6 @@ def foo(a, scalar):
     expected = foo(a, b)
 
     assert_close(actual, expected)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
 
     b = 2
 
@@ -862,8 +777,6 @@ def foo(a, scalar):
     expected = foo(a, b)
 
     assert_close(actual, expected)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 1
 
 
 def test_post_optimization_transform():
@@ -947,30 +860,22 @@ def foo(scalar):
     actual = jfoo(1.5)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
 
     expected = foo(2.0)
     actual = jfoo(2.0)
 
     assert_close(expected, actual)
     # even though we should be able to re-use the cache, we cannot do it now. Because constraints are propagated to inputs being static number.
-    assert thunder.cache_misses(jfoo) == 2
-    assert thunder.cache_hits(jfoo) == 0
 
     expected = foo(1.5)
     actual = jfoo(1.5)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 2
-    assert thunder.cache_hits(jfoo) == 1
 
     expected = foo(-0.3)
     actual = jfoo(-0.3)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 3
-    assert thunder.cache_hits(jfoo) == 1
 
     def bar(t):
         if t[0].item() > 5:
@@ -1125,15 +1030,11 @@ def foo(t, batch_size):
     actual = jfoo(a, 32)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
 
     expected = foo(a, 16)
     actual = jfoo(a, 16)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 1
 
 
 @pytest.mark.filterwarnings("ignore:Please use `torch.vmap`")
@@ -1415,8 +1316,6 @@ def foo(a):
     expected = foo(a)
 
     assert_close(actual, expected)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
 
     a = torch.randn((3, 4, 5), device="cpu")
 
@@ -1424,8 +1323,6 @@ def foo(a):
     expected = foo(a)
 
     assert_close(actual, expected)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 1
 
 
 def test_cache_symbolic_values_reshape_numel():
diff --git a/thunder/transforms/cudagraph.py b/thunder/transforms/cudagraph.py
index 7776db882e..d7d8fabb87 100644
--- a/thunder/transforms/cudagraph.py
+++ b/thunder/transforms/cudagraph.py
@@ -5,6 +5,7 @@
 
 import torch
 
+from thunder.core.profile import annotate_for_profile
 from thunder.core.transform_common import Transform
 from thunder.core import utils, prims
 from thunder.core.proxies import Proxy, ProxyTag, unvariableify
@@ -268,9 +269,8 @@ def can_fuse(self, bsym: BoundSymbol):
 
         return True
 
+    @annotate_for_profile("CUDAGraphTransformer.transform_trace_post_optimization")
     def transform_trace_post_optimization(self, trace: TraceCtx, **kwargs) -> TraceCtx:
-        start_time_ns: int = time.perf_counter_ns()
-
         def _should_fuse(a: Node, b: Node):
             # TODO: modify the logic to be able to potentially better handle
             # islands around data-dependent ops once these are supported by Thunder.
@@ -315,9 +315,5 @@ def _can_fuse_node(n: Node):
         delattr(fused_trace, "clear_collection_names")
         reset_tracectx(fused_trace_tok)
 
-        end_time_ns = time.perf_counter_ns()
-        elapsed_time_ns = end_time_ns - start_time_ns
-        elapsed_time_ms = elapsed_time_ns // 1000000
-        fused_trace.set_provenance(TraceProvenance(f"CUDAGraph fusion (took {elapsed_time_ms} milliseconds)"))
-
+        fused_trace.set_provenance(TraceProvenance("CUDAGraph fusion"))
         return fused_trace