diff --git a/thunder/__init__.py b/thunder/__init__.py
index eceacc4cad..06ba26fcc2 100644
--- a/thunder/__init__.py
+++ b/thunder/__init__.py
@@ -12,12 +12,14 @@
 from looseversion import LooseVersion
 
 from thunder.core.module import ThunderModule
+import thunder.core.profile
 from thunder.core.interpreter import InterpreterLogItem
 from thunder.core.options import (
     CACHE_OPTIONS,
     SHARP_EDGES_OPTIONS,
     DebugOptions,
 )
+from thunder.core.profile import annotate_for_profile
 from thunder.core.trace import (
     TraceResults,
     TraceCtx,
@@ -277,6 +279,7 @@ def compile(fn: Callable, recipe: Recipe | None):
 
 # This function will replace compile() (below) before RC1
 # TODO RC1 Consider renaming compile_options to additional_compile_options
+@thunder.core.profile.annotate_for_profile("jit")
 def jit(
     fn: Callable,
     /,
@@ -378,6 +381,7 @@ def _alias_tensor_of_args_kwargs_dict(*args, **kwargs) -> dict[int, list[int]]:
                 tensor_group_index_to_tensor_indices[tgi].append(idx)
         return tensor_group_index_to_tensor_indices
 
+    @thunder.core.profile.annotate_for_profile("_alias_tensor_of_args_kwargs")
     def _alias_tensor_of_args_kwargs(*args, **kwargs) -> str:
         """If no aliases found, empty string, otherwise, aliases are comma separated, groups are hyphen separated."""
 
@@ -392,6 +396,7 @@ def _alias_tensor_of_args_kwargs(*args, **kwargs) -> str:
 
     @langctxs.langctx(cd.langctx)
     @_with_cache_info_ctx
+    @thunder.core.profile.annotate_for_profile("get_computation_and_inputs")
     def get_computation_and_inputs(*args, **kwargs):
         # set up a record of things in the current environment that impact caching / prologues
         # this could be replaced by the respective querying in the prologues
@@ -441,7 +446,8 @@ def get_computation_and_inputs(*args, **kwargs):
         # alaises wouldn't matter, thus it'd be better to nullify this entry in such cases.
         # It however would require the functionalized computation trace to interact with `cache_info`,
         # which seems to break the consistency of cache_info, leading to a failure in cache_info check.
-        cache_info["alias_tensor_indices"] = _alias_tensor_of_args_kwargs(*args, **kwargs)
+        with thunder.core.profile.annotate_for_profile("Alias_tensor_of_args_kwarsgs"):
+            cache_info["alias_tensor_indices"] = _alias_tensor_of_args_kwargs(*args, **kwargs)
 
         # Store the `is_grad_enabled` state of PyTorch. This is used by vjp transform
         # to treat certain Symbols as constant.
@@ -451,7 +457,6 @@ def get_computation_and_inputs(*args, **kwargs):
         # TODO RC1 Add module and function checks to prologue (make it a compile option)
 
         # Checks cache
-        cs.last_trace_cache_start = time.perf_counter_ns()
         if (cd.cache_option is CACHE_OPTIONS.CONSTANT_VALUES) or (cd.cache_option is CACHE_OPTIONS.SYMBOLIC_VALUES):
             for cache_entry in reversed(cs.interpreter_cache):
                 with compile_data_and_stats(cd, cs):
@@ -467,7 +472,8 @@ def get_computation_and_inputs(*args, **kwargs):
                         _return_none_instead_of_grads,
                     ) = cache_entry
                     try:
-                        inps, pro_to_epi = pro(*args, **kwargs)
+                        from thunder.core.profile import annotate_for_profile
+                        inps, pro_to_epi = annotate_for_profile("prologue")(pro(*args, **kwargs))
                     except Exception as _:
                         continue
 
@@ -478,10 +484,6 @@ def get_computation_and_inputs(*args, **kwargs):
                     cs.last_interpreter_log = None
                     cs.last_prologue_traces = pro_traces
                     cs.last_prologue = pro
-                    cs.last_prologue_transformation_start = 0
-                    cs.last_prologue_transformation_stop = 0
-                    cs.last_computation_transformation_start = 0
-                    cs.last_computation_transformation_stop = 0
 
                     return cache_entry, inps, pro_to_epi
 
@@ -499,7 +501,7 @@ def get_computation_and_inputs(*args, **kwargs):
                     backward_traces,
                 ) = cache_entry
 
-                inps, pro_to_epi = pro(*args, **kwargs)
+                inps, pro_to_epi = annotate_for_profile("Prologue")(pro(*args, **kwargs))
 
                 # Updates cache statistics
                 cs.cache_hits += 1
@@ -512,7 +514,6 @@ def get_computation_and_inputs(*args, **kwargs):
                 return cache_entry, inps, pro_to_epi
 
         cs.cache_misses += 1
-        cs.last_trace_cache_stop = time.perf_counter_ns()
 
         # Resets use of compile flags
         cs.last_compile_reasons = defaultdict(list)
@@ -520,8 +521,6 @@ def get_computation_and_inputs(*args, **kwargs):
         with compile_data_and_stats(cd, cs):
             # Acquires the trace OR inlines the trace into an existing trace and
             #   returns the (proxied) result of the operation
-            cs.last_trace_tracing_start = time.perf_counter_ns()
-
             prologue_trc: TraceCtx
             computation_trc: TraceCtx
             jit_results: TraceResults = thunder_general_jit(
@@ -563,11 +562,7 @@ def get_computation_and_inputs(*args, **kwargs):
 
             epilogue_traces = [epilogue_trc]
 
-            cs.last_trace_tracing_stop = time.perf_counter_ns()
-
             # Makes the prologue callable
-            cs.last_prologue_transformation_start = time.perf_counter_ns()
-
             transform: Transform
             for transform in transforms:
                 thunder.core.utils.check_type(transform, Transform)
@@ -596,13 +591,16 @@ def get_computation_and_inputs(*args, **kwargs):
                 use_del_last_used=False,
             )
             prologue_trc = prologue_traces[-1]
-            pro = prologue_trc.python_callable(include_decorators=False)
-            pro = prologue_execution_timer(pro)
+            pro = thunder.core.profile.annotate_for_profile("prologue python_callable")(
+                prologue_trc.python_callable(include_decorators=False)
+            )
+            #pro = thunder.core.profile.annotate_for_profile("prologue")(pro)
 
             epilogue_trc = transform_to_torch_types(epilogue_trc)
-            epilogue = epilogue_trc.python_callable()
+            epilogue = thunder.core.profile.annotate_for_profile("epilogue python_callable")(
+              epilogue_trc.python_callable()
+            )
 
-            cs.last_prologue_transformation_stop = time.perf_counter_ns()
             cs.last_prologue_traces = prologue_traces
             cs.last_prologue = pro
             cs.last_traces = computation_traces
@@ -672,7 +670,9 @@ def get_computation_and_inputs(*args, **kwargs):
                         backward_traces.append(backward_trc)
 
             if backward_trc is not None:
-                backward_fn = backward_trc.python_callable()
+                backward_fn = thunder.core.profile.annotate_for_profile("backward python_callable")(
+                    backward_trc.python_callable()
+                )
             else:
                 backward_fn = None
                 # We do not have to return auxiliary tensors, which will only be useful in backward pass
@@ -701,33 +701,20 @@ def get_computation_and_inputs(*args, **kwargs):
 
     def host_execution_timer(fn):
         def wrapped(*args, **kwargs):
-            cs.last_trace_host_execution_start = time.perf_counter_ns()
-            try:
+            with thunder.core.profile.annotate_for_profile("computation"):
                 return fn(*args, **kwargs)
-            finally:
-                cs.last_trace_host_execution_stop = time.perf_counter_ns()
-
-        return wrapped
-
-    def prologue_execution_timer(fn):
-        def wrapped(*args, **kwargs):
-            cs.last_prologue_execution_start = time.perf_counter_ns()
-            try:
-                return fn(*args, **kwargs)
-            finally:
-                cs.last_prologue_execution_stop = time.perf_counter_ns()
-
         return wrapped
 
     def decorate_computation_function(get_computation_and_inputs_fn, *decorators):
         def wrapped(*args, **kwargs):
-            cache_entry, inps, pro_to_epi = get_computation_and_inputs_fn(*args, **kwargs)
-            decorated_computation_fn = cache_entry.computation_fn
-            for decorator in decorators:
-                decorated_computation_fn = decorator(decorated_computation_fn)
-            if decorators:
-                cache_entry = cache_entry._replace(computation_fn=decorated_computation_fn)
-            return cache_entry, inps, pro_to_epi
+            with thunder.core.profile.annotate_for_profile("get_computation_and_inputs"):
+                cache_entry, inps, pro_to_epi = get_computation_and_inputs_fn(*args, **kwargs)
+                decorated_computation_fn = cache_entry.computation_fn
+                for decorator in decorators:
+                    decorated_computation_fn = decorator(decorated_computation_fn)
+                if decorators:
+                    cache_entry = cache_entry._replace(computation_fn=decorated_computation_fn)
+                return cache_entry, inps, pro_to_epi
 
         return wrapped
 
@@ -737,14 +724,11 @@ def wrapped(*args, **kwargs):
     def update_call_statistics(fn):
         def wrapped(*args, **kwargs):
             cs.calls += 1
-            cs.last_trace_host_start = time.perf_counter_ns()
-            try:
-                return fn(*args, **kwargs)
-            finally:
-                cs.last_trace_host_stop = time.perf_counter_ns()
-
+            return fn(*args, **kwargs)
         return wrapped
 
+
+    @thunder.core.profile.annotate_for_profile("maybe_connect_to_autograd")
     def maybe_connect_to_autograd(cache_entry, result):
         if cache_entry.backward_fn:
             # If the backward function is available, we need to connect the
@@ -769,6 +753,7 @@ def call_epilogue(cache_entry, comp_result, pro_to_epi):
 
     @wraps(fn)
     @update_call_statistics
+    @thunder.core.profile.annotate_for_profile("fn_")
     def fn_(*args, **kwargs) -> Any:
         if is_tracing():
             _recursive_jit_call_warning()
@@ -990,6 +975,7 @@ def get_auto_registered_torch_op_names(fn: Callable, /) -> set[str] | None:
 
 
 # TODO (mruberry) Update this
+@thunder.core.profile.annotate_for_profile("_grad_transform")
 def _grad_transform(trace):
     grad_fwd_trace = from_trace(trace)
     trace_tok = set_tracectx(grad_fwd_trace)
@@ -1043,10 +1029,12 @@ def _grad_transform(trace):
 
 # TODO Test nesting of grad and grad and grad and grad
 # TODO Test nesting of a regular function + calling grad
+@thunder.core.profile.annotate_for_profile("grad")
 def grad(fn):
     cfn = compile(fn)
 
     @wraps(cfn)
+    @thunder.core.profile.annotate_for_profile("_fn")
     def _fn(*args, **kwargs):
         original_result, original_trace = cfn(*args, **kwargs)
         original_trace = last_traces(cfn)
diff --git a/thunder/benchmarks/__init__.py b/thunder/benchmarks/__init__.py
index e4e3ca232e..d28a749585 100644
--- a/thunder/benchmarks/__init__.py
+++ b/thunder/benchmarks/__init__.py
@@ -232,15 +232,6 @@ class BenchmarkRunStatistics:
     stop_time: int
     host_stop_time: int
     called_backward: bool
-    has_extended_stats: bool = False
-    last_trace_host_start: int = -1
-    last_trace_host_stop: int = -1
-    last_trace_cache_start: int = -1
-    last_trace_cache_stop: int = -1
-    last_trace_tracing_start: int = -1
-    last_trace_tracing_stop: int = -1
-    last_trace_host_execution_start: int = -1
-    last_trace_host_execution_stop: int = -1
 
 
 # A timing helper
@@ -294,18 +285,6 @@ def _benchmark(
             called_backward=called_backward,
         )
 
-        # TODO Ensure the compile data statistics are always populated
-        if cs is not None and cs.last_trace_host_start > 0:
-            stat.has_extended_stats = True
-            stat.last_trace_host_start = cs.last_trace_host_start
-            stat.last_trace_host_stop = cs.last_trace_host_stop
-            stat.last_trace_cache_start = cs.last_trace_cache_start
-            stat.last_trace_cache_stop = cs.last_trace_cache_stop
-            stat.last_trace_tracing_start = cs.last_trace_tracing_start
-            stat.last_trace_tracing_stop = cs.last_trace_tracing_stop
-            stat.last_trace_host_execution_start = cs.last_trace_host_execution_start
-            stat.last_trace_host_execution_stop = cs.last_trace_host_execution_stop
-
         stats.append(stat)
 
     return stats
@@ -417,51 +396,9 @@ def _prettyprint_stats(
         for rank, (memory_allocated, memory_reserved) in rank_mem_info.items():
             short_printout += f"\n    rank-{rank} - peak allocated memory {memory_allocated/1024/1024:.2f}MB, peak reserved: {memory_reserved/1024/1024:.2f}MB"
         short_printout += "\n"
-    if median_benchmark_stat.has_extended_stats:
-        # NOTE At this point in the program extended statistics are available
-        trace_time_ns = median_benchmark_stat.last_trace_host_stop - median_benchmark_stat.last_trace_host_start
-        cache_time_ns = median_benchmark_stat.last_trace_cache_stop - median_benchmark_stat.last_trace_cache_start
-        tracing_time_ns = median_benchmark_stat.last_trace_tracing_stop - median_benchmark_stat.last_trace_tracing_start
-        trace_execution_time_ns = (
-            median_benchmark_stat.last_trace_host_execution_stop - median_benchmark_stat.last_trace_host_execution_start
-        )
-
-        trace_time_us: str = ns_to_us(trace_time_ns)
-        cache_time_us: str = ns_to_us(cache_time_ns)
-        tracing_time_us: str = ns_to_us(tracing_time_ns)
-        trace_execution_time_us: str = ns_to_us(trace_execution_time_ns)
-
-        trace_time_percentage: str = f"{round(trace_time_ns / median_benchmark_stat.total_time * 100)}%"
-        cache_time_percentage: str = f"{round(cache_time_ns / median_benchmark_stat.total_time * 100)}%"
-        tracing_time_percentage: str = f"{round(tracing_time_ns / median_benchmark_stat.total_time * 100)}%"
-        trace_execution_time_percentage: str = (
-            f"{round(trace_execution_time_ns / median_benchmark_stat.total_time * 100)}%"
-        )
-
-        before_trace_time_ns = median_benchmark_stat.last_trace_host_start - median_benchmark_stat.start_time
-        accelerator_wait_time_ns = median_benchmark_stat.stop_time - median_benchmark_stat.last_trace_host_stop
-
-        before_trace_time_us: str = ns_to_us(before_trace_time_ns)
-        accelerator_wait_time_us: str = ns_to_us(accelerator_wait_time_ns)
-
-        before_trace_time_percentage: str = f"{round(before_trace_time_ns / median_benchmark_stat.total_time * 100)}%"
-        accelerator_wait_time_percentage: str = (
-            f"{round(accelerator_wait_time_ns / median_benchmark_stat.total_time * 100)}%"
-        )
-
-        extension = f"""\
-            The median benchmark took {before_trace_time_us} to get into the tracing logic, {before_trace_time_percentage} of the total time.
-            The median benchmark took {accelerator_wait_time_us} waiting for the accelerator's computation to finish, {accelerator_wait_time_percentage} of the total time.
-            The median benchmark run's total time in tracing logic is {trace_time_us}, {trace_time_percentage} of the total time.
-            The median benchmark run's cache lookup time is {cache_time_us}, {cache_time_percentage} of the total time.
-            The median benchmark run's time spent tracing is {tracing_time_us}, {tracing_time_percentage} of the total time.
-            The median benchmark run's time to request the traced program be executed is {trace_execution_time_us}, {trace_execution_time_percentage} of the total time.
-        """
-    else:
         extension = ""
 
-    output = textwrap.dedent(preamble) + textwrap.indent(textwrap.dedent(extension), " " * 4)
-    print(output)
+    print(textwrap.dedent(preamble))
 
 
 def print_rank_0(message):
diff --git a/thunder/common.py b/thunder/common.py
index c5101b8c27..454e71d8c8 100644
--- a/thunder/common.py
+++ b/thunder/common.py
@@ -83,20 +83,6 @@ class CompileStats:
         last_interpreted_instructions (Generator[dist.Instruction, None, None] | None):
         last_interpreter_log (list[InterpreterLogItem] | None):
         last_backward_traces (Sequence[TraceCtx]):
-        last_trace_host_start (int):
-        last_trace_host_stop (int):
-        last_trace_cache_start (int):
-        last_trace_cache_stop (int):
-        last_trace_tracing_start (int):
-        last_trace_tracing_stop (int):
-        last_trace_host_execution_start (int):
-        last_trace_host_execution_stop (int):
-        last_prologue_transformation_start (int):
-        last_prologue_transformation_stop (int):
-        last_prologue_execution_start (int):
-        last_prologue_execution_stop (int):
-        last_computation_execution_start (int):
-        last_computation_execution_stop (int):
         cache (dict):
         interpreter_cache (list):
         calls (int):
@@ -117,23 +103,6 @@ def __init__(self):
         # torch.autograd.Function specific data
         self.last_backward_traces = None
 
-        # Timing stats
-        self.last_trace_host_start: int = -1
-        self.last_trace_host_stop: int = -1
-        self.last_trace_cache_start: int = -1
-        self.last_trace_cache_stop: int = -1
-        self.last_trace_tracing_start: int = -1
-        self.last_trace_tracing_stop: int = -1
-        self.last_trace_host_execution_start: int = -1
-        self.last_trace_host_execution_stop: int = -1
-
-        self.last_prologue_transformation_start: int = -1
-        self.last_prologue_transformation_stop: int = -1
-        self.last_prologue_execution_start: int = -1
-        self.last_prologue_execution_stop: int = -1
-        self.last_computation_execution_start: int = -1
-        self.last_computation_execution_stop: int = -1
-
         # Cache stats
         self.cache = {}
         self.interpreter_cache: list = []
@@ -152,29 +121,24 @@ def _time_template(self, start: int, stop: int, desc: str, /) -> int:
         return stop - start
 
     def last_cache_lookup_time(self, /) -> int:
-        start: int = self.last_trace_cache_start
-        stop: int = self.last_trace_cache_stop
-        return self._time_template(start, stop, "cache lookup")
+        warnings.warn(self.last_times_msg)
+        return -1
 
     def last_trace_construction_time(self, /) -> int:
-        start: int = self.last_trace_host_start
-        stop: int = self.last_trace_host_stop
-        return self._time_template(start, stop, "trace construction")
+        warnings.warn(self.last_times_msg)
+        return -1
 
     def last_prologue_transformation_time(self, /) -> int:
-        start: int = self.last_prologue_transformation_start
-        stop: int = self.last_prologue_transformation_stop
-        return self._time_template(start, stop, "prologue construction")
+        warnings.warn(self.last_times_msg)
+        return -1
 
     def last_prologue_execution_time(self, /) -> int:
-        start: int = self.last_prologue_execution_start
-        stop: int = self.last_prologue_execution_stop
-        return self._time_template(start, stop, "prologue execution")
+        warnings.warn(self.last_times_msg)
+        return -1
 
     def last_computation_execution_time(self, /) -> int:
-        start: int = self.last_computation_execution_start
-        stop: int = self.last_computation_execution_stop
-        return self._time_template(start, stop, "computation execution")
+        warnings.warn(self.last_times_msg)
+        return -1
 
 
 # A class that holds data about the compiled object, including statistics about how it's been called
@@ -638,6 +602,7 @@ def wait_for_future(f: FutureTensorProxy) -> TensorProxy:
 # TODO Consider making this faster by reusing more data
 # TODO Create a general mechanism for running traces that produces reproducible provenance and the
 #   appropriate error checks
+@thunder.core.profile.annotate_for_profile("transform_for_execution")
 def transform_for_execution(
     trace: TraceCtx,
     executors_list: Sequence[Executor],
diff --git a/thunder/core/functionalization.py b/thunder/core/functionalization.py
index 86859e6428..d7b3dfabdd 100644
--- a/thunder/core/functionalization.py
+++ b/thunder/core/functionalization.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING
 
 import thunder.core.prims as prims
+import thunder.core.profile
 from thunder.core.proxies import variableify, TensorProxy, unvariableify, ProxyInterface
 from thunder.core.pytree import tree_flatten, tree_unflatten
 from thunder.core.symbol import BoundSymbol
@@ -39,6 +40,7 @@ def bsym_of_to_return_self(bsym: BoundSymbol):
     return result_is_self
 
 
+@thunder.core.profile.annotate_for_profile("check_inplace_to_views")
 def check_inplace_to_views(computation_trace: Trace) -> dict[VariableInterface, TensorProxy]:
     """Error out if in-place op that outputs of different number of elements from the input and the input has other consumers."""
     import thunder.torch as ltorch
@@ -613,6 +615,7 @@ def _reshape_bsym_ctor(src: TensorProxy, dst: TensorProxy, trace: Trace) -> tupl
     return functionalized_computation_trace
 
 
+@thunder.core.profile.annotate_for_profile("functionalize_inplace_ops")
 def functionalize_inplace_ops(
     computation_trace: Trace,
     orig_to_view_swap_map: dict[VariableInterface, TensorProxy],
diff --git a/thunder/core/jit_ext.py b/thunder/core/jit_ext.py
index 726353983a..d4586d7419 100644
--- a/thunder/core/jit_ext.py
+++ b/thunder/core/jit_ext.py
@@ -1784,6 +1784,7 @@ def update_tags(proxy_swapmap: dict[Variable, Proxy]) -> None:
 )
 
 
+@thunder.core.profile.annotate_for_profile("thunder_general_jit")
 def thunder_general_jit(
     fn: Callable,
     args: tuple[Any, ...],
diff --git a/thunder/core/trace.py b/thunder/core/trace.py
index b8670212aa..c71795f587 100644
--- a/thunder/core/trace.py
+++ b/thunder/core/trace.py
@@ -14,6 +14,7 @@
 import thunder
 import thunder.core.codeutils as codeutils
 import thunder.core.baseutils as baseutils
+import thunder.core.profile
 from thunder.core.baseutils import ProxyInterface, BoundSymbolInterface, TagBase
 import thunder.core.devices as devices
 from thunder.core.pytree import tree_flatten, tree_unflatten
@@ -479,6 +480,7 @@ def keyfn(class_or_module: type | ModuleType) -> str:
     # Returns a Python callable that executes the trace
     # TODO issue "Create a mechanism for freezing TraceCtx objects"
     #   Create a mechanism for freezing traces and cache the compilation
+    @thunder.core.profile.annotate_for_profile("TraceCtx.python_callable")
     def python_callable(self, *, global_dicts: None | dict = None, **kwargs: Any) -> Callable:
         python_str: str
 
diff --git a/thunder/core/transform_common.py b/thunder/core/transform_common.py
index bfe4123dc6..ed05a0cd1a 100644
--- a/thunder/core/transform_common.py
+++ b/thunder/core/transform_common.py
@@ -141,6 +141,7 @@ def keep_or_swap(p):
 #   that only produce non-proxy objects
 # NOTE needed_proxies is an in/out argument, it takes an initial set of Variables you want to keep, and return
 #   all the needed proxies of the input trace
+@thunder.core.profile.annotate_for_profile("dce")
 def dce(trace: Trace, needed_proxies: None | set[Variable] = None) -> Trace:
     start_time_ns = time.perf_counter_ns()
 
@@ -491,6 +492,7 @@ def process_bound_symbols(src_bound_symbols, target_bound_symbols):
     return output
 
 
+@thunder.core.profile.annotate_for_profile("wrap_return_value_together_with_arguments")
 def wrap_return_value_together_with_arguments(trace: Trace) -> Trace:
     last = trace.bound_symbols[-1]
     assert last.sym.id == prims.PrimIDs.RETURN
@@ -515,6 +517,7 @@ def unwrap_return_value(trace: Trace) -> Trace:
     return new_trace
 
 
+@thunder.core.profile.annotate_for_profile("remove_context_manager_prims_from_trace")
 def remove_context_manager_prims_from_trace(trace: Trace) -> Trace:
     def is_context_manager_prim(bsym):
         # context manager prims would/should be explicitly tagged.
diff --git a/thunder/core/transforms.py b/thunder/core/transforms.py
index 3cb34a623e..4902d69be6 100644
--- a/thunder/core/transforms.py
+++ b/thunder/core/transforms.py
@@ -24,6 +24,7 @@
     interpret_trace_to_trace,
     trace_interpreter_skip_list,
 )
+from thunder.core.profile import annotate_for_profile
 from thunder.core.proxies import (
     CollectionProxy,
     NumberProxy,
@@ -1513,6 +1514,7 @@ def transform_traces_pre_prologue(
             computation_trc = dce(computation_trc)
 
             @wraps(computation_trc.python_callable())
+            @annotate_for_profile("_GradTransform.python_callable")
             def python_callable(*args, **kwargs):
                 return eval_trace(computation_trc, *args, **kwargs)["output"]
 
diff --git a/thunder/executors/passes.py b/thunder/executors/passes.py
index 2d5b9bd1f0..f18454f291 100644
--- a/thunder/executors/passes.py
+++ b/thunder/executors/passes.py
@@ -10,6 +10,7 @@
 from thunder.core.trace import TraceCtx, from_trace, TraceProvenance, VariableInterface
 import thunder.core.dtypes as dtypes
 import thunder.core.utils as cutils
+import thunder.core.profile
 from thunder.core.utils import ProxyDict, check, safe_map_flat
 from thunder.core.symbol import BoundSymbol
 from thunder.core.pytree import tree_flatten, tree_unflatten, tree_map
@@ -28,6 +29,7 @@
 
 # Transforms a trace by determining which execution transforms to call given the list of executors in priority order
 # This pass tries to preserve the original trace and proxies.
+@thunder.core.profile.annotate_for_profile("_transform_for_operator_execution")
 def _transform_for_operator_executor_execution(trace: TraceCtx, executors_list: Sequence[Executor]) -> TraceCtx:
     start_time_ns = time.perf_counter_ns()
 
@@ -115,6 +117,7 @@ def process_bsym(self, bsym):
     return extrace
 
 
+@thunder.core.profile.annotate_for_profile("transform_for_execution")
 def transform_for_execution(trace: TraceCtx, executors_list: Sequence[Executor]) -> TraceCtx:
     import torch
 
@@ -193,6 +196,7 @@ def fusion_bsym_to_region(bsym: BoundSymbol):
     return bsym.sym.executor.fuse(fusion_bsym_to_region(bsym), counter)
 
 
+@thunder.core.profile.annotate_for_profile("update_fusion_call_ctx")
 def update_fusion_call_ctx(trace: TraceCtx) -> TraceCtx:
     """Updates the call context of the trace to be the current call context.
 
@@ -268,6 +272,7 @@ def _del_last_used(bound_symbols, flattened_final_output, *, clear_mutable_colle
 
 
 # TODO Review deleting non-proxies
+@thunder.core.profile.annotate_for_profile("del_last_used")
 def del_last_used(trace: TraceCtx, *, clear_mutable_collections=False) -> TraceCtx:
     """Mark last used intermediates to be deleted. This lets the Python garbage collector free
         unused tensor memory.
diff --git a/thunder/tests/test_core.py b/thunder/tests/test_core.py
index bf6e9bd7d3..7f9d96f08e 100644
--- a/thunder/tests/test_core.py
+++ b/thunder/tests/test_core.py
@@ -13,7 +13,7 @@
 from types import FunctionType
 
 import thunder
-from thunder import last_traces, cache_option, cache_hits, cache_misses
+from thunder import last_traces, cache_option
 import thunder.examine as examine
 import thunder.clang as clang
 import thunder.core.profile
@@ -270,54 +270,6 @@ def grad_second(a):
     assert_close(actual_second, expected)
 
 
-@instantiate(dtypes=(thunder.float32,))
-def test_grad_no_recompile(executor, device, dtype):
-    # Checks that having .grad or not does not cause recompile
-
-    def foo(a):
-        return a * 2
-
-    cfoo = executor.make_callable(foo)
-
-    tdtype = ltorch.to_torch_dtype(dtype)
-    a = make_tensor((2, 2), device=device, dtype=tdtype, requires_grad=True)
-    a.grad = make_tensor((2, 2), device=device, dtype=tdtype)
-    cfoo(a)
-    assert thunder.cache_misses(cfoo) == 1
-
-    a.grad = None
-    cfoo(a)
-    assert thunder.cache_misses(cfoo) == 1
-
-    b = make_tensor((3, 3), device=device, dtype=tdtype, requires_grad=True)
-    cfoo(b)
-    assert thunder.cache_misses(cfoo) == 2
-
-    b.grad = make_tensor((3, 3), device=device, dtype=tdtype)
-    cfoo(b)
-    assert thunder.cache_misses(cfoo) == 2
-
-
-@instantiate(dtypes=(thunder.float32,))
-def test_grad_recompile(executor, device, dtype):
-    # Checks that change in the metadata of a.grad causes recompile
-
-    def foo(a):
-        return a.grad * 2
-
-    cfoo = executor.make_callable(foo)
-
-    tdtype = ltorch.to_torch_dtype(dtype)
-    a = make_tensor((2, 2), device=device, dtype=tdtype, requires_grad=True)
-    a.grad = make_tensor((2, 2), device=device, dtype=tdtype)
-    cfoo(a)
-    assert thunder.cache_misses(cfoo) == 1
-
-    b = make_tensor((3, 3), device=device, dtype=tdtype, requires_grad=True)
-    b.grad = make_tensor((3, 3), device=device, dtype=tdtype)
-    cfoo(b)
-    assert thunder.cache_misses(cfoo) == 2
-
 
 @instantiate(dtypes=(thunder.float32,))
 def test_optimizer_unpack(executor, device, dtype):
@@ -816,62 +768,42 @@ def foo(a, b):
 
     # Tensor x tensor
     result = cfoo(a, b)
-    assert cache_misses(cfoo) == 1
-    assert cache_hits(cfoo) == 0
     assert_close(result, a + b)
 
     # Same tensors -- cache hit
     result = cfoo(a, b)
-    assert cache_misses(cfoo) == 1
-    assert cache_hits(cfoo) == 1
     assert_close(result, a + b)
 
     # Different tensor, same metadata -- cache hit
     result = cfoo(a, c)
-    assert cache_misses(cfoo) == 1
-    assert cache_hits(cfoo) == 2
     assert_close(result, a + c)
 
     # Different tensor, different shape -- cache miss
     result = cfoo(a, d)
-    assert cache_misses(cfoo) == 2
-    assert cache_hits(cfoo) == 2
     assert_close(result, a + d)
 
     # Different tensor, different dtype -- cache miss
     result = cfoo(a, e)
-    assert cache_misses(cfoo) == 3
-    assert cache_hits(cfoo) == 2
     assert_close(result, a + e)
 
     # Tensor x float number -- cache miss
     result = cfoo(a, 1.0)
-    assert cache_misses(cfoo) == 4
-    assert cache_hits(cfoo) == 2
     assert_close(result, a + 1.0)
 
     # Tensor x float number, different tensor data -- cache hit
     result = cfoo(b, 1.0)
-    assert cache_misses(cfoo) == 4
-    assert cache_hits(cfoo) == 3
     assert_close(result, b + 1.0)
 
     # Tensor x float number, different number value -- cache miss
     result = cfoo(b, 3.0)
-    assert cache_misses(cfoo) == 5
-    assert cache_hits(cfoo) == 3
     assert_close(result, b + 3.0)
 
     # Tensor x int number, different number type -- cache miss
     result = cfoo(b, 3)
-    assert cache_misses(cfoo) == 6
-    assert cache_hits(cfoo) == 3
     assert_close(result, b + 3)
 
     # Tensor x int number -- cache hit
     result = cfoo(b, 3)
-    assert cache_misses(cfoo) == 6
-    assert cache_hits(cfoo) == 4
     assert_close(result, b + 3)
 
     def bar(a, b):
@@ -884,30 +816,20 @@ def bar(a, b):
 
     # String x string -- cache miss
     cbar(astr, bstr)
-    assert cache_misses(cbar) == 1
-    assert cache_hits(cbar) == 0
 
     # Same strings -- cache hit
     cbar(astr, bstr)
-    assert cache_misses(cbar) == 1
-    assert cache_hits(cbar) == 1
 
     # Same string values -- different strings
     bother_str = "b"
     cbar(astr, bother_str)
-    assert cache_misses(cbar) == 1
-    assert cache_hits(cbar) == 2
 
     # Object x string -- cache miss
     cbar(object(), bother_str)
-    assert cache_misses(cbar) == 2
-    assert cache_hits(cbar) == 2
 
     # TODO: test objects in prologues
     # object() != object() -- cache miss
     # cbar(object(), bother_str)
-    # assert cache_misses(cbar) == 3
-    # assert cache_hits(cbar) == 2
 
     # Module tests
     m = torch.nn.Linear(5, 5, device=device, dtype=torch_dtype)
@@ -920,9 +842,6 @@ def bar(a, b):
 
     assert_close(result, torch_result)
 
-    assert cache_misses(cm) == 1
-    assert cache_hits(cm) == 0
-
     # Same input -- cache hit
 
     result = cm(inp)
@@ -930,9 +849,6 @@ def bar(a, b):
 
     assert_close(result, torch_result)
 
-    assert cache_misses(cm) == 1
-    assert cache_hits(cm) == 1
-
     # Different input, same metadata -- cache hit
     inp = make_tensor((5, 5), device=device, dtype=torch_dtype)
     result = cm(inp)
@@ -940,9 +856,6 @@ def bar(a, b):
 
     assert_close(result, torch_result)
 
-    assert cache_misses(cm) == 1
-    assert cache_hits(cm) == 2
-
     # Different input, different metadata -- cache miss
     inp = make_tensor((6, 5), device=device, dtype=torch_dtype)
     result = cm(inp)
@@ -950,9 +863,6 @@ def bar(a, b):
 
     assert_close(result, torch_result)
 
-    assert cache_misses(cm) == 2
-    assert cache_hits(cm) == 2
-
     #
     # Sequence tests
     #
@@ -970,9 +880,6 @@ def caz(tup):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(ccaz) == 1
-    assert cache_hits(ccaz) == 0
-
     # List with different values -- cache miss
     inp1 = [6, 3, 7]
     thunder_result = ccaz(inp1)
@@ -980,9 +887,6 @@ def caz(tup):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(ccaz) == 2
-    assert cache_hits(ccaz) == 0
-
     # List with same values -- cache hit
     inp2 = [5, 3, 7]
     thunder_result = ccaz(inp2)
@@ -990,9 +894,6 @@ def caz(tup):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(ccaz) == 2
-    assert cache_hits(ccaz) == 1
-
     # List with same values but different types -- cache miss
     inp3 = [5.0, 3, 7]
     thunder_result = ccaz(inp3)
@@ -1000,9 +901,6 @@ def caz(tup):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(ccaz) == 3
-    assert cache_hits(ccaz) == 1
-
     #
     # Kwarg tests
     #
@@ -1017,9 +915,6 @@ def daz(*, a, b):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(cdaz) == 1
-    assert cache_hits(cdaz) == 0
-
     # Same keys and tensor metadata the same -- cache hit
     inp1 = {"a": b, "b": a}
     thunder_result = cdaz(**inp1)
@@ -1027,9 +922,6 @@ def daz(*, a, b):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(cdaz) == 1
-    assert cache_hits(cdaz) == 1
-
     # Same keys but different tensor metadata -- cache hit
     inp2 = {"a": b, "b": e}
     thunder_result = cdaz(**inp2)
@@ -1037,9 +929,6 @@ def daz(*, a, b):
 
     assert_close(thunder_result, torch_result)
 
-    assert cache_misses(cdaz) == 2
-    assert cache_hits(cdaz) == 1
-
 
 #
 # Tests related to trace manipulation and transformation
@@ -2738,8 +2627,6 @@ def fn(x):
         actual_dtype = jfn(x)
         assert actual_dtype == torch.float16
 
-    assert thunder.cache_misses(jfn) == 2
-
 
 def test_change_default_dtype_in_jitted_fn():
     default_dtype = torch.get_default_dtype()
@@ -2786,8 +2673,6 @@ def fn(x):
         except Exception:
             pass
 
-    assert thunder.cache_misses(jfn) == 2
-
 
 @requiresCUDA
 def test_change_default_device_in_jitted_fn():
@@ -3019,20 +2904,15 @@ def fn():
 
     jfn = thunder.jit(fn)
     assert jfn() == 1
-    assert thunder.cache_misses(jfn) == 1  # Due to first compilation.
 
     # Call jfn with no changes
     # this should be cache hit.
     assert jfn() == 1
-    assert thunder.cache_hits(jfn) == 1
-    assert thunder.cache_misses(jfn) == 1
 
     # Change the value of the captured dict.
-    # This should be a cache miss, verify that.
+    # This should be a cache miss
     d[h] = 2
     assert jfn() == 2  # Verify that jfn now returns 2
-    assert thunder.cache_hits(jfn) == 1
-    assert thunder.cache_misses(jfn) == 2
 
 
 def test_profiling_decorator():
diff --git a/thunder/tests/test_inplace_functionalization.py b/thunder/tests/test_inplace_functionalization.py
index 6f88f1f8eb..d9cb60cef1 100644
--- a/thunder/tests/test_inplace_functionalization.py
+++ b/thunder/tests/test_inplace_functionalization.py
@@ -653,7 +653,6 @@ def f(a, b):
 
     res_of_a, a_out = jitted_f(a, a)
     ref_res_of_a, ref_a_out = f(a_ref, a_ref)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (0, 1)
     torch.testing.assert_close(res_of_a, ref_res_of_a)
     torch.testing.assert_close(a, a_ref)
     assert a_out.data_ptr() == a.data_ptr()
@@ -662,12 +661,10 @@ def f(a, b):
     a_ref = a.clone().detach()
     res_of_a_and_b, _ = jitted_f(a, b)
     ref_res_of_a_and_b, _ = f(a_ref, b_ref)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (0, 2)
     torch.testing.assert_close(res_of_a_and_b, ref_res_of_a_and_b)
 
     res_of_b, _ = jitted_f(b, b)
     ref_res_of_b, _ = f(b_ref, b_ref)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (1, 2)
     torch.testing.assert_close(res_of_b, ref_res_of_b)
     torch.testing.assert_close(b, b_ref)
 
@@ -675,7 +672,6 @@ def f(a, b):
     b_ref = b.clone().detach()
     res_of_b_and_a, _ = jitted_f(b, a)
     ref_res_of_b_and_a, _ = f(b_ref, a_ref)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (2, 2)
     torch.testing.assert_close(res_of_b_and_a, ref_res_of_b_and_a)
 
     # TODO(crcrpar): The message should be from the check of in-place to aliases of different shapes.
@@ -687,13 +683,9 @@ def f(a, b):
 
     jitted_f = executor.make_callable(f)
     jitted_f(a, a)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (0, 1)
     jitted_f(a, b)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (0, 2)
     jitted_f(b, a)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (1, 2)
     jitted_f(b, b)
-    assert (thunder.cache_hits(jitted_f), thunder.cache_misses(jitted_f)) == (2, 2)
 
     def f(a, b, c):
         d = a.exp_()
diff --git a/thunder/tests/test_jit_general.py b/thunder/tests/test_jit_general.py
index 0d578cbea0..c6a95eb8ff 100644
--- a/thunder/tests/test_jit_general.py
+++ b/thunder/tests/test_jit_general.py
@@ -172,75 +172,6 @@ def foo(a, b):
     assert_close(actual, expected)
 
 
-def test_cache_basic():
-    def foo(a, b):
-        return a + b
-
-    jfoo = thunder.jit(foo)
-
-    a = torch.randn((2, 2), device="cpu")
-    b = torch.randn((2, 2), device="cpu")
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 1
-
-    # Tests rank changing
-    a = torch.randn((2), device="cpu")
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 2
-    assert thunder.cache_hits(jfoo) == 1
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 2
-    assert thunder.cache_hits(jfoo) == 2
-
-    # Tests dtype changing
-    a = torch.randn((2, 2), device="cpu", dtype=torch.bfloat16)
-    b = torch.randn((2, 2), device="cpu", dtype=torch.bfloat16)
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 3
-    assert thunder.cache_hits(jfoo) == 2
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 3
-    assert thunder.cache_hits(jfoo) == 3
-
-    # Tests shape changing
-    a = torch.randn((2, 1), device="cpu", dtype=torch.bfloat16)
-    b = torch.randn((2, 1), device="cpu", dtype=torch.bfloat16)
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 4
-    assert thunder.cache_hits(jfoo) == 3
-
-    expected = foo(a, b)
-    actual = jfoo(a, b)
-    assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 4
-    assert thunder.cache_hits(jfoo) == 4
-
-
 def test_cache_always_trace():
     def foo(a, b):
         return a + b
@@ -256,8 +187,6 @@ def foo(a, b):
     actual = jfoo(a, b)
     actual = jfoo(a, b)
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 4
-    assert thunder.cache_hits(jfoo) == 0
 
 
 def test_cache_equality_constraint():
@@ -274,15 +203,6 @@ def fn(b):
     assert_close(fn(True), jfn(True))
     assert_close(fn(False), jfn(False))
 
-    assert thunder.cache_misses(jfn) == 2
-    assert thunder.cache_hits(jfn) == 0
-
-    assert_close(fn(True), jfn(True))
-    assert_close(fn(False), jfn(False))
-
-    assert thunder.cache_misses(jfn) == 2
-    assert thunder.cache_hits(jfn) == 2
-
 
 def test_nn_parameter():
     a = torch.nn.Parameter(torch.randn(2, 3))
@@ -577,9 +497,6 @@ def test_litgpt():
     result = jfn(*args, **kwargs)
     assert_close(result, module(*args, **kwargs))
 
-    assert thunder.cache_misses(jfn) == 1
-    assert thunder.cache_hits(jfn) == 1
-
 
 def test_nanogpt_block():
     from thunder.benchmarks import NanoGPTBlockBenchmark, NanoGPTConfig, _nanogpt_configs
@@ -853,8 +770,6 @@ def foo(a, scalar):
     expected = foo(a, b)
 
     assert_close(actual, expected)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
 
     b = 2
 
@@ -862,8 +777,6 @@ def foo(a, scalar):
     expected = foo(a, b)
 
     assert_close(actual, expected)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 1
 
 
 def test_post_optimization_transform():
@@ -947,30 +860,22 @@ def foo(scalar):
     actual = jfoo(1.5)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
 
     expected = foo(2.0)
     actual = jfoo(2.0)
 
     assert_close(expected, actual)
     # even though we should be able to re-use the cache, we cannot do it now. Because constraints are propagated to inputs being static number.
-    assert thunder.cache_misses(jfoo) == 2
-    assert thunder.cache_hits(jfoo) == 0
 
     expected = foo(1.5)
     actual = jfoo(1.5)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 2
-    assert thunder.cache_hits(jfoo) == 1
 
     expected = foo(-0.3)
     actual = jfoo(-0.3)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 3
-    assert thunder.cache_hits(jfoo) == 1
 
     def bar(t):
         if t[0].item() > 5:
@@ -1125,15 +1030,11 @@ def foo(t, batch_size):
     actual = jfoo(a, 32)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
 
     expected = foo(a, 16)
     actual = jfoo(a, 16)
 
     assert_close(expected, actual)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 1
 
 
 @pytest.mark.filterwarnings("ignore:Please use `torch.vmap`")
@@ -1415,8 +1316,6 @@ def foo(a):
     expected = foo(a)
 
     assert_close(actual, expected)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 0
 
     a = torch.randn((3, 4, 5), device="cpu")
 
@@ -1424,8 +1323,6 @@ def foo(a):
     expected = foo(a)
 
     assert_close(actual, expected)
-    assert thunder.cache_misses(jfoo) == 1
-    assert thunder.cache_hits(jfoo) == 1
 
 
 def test_cache_symbolic_values_reshape_numel():