address review

Lightning-AI · Aug 21, 2024 · 72ca3d9 · 72ca3d9
1 parent 091aa9b
commit 72ca3d9
Showing 1 changed file with 15 additions and 2 deletions.
diff --git a/notebooks/writing_a_trace_transform_cpu_offloading.ipynb b/notebooks/writing_a_trace_transform_cpu_offloading.ipynb
@@ -59,6 +59,9 @@
     "# Create a new executor.\n",
     "offload_ex = OperatorExecutor(\"offload_ex\")\n",
     "\n",
+    "# NOTE: We create the offloaded CPU tensor in pinned memory and load the tensor back onto GPU with `to(non_blocking=True)`.\n",
+    "#       These allow for better memory transfer speeds.\n",
+    "#       Read the following tutorial for detailed explanation - https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html\n",
     "\n",
     "# Offload the GPU tensor to a pinned CPU tensor.\n",
     "def offload_to_cpu_impl(t):\n",
@@ -148,7 +151,18 @@
     "    return {sym: idx for idx, sym in enumerate(symbols)}\n",
     "\n",
     "\n",
-    "def move_closer_to_consumer(execution_trace):\n",
+    "def move_closer_to_consumer(execution_trace: TraceCtx) -> TraceCtx:\n",
+    "    '''\n",
+    "    This function takes the trace and reorders the operation such that operations producing input for the next operation\n",
+    "    are closer together.\n",
+    "\n",
+    "    This is required as in the backward trace, the first consumer of saved_for_backward tensor maybe\n",
+    "    a reshape or permute op and the actual computation occurs 50-100 (or more) lines later.\n",
+    "    Because of this we load more tensors than required eagerly (thus decreasing the memory gains from CPU Offloading).\n",
+    "\n",
+    "    Args:\n",
+    "        execution_trace (TraceCtx): Trace to be re-ordered.\n",
+    "    '''\n",
     "    order_in_trace = {bsym: i for i, bsym in enumerate(execution_trace.bound_symbols)}\n",
     "\n",
     "    def prefer_ops_closer_to_consumer(eligible_nodes: list[Node]) -> int:\n",
@@ -385,7 +399,6 @@
     "            # We need this because in unmodified backward trace, the first consumer of saved_for_backward maybe\n",
     "            # a reshape or permute op and the actual computation occurs 50-100 (or more) lines later.\n",
     "            # Because of this we load more tensors than required eagerly (thus decreasing the memory gains from CPU Offloading).\n",
-    "            # This function is currently tailored to pattern observed in Llama-2\n",
     "            # Eg. on line 92\n",
     "            #   # Created by CPU Offloading Transform\n",
     "            #   t1319 = load_to_gpu(offloaded_t1319, 'cuda:0')  # t1319: \"cuda:0 f32[8, 1024, 11008]\"\n",