Skip to content

Commit

Permalink
address review
Browse files Browse the repository at this point in the history
  • Loading branch information
kshitij12345 committed Aug 21, 2024
1 parent 091aa9b commit 72ca3d9
Showing 1 changed file with 15 additions and 2 deletions.
17 changes: 15 additions & 2 deletions notebooks/writing_a_trace_transform_cpu_offloading.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@
"# Create a new executor.\n",
"offload_ex = OperatorExecutor(\"offload_ex\")\n",
"\n",
"# NOTE: We create the offloaded CPU tensor in pinned memory and load the tensor back onto GPU with `to(non_blocking=True)`.\n",
"# These allow for better memory transfer speeds.\n",
"# Read the following tutorial for detailed explanation - https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html\n",
"\n",
"# Offload the GPU tensor to a pinned CPU tensor.\n",
"def offload_to_cpu_impl(t):\n",
Expand Down Expand Up @@ -148,7 +151,18 @@
" return {sym: idx for idx, sym in enumerate(symbols)}\n",
"\n",
"\n",
"def move_closer_to_consumer(execution_trace):\n",
"def move_closer_to_consumer(execution_trace: TraceCtx) -> TraceCtx:\n",
" '''\n",
" This function takes the trace and reorders the operation such that operations producing input for the next operation\n",
" are closer together.\n",
"\n",
" This is required as in the backward trace, the first consumer of saved_for_backward tensor maybe\n",
" a reshape or permute op and the actual computation occurs 50-100 (or more) lines later.\n",
" Because of this we load more tensors than required eagerly (thus decreasing the memory gains from CPU Offloading).\n",
"\n",
" Args:\n",
" execution_trace (TraceCtx): Trace to be re-ordered.\n",
" '''\n",
" order_in_trace = {bsym: i for i, bsym in enumerate(execution_trace.bound_symbols)}\n",
"\n",
" def prefer_ops_closer_to_consumer(eligible_nodes: list[Node]) -> int:\n",
Expand Down Expand Up @@ -385,7 +399,6 @@
" # We need this because in unmodified backward trace, the first consumer of saved_for_backward maybe\n",
" # a reshape or permute op and the actual computation occurs 50-100 (or more) lines later.\n",
" # Because of this we load more tensors than required eagerly (thus decreasing the memory gains from CPU Offloading).\n",
" # This function is currently tailored to pattern observed in Llama-2\n",
" # Eg. on line 92\n",
" # # Created by CPU Offloading Transform\n",
" # t1319 = load_to_gpu(offloaded_t1319, 'cuda:0') # t1319: \"cuda:0 f32[8, 1024, 11008]\"\n",
Expand Down

0 comments on commit 72ca3d9

Please sign in to comment.