From 14eedf9f8573ca49493bc49ea38b150b43b925a0 Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 10 Dec 2024 11:19:55 +0100 Subject: [PATCH] hellow world thunderfx --- notebooks/hello_world_thunderfx.ipynb | 279 ++++++++++++++++ ...er_as_torch_compile_backend_tutorial.ipynb | 308 ------------------ 2 files changed, 279 insertions(+), 308 deletions(-) create mode 100644 notebooks/hello_world_thunderfx.ipynb delete mode 100644 notebooks/thunder_as_torch_compile_backend_tutorial.ipynb diff --git a/notebooks/hello_world_thunderfx.ipynb b/notebooks/hello_world_thunderfx.ipynb new file mode 100644 index 0000000000..7a2e5eae96 --- /dev/null +++ b/notebooks/hello_world_thunderfx.ipynb @@ -0,0 +1,279 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## \"Hello, World!\" ThunderFX\n", + "\n", + "In this tutorial, we’ll explore how to use ThunderFX to accelerate PyTorch program.\n", + "\n", + "In this guide, we’ll explore the basics of ThunderFX, how to apply it to PyTorch functions and models, and evaluate its performance in both inference and gradient calculations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Getting Started with ThunderFX\n", + "\n", + "Let's see an example of using ThunderFX on a PyTorch function. ThunderFX optimizes the given callable and returns a compiled version of the function. You can then use the compiled function just like you would the original one." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from thunder.dynamo import thunderfx\n", + "\n", + "def foo(x, y):\n", + " return torch.sin(x) + torch.cos(y)\n", + "\n", + "# Compiles foo with ThunderFX\n", + "compiled_foo = thunderfx(foo)\n", + "\n", + "# Creates inputs\n", + "inputs = [torch.randn(4, 4), torch.randn(4, 4)]\n", + "\n", + "eager_results = foo(*inputs)\n", + "# Runs the compiled model\n", + "thunderfx_results = compiled_foo(*inputs)\n", + "\n", + "torch.testing.assert_close(eager_results, thunderfx_results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ThunderFX supports both CPU and CUDA tensors. However, its primary focus is optimizing CUDA calculations. The following example demonstrates ThunderFX with CUDA tensors:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# Checks if CUDA is available\n", + "if not torch.cuda.is_available():\n", + " print(\"No suitable GPU detected. Unable to proceed with the tutorial. Cell execution has been stopped.\")\n", + " sys.exit()\n", + "\n", + "\n", + "# Creates inputs\n", + "inputs = [torch.randn(4, 4, device=\"cuda\"), torch.randn(4, 4, device=\"cuda\")]\n", + "\n", + "eager_result = foo(*inputs)\n", + "thunderfx_result = compiled_foo(*inputs)\n", + "\n", + "torch.testing.assert_close(eager_result, thunderfx_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Performance Optimization with ThunderFX\n", + "\n", + "Next, let’s evaluate how ThunderFX improves performance on a real-world model. We'll use the Llama3 model as an example and compare the execution time for both inference and gradient calculations.\n", + "\n", + "We begin by loading and configuring a lightweight version of the Llama3 model:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GPT(\n", + " (lm_head): Linear(in_features=4096, out_features=128256, bias=False)\n", + " (transformer): ModuleDict(\n", + " (wte): Embedding(128256, 4096)\n", + " (h): ModuleList(\n", + " (0-1): 2 x Block(\n", + " (norm_1): RMSNorm()\n", + " (attn): CausalSelfAttention(\n", + " (attn): Linear(in_features=4096, out_features=6144, bias=False)\n", + " (proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " )\n", + " (post_attention_norm): Identity()\n", + " (norm_2): RMSNorm()\n", + " (mlp): LLaMAMLP(\n", + " (fc_1): Linear(in_features=4096, out_features=14336, bias=False)\n", + " (fc_2): Linear(in_features=4096, out_features=14336, bias=False)\n", + " (proj): Linear(in_features=14336, out_features=4096, bias=False)\n", + " )\n", + " (post_mlp_norm): Identity()\n", + " )\n", + " )\n", + " (ln_f): RMSNorm()\n", + " )\n", + ")" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from litgpt import Config, GPT\n", + "from functools import partial\n", + "from torch.testing import make_tensor\n", + "from thunder.dynamo import thunderfx\n", + "\n", + "cfg = Config.from_name(\"Llama-3-8B\")\n", + "\n", + "# Uses a reduced configuration for this tutorial\n", + "cfg.n_layer = 2\n", + "cfg.block_size = 1024\n", + "batch_dim = 8\n", + "\n", + "torch.set_default_dtype(torch.bfloat16)\n", + "make = partial(make_tensor, low=0, high=255, device='cuda', dtype=torch.int64, requires_grad=False)\n", + "\n", + "with torch.device('cuda'):\n", + " model = GPT(cfg)\n", + " shape = (batch_dim, cfg.block_size)\n", + " x = make(shape)\n", + "\n", + "model " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again we first compile our model and compare the output" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "deviation: 0.015625\n" + ] + } + ], + "source": [ + "compiled_model = thunderfx(model)\n", + "thunderfx_result = compiled_model(x)\n", + "eager_result = model(x)\n", + "print(\"deviation:\", (thunderfx_result - eager_result).abs().max().item())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: ThunderFX compiles the model into optimized kernels as it executes. This means the first run may take longer due to the compilation process, but subsequent runs will benefit from significant speedups.\n", + "\n", + "To evaluate ThunderFX’s inference performance, we compare the execution time of the compiled model versus the standard PyTorch model:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ThunderFX Inference Time:\n", + "142 ms ± 1.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "Torch Eager Inference Time:\n", + "159 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "# Clears data to free some memory.\n", + "del thunderfx_result, eager_result\n", + "import gc\n", + "gc.collect()\n", + "torch.cuda.empty_cache()\n", + "\n", + "# Measures inference time\n", + "print(\"ThunderFX Inference Time:\")\n", + "%timeit r = compiled_model(x); torch.cuda.synchronize()\n", + "print(\"Torch Eager Inference Time:\")\n", + "%timeit r = model(x); torch.cuda.synchronize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similarly, let’s measure the performance improvement for gradient calculations:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ThunderFX Gradient Calculation Time:\n", + "441 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "Torch Eager Gradient Calculation Time:\n", + "480 ms ± 2.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "print(\"ThunderFX Gradient Calculation Time:\")\n", + "%timeit r = compiled_model(x); torch.autograd.grad(r.sum(), model.parameters()); torch.cuda.synchronize()\n", + "print(\"Torch Eager Gradient Calculation Time:\")\n", + "%timeit r = model(x); torch.autograd.grad(r.sum(), model.parameters()); torch.cuda.synchronize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Conclusion\n", + "\n", + "ThunderFX provides an efficient way to accelerate PyTorch programs, particularly for GPU workloads. By compiling functions and models, it reduces runtime for both inference and gradient computations. This tutorial demonstrated its usage and performance benefits using both simple functions and a real-world model." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/thunder_as_torch_compile_backend_tutorial.ipynb b/notebooks/thunder_as_torch_compile_backend_tutorial.ipynb deleted file mode 100644 index 25c5be02cc..0000000000 --- a/notebooks/thunder_as_torch_compile_backend_tutorial.ipynb +++ /dev/null @@ -1,308 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Thunder as torch.compile backend (ThunderFX) Tutorial\n", - "\n", - "In this tutorial, we’ll explore how to use Thunder as a backend for `torch.compile`, and demonstrate the tools to inspect the compiling process." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Introduction\n", - "\n", - "Starting with PyTorch2.0, the `torch.compile` feature introduces a powerful way to optimize and accelerate the PyTorch models. As its core, `torch.compile` relies on the following key components:\n", - "1. TorchDynamo - A Python-level tracing tool that transforms Python function calls into an intermediate representation(IR)\n", - "2. Backends - Systems that further process the IR, optimizing and executing the computational graph for better performance.\n", - "\n", - "While PyTorch provides several built-in backends such as \"inductor\" and \"cudagraphs\", it also supports custom backends that allow users to define their own optimization strategies. Thunder as a deep learning compiler can either be used on its own to accelerate model performance (see the [Thunder overview](https://lightning-thunder.readthedocs.io/en/latest/basic/overview.html) and other tutorials for more details) or also integrate with `torch.compile` as a backend. This is possible because TorchDynamo transforms the original Python code into new, optimized Python code that represents the same computation, which Thunder can directly process.\n", - "\n", - "For more information on `torch.compile`, we recommend reading PyTorch documentation and tutorials:\n", - "\n", - "1. Introduction to torch.compile - [Link](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html)\n", - "2. Docs of torch.compile - [Link](https://pytorch.org/docs/stable/generated/torch.compile.html)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Example Usage\n", - "\n", - "By simply specifying the `backend` argument as `ThunderCompiler`, we can seamlessly use `torch.compile` with Thunder as the backend." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/wayan/lightning-thunder/thunder/dynamo/compiler.py:21: UserWarning: The ThunderCompiler is in active development and may not work as expected. Please report any issues you encounter to the Lightning Thunder team.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[ 2.0454, 0.9373, 1.3031, 1.6171],\n", - " [ 2.0270, 1.5227, 1.4768, 0.2534],\n", - " [-0.4196, 1.7928, 1.9140, 0.5584],\n", - " [ 1.9205, -0.8348, -0.0268, 2.0556]], device='cuda:0',\n", - " grad_fn=)\n" - ] - } - ], - "source": [ - "import torch\n", - "from thunder.dynamo import ThunderCompiler\n", - "\n", - "def foo(x, y):\n", - " a = torch.sin(x)\n", - " return a + torch.sinc(a) + torch.cos(y)\n", - "\n", - "# Create the ThunderCompiler backend\n", - "backend = ThunderCompiler()\n", - "# Pass the ThunderCompiler backend to torch.compile by using the backend argument.\n", - "opt_foo1 = torch.compile(foo, backend=backend)\n", - "# Run the compiled model as you normally would\n", - "print(opt_foo1(torch.randn(4, 4, requires_grad=True, device=\"cuda\"), torch.randn(4, 4, requires_grad=True, device=\"cuda\")))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Implementation Details and Debugging\n", - "\n", - "Now Let’s dive into the [FX graphs](https://pytorch.org/docs/stable/fx.html#torch.fx.Graph) generated by TorchDynamo and explore how Thunder processes them.\n", - "\n", - "##### Exploring FX Graphs Generated by TorchDynamo\n", - "\n", - "TorchDynamo transforms Python functions into FX graphs. It can segment computations into smaller subgraphs to handle dynamic behavior or unsupported operations, allowing parts of the code to fall back to native execution while optimizing supported segments. \n", - "\n", - "In our example, all operators in the `foo` function are supported, resulting in a single FX graph.\n", - "\n", - "**NOTE**: For more information about TorchDynamo, refer to the official [Dynamo overview](https://pytorch.org/docs/stable/torch.compiler_dynamo_overview.html)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TorchDynamo extracts 1 FX graphs\n", - "Graph 0:\n", - "GraphModule()\n", - "\n", - "\n", - "\n", - "def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):\n", - " l_x_ = L_x_\n", - " l_y_ = L_y_\n", - " a = torch.sin(l_x_); l_x_ = None\n", - " sinc = torch.sinc(a)\n", - " add = a + sinc; a = sinc = None\n", - " cos = torch.cos(l_y_); l_y_ = None\n", - " add_1 = add + cos; add = cos = None\n", - " return (add_1,)\n", - " \n", - "# To see more debug info, please use `graph_module.print_readable()`\n", - "\n" - ] - } - ], - "source": [ - "subgraph_infos = backend.subgraph_infos\n", - "print(f\"TorchDynamo extracts {len(subgraph_infos)} FX graphs\")\n", - "for graph_id, subgraph_info in enumerate(subgraph_infos):\n", - " print(f\"Graph {graph_id}:\\n{subgraph_info.original_graph_module}\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### How `ThunderCompiler` Handles FX Graphs\n", - "\n", - "The `ThunderCompiler` serves as the backend for torch.compile, processing the FX graph generated by TorchDynamo. If the graph contains regions unsupported by Thunder, ThunderCompiler splits the FX graph into smaller subgraphs. To achieve this, it leverages the [split module pass](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/split_module.py) provided by `torch.fx` to customize the rules of how to split the FX graph. `ThunderCompiler` implements its own [callback function](https://github.com/Lightning-AI/lightning-thunder/blob/75ba590708178bfe61b7ec2ed2d579d9edb7daa9/thunder/dynamo/splitter.py#L101-L135) to:\n", - "1. Split the FX graph into supported subgraph that is compiled and executed by Thunder\n", - "2. Send unsupported subgraphs to alternative execution path -- PyTorch’s Inductor.\n", - "\n", - "Some common causes for graph splitting include:\n", - "1. Unsupported operators: when encounter operators that are not supported by Thunder\n", - "2. Compilation Errors: when exceptions occur while attempting to compile operators using Thunder. \n", - "\n", - "You can inspect the split reasons and review how the FX graph was split by accessing the `TorchCompiler.subgraph_infos` attribute.\n", - "\n", - "Note that ThunderCompiler accepts `thunder.jit` options as keyword arguments to customize the compilation of subgraphs executed by Thunder. Similarly, `torch_inductor_options` options can be specified for subgraphs executed by Inductor.\n", - "\n", - "In this example, the `sinc` operator is not yet supported by Thunder. As a result, the original FX graph is split into three parts. The first and third part is executed by Thunder. The second part contains the unsupported `sinc` operator and is executed by Inductor." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Thunder spliter splits the graph into 3 subgraphs, in which 2 subgraphs are run by Thunder\n", - "The structure of the split graph:\n", - "\n", - "GraphModule(\n", - " (thunder_0): ThunderModule(\n", - " (_model): GraphModule()\n", - " )\n", - " (inductor_1): OptimizedModule(\n", - " (_orig_mod): GraphModule()\n", - " )\n", - " (thunder_2): ThunderModule(\n", - " (_model): GraphModule()\n", - " )\n", - ")\n", - "\n", - "\n", - "\n", - "def forward(self, l_x_ : torch.Tensor, l_y_ : torch.Tensor):\n", - " thunder_0 = self.thunder_0(l_x_); l_x_ = None\n", - " inductor_1 = self.inductor_1(thunder_0)\n", - " thunder_2 = self.thunder_2(thunder_0, inductor_1, l_y_); thunder_0 = inductor_1 = l_y_ = None\n", - " return (thunder_2,)\n", - " \n", - "# To see more debug info, please use `graph_module.print_readable()`\n", - "Subgraph 0:\n", - "GraphModule()\n", - "\n", - "\n", - "\n", - "def forward(self, l_x_ : torch.Tensor):\n", - " a = torch.sin(l_x_); l_x_ = None\n", - " return a\n", - " \n", - "# To see more debug info, please use `graph_module.print_readable()`\n", - "\n", - "Subgraph 1:\n", - "GraphModule()\n", - "\n", - "\n", - "\n", - "def forward(self, a):\n", - " sinc = torch.sinc(a); a = None\n", - " return sinc\n", - " \n", - "# To see more debug info, please use `graph_module.print_readable()`\n", - "\n", - "Subgraph 2:\n", - "GraphModule()\n", - "\n", - "\n", - "\n", - "def forward(self, a, sinc, l_y_ : torch.Tensor):\n", - " add = a + sinc; a = sinc = None\n", - " cos = torch.cos(l_y_); l_y_ = None\n", - " add_1 = add + cos; add = cos = None\n", - " return add_1\n", - " \n", - "# To see more debug info, please use `graph_module.print_readable()`\n", - "\n" - ] - } - ], - "source": [ - "subgraph_info = subgraph_infos[0]\n", - "num_of_submodules = len(subgraph_info.submodule_to_compiled_functions)\n", - "num_of_thunder_modules = len(subgraph_info.thunder_compiled_fns)\n", - "print(f\"Thunder spliter splits the graph into {num_of_submodules} subgraphs, in which {num_of_thunder_modules} subgraphs are run by Thunder\")\n", - "print(\"The structure of the split graph:\\n\")\n", - "print(subgraph_info.split_graph_module)\n", - "\n", - "for subgraph_id, (original_graph, compiled_graph) in enumerate(subgraph_info.submodule_to_compiled_functions.items()):\n", - " print(f\"Subgraph {subgraph_id}:\\n{original_graph}\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To inspect why the original graph is split, we can print the split reasons:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split reason 0:\n", - "SplitReason(reason_type=, info='node with name: sinc and target: only has an automatic torch fallback in thunder.', exception=None)\n", - "\n" - ] - } - ], - "source": [ - "for reason_id, split_reason in enumerate(subgraph_info.split_reasons):\n", - " print(f\"Split reason {reason_id}:\\n{split_reason}\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To inspect the Thunder trace of each subgraph, we can use the `thunder.last_traces` and `thunder.last_backward_traces` on the compiled module as usual:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import thunder\n", - "\n", - "for subgraph_id, thunder_module in enumerate(subgraph_info.thunder_compiled_fns):\n", - " print(f\"Subgraph {subgraph_id}:\")\n", - " print(f\"Forward trace:\\n{thunder.last_traces(thunder_module)[-1]}\\n\")\n", - " print(f\"Backward trace:\\n{thunder.last_backward_traces(thunder_module)[-1]}\\n\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}