Merge branch 'main' into tom/line-breaks

Lightning-AI · Mar 22, 2024 · 7fb90bd · 7fb90bd
2 parents fe131af + 5f6d3d3
commit 7fb90bd
Show file tree

Hide file tree

Showing 30 changed files with 818 additions and 743 deletions.
diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
@@ -65,8 +65,14 @@ jobs:
         # drop pt from requirements so not to interfere with the existing one
         bash .azure/remove-torch-lines.sh requirements/base.txt
         cat requirements/base.txt
+
         # double check on test requirements
         pip install -r requirements/test.txt
+
+        # https://docs.codecov.com/docs/codecov-uploader
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        chmod +x codecov
+
         # install this package
         python setup.py develop
       displayName: 'Install package & ...'
@@ -85,6 +91,12 @@ jobs:
             --durations=250 \
             --numprocesses=9 \
             --ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py
+        # compile coverage results
+        python -m coverage report
+        python -m coverage xml
+        # upload to codecov
+        ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
+          --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
       condition: ne(variables['testing'], 'distributed')
       displayName: 'Testing: regular'
 
@@ -95,6 +107,12 @@ jobs:
              thunder/tests/test_networks.py \
              -m "not standalone" \
              -v --random-order-seed=42 --durations=0 --numprocesses=3
+        # compile coverage results
+        python -m coverage report
+        python -m coverage xml
+        # upload to codecov
+        ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
+          --flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure
       condition: ne(variables['testing'], 'distributed')
       displayName: 'Testing: networks'
 
@@ -108,6 +126,12 @@ jobs:
     - bash: |
         # run all found tests in given past as standalone
         bash scripts/run_standalone_tests.sh "thunder/tests/distributed"
+        # compile coverage results
+        python -m coverage report
+        python -m coverage xml
+        # upload to codecov
+        ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
+          --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure
       condition: eq(variables['testing'], 'distributed')
       displayName: 'Testing: distributed'
 

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,10 +1,13 @@
-# Before submitting
+<details>
+  <summary><b>Before submitting</b></summary>
 
 - [ ] Was this discussed/approved via a Github issue? (no need for typos and docs improvements)
 - [ ] Did you read the [contributor guideline](https://github.com/Lightning-AI/pytorch-lightning/blob/main/.github/CONTRIBUTING.md), Pull Request section?
 - [ ] Did you make sure to update the docs?
 - [ ] Did you write any new necessary tests?
 
+</details>
+
 ## What does this PR do?
 
 Fixes # (issue).

diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
@@ -114,15 +114,15 @@ jobs:
         coverage report
         coverage xml
 
-    #- name: Upload coverage to Codecov
-    #  uses: codecov/codecov-action@v3
-    #  with:
-    #    token: ${{ secrets.CODECOV_TOKEN }}
-    #    file: ./coverage.xml
-    #    flags: unittests
-    #    env_vars: OS,PYTHON
-    #    name: codecov-umbrella
-    #    fail_ci_if_error: false
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v3
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: ./coverage.xml
+        flags: unittests
+        env_vars: OS,PYTHON
+        name: codecov-umbrella
+        fail_ci_if_error: false
 
 
   testing-guardian:

diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml
@@ -14,7 +14,7 @@ defaults:
     shell: bash
 
 jobs:
-  build-docs:
+  docs-make:
     uses: Lightning-AI/utilities/.github/workflows/check-docs.yml@v0.11.0
     with:
       python-version: "3.10"
@@ -28,7 +28,7 @@ jobs:
     env:
       GCP_TARGET: "gs://lightning-docs-thunder"
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v3
         with:
           name: docs-html-${{ github.sha }}
           path: docs/build/
@@ -50,7 +50,7 @@ jobs:
 
       # Uploading docs to GCS, so they can be served on lightning.ai
       - name: Upload docs/thunder/latest to GCS 🪣
-        if: github.ref == 'refs/heads/master'
+        if: github.ref == 'refs/heads/main'
         run: gsutil -m rsync -d -R docs/build/html/ ${GCP_TARGET}/latest
 
       # Uploading docs to GCS, so they can be served on lightning.ai

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,6 +17,9 @@ repos:
       - id: check-toml
       - id: check-json
       - id: check-added-large-files
+        with:
+          maxkb: 250
+          enforce-all: true
       - id: check-docstring-first
       - id: detect-private-key
 

diff --git a/README.md b/README.md
@@ -1,5 +1,6 @@
 <div align="center">
-<img alt="Thunder" src="docs/source/_static/images/lightning_thunder_lightmode_nobyline.png" width="400px" style="max-width: 100%;">
+<img alt="Thunder" src="docs/source/_static/images/LightningThunderLightModewByline.png#gh-light-mode-only" width="400px" style="max-width: 100%;">
+<img alt="Thunder" src="docs/source/_static/images/LightningThunderDarkModewByline.png#gh-dark-mode-only" width="400px" style="max-width: 100%;">
     <br/>
 <br/>
 

diff --git a/dockers/ubuntu-cuda/Dockerfile b/dockers/ubuntu-cuda/Dockerfile
@@ -24,6 +24,7 @@ ARG CUDNN_FRONTEND_CHECKOUT="v1.1.0"
 ARG PYTHON_VERSION="3.10"
 ARG TORCH_VERSION="2.2.1"
 ARG TRITON_VERSION="2.2.0"
+ARG TORCH_INSTALL="stable"
 
 SHELL ["/bin/bash", "-c"]
 # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
@@ -96,7 +97,7 @@ ENV \
     TORCH_CUDA_ARCH_LIST="8.0" \
     CUDA_SELECT_NVCC_ARCH_FLAGS="8.0"
 
-ARG TORCH_INSTALL="wheel"
+ARG TORCH_INSTALL
 
 RUN \
     if [ "${TORCH_INSTALL}" == "source" ]; then \
@@ -122,15 +123,26 @@ RUN \
           --index-url="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM//'.'/''}"; \
     fi
 
+ARG TORCH_INSTALL
+
 RUN \
-    # building nvFuser from source
-    git clone https://github.com/NVIDIA/Fuser.git && \
-    cd Fuser && \
-    git submodule update --init --recursive && \
-    pip install -r requirements.txt && \
-    python setup.py install --no-test --no-benchmark && \
-    cd .. && \
-    rm -rf Fuser
+    if [ "${TORCH_INSTALL}" == "source" ]; then \
+        # building nvFuser from source
+        git clone https://github.com/NVIDIA/Fuser.git && \
+        cd Fuser && \
+        git submodule update --init --recursive && \
+        pip install -r requirements.txt && \
+        python setup.py install --no-test --no-benchmark && \
+        cd .. && \
+        rm -rf Fuser ; \
+    elif [ "${TORCH_INSTALL}" == "test" ]; then \
+        echo "Not supported option" ; \
+    else \
+        # installing pytorch from wheels \
+        CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
+        TORCH_VERSION_MM=${TORCH_VERSION%.*} && \
+        pip install -U "nvfuser-cu${CUDA_VERSION_MM/./}-torch${TORCH_VERSION_MM/./}" ; \
+    fi
 
 RUN \
     ls -lh requirements/ && \

diff --git a/docs/source/_static/images/LightningThunderDarkModewByline.png b/docs/source/_static/images/LightningThunderDarkModewByline.png
diff --git a/docs/source/_static/images/LightningThunderLightModewByline.png b/docs/source/_static/images/LightningThunderLightModewByline.png
diff --git a/docs/source/_static/images/normalized_training_throughput_zero2.png b/docs/source/_static/images/normalized_training_throughput_zero2.png
diff --git a/docs/source/_static/images/training_throughput_single.png b/docs/source/_static/images/training_throughput_single.png
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -92,7 +92,7 @@ def _transform_changelog(path_in: str, path_out: str) -> None:
     "sphinx.ext.linkcode",
     "sphinx.ext.autosummary",
     "sphinx.ext.napoleon",
-    "sphinx.ext.imgmath",
+    "sphinx.ext.mathjax",
     "myst_parser",
     "nbsphinx",
     "sphinx_autodoc_typehints",
@@ -209,6 +209,11 @@ def _transform_changelog(path_in: str, path_out: str) -> None:
     (master_doc, project + ".tex", project + " Documentation", author, "manual"),
 ]
 
+# MathJax configuration
+mathjax3_config = {
+    "tex": {"packages": {"[+]": ["ams", "newcommand", "configMacros"]}},
+}
+
 # -- Options for manual page output ------------------------------------------
 
 # One entry per manual page. List of tuples

diff --git a/notebooks/dev_tutorials/fsdp_tutorial.ipynb b/notebooks/dev_tutorials/fsdp_tutorial.ipynb
@@ -1764,7 +1764,7 @@
     "%%writefile thunder_fsdp_simple_example.py\n",
     "\n",
     "# imports\n",
-    "from thunder.tests.lit_gpt_model import GPT, Config\n",
+    "from thunder.tests.litgpt_model import GPT, Config\n",
     "import torch\n",
     "import torch.distributed\n",
     "import thunder\n",

diff --git a/notebooks/zero_to_thunder.ipynb b/notebooks/zero_to_thunder.ipynb
@@ -312,8 +312,8 @@
     }
    ],
    "source": [
-    "from lit_gpt import GPT\n",
-    "from thunder.tests.lit_gpt_model import Config\n",
+    "from litgpt import GPT\n",
+    "from thunder.tests.litgpt_model import Config\n",
     "cfg = Config.from_name('Llama-2-7b-hf')\n",
     "cfg.n_layer = 16 # fewer layers\n",
     "torch.set_default_dtype(torch.bfloat16)\n",
@@ -3326,7 +3326,7 @@
    ],
    "source": [
     "%%writefile zero_to_thunder_fsdp_simple_example.py\n",
-    "from thunder.tests.lit_gpt_model import GPT, Config\n",
+    "from thunder.tests.litgpt_model import GPT, Config\n",
     "import os\n",
     "import torch, torch.distributed\n",
     "import thunder, thunder.distributed\n",
@@ -3470,7 +3470,7 @@
    },
    "outputs": [],
    "source": [
-    "import lit_gpt\n",
+    "import litgpt\n",
     "def apply_rope_copy(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:\n",
     "    head_size = x.size(-1)\n",
     "    x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)\n",
@@ -3493,7 +3493,7 @@
     "\n",
     "Say we have a function `apply_rope` applying the RoPE transformation in PyTorch.\n",
     "\n",
-    "In thunder, we define a *meta* function that only defines the metadata (like shapes) of outputs and the actual implementation for each operator and then register the pair with our executor using the `register_operator` function and tell it to use the new symbol instead of the original function `lit_gpt.model.apply_rope`.\n"
+    "In thunder, we define a *meta* function that only defines the metadata (like shapes) of outputs and the actual implementation for each operator and then register the pair with our executor using the `register_operator` function and tell it to use the new symbol instead of the original function `litgpt.model.apply_rope`.\n"
    ]
   },
   {
@@ -3504,17 +3504,17 @@
    "outputs": [],
    "source": [
     "import torch, thunder\n",
-    "from thunder.tests.lit_gpt_model import GPT\n",
+    "from thunder.tests.litgpt_model import GPT\n",
     "from thunder import TensorProxy\n",
     "\n",
     "def apply_rope_impl(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:\n",
-    "    return lit_gpt.model.apply_rope(x, cos, sin)\n",
+    "    return litgpt.model.apply_rope(x, cos, sin)\n",
     "\n",
     "def apply_rope_meta(x: TensorProxy, cos: TensorProxy, sin: TensorProxy) -> TensorProxy:\n",
     "    return TensorProxy(like=x)\n",
     "\n",
     "apply_rope = my_ex.register_operator('apply_rope', like=apply_rope_meta, fn=apply_rope_impl,\n",
-    "                                    replaces=lit_gpt.model.apply_rope)"
+    "                                    replaces=litgpt.model.apply_rope)"
    ]
   },
   {
@@ -3569,7 +3569,7 @@
     "with torch.device('cuda'): m = GPT.from_name('llama2-like'); Q = torch.randn(2, 128, 4096, 16)\n",
     "\n",
     "def test_apply_rope(x, m):\n",
-    "    return lit_gpt.model.apply_rope(x, m.cos, m.sin)\n",
+    "    return litgpt.model.apply_rope(x, m.cos, m.sin)\n",
     "\n",
     "thunder_apply_rope = thunder.jit(test_apply_rope, executors=(my_ex,) + thunder.get_default_executors())    \n",
     "\n",

diff --git a/requirements/notebooks.txt b/requirements/notebooks.txt
@@ -1 +1,3 @@
 ipython[all] ==8.22.2
+
+litgpt @ git+https://github.com/Lightning-AI/lit-gpt@24d5eba1724c953b7506edc041a7da1ce226c129
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -11,7 +11,7 @@ expecttest ==0.2.1  # for test_ddp.py
 hypothesis ==6.99.10  # for test_ddp.py
 numpy  # for test_ops.py
 einops  # for test_einops.py
-lit_gpt @ git+https://github.com/Lightning-AI/lit-gpt@f241d94df59d82b2017bfdcd3800ac8779eb45f5
+litgpt @ git+https://github.com/Lightning-AI/lit-gpt@24d5eba1724c953b7506edc041a7da1ce226c129
 absl-py # thunder/benchmarks/test_benchmark_litgpt.py
 pandas # thunder/benchmarks/test_benchmark_litgpt.py
 xlsxwriter # thunder/benchmarks/test_benchmark_litgpt.py
Original file line number	Diff line number	Diff line change
		@@ -1 +1,3 @@
		ipython[all] ==8.22.2

		litgpt @ git+https://github.com/Lightning-AI/lit-gpt@24d5eba1724c953b7506edc041a7da1ce226c129