Lingwave · bryce13950 · Mar 30, 2023 · Mar 30, 2023 · Apr 1, 2023 · Apr 1, 2023
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -33,7 +33,7 @@ permissions:
 
 jobs:
   checks:
-    name: Checks
+    name: Code Checks
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -67,3 +67,27 @@ jobs:
       #   run: poetry run mypy transformer_lens
       - name: Build check
         run: poetry build
+  docs:
+    name: Documentation Checks
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version:
+          - "3.9"
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install Poetry
+        uses: snok/install-poetry@v1
+        with:
+          version: 1.4.0
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+      - name: Install dependencies
+        run: |
+          poetry lock --check
+          poetry install --with dev
+      - name: Documentation test
+        run: make documentation-test
diff --git a/demos/Activation_Patching_in_TL_Demo.ipynb b/demos/Activation_Patching_in_TL_Demo.ipynb
@@ -43,24 +43,27 @@
     }
    ],
    "source": [
-    "# Janky code to do different setup when run in a Colab notebook vs VSCode\n",
+    "import os\n",
+    "\n",
+    "IN_COLAB = 'google.colab' in str(get_ipython())\n",
+    "IN_GITHUB = os.getenv(\"GITHUB_ACTIONS\") == \"true\"\n",
     "DEBUG_MODE = False\n",
-    "try:\n",
-    "    import google.colab\n",
-    "    IN_COLAB = True\n",
+    "DO_SLOW_RUNS = not IN_GITHUB\n",
+    "\n",
+    "if IN_COLAB or IN_GITHUB:\n",
     "    print(\"Running as a Colab notebook\")\n",
     "    %pip install git+https://github.com/neelnanda-io/TransformerLens.git\n",
     "    # Install my janky personal plotting utils\n",
     "    %pip install git+https://github.com/neelnanda-io/neel-plotly.git\n",
-    "except:\n",
-    "    IN_COLAB = False\n",
+    "else:\n",
     "    print(\"Running as a Jupyter notebook - intended for development only!\")\n",
     "    from IPython import get_ipython\n",
     "\n",
     "    ipython = get_ipython()\n",
     "    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel\n",
     "    ipython.magic(\"load_ext autoreload\")\n",
-    "    ipython.magic(\"autoreload 2\")"
+    "    ipython.magic(\"autoreload 2\")\n",
+    "    "
    ]
   },
   {
@@ -319,7 +322,7 @@
    "outputs": [],
    "source": [
     "# Whether to do the runs by head and by position, which are much slower\n",
-    "DO_SLOW_RUNS = True"
+    "# DO_SLOW_RUNS = False"
    ]
   },
   {

diff --git a/demos/Attribution_Patching_Demo.ipynb b/demos/Attribution_Patching_Demo.ipynb
diff --git a/demos/Exploratory_Analysis_Demo.ipynb b/demos/Exploratory_Analysis_Demo.ipynb
@@ -75,18 +75,20 @@
                 }
             ],
             "source": [
-                "# Janky code to do different setup when run in a Colab notebook vs VSCode\n",
+                "import os\n",
+                "\n",
+                "IN_COLAB = 'google.colab' in str(get_ipython())\n",
+                "IN_GITHUB = os.getenv(\"GITHUB_ACTIONS\") == \"true\"\n",
                 "DEBUG_MODE = False\n",
-                "try:\n",
-                "    import google.colab\n",
-                "    IN_COLAB = True\n",
+                "DO_SLOW_RUNS = not IN_GITHUB\n",
+                "\n",
+                "if IN_COLAB or IN_GITHUB:\n",
                 "    print(\"Running as a Colab notebook\")\n",
                 "    %pip install git+https://github.com/neelnanda-io/TransformerLens.git\n",
                 "    # Install another version of node that makes PySvelte work way faster\n",
                 "    !curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash -; sudo apt-get install -y nodejs\n",
                 "    %pip install git+https://github.com/neelnanda-io/PySvelte.git\n",
-                "except:\n",
-                "    IN_COLAB = False\n",
+                "else:\n",
                 "    print(\"Running as a Jupyter notebook - intended for development only!\")\n",
                 "    from IPython import get_ipython\n",
                 "\n",
@@ -188,6 +190,17 @@
                 "torch.set_grad_enabled(False)"
             ]
         },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# This variable needs to be used instead of allowing the default\n",
+                "# Any calls to .cuda() need to be .to(device) to allow for your notebook to be compatible with github CI\n",
+                "device = \"cuda\" if torch.cuda.is_available() else \"cpu\""
+            ]
+        },
         {
             "cell_type": "markdown",
             "metadata": {},
@@ -336,6 +349,7 @@
                 "    center_writing_weights=True,\n",
                 "    fold_ln=True,\n",
                 "    refactor_factored_attn_matrices=True,\n",
+                "    device=device\n",
                 ")"
             ]
         },
@@ -478,7 +492,8 @@
                 "        )\n",
                 "        # Insert the *incorrect* answer to the prompt, making the correct answer the indirect object.\n",
                 "        prompts.append(prompt_format[i].format(answers[-1][1]))\n",
-                "answer_tokens = torch.tensor(answer_tokens).cuda()\n",
+                "answer_tokens = answer_tokens.cuda() if not IN_GITHUB else torch.LongTensor(answer_tokens)\n",
+                "\n",
                 "print(prompts)\n",
                 "print(answers)"
             ]
@@ -518,7 +533,7 @@
             "source": [
                 "tokens = model.to_tokens(prompts, prepend_bos=True)\n",
                 "# Move the tokens to the GPU\n",
-                "tokens = tokens.cuda()\n",
+                "tokens = tokens.cuda() if not IN_GITHUB else torch.LongTensor(tokens)\n",
                 "# Run the model and cache all activations\n",
                 "original_logits, cache = model.run_with_cache(tokens)"
             ]
@@ -1004,6 +1019,10 @@
                 "    local_cache: Optional[ActivationCache]=None, \n",
                 "    local_tokens: Optional[torch.Tensor]=None, \n",
                 "    title: str=\"\"):\n",
+                "    \n",
+                "    if IN_GITHUB:\n",
+                "        return\n",
+                "    \n",
                 "    # Heads are given as a list of integers or a single integer in [0, n_layers * n_heads)\n",
                 "    if isinstance(heads, int):\n",
                 "        heads = [heads]\n",
@@ -1368,7 +1387,7 @@
                 "    # 0 means zero change, negative means actively made worse, 1 means totally recovered clean performance, >1 means actively *improved* on clean performance\n",
                 "    return (patched_logit_diff - corrupted_average_logit_diff)/(original_average_logit_diff - corrupted_average_logit_diff)\n",
                 "\n",
-                "patched_residual_stream_diff = torch.zeros(model.cfg.n_layers, tokens.shape[1], device=\"cuda\", dtype=torch.float32)\n",
+                "patched_residual_stream_diff = torch.zeros(model.cfg.n_layers, tokens.shape[1], device=device, dtype=torch.float32)\n",
                 "for layer in range(model.cfg.n_layers):\n",
                 "    for position in range(tokens.shape[1]):\n",
                 "        hook_fn = partial(patch_residual_component, pos=position, clean_cache=cache)\n",
@@ -1465,8 +1484,8 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "patched_attn_diff = torch.zeros(model.cfg.n_layers, tokens.shape[1], device=\"cuda\", dtype=torch.float32)\n",
-                "patched_mlp_diff = torch.zeros(model.cfg.n_layers, tokens.shape[1], device=\"cuda\", dtype=torch.float32)\n",
+                "patched_attn_diff = torch.zeros(model.cfg.n_layers, tokens.shape[1], device=device, dtype=torch.float32)\n",
+                "patched_mlp_diff = torch.zeros(model.cfg.n_layers, tokens.shape[1], device=device, dtype=torch.float32)\n",
                 "for layer in range(model.cfg.n_layers):\n",
                 "    for position in range(tokens.shape[1]):\n",
                 "        hook_fn = partial(patch_residual_component, pos=position, clean_cache=cache)\n",
@@ -1639,7 +1658,7 @@
                 "    return corrupted_head_vector\n",
                 "\n",
                 "\n",
-                "patched_head_z_diff = torch.zeros(model.cfg.n_layers, model.cfg.n_heads, device=\"cuda\", dtype=torch.float32)\n",
+                "patched_head_z_diff = torch.zeros(model.cfg.n_layers, model.cfg.n_heads, device=device, dtype=torch.float32)\n",
                 "for layer in range(model.cfg.n_layers):\n",
                 "    for head_index in range(model.cfg.n_heads):\n",
                 "        hook_fn = partial(patch_head_vector, head_index=head_index, clean_cache=cache)\n",
@@ -1737,7 +1756,7 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "patched_head_v_diff = torch.zeros(model.cfg.n_layers, model.cfg.n_heads, device=\"cuda\", dtype=torch.float32)\n",
+                "patched_head_v_diff = torch.zeros(model.cfg.n_layers, model.cfg.n_heads, device=device, dtype=torch.float32)\n",
                 "for layer in range(model.cfg.n_layers):\n",
                 "    for head_index in range(model.cfg.n_heads):\n",
                 "        hook_fn = partial(patch_head_vector, head_index=head_index, clean_cache=cache)\n",
@@ -1899,7 +1918,7 @@
                 "    corrupted_head_pattern[:, head_index, :, :] = clean_cache[hook.name][:, head_index, :, :]\n",
                 "    return corrupted_head_pattern\n",
                 "\n",
-                "patched_head_attn_diff = torch.zeros(model.cfg.n_layers, model.cfg.n_heads, device=\"cuda\", dtype=torch.float32)\n",
+                "patched_head_attn_diff = torch.zeros(model.cfg.n_layers, model.cfg.n_heads, device=device, dtype=torch.float32)\n",
                 "for layer in range(model.cfg.n_layers):\n",
                 "    for head_index in range(model.cfg.n_heads):\n",
                 "        hook_fn = partial(patch_head_pattern, head_index=head_index, clean_cache=cache)\n",
@@ -2491,25 +2510,25 @@
                 "seq_len = 100\n",
                 "batch_size = 2\n",
                 "\n",
-                "prev_token_scores = torch.zeros((model.cfg.n_layers, model.cfg.n_heads), device=\"cuda\")\n",
+                "prev_token_scores = torch.zeros((model.cfg.n_layers, model.cfg.n_heads), device=device)\n",
                 "def prev_token_hook(pattern, hook):\n",
                 "    layer = hook.layer()\n",
                 "    diagonal = pattern.diagonal(offset=1, dim1=-1, dim2=-2)\n",
                 "    # print(diagonal)\n",
                 "    # print(pattern)\n",
                 "    prev_token_scores[layer] = einops.reduce(diagonal, \"batch head_index diagonal -> head_index\", \"mean\")\n",
-                "duplicate_token_scores = torch.zeros((model.cfg.n_layers, model.cfg.n_heads), device=\"cuda\")\n",
+                "duplicate_token_scores = torch.zeros((model.cfg.n_layers, model.cfg.n_heads), device=device)\n",
                 "def duplicate_token_hook(pattern, hook):\n",
                 "    layer = hook.layer()\n",
                 "    diagonal = pattern.diagonal(offset=seq_len, dim1=-1, dim2=-2)\n",
                 "    duplicate_token_scores[layer] = einops.reduce(diagonal, \"batch head_index diagonal -> head_index\", \"mean\")\n",
-                "induction_scores = torch.zeros((model.cfg.n_layers, model.cfg.n_heads), device=\"cuda\")\n",
+                "induction_scores = torch.zeros((model.cfg.n_layers, model.cfg.n_heads), device=device)\n",
                 "def induction_hook(pattern, hook):\n",
                 "    layer = hook.layer()\n",
                 "    diagonal = pattern.diagonal(offset=seq_len-1, dim1=-1, dim2=-2)\n",
                 "    induction_scores[layer] = einops.reduce(diagonal, \"batch head_index diagonal -> head_index\", \"mean\")\n",
                 "original_tokens = torch.randint(100, 20000, size=(batch_size, seq_len))\n",
-                "repeated_tokens = einops.repeat(original_tokens, \"batch seq_len -> batch (2 seq_len)\").cuda()\n",
+                "repeated_tokens = einops.repeat(original_tokens, \"batch seq_len -> batch (2 seq_len)\").to(device)\n",
                 "\n",
                 "pattern_filter = lambda act_name: act_name.endswith(\"hook_attn\")\n",
                 "loss = model.run_with_hooks(repeated_tokens, return_type=\"loss\", fwd_hooks=[(pattern_filter, prev_token_hook), (pattern_filter, duplicate_token_hook), (pattern_filter, induction_hook)])\n",
@@ -2912,7 +2931,7 @@
             "name": "python",
             "nbconvert_exporter": "python",
             "pygments_lexer": "ipython3",
-            "version": "3.9.14"
+            "version": "3.8.10"
         },
         "vscode": {
             "interpreter": {