history

nichoffs · Jul 10, 2024 · 3bfe314 · 3bfe314
1 parent a3b3c4d
commit 3bfe314
Show file tree

Hide file tree

Showing 19 changed files with 1,524 additions and 10 deletions.
diff --git a/content/post/mini_projects/gpt2.md → content/post/mini_projects/gpt2_p1.md b/content/post/mini_projects/gpt2.md → content/post/mini_projects/gpt2_p1.md
diff --git a/content/post/mini_projects/gpt2_p2.md b/content/post/mini_projects/gpt2_p2.md
@@ -0,0 +1,201 @@
++++
+title = 'TinyGrad GPT2 à la Karpathy - Part 2'
+date = 2024-06-11T16:25:35+02:00
+draft = false
++++
+
+As expected, this part of the series has been more difficult. Karpathy makes use of lots of relatively niche, PyTorch specific functions that don't have built-in support in TinyGrad. As a result, I need to be more methodical about how I approach this and rigorously confirm that things are working as expected.
+
+I'll start with the code we left off on in part 1.
+
+
+```python
+from tinygrad import Tensor, dtypes
+from tinygrad.nn import Embedding, Linear, LayerNorm
+from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict, get_parameters
+from tqdm import tqdm, trange
+from tinygrad.nn.optim import AdamW
+from dataclasses import dataclass
+from tinygrad.helpers import fetch
+import tiktoken
+import numpy as np
+import os
+import matplotlib.pyplot as plt
+
+@dataclass
+class GPT2Config:
+    block_size: int = 1024
+    vocab_size: int = 50257
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    norm_eps: float = 1e-5
+
+@dataclass
+class GPT2Small(GPT2Config):
+    pass
+
+@dataclass
+class GPT2Medium(GPT2Config):
+    n_layer: int = 24
+    n_head: int = 16
+    n_embd: int = 1024
+
+@dataclass
+class GPT2Large(GPT2Config):
+    n_layer: int = 36
+    n_head: int = 20
+    n_embd: int = 1280
+
+@dataclass
+class GPT2XL(GPT2Config):
+    n_layer: int = 48
+    n_head: int = 25
+    n_embd: int = 1600
+
+MODEL_CONFIGS = {
+    'gpt2': GPT2Small,
+    'gpt2-medium': GPT2Medium,
+    'gpt2-large': GPT2Large,
+    'gpt2-xl': GPT2XL
+}
+
+class MLP:
+    def __init__(self, config : GPT2Config):
+        self.c_fc = Linear(config.n_embd, config.n_embd*4)
+        self.c_proj = Linear(config.n_embd*4, config.n_embd)
+    def __call__(self, x):
+        x = self.c_fc(x).gelu()
+        x = self.c_proj(x)
+        return x
+
+class Attention:
+    def __init__(self, config: GPT2Config):
+        self.config = config
+        self.c_attn = Linear(config.n_embd, config.n_embd*3)
+        self.c_proj = Linear(config.n_embd, config.n_embd)
+    def __call__(self, x):
+        B,T,C = x.shape
+
+        q, k, v = self.c_attn(x).split(C, dim=-1) #(B,T,3C) -> (B,T,C) x 3
+        split_heads = lambda x: x.view(B, T, self.config.n_head, self.config.n_embd//self.config.n_head).transpose(1,2)
+        q, k, v = map(split_heads, (q,k,v))
+
+        y = q.scaled_dot_product_attention(k, v, is_causal=True)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.c_proj(y)
+
+        return y
+
+class TransformerBlock:
+    def __init__(self, config : GPT2Config):
+        self.ln_1 = LayerNorm(config.n_embd, eps=config.norm_eps)
+        self.ln_2 = LayerNorm(config.n_embd, eps=config.norm_eps)
+        self.attn = Attention(config)
+        self.mlp = MLP(config)
+    def __call__(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+class GPT2:
+    def __init__(self, config : GPT2Config = GPT2Small):
+        self.config = config
+
+        self.wte = Embedding(config.vocab_size, config.n_embd)
+        self.wpe = Embedding(config.block_size, config.n_embd)
+        self.h = [TransformerBlock(config) for _ in range(config.n_layer)]
+        self.ln_f = LayerNorm(config.n_embd, config.norm_eps)
+        self.lm_head = Linear(config.n_embd, config.vocab_size, bias=False)
+
+        # tie weights - HUGE SAVINGS
+        self.lm_head.weight = self.wte.weight
+
+    def __call__(self, idx, targets=None):
+        B,T = idx.shape
+
+        assert T <= self.config.block_size, f"Cannot forward, model block size is {self.config.block_size} but got sequence of length {T}"
+        pos = Tensor.arange(0, T, dtype=dtypes.long) # (T,)
+        pos_emb = self.wpe(pos) # (T,) -> (T,C)
+        tok_emb = self.wte(idx) # (B,T) -> (B,T,C)
+
+        x = tok_emb + pos_emb
+        x = x.sequential(self.h)
+
+        x = self.ln_f(x)
+        logits = self.lm_head(x) # (B,T,C) -> (B,T,V)
+
+        if targets is not None:
+            loss = logits.flatten(0,1).sparse_categorical_crossentropy(targets.flatten())
+            return logits, loss.realize()
+
+        return logits, None
+
+    @staticmethod
+    def build(MODEL_NAME):
+
+        weights = torch_load(fetch(f'https://huggingface.co/{MODEL_NAME}/resolve/main/pytorch_model.bin'))
+
+        transposed = ('attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight')
+        for k in weights:
+            if k.endswith(transposed):
+                weights[k] = weights[k].T
+
+        weights['lm_head.weight'] = weights['wte.weight']
+        model = GPT2(MODEL_CONFIGS[MODEL_NAME])
+        load_state_dict(model, weights)
+
+        return model
+
+class DataLoaderLite:
+    def __init__(self, B, T, file_path):
+        self.B=B
+        self.T=T
+
+        self.batch = lambda x: x.view(B,T)
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        enc = tiktoken.get_encoding('gpt2')
+
+        tokens = enc.encode(text)
+        self.tokens = Tensor(tokens, dtype=dtypes.long)
+
+        print(f"loaded {len(self.tokens)} tokens")
+        print(f"1 epoch = {len(self.tokens) // (B*T)} batches")
+
+        self.current_position = 0
+
+    def next_batch(self):
+        B, T = self.B, self.T
+
+        buf = self.tokens[self.current_position:self.current_position + B*T+1]
+        x = self.batch(buf[:-1])
+        y = self.batch(buf[1:])
+        self.current_position += B*T
+
+        if self.current_position + (B*T+1) > len(self.tokens):
+            print("read entire document, resetting position...")
+            self.current_position = 0
+
+        return x,y
+
+Tensor.training = True
+Tensor.no_grad = False
+model = GPT2(GPT2Small)
+optim = AdamW(get_parameters(model), lr=3e-4)
+dl = DataLoaderLite(4, 32, "datasets/shake.txt")
+losses = []
+for i in (t := trange(100)):
+    x, y = dl.next_batch()
+    optim.zero_grad()
+    logits, loss = model(x,y)
+    losses.append(loss.numpy())
+    loss.backward()
+    optim.step()
+
+    t.set_description(
+        f"train loss: {loss.numpy():.2f}"
+    )
+```
diff --git a/content/post/tinygrad/_index.md b/content/post/tinygrad/_index.md
@@ -0,0 +1,4 @@
+---
+title: TinyGrad
+type: series
+---
diff --git a/content/post/tinygrad/teenygrad.md b/content/post/tinygrad/teenygrad.md
@@ -0,0 +1,7 @@
++++
+title = 'Teenygrad'
+date = 2024-07-05T00:26:18+02:00
+draft = false
++++
+
+TeenyGrad is TinyGrad's younger sibling.
diff --git a/content/post/tinygrad/the_history.md b/content/post/tinygrad/the_history.md
@@ -0,0 +1,40 @@
++++
+title = 'Why Tiny?'
+date = 2024-07-04T21:51:02+02:00
+draft = false
++++
+
+# TinyPhilosophy
+
+TinyGrad is a tensor automatic differentiation library created by [George Hotz](https://en.wikipedia.org/wiki/George_Hotz) in [2020](https://www.youtube.com/watch?v=Xtws3-Pk69o&list=PLzFUMGbVxlQsh0fFZ2QKOBY25lz04A3hi). It's been described (by TinyCorp) as a middleground between Anrej Karpathy's famous micrograd project and full-blown PyTorch. It offers both the beautiful simplicity, leanness, and ease of development of micrograd, and *almost* all the speed and functionality of PyTorch.
+
+An interesting features of TinyGrad's development is a continued, explicit constraint on the line count (~8000 LOC today). *Generally*, I consider this an ingenious design choice. Why generally? See below.
+
+![heinous function](/tiny_oneliners.png)
+
+Despite the sometimes unsavoury one-liners, I support the low LOC constraint because it forces you to express the logic of the underlying concepts as concisely as possible. This means no fluff, no bloat, no boilerplate. As a result, understanding the core of TinyGrad is essentially like understanding tensor automatic differentiation itself. There's minimal extra abstraction between you and the fundamental concepts. The same cannot be said for PyTorch.
+
+I first realized that TinyGrad may be my deep learning library of choice when I tried to add support for the 1D convolution using FFT for Metal Performance Shaders in PyTorch. Such a task wouldn't just demand a solid grasp of the core principles; It would require grappling with layers of library-specific complexity. As we dive into the internals in the coming posts, you will begin to see how this is simply not an issue when developing with TinyGrad. Don't get me wrong. Things still get complicated, but they're really only complicated when *the subject itself is complicated*.
+
+# TinyProp
+
+I think you get the point now. TinyGrad is beautifully simple. But deep learning isn't about beautiful, simple software, it's about speed and accuracy. So what is the immediate value proposition of TinyGrad? Here are some thoughts:
+
+
+1. API - similar to Torch, but way better in many areas
+2. Accelerator Support - much better support for non-CPU/CUDA libraries than anything else.
+3. Adding Accelerators - TinyGrad delineates frontend tensor/kernel fusion logic from backend accelerator logic with a fundamental set of 25 operations. To configure your accelerator with TinyGrad, you don't need to do too much more than define how these operations execute on it.
+4. Great Community - the TinyGrad discord is active and willing to help
+
+From [tinygrad.org](tinygrad.org)
+> How is tinygrad faster than PyTorch?
+> For most use cases it isn't yet, but it will be. It has three advantages:
+> 1. It compiles a custom kernel for every operation, allowing extreme shape specialization.
+> 2. All tensors are lazy, so it can aggressively fuse operations.
+> 3. The backend is 10x+ simpler, meaning optimizing one kernel makes everything fast.
+
+# TinyFuture
+
+In the words of George Hotz:
+
+>  We will beat pytorch at speed, API simplicity, and having less bugs. If we do that, we win.
diff --git a/public/index.html b/public/index.html
@@ -36,11 +36,32 @@ <h1 id="nic-hoffs">Nic Hoffs</h1>
 
 
 
+	<li>
+
+		<span class="date">2024/07/05</span>
+
+		<a href="/post/tinygrad/teenygrad/">Teenygrad</a>
+	</li>
+
+	<li>
+
+		<span class="date">2024/07/04</span>
+
+		<a href="/post/tinygrad/the_history/">Why Tiny?</a>
+	</li>
+
+	<li>
+
+		<span class="date">2024/06/11</span>
+
+		<a href="/post/mini_projects/gpt2_p1/">TinyGrad GPT2 à la Karpathy - Part 1</a>
+	</li>
+
 	<li>
 
 		<span class="date">2024/06/11</span>
 
-		<a href="/post/mini_projects/gpt2/">TinyGrad GPT2 à la Karpathy - Part 1</a>
+		<a href="/post/mini_projects/gpt2_p2/">TinyGrad GPT2 à la Karpathy - Part 2</a>
 	</li>
 
 	<li>

diff --git a/public/index.xml b/public/index.xml
@@ -6,15 +6,36 @@
     <description>Recent content in Home on Nic Hoffs Blog</description>
     <generator>Hugo</generator>
     <language>en-us</language>
-    <lastBuildDate>Tue, 11 Jun 2024 16:25:35 +0200</lastBuildDate>
+    <lastBuildDate>Fri, 05 Jul 2024 00:26:18 +0200</lastBuildDate>
     <atom:link href="http://localhost:1313/index.xml" rel="self" type="application/rss+xml" />
+    <item>
+      <title>Teenygrad</title>
+      <link>http://localhost:1313/post/tinygrad/teenygrad/</link>
+      <pubDate>Fri, 05 Jul 2024 00:26:18 +0200</pubDate>
+      <guid>http://localhost:1313/post/tinygrad/teenygrad/</guid>
+      <description>TeenyGrad is TinyGrad&amp;rsquo;s younger sibling.</description>
+    </item>
+    <item>
+      <title>Why Tiny?</title>
+      <link>http://localhost:1313/post/tinygrad/the_history/</link>
+      <pubDate>Thu, 04 Jul 2024 21:51:02 +0200</pubDate>
+      <guid>http://localhost:1313/post/tinygrad/the_history/</guid>
+      <description>TinyPhilosophy TinyGrad is a tensor automatic differentiation library created by George Hotz in 2020. It&amp;rsquo;s been described (by TinyCorp) as a middleground between Anrej Karpathy&amp;rsquo;s famous micrograd project and full-blown PyTorch. It offers both the beautiful simplicity, leanness, and ease of development of micrograd, and almost all the speed and functionality of PyTorch.&#xA;An interesting features of TinyGrad&amp;rsquo;s development is a continued, explicit constraint on the line count (~8000 LOC today).</description>
+    </item>
     <item>
       <title>TinyGrad GPT2 à la Karpathy - Part 1</title>
-      <link>http://localhost:1313/post/mini_projects/gpt2/</link>
+      <link>http://localhost:1313/post/mini_projects/gpt2_p1/</link>
       <pubDate>Tue, 11 Jun 2024 16:25:35 +0200</pubDate>
-      <guid>http://localhost:1313/post/mini_projects/gpt2/</guid>
+      <guid>http://localhost:1313/post/mini_projects/gpt2_p1/</guid>
       <description>Whenever Andrej Karpathy releases a new YouTube video, the only moral thing to do is drop all responsibilites and replicate it in TinyGrad. This released yesterday (June 10) and is over 4 hours long (I&amp;rsquo;m salivating), so I&amp;rsquo;ve only finished the first part. I anticipate the later parts of the video will include more Torch-specific optimizations, which will make things a bit more difficult.&#xA;Setting up inference with pre-trained weights GPT2 has four models from 124M to 1.</description>
     </item>
+    <item>
+      <title>TinyGrad GPT2 à la Karpathy - Part 2</title>
+      <link>http://localhost:1313/post/mini_projects/gpt2_p2/</link>
+      <pubDate>Tue, 11 Jun 2024 16:25:35 +0200</pubDate>
+      <guid>http://localhost:1313/post/mini_projects/gpt2_p2/</guid>
+      <description>As expected, this part of the series has been more difficult. Karpathy makes use of lots of relatively niche, PyTorch specific functions that don&amp;rsquo;t have built-in support in TinyGrad. As a result, I need to be more methodical about how I approach this and rigorously confirm that things are working as expected.&#xA;I&amp;rsquo;ll start with the code we left off on in part 1.&#xA;from tinygrad import Tensor, dtypes from tinygrad.</description>
+    </item>
     <item>
       <title>Simple Symbolic Distillation on XOR Neural Network</title>
       <link>http://localhost:1313/post/symbolic_regression/symbolic_distillation/</link>

diff --git a/public/post/index.html b/public/post/index.html
@@ -34,6 +34,11 @@ <h1>Posts by Category</h1>
 
 
 
+	<li>
+
+		<a href="/post/tinygrad/">TinyGrad</a>
+	</li>
+
 	<li>
 
 		<a href="/post/mini_projects/">Mini-Projects</a>