Skip to content

Commit

Permalink
history
Browse files Browse the repository at this point in the history
  • Loading branch information
nichoffs committed Jul 10, 2024
1 parent a3b3c4d commit 3bfe314
Show file tree
Hide file tree
Showing 19 changed files with 1,524 additions and 10 deletions.
File renamed without changes.
201 changes: 201 additions & 0 deletions content/post/mini_projects/gpt2_p2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
+++
title = 'TinyGrad GPT2 à la Karpathy - Part 2'
date = 2024-06-11T16:25:35+02:00
draft = false
+++

As expected, this part of the series has been more difficult. Karpathy makes use of lots of relatively niche, PyTorch specific functions that don't have built-in support in TinyGrad. As a result, I need to be more methodical about how I approach this and rigorously confirm that things are working as expected.

I'll start with the code we left off on in part 1.


```python
from tinygrad import Tensor, dtypes
from tinygrad.nn import Embedding, Linear, LayerNorm
from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict, get_parameters
from tqdm import tqdm, trange
from tinygrad.nn.optim import AdamW
from dataclasses import dataclass
from tinygrad.helpers import fetch
import tiktoken
import numpy as np
import os
import matplotlib.pyplot as plt

@dataclass
class GPT2Config:
block_size: int = 1024
vocab_size: int = 50257
n_layer: int = 12
n_head: int = 12
n_embd: int = 768
norm_eps: float = 1e-5

@dataclass
class GPT2Small(GPT2Config):
pass

@dataclass
class GPT2Medium(GPT2Config):
n_layer: int = 24
n_head: int = 16
n_embd: int = 1024

@dataclass
class GPT2Large(GPT2Config):
n_layer: int = 36
n_head: int = 20
n_embd: int = 1280

@dataclass
class GPT2XL(GPT2Config):
n_layer: int = 48
n_head: int = 25
n_embd: int = 1600

MODEL_CONFIGS = {
'gpt2': GPT2Small,
'gpt2-medium': GPT2Medium,
'gpt2-large': GPT2Large,
'gpt2-xl': GPT2XL
}

class MLP:
def __init__(self, config : GPT2Config):
self.c_fc = Linear(config.n_embd, config.n_embd*4)
self.c_proj = Linear(config.n_embd*4, config.n_embd)
def __call__(self, x):
x = self.c_fc(x).gelu()
x = self.c_proj(x)
return x

class Attention:
def __init__(self, config: GPT2Config):
self.config = config
self.c_attn = Linear(config.n_embd, config.n_embd*3)
self.c_proj = Linear(config.n_embd, config.n_embd)
def __call__(self, x):
B,T,C = x.shape

q, k, v = self.c_attn(x).split(C, dim=-1) #(B,T,3C) -> (B,T,C) x 3
split_heads = lambda x: x.view(B, T, self.config.n_head, self.config.n_embd//self.config.n_head).transpose(1,2)
q, k, v = map(split_heads, (q,k,v))

y = q.scaled_dot_product_attention(k, v, is_causal=True)
y = y.transpose(1, 2).contiguous().view(B, T, C)
y = self.c_proj(y)

return y

class TransformerBlock:
def __init__(self, config : GPT2Config):
self.ln_1 = LayerNorm(config.n_embd, eps=config.norm_eps)
self.ln_2 = LayerNorm(config.n_embd, eps=config.norm_eps)
self.attn = Attention(config)
self.mlp = MLP(config)
def __call__(self, x):
x = x + self.attn(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x

class GPT2:
def __init__(self, config : GPT2Config = GPT2Small):
self.config = config

self.wte = Embedding(config.vocab_size, config.n_embd)
self.wpe = Embedding(config.block_size, config.n_embd)
self.h = [TransformerBlock(config) for _ in range(config.n_layer)]
self.ln_f = LayerNorm(config.n_embd, config.norm_eps)
self.lm_head = Linear(config.n_embd, config.vocab_size, bias=False)

# tie weights - HUGE SAVINGS
self.lm_head.weight = self.wte.weight

def __call__(self, idx, targets=None):
B,T = idx.shape

assert T <= self.config.block_size, f"Cannot forward, model block size is {self.config.block_size} but got sequence of length {T}"
pos = Tensor.arange(0, T, dtype=dtypes.long) # (T,)
pos_emb = self.wpe(pos) # (T,) -> (T,C)
tok_emb = self.wte(idx) # (B,T) -> (B,T,C)

x = tok_emb + pos_emb
x = x.sequential(self.h)

x = self.ln_f(x)
logits = self.lm_head(x) # (B,T,C) -> (B,T,V)

if targets is not None:
loss = logits.flatten(0,1).sparse_categorical_crossentropy(targets.flatten())
return logits, loss.realize()

return logits, None

@staticmethod
def build(MODEL_NAME):

weights = torch_load(fetch(f'https://huggingface.co/{MODEL_NAME}/resolve/main/pytorch_model.bin'))

transposed = ('attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight')
for k in weights:
if k.endswith(transposed):
weights[k] = weights[k].T

weights['lm_head.weight'] = weights['wte.weight']
model = GPT2(MODEL_CONFIGS[MODEL_NAME])
load_state_dict(model, weights)

return model

class DataLoaderLite:
def __init__(self, B, T, file_path):
self.B=B
self.T=T

self.batch = lambda x: x.view(B,T)

with open(file_path, 'r') as f:
text = f.read()

enc = tiktoken.get_encoding('gpt2')

tokens = enc.encode(text)
self.tokens = Tensor(tokens, dtype=dtypes.long)

print(f"loaded {len(self.tokens)} tokens")
print(f"1 epoch = {len(self.tokens) // (B*T)} batches")

self.current_position = 0

def next_batch(self):
B, T = self.B, self.T

buf = self.tokens[self.current_position:self.current_position + B*T+1]
x = self.batch(buf[:-1])
y = self.batch(buf[1:])
self.current_position += B*T

if self.current_position + (B*T+1) > len(self.tokens):
print("read entire document, resetting position...")
self.current_position = 0

return x,y

Tensor.training = True
Tensor.no_grad = False
model = GPT2(GPT2Small)
optim = AdamW(get_parameters(model), lr=3e-4)
dl = DataLoaderLite(4, 32, "datasets/shake.txt")
losses = []
for i in (t := trange(100)):
x, y = dl.next_batch()
optim.zero_grad()
logits, loss = model(x,y)
losses.append(loss.numpy())
loss.backward()
optim.step()

t.set_description(
f"train loss: {loss.numpy():.2f}"
)
```
4 changes: 4 additions & 0 deletions content/post/tinygrad/_index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
title: TinyGrad
type: series
---
7 changes: 7 additions & 0 deletions content/post/tinygrad/teenygrad.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
+++
title = 'Teenygrad'
date = 2024-07-05T00:26:18+02:00
draft = false
+++

TeenyGrad is TinyGrad's younger sibling.
40 changes: 40 additions & 0 deletions content/post/tinygrad/the_history.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
+++
title = 'Why Tiny?'
date = 2024-07-04T21:51:02+02:00
draft = false
+++

# TinyPhilosophy

TinyGrad is a tensor automatic differentiation library created by [George Hotz](https://en.wikipedia.org/wiki/George_Hotz) in [2020](https://www.youtube.com/watch?v=Xtws3-Pk69o&list=PLzFUMGbVxlQsh0fFZ2QKOBY25lz04A3hi). It's been described (by TinyCorp) as a middleground between Anrej Karpathy's famous micrograd project and full-blown PyTorch. It offers both the beautiful simplicity, leanness, and ease of development of micrograd, and *almost* all the speed and functionality of PyTorch.

An interesting features of TinyGrad's development is a continued, explicit constraint on the line count (~8000 LOC today). *Generally*, I consider this an ingenious design choice. Why generally? See below.

![heinous function](/tiny_oneliners.png)

Despite the sometimes unsavoury one-liners, I support the low LOC constraint because it forces you to express the logic of the underlying concepts as concisely as possible. This means no fluff, no bloat, no boilerplate. As a result, understanding the core of TinyGrad is essentially like understanding tensor automatic differentiation itself. There's minimal extra abstraction between you and the fundamental concepts. The same cannot be said for PyTorch.

I first realized that TinyGrad may be my deep learning library of choice when I tried to add support for the 1D convolution using FFT for Metal Performance Shaders in PyTorch. Such a task wouldn't just demand a solid grasp of the core principles; It would require grappling with layers of library-specific complexity. As we dive into the internals in the coming posts, you will begin to see how this is simply not an issue when developing with TinyGrad. Don't get me wrong. Things still get complicated, but they're really only complicated when *the subject itself is complicated*.

# TinyProp

I think you get the point now. TinyGrad is beautifully simple. But deep learning isn't about beautiful, simple software, it's about speed and accuracy. So what is the immediate value proposition of TinyGrad? Here are some thoughts:


1. API - similar to Torch, but way better in many areas
2. Accelerator Support - much better support for non-CPU/CUDA libraries than anything else.
3. Adding Accelerators - TinyGrad delineates frontend tensor/kernel fusion logic from backend accelerator logic with a fundamental set of 25 operations. To configure your accelerator with TinyGrad, you don't need to do too much more than define how these operations execute on it.
4. Great Community - the TinyGrad discord is active and willing to help

From [tinygrad.org](tinygrad.org)
> How is tinygrad faster than PyTorch?
> For most use cases it isn't yet, but it will be. It has three advantages:
> 1. It compiles a custom kernel for every operation, allowing extreme shape specialization.
> 2. All tensors are lazy, so it can aggressively fuse operations.
> 3. The backend is 10x+ simpler, meaning optimizing one kernel makes everything fast.
# TinyFuture

In the words of George Hotz:

> We will beat pytorch at speed, API simplicity, and having less bugs. If we do that, we win.
23 changes: 22 additions & 1 deletion public/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,32 @@ <h1 id="nic-hoffs">Nic Hoffs</h1>



<li>

<span class="date">2024/07/05</span>

<a href="/post/tinygrad/teenygrad/">Teenygrad</a>
</li>

<li>

<span class="date">2024/07/04</span>

<a href="/post/tinygrad/the_history/">Why Tiny?</a>
</li>

<li>

<span class="date">2024/06/11</span>

<a href="/post/mini_projects/gpt2_p1/">TinyGrad GPT2 à la Karpathy - Part 1</a>
</li>

<li>

<span class="date">2024/06/11</span>

<a href="/post/mini_projects/gpt2/">TinyGrad GPT2 à la Karpathy - Part 1</a>
<a href="/post/mini_projects/gpt2_p2/">TinyGrad GPT2 à la Karpathy - Part 2</a>
</li>

<li>
Expand Down
27 changes: 24 additions & 3 deletions public/index.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,36 @@
<description>Recent content in Home on Nic Hoffs Blog</description>
<generator>Hugo</generator>
<language>en-us</language>
<lastBuildDate>Tue, 11 Jun 2024 16:25:35 +0200</lastBuildDate>
<lastBuildDate>Fri, 05 Jul 2024 00:26:18 +0200</lastBuildDate>
<atom:link href="http://localhost:1313/index.xml" rel="self" type="application/rss+xml" />
<item>
<title>Teenygrad</title>
<link>http://localhost:1313/post/tinygrad/teenygrad/</link>
<pubDate>Fri, 05 Jul 2024 00:26:18 +0200</pubDate>
<guid>http://localhost:1313/post/tinygrad/teenygrad/</guid>
<description>TeenyGrad is TinyGrad&amp;rsquo;s younger sibling.</description>
</item>
<item>
<title>Why Tiny?</title>
<link>http://localhost:1313/post/tinygrad/the_history/</link>
<pubDate>Thu, 04 Jul 2024 21:51:02 +0200</pubDate>
<guid>http://localhost:1313/post/tinygrad/the_history/</guid>
<description>TinyPhilosophy TinyGrad is a tensor automatic differentiation library created by George Hotz in 2020. It&amp;rsquo;s been described (by TinyCorp) as a middleground between Anrej Karpathy&amp;rsquo;s famous micrograd project and full-blown PyTorch. It offers both the beautiful simplicity, leanness, and ease of development of micrograd, and almost all the speed and functionality of PyTorch.&#xA;An interesting features of TinyGrad&amp;rsquo;s development is a continued, explicit constraint on the line count (~8000 LOC today).</description>
</item>
<item>
<title>TinyGrad GPT2 à la Karpathy - Part 1</title>
<link>http://localhost:1313/post/mini_projects/gpt2/</link>
<link>http://localhost:1313/post/mini_projects/gpt2_p1/</link>
<pubDate>Tue, 11 Jun 2024 16:25:35 +0200</pubDate>
<guid>http://localhost:1313/post/mini_projects/gpt2/</guid>
<guid>http://localhost:1313/post/mini_projects/gpt2_p1/</guid>
<description>Whenever Andrej Karpathy releases a new YouTube video, the only moral thing to do is drop all responsibilites and replicate it in TinyGrad. This released yesterday (June 10) and is over 4 hours long (I&amp;rsquo;m salivating), so I&amp;rsquo;ve only finished the first part. I anticipate the later parts of the video will include more Torch-specific optimizations, which will make things a bit more difficult.&#xA;Setting up inference with pre-trained weights GPT2 has four models from 124M to 1.</description>
</item>
<item>
<title>TinyGrad GPT2 à la Karpathy - Part 2</title>
<link>http://localhost:1313/post/mini_projects/gpt2_p2/</link>
<pubDate>Tue, 11 Jun 2024 16:25:35 +0200</pubDate>
<guid>http://localhost:1313/post/mini_projects/gpt2_p2/</guid>
<description>As expected, this part of the series has been more difficult. Karpathy makes use of lots of relatively niche, PyTorch specific functions that don&amp;rsquo;t have built-in support in TinyGrad. As a result, I need to be more methodical about how I approach this and rigorously confirm that things are working as expected.&#xA;I&amp;rsquo;ll start with the code we left off on in part 1.&#xA;from tinygrad import Tensor, dtypes from tinygrad.</description>
</item>
<item>
<title>Simple Symbolic Distillation on XOR Neural Network</title>
<link>http://localhost:1313/post/symbolic_regression/symbolic_distillation/</link>
Expand Down
5 changes: 5 additions & 0 deletions public/post/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ <h1>Posts by Category</h1>



<li>

<a href="/post/tinygrad/">TinyGrad</a>
</li>

<li>

<a href="/post/mini_projects/">Mini-Projects</a>
Expand Down
Loading

0 comments on commit 3bfe314

Please sign in to comment.