-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
1,524 additions
and
10 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
+++ | ||
title = 'TinyGrad GPT2 à la Karpathy - Part 2' | ||
date = 2024-06-11T16:25:35+02:00 | ||
draft = false | ||
+++ | ||
|
||
As expected, this part of the series has been more difficult. Karpathy makes use of lots of relatively niche, PyTorch specific functions that don't have built-in support in TinyGrad. As a result, I need to be more methodical about how I approach this and rigorously confirm that things are working as expected. | ||
|
||
I'll start with the code we left off on in part 1. | ||
|
||
|
||
```python | ||
from tinygrad import Tensor, dtypes | ||
from tinygrad.nn import Embedding, Linear, LayerNorm | ||
from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict, get_parameters | ||
from tqdm import tqdm, trange | ||
from tinygrad.nn.optim import AdamW | ||
from dataclasses import dataclass | ||
from tinygrad.helpers import fetch | ||
import tiktoken | ||
import numpy as np | ||
import os | ||
import matplotlib.pyplot as plt | ||
|
||
@dataclass | ||
class GPT2Config: | ||
block_size: int = 1024 | ||
vocab_size: int = 50257 | ||
n_layer: int = 12 | ||
n_head: int = 12 | ||
n_embd: int = 768 | ||
norm_eps: float = 1e-5 | ||
|
||
@dataclass | ||
class GPT2Small(GPT2Config): | ||
pass | ||
|
||
@dataclass | ||
class GPT2Medium(GPT2Config): | ||
n_layer: int = 24 | ||
n_head: int = 16 | ||
n_embd: int = 1024 | ||
|
||
@dataclass | ||
class GPT2Large(GPT2Config): | ||
n_layer: int = 36 | ||
n_head: int = 20 | ||
n_embd: int = 1280 | ||
|
||
@dataclass | ||
class GPT2XL(GPT2Config): | ||
n_layer: int = 48 | ||
n_head: int = 25 | ||
n_embd: int = 1600 | ||
|
||
MODEL_CONFIGS = { | ||
'gpt2': GPT2Small, | ||
'gpt2-medium': GPT2Medium, | ||
'gpt2-large': GPT2Large, | ||
'gpt2-xl': GPT2XL | ||
} | ||
|
||
class MLP: | ||
def __init__(self, config : GPT2Config): | ||
self.c_fc = Linear(config.n_embd, config.n_embd*4) | ||
self.c_proj = Linear(config.n_embd*4, config.n_embd) | ||
def __call__(self, x): | ||
x = self.c_fc(x).gelu() | ||
x = self.c_proj(x) | ||
return x | ||
|
||
class Attention: | ||
def __init__(self, config: GPT2Config): | ||
self.config = config | ||
self.c_attn = Linear(config.n_embd, config.n_embd*3) | ||
self.c_proj = Linear(config.n_embd, config.n_embd) | ||
def __call__(self, x): | ||
B,T,C = x.shape | ||
|
||
q, k, v = self.c_attn(x).split(C, dim=-1) #(B,T,3C) -> (B,T,C) x 3 | ||
split_heads = lambda x: x.view(B, T, self.config.n_head, self.config.n_embd//self.config.n_head).transpose(1,2) | ||
q, k, v = map(split_heads, (q,k,v)) | ||
|
||
y = q.scaled_dot_product_attention(k, v, is_causal=True) | ||
y = y.transpose(1, 2).contiguous().view(B, T, C) | ||
y = self.c_proj(y) | ||
|
||
return y | ||
|
||
class TransformerBlock: | ||
def __init__(self, config : GPT2Config): | ||
self.ln_1 = LayerNorm(config.n_embd, eps=config.norm_eps) | ||
self.ln_2 = LayerNorm(config.n_embd, eps=config.norm_eps) | ||
self.attn = Attention(config) | ||
self.mlp = MLP(config) | ||
def __call__(self, x): | ||
x = x + self.attn(self.ln_1(x)) | ||
x = x + self.mlp(self.ln_2(x)) | ||
return x | ||
|
||
class GPT2: | ||
def __init__(self, config : GPT2Config = GPT2Small): | ||
self.config = config | ||
|
||
self.wte = Embedding(config.vocab_size, config.n_embd) | ||
self.wpe = Embedding(config.block_size, config.n_embd) | ||
self.h = [TransformerBlock(config) for _ in range(config.n_layer)] | ||
self.ln_f = LayerNorm(config.n_embd, config.norm_eps) | ||
self.lm_head = Linear(config.n_embd, config.vocab_size, bias=False) | ||
|
||
# tie weights - HUGE SAVINGS | ||
self.lm_head.weight = self.wte.weight | ||
|
||
def __call__(self, idx, targets=None): | ||
B,T = idx.shape | ||
|
||
assert T <= self.config.block_size, f"Cannot forward, model block size is {self.config.block_size} but got sequence of length {T}" | ||
pos = Tensor.arange(0, T, dtype=dtypes.long) # (T,) | ||
pos_emb = self.wpe(pos) # (T,) -> (T,C) | ||
tok_emb = self.wte(idx) # (B,T) -> (B,T,C) | ||
|
||
x = tok_emb + pos_emb | ||
x = x.sequential(self.h) | ||
|
||
x = self.ln_f(x) | ||
logits = self.lm_head(x) # (B,T,C) -> (B,T,V) | ||
|
||
if targets is not None: | ||
loss = logits.flatten(0,1).sparse_categorical_crossentropy(targets.flatten()) | ||
return logits, loss.realize() | ||
|
||
return logits, None | ||
|
||
@staticmethod | ||
def build(MODEL_NAME): | ||
|
||
weights = torch_load(fetch(f'https://huggingface.co/{MODEL_NAME}/resolve/main/pytorch_model.bin')) | ||
|
||
transposed = ('attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight') | ||
for k in weights: | ||
if k.endswith(transposed): | ||
weights[k] = weights[k].T | ||
|
||
weights['lm_head.weight'] = weights['wte.weight'] | ||
model = GPT2(MODEL_CONFIGS[MODEL_NAME]) | ||
load_state_dict(model, weights) | ||
|
||
return model | ||
|
||
class DataLoaderLite: | ||
def __init__(self, B, T, file_path): | ||
self.B=B | ||
self.T=T | ||
|
||
self.batch = lambda x: x.view(B,T) | ||
|
||
with open(file_path, 'r') as f: | ||
text = f.read() | ||
|
||
enc = tiktoken.get_encoding('gpt2') | ||
|
||
tokens = enc.encode(text) | ||
self.tokens = Tensor(tokens, dtype=dtypes.long) | ||
|
||
print(f"loaded {len(self.tokens)} tokens") | ||
print(f"1 epoch = {len(self.tokens) // (B*T)} batches") | ||
|
||
self.current_position = 0 | ||
|
||
def next_batch(self): | ||
B, T = self.B, self.T | ||
|
||
buf = self.tokens[self.current_position:self.current_position + B*T+1] | ||
x = self.batch(buf[:-1]) | ||
y = self.batch(buf[1:]) | ||
self.current_position += B*T | ||
|
||
if self.current_position + (B*T+1) > len(self.tokens): | ||
print("read entire document, resetting position...") | ||
self.current_position = 0 | ||
|
||
return x,y | ||
|
||
Tensor.training = True | ||
Tensor.no_grad = False | ||
model = GPT2(GPT2Small) | ||
optim = AdamW(get_parameters(model), lr=3e-4) | ||
dl = DataLoaderLite(4, 32, "datasets/shake.txt") | ||
losses = [] | ||
for i in (t := trange(100)): | ||
x, y = dl.next_batch() | ||
optim.zero_grad() | ||
logits, loss = model(x,y) | ||
losses.append(loss.numpy()) | ||
loss.backward() | ||
optim.step() | ||
|
||
t.set_description( | ||
f"train loss: {loss.numpy():.2f}" | ||
) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
--- | ||
title: TinyGrad | ||
type: series | ||
--- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
+++ | ||
title = 'Teenygrad' | ||
date = 2024-07-05T00:26:18+02:00 | ||
draft = false | ||
+++ | ||
|
||
TeenyGrad is TinyGrad's younger sibling. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
+++ | ||
title = 'Why Tiny?' | ||
date = 2024-07-04T21:51:02+02:00 | ||
draft = false | ||
+++ | ||
|
||
# TinyPhilosophy | ||
|
||
TinyGrad is a tensor automatic differentiation library created by [George Hotz](https://en.wikipedia.org/wiki/George_Hotz) in [2020](https://www.youtube.com/watch?v=Xtws3-Pk69o&list=PLzFUMGbVxlQsh0fFZ2QKOBY25lz04A3hi). It's been described (by TinyCorp) as a middleground between Anrej Karpathy's famous micrograd project and full-blown PyTorch. It offers both the beautiful simplicity, leanness, and ease of development of micrograd, and *almost* all the speed and functionality of PyTorch. | ||
|
||
An interesting features of TinyGrad's development is a continued, explicit constraint on the line count (~8000 LOC today). *Generally*, I consider this an ingenious design choice. Why generally? See below. | ||
|
||
![heinous function](/tiny_oneliners.png) | ||
|
||
Despite the sometimes unsavoury one-liners, I support the low LOC constraint because it forces you to express the logic of the underlying concepts as concisely as possible. This means no fluff, no bloat, no boilerplate. As a result, understanding the core of TinyGrad is essentially like understanding tensor automatic differentiation itself. There's minimal extra abstraction between you and the fundamental concepts. The same cannot be said for PyTorch. | ||
|
||
I first realized that TinyGrad may be my deep learning library of choice when I tried to add support for the 1D convolution using FFT for Metal Performance Shaders in PyTorch. Such a task wouldn't just demand a solid grasp of the core principles; It would require grappling with layers of library-specific complexity. As we dive into the internals in the coming posts, you will begin to see how this is simply not an issue when developing with TinyGrad. Don't get me wrong. Things still get complicated, but they're really only complicated when *the subject itself is complicated*. | ||
|
||
# TinyProp | ||
|
||
I think you get the point now. TinyGrad is beautifully simple. But deep learning isn't about beautiful, simple software, it's about speed and accuracy. So what is the immediate value proposition of TinyGrad? Here are some thoughts: | ||
|
||
|
||
1. API - similar to Torch, but way better in many areas | ||
2. Accelerator Support - much better support for non-CPU/CUDA libraries than anything else. | ||
3. Adding Accelerators - TinyGrad delineates frontend tensor/kernel fusion logic from backend accelerator logic with a fundamental set of 25 operations. To configure your accelerator with TinyGrad, you don't need to do too much more than define how these operations execute on it. | ||
4. Great Community - the TinyGrad discord is active and willing to help | ||
|
||
From [tinygrad.org](tinygrad.org) | ||
> How is tinygrad faster than PyTorch? | ||
> For most use cases it isn't yet, but it will be. It has three advantages: | ||
> 1. It compiles a custom kernel for every operation, allowing extreme shape specialization. | ||
> 2. All tensors are lazy, so it can aggressively fuse operations. | ||
> 3. The backend is 10x+ simpler, meaning optimizing one kernel makes everything fast. | ||
# TinyFuture | ||
|
||
In the words of George Hotz: | ||
|
||
> We will beat pytorch at speed, API simplicity, and having less bugs. If we do that, we win. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.