Skip to content

Pytorch implementation of GPT (Generative Pretrained Transformer)

License

Notifications You must be signed in to change notification settings

thisisanshgupta/miniature-GPT

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

5 Commits
 
 
 
 
 
 

Repository files navigation

Introduction

This example demonstrates how to implement an autoregressive language model using a miniature version of the GPT model. The model consists of a single Transformer block with causal masking in its attention layer. We use the text from the IMDB sentiment classification dataset for training and generate new movie reviews for a given prompt. When using this script with your own dataset, make sure it has at least 1 million words.

Setup

import numpy as np
import os
import re
import string
import random

import torch
import torch.nn as nn
import torch.functional as F

Implement a Transformer block as a layer

def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = torch.arange(n_dest)[:, None]
    j = torch.arange(n_src)
    m = i >= j - n_src + n_dest
    mask = m.type(dtype)
    mask = mask.reshape(1, n_dest, n_src)
    mult = torch.cat(
        [torch.expand_dims(batch_size, -1), torch.tensor([1, 1], dtype=torch.int32)], 0
    )
    return torch.tile(mask, mult)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = nn.MultiheadAttention(embed_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim),
        )
        self.layernorm1 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.dropout1 = nn.Dropout(rate)
        self.dropout2 = nn.Dropout(rate)
    
    def forward(self, x):
        input_shape = x.shape
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, torch.bool)
        attention_output = self.att(x, x, causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(x + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

Implement an embedding layer

class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        maxlen = x.size(1)
        positions = torch.arange(0, maxlen, dtype=torch.long)
        positions = positions.unsqueeze(0).expand_as(x)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

Implement the miniature GPT model

vocab_size = 20000  # Only consider the top 20k words
maxlen = 80  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer

class Transformer(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, feed_forward_dim):
        super(Transformer, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
        self.dense = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.transformer_block(x)
        x = self.dense(x)
        return x

model = Transformer(maxlen, vocab_size, embed_dim, num_heads, feed_forward_dim)

About

Pytorch implementation of GPT (Generative Pretrained Transformer)

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages