-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathminiGPT.py
131 lines (113 loc) · 4.1 KB
/
miniGPT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import pprint
import selfAttentionModule, transformerModule, transformerBlock
# Function to obtain training data, vocab and mapping from word to index and vice versa
def get_data_and_vocab():
# Define training data
training_data = {
"how are you": "i am fine <end>",
"who is john": "a nice person <end>",
"who is nice": "john <end>",
"where is john": "at home <end>",
"how is john": "i dont know <end>",
"who are you": "mini gpt model <end>",
}
# Extract input and target phrases
data_words = [k for k, _ in training_data.items()]
target_words = [v for _, v in training_data.items()]
# Build vocabulary from training data
vocabulary_words = list(
set(
[
element.lower()
for nestedlist in [x.split(" ") for x in data_words]
for element in nestedlist
]
+ [
element.lower()
for nestedlist in [x.split(" ") for x in target_words]
for element in nestedlist
]
)
)
# Ensure <end> token is at the end of vocabulary list, and there's a blank at the beginning
vocabulary_words.remove("<end>")
vocabulary_words.append("<end>")
vocabulary_words.insert(0, "")
# Create mappings from word to index and index to word
word_to_ix = {vocabulary_words[k].lower(): k for k in range(len(vocabulary_words))}
ix_to_word = {v: k for k, v in word_to_ix.items()}
# Return all the necessary data and mappings
return (
training_data,
data_words,
target_words,
vocabulary_words,
word_to_ix,
ix_to_word,
)
# Function to convert a batch of sequences of words to a tensor of indices
def words_to_tensor(seq_batch, device=None):
index_batch = []
# Loop over sequences in the batch
for seq in seq_batch:
word_list = seq.lower().split(" ")
indices = [word_to_ix[word] for word in word_list if word in word_to_ix]
t = torch.tensor(indices)
if device is not None:
t = t.to(device) # Transfer tensor to the specified device
index_batch.append(t)
# Pad tensors to have the same length
return pad_tensors(index_batch)
# Function to convert a tensor of indices to a list of sequences of words
def tensor_to_words(tensor):
index_batch = tensor.cpu().numpy().tolist()
res = []
for indices in index_batch:
words = []
for ix in indices:
words.append(ix_to_word[ix].lower()) # Convert index to word
if ix == word_to_ix["<end>"]:
break # Stop when <end> token is encountered
res.append(" ".join(words))
return res
# Function to pad a list of tensors to the same length
def pad_tensors(list_of_tensors):
tensor_count = (
len(list_of_tensors)
if not torch.is_tensor(list_of_tensors)
else list_of_tensors.shape[0]
)
max_dim = max(t.shape[0] for t in list_of_tensors) # Find the maximum length
res = []
for t in list_of_tensors:
# Create a zero tensor of the desired shape
res_t = torch.zeros(max_dim, *t.shape[1:]).type(t.dtype).to(t.device)
res_t[: t.shape[0]] = t # Copy the original tensor into the padded tensor
res.append(res_t)
# Concatenate tensors along a new dimension
res = torch.cat(res)
firstDim = len(list_of_tensors)
secondDim = max_dim
# Reshape the result to have the new dimension first
return res.reshape(firstDim, secondDim, *res.shape[1:])
# Main function to call the demonstration function
if __name__ == "__main__":
# Get training data and vocabulary
(
training_data,
data_words,
target_words,
vocabulary_words,
word_to_ix,
ix_to_word,
) = get_data_and_vocab()
# Run the example training and inference function
# example_training_and_inference()
print(vocabulary_words)
x = words_to_tensor(vocabulary_words)
print(x)
print(tensor_to_words(x))