-
Notifications
You must be signed in to change notification settings - Fork 24
/
utils.py
239 lines (183 loc) · 7.92 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import os
import re
import json
import pickle
from pathlib import Path
import torch
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.font_manager as fm
from torchtext import data as ttd
from torchtext.data import Example, Dataset
def load_dataset(mode):
"""
Load train, valid and test dataset as a pandas DataFrame
Args:
mode: (string) configuration mode used to which dataset to load
Returns:
(DataFrame) train, valid, test dataset converted to pandas DataFrame
"""
print(f'Loading AI Hub Kor-Eng translation dataset and converting it to pandas DataFrame . . .')
data_dir = Path().cwd() / 'data'
if mode == 'train':
train_file = os.path.join(data_dir, 'train.csv')
train_data = pd.read_csv(train_file, encoding='utf-8')
valid_file = os.path.join(data_dir, 'valid.csv')
valid_data = pd.read_csv(valid_file, encoding='utf-8')
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
return train_data, valid_data
else:
test_file = os.path.join(data_dir, 'test.csv')
test_data = pd.read_csv(test_file, encoding='utf-8')
print(f'Number of testing examples: {len(test_data)}')
return test_data
def clean_text(text):
"""
remove special characters from the input sentence to normalize it
Args:
text: (string) text string which may contain special character
Returns:
normalized sentence
"""
text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`…》]', '', text)
return text
def convert_to_dataset(data, kor, eng):
"""
Pre-process input DataFrame and convert pandas DataFrame to torchtext Dataset.
Args:
data: (DataFrame) pandas DataFrame to be converted into torchtext Dataset
kor: torchtext Field containing Korean sentence
eng: torchtext Field containing English sentence
Returns:
(Dataset) torchtext Dataset containing 'kor' and 'eng' Fields
"""
# drop missing values not containing str value from DataFrame
missing_rows = [idx for idx, row in data.iterrows() if type(row.korean) != str or type(row.english) != str]
data = data.drop(missing_rows)
# convert each row of DataFrame to torchtext 'Example' containing 'kor' and 'eng' Fields
list_of_examples = [Example.fromlist(row.apply(lambda x: clean_text(x)).tolist(),
fields=[('kor', kor), ('eng', eng)]) for _, row in data.iterrows()]
# construct torchtext 'Dataset' using torchtext 'Example' list
dataset = Dataset(examples=list_of_examples, fields=[('kor', kor), ('eng', eng)])
return dataset
def make_iter(batch_size, mode, train_data=None, valid_data=None, test_data=None):
"""
Convert pandas DataFrame to torchtext Dataset and make iterator which will be used to train and test the model
Args:
batch_size: (integer) batch size used to make iterators
mode: (string) configuration mode used to which iterator to make
train_data: (DataFrame) pandas DataFrame used to build train iterator
valid_data: (DataFrame) pandas DataFrame used to build validation iterator
test_data: (DataFrame) pandas DataFrame used to build test iterator
Returns:
(BucketIterator) train, valid, test iterator
"""
# load text and label field made by build_pickles.py
file_kor = open('pickles/kor.pickle', 'rb')
kor = pickle.load(file_kor)
file_eng = open('pickles/eng.pickle', 'rb')
eng = pickle.load(file_eng)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# convert pandas DataFrame to torchtext dataset
if mode == 'train':
train_data = convert_to_dataset(train_data, kor, eng)
valid_data = convert_to_dataset(valid_data, kor, eng)
# make iterator using train and validation dataset
print(f'Make Iterators for training . . .')
train_iter, valid_iter = ttd.BucketIterator.splits(
(train_data, valid_data),
# the BucketIterator needs to be told what function it should use to group the data.
# In our case, we sort dataset using text of example
sort_key=lambda sent: len(sent.kor),
# all of the tensors will be sorted by their length by below option
sort_within_batch=True,
batch_size=batch_size,
device=device)
return train_iter, valid_iter
else:
test_data = convert_to_dataset(test_data, kor, eng)
# defines dummy list will be passed to the BucketIterator
dummy = list()
# make iterator using test dataset
print(f'Make Iterators for testing . . .')
test_iter, _ = ttd.BucketIterator.splits(
(test_data, dummy),
sort_key=lambda sent: len(sent.kor),
sort_within_batch=True,
batch_size=batch_size,
device=device)
return test_iter
def epoch_time(start_time, end_time):
"""
Calculate the time spent to train one epoch
Args:
start_time: (float) training start time
end_time: (float) training end time
Returns:
(int) elapsed_mins and elapsed_sec spent for one epoch
"""
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
def display_attention(candidate, translation, attention):
"""
displays the model's attention over the source sentence for each target token generated.
Args:
candidate: (list) tokenized source tokens
translation: (list) predicted target translation tokens
attention: a tensor containing attentions scores
Returns:
"""
# attention = [target length, source length]
attention = attention.cpu().detach().numpy()
# attention = [target length, source length]
font_location = 'pickles/NanumSquareR.ttf'
fontprop = fm.FontProperties(fname=font_location)
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
ax.matshow(attention, cmap='bone')
ax.tick_params(labelsize=15)
ax.set_xticklabels([''] + [t.lower() for t in candidate], rotation=45, fontproperties=fontprop)
ax.set_yticklabels([''] + translation, fontproperties=fontprop)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
plt.show()
plt.close()
class Params:
"""
Class that loads hyperparameters from a json file
Example:
```
params = Params(json_path)
print(params.learning_rate)
params.learning_rate = 0.5 # change the value of learning_rate in params
```
"""
def __init__(self, json_path):
self.update(json_path)
self.load_vocab()
def update(self, json_path):
"""Loads parameters from json file"""
with open(json_path) as f:
params = json.load(f)
self.__dict__.update(params)
def load_vocab(self):
# load kor and eng vocabs to add vocab size configuration
pickle_kor = open('pickles/kor.pickle', 'rb')
kor = pickle.load(pickle_kor)
pickle_eng = open('pickles/eng.pickle', 'rb')
eng = pickle.load(pickle_eng)
# add device information to the the params
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# add <sos> and <eos> tokens' indices used to predict the target sentence
params = {'input_dim': len(kor.vocab), 'output_dim': len(eng.vocab),
'sos_idx': eng.vocab.stoi['<sos>'], 'eos_idx': eng.vocab.stoi['<eos>'],
'pad_idx': eng.vocab.stoi['<pad>'], 'device': device}
self.__dict__.update(params)
@property
def dict(self):
"""Gives dict-like access to Params instance by `params.dict['learning_rate']`"""
return self.__dict__