From 1f24a6470324f3fdb1cd65d4eab2d3e039f2eb1d Mon Sep 17 00:00:00 2001 From: Syafiq Kamarul Azman Date: Thu, 17 Jan 2019 16:56:52 +0400 Subject: [PATCH] Initializes repository --- .gitignore | 121 +++++++++++++++++++++++++++++++++++++++++++++++ datasets.py | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++ generator.py | 100 +++++++++++++++++++++++++++++++++++++++ models.py | 94 ++++++++++++++++++++++++++++++++++++ trainer.py | 81 +++++++++++++++++++++++++++++++ 5 files changed, 527 insertions(+) create mode 100644 .gitignore create mode 100644 datasets.py create mode 100644 generator.py create mode 100644 models.py create mode 100644 trainer.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e126ef8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,121 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# VS Code +.vscode + +# Trained model +model.pt diff --git a/datasets.py b/datasets.py new file mode 100644 index 0000000..5d617d7 --- /dev/null +++ b/datasets.py @@ -0,0 +1,131 @@ +import os +import torch +from sklearn.preprocessing import LabelEncoder +from torch.utils.data import Dataset + +class TESNamesDataset(Dataset): + ''' The Elder Scrolls Names dataset class. + + The Elder Scrolls Names dataset is a dataset of first names of all 10 + major races in Tamriel. The dataset contains male and female first names + of different lengths and characters organized into folders of races and + files of genders. + ''' + def __init__(self, data_root, charset, max_length): + ''' Initializes the Elder Scrolls dataset. + + The initialization appends a terminating character, \0, and therfore + the passed charset argument should not contain the terminating + character. + + Parameters + ---------- + data_root: str + Absolute path to the root folder of the dataset. + charset: str + String of all characters expected to be present in the names. + max_length: str + The maximum number of characters in a name to be used for + zero-padding or truncation of names when preprocessing. + ''' + self.data_root = data_root + self.charset = charset + '\0' + self.max_length = max_length + self.race_codec = LabelEncoder() + self.gender_codec = LabelEncoder() + self.char_codec = LabelEncoder() + self.samples = [] + self._init_dataset() + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx): + race, gender, name = self.samples[idx] + return self.one_hot_sample(race, gender, name) + + def _init_dataset(self): + ''' Dataset initialization subroutine. + + Goes through all the folders in the root directory of the dataset + and reads all the files in the subfolders and appends tuples of + race, gender and name into the `self.samples` list. + + The label encoder for the races, genders and characters are also + initialized here. + ''' + races = set() + genders = set() + + for race in os.listdir(self.data_root): + race_folder = os.path.join(self.data_root, race) + races.add(race) + + for gender in os.listdir(race_folder): + gender_filepath = os.path.join(race_folder, gender) + genders.add(gender) + + with open(gender_filepath, 'r') as gender_file: + for name in gender_file.read().splitlines(): + if len(name) > self.max_length: + name = name[:self.max_length-1] + '\0' + else: + name = name + '\0' * (self.max_length - len(name)) + self.samples.append((race, gender, name)) + + self.race_codec.fit(list(races)) + self.gender_codec.fit(list(genders)) + self.char_codec.fit(list(self.charset)) + + def to_one_hot(self, codec, values): + ''' Encodes a list of nominal values into a one-hot tensor. + + Parameters + ---------- + codec: sklearn.preprocessing.LabelEncoder + Scikit-learn label encoder for the list of values. + values: list of str + List of values to be converted into numbers. + ''' + values_idx = codec.transform(values) + return torch.eye(len(codec.classes_))[values_idx] + + def one_hot_sample(self, race, gender, name): + ''' Converts a single sample into its one-hot counterpart. + + Calls the `to_one_hot` function for each of the value in a sample: + race, gender, and name. The race and gender gets converted into + a 1xR tensor, and 1xG tensor, respectively, where R is the number of + races in the dataset and G is the number of genders in the dataset. + + The name gets converted into a tensor of 1xMxC where M is the maximum + length of the names (`self.max_length`) and C is the length of the + character set (after adding the terminationg, \0, character). + + Parameters + ---------- + race: str + The race of the sample. + gender: str + The gender of the sample. + name: str + The name of the sample. + ''' + t_race = self.to_one_hot(self.race_codec, [race]) + t_gender = self.to_one_hot(self.gender_codec, [gender]) + t_name = self.to_one_hot(self.char_codec, list(name)) + return t_race, t_gender, t_name + + +if __name__ == '__main__': + import string + from torch.utils.data import DataLoader + + data_root = '/home/syafiq/Data/tes-names/' + charset = string.ascii_letters + '\'- ' + max_length = 30 + dataset = TESNamesDataset(data_root, charset, max_length) + print(dataset[100]) + + dataloader = DataLoader(dataset, batch_size=10, shuffle=True) + print(next(iter(dataloader))) diff --git a/generator.py b/generator.py new file mode 100644 index 0000000..8eaf689 --- /dev/null +++ b/generator.py @@ -0,0 +1,100 @@ +from collections import deque +import string +from torch.utils.data import DataLoader +import torch +import torch.nn as nn +import torch.optim as optim +from datasets import TESNamesDataset +from models import TESLSTM + + +def generate(race, gender, char, dataset, model, device): + ''' Generates a "novel" name given the parameters. + + Given the desired race, gender and initial character, the trained model + will produce a new name by predicting what letter should come next and + feeding the predicted letter as an input to the model until it reaches + the maximum length or the terminating character is predicted. + + Parameters + ---------- + race: str + Desired race for new name. + gender: str + Desired gender for new name. + char: str + Starting character of the new name. + dataset: torch.utils.data.Dataset + The dataset of Elder Scrolls names. + model: models.TESLSTM + The trained model used for prediction. + device: torch.device + The device on which to execute. + ''' + name = char + model.eval() + + t_race, t_gender, t_char = dataset.one_hot_sample(race, gender, char) + t_hidden, t_cell = model.init_hidden(1) + + t_race = t_race.view(1, 1, -1).to(device) + t_gender = t_gender.view(1, 1, -1).to(device) + t_char = t_char.view(1, 1, -1).to(device) + t_hidden = t_hidden.to(device) + t_cell = t_cell.to(device) + + for _ in range(dataset.max_length): + t_char, t_hidden, t_cell = \ + model(t_race, t_gender, t_char, t_hidden, t_cell) + + char_idx = t_char.argmax(dim=1).item() + new_char = dataset.char_codec.inverse_transform([char_idx])[0] + + if new_char == '\0': + break + else: + name += new_char + t_char = dataset.to_one_hot(dataset.char_codec, [new_char]) + t_char = t_char.view(1, 1, -1).to(device) + + return name + + +if __name__ == '__main__': + data_root = '/home/syafiq/Data/tes-names/' + charset = string.ascii_letters + '\'- ' + max_length = 30 + + # Prepare GPU. + device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + + # Prepare dataset. + dataset = TESNamesDataset(data_root, charset, max_length) + + input_size = ( + len(dataset.race_codec.classes_) + + len(dataset.gender_codec.classes_) + + len(dataset.char_codec.classes_) + ) + hidden_size = 128 + output_size = len(dataset.char_codec.classes_) + + # Prepare model. + model = TESLSTM(input_size, hidden_size, output_size) + model.load_state_dict(torch.load('model.pt')) + model = model.to(device) + + new_names = [] + + # Predict a name for all combinations. + for race in dataset.race_codec.classes_: + for gender in dataset.gender_codec.classes_: + for letter in string.ascii_uppercase: + name = generate(race, gender, letter, dataset, model, device) + print(race, gender, name) + new_names.append(name) + + # See how many names are copied from the dataset, if any. + sample_names = [name.replace('\0', '') for _, _, name in dataset.samples] + intersection_set = set(new_names).intersection(set(sample_names)) + print('%% of similar names: %.2f%%' % (len(intersection_set) / len(dataset))) diff --git a/models.py b/models.py new file mode 100644 index 0000000..d2948c6 --- /dev/null +++ b/models.py @@ -0,0 +1,94 @@ +import torch +import torch.nn as nn + +class TESLSTM(nn.Module): + ''' LSTM cell model to predict the next character in a name. + + The LSTM model takes as input the race tensor, gender tensor, single + character tensor, the previous hidden and cell states and outputs the + next character tensor. + ''' + + def __init__(self, input_size, hidden_size, output_size): + ''' Initializes the parameters of the LSTM cell model + + Parameters + ---------- + input_size: int + Number of input units of the LSTM. + hidden_size: int + Number of hidden units and cell memory units of the LSTM. + output_size: int + Number of output units of the LSTM. + ''' + super(TESLSTM, self).__init__() + + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + + self.lstm = nn.LSTMCell(input_size, hidden_size) + self.fc = nn.Linear(hidden_size, output_size) + self.dropout = nn.Dropout(0.05) + self.softmax = nn.LogSoftmax(dim=1) + + def forward(self, t_race, t_gender, t_char, t_hidden, t_cell): + ''' Forward pass execution for the LSTM model. + + Input tensors (`t_race`, `t_gender`, `t_char`) should be 3rd rank. + Hidden tensors (`t_hidden` and `t_cell`) should be 2nd rank. + + The forward pass concatenates the input tensors and performs + a pass through the LSTM cell and through a fully-connected layer + before a softmax layer at the output units. + + Parameters + ---------- + t_race: torch.Tensor + The tensor of batches of race vectors. + t_gender: torch.Tensor + The tensor of batches of gender vectors. + t_char: torch.Tensor + The tensor of batches of single character vectors of a name. + t_hidden: torch.Tensor + The tensor of batches of hidden units. + t_cell: torch.Tensor + The tensor of batches of cell memory units. + ''' + t_input = torch.cat((t_race, t_gender, t_char), dim=2) + t_input = t_input.view(-1, self.input_size) + t_hidden, t_cell = self.lstm(t_input, (t_hidden, t_cell)) + + t_output = self.fc(self.dropout(t_hidden)) + t_output = self.softmax(t_output) + + return t_output, t_hidden, t_cell + + def init_hidden(self, batch_size): + ''' Initializes the LSTM's hidden and memory cell values. + + The LSTM starts with a random hidden state and a zero memory cell + state as the neural network needs to be able to generate new names. + With the random initial state, the network needs to learn to + generalize this process when training. + + Parameters + ---------- + batch_size: int + Size of the current batch of tensors. + ''' + t_hidden = torch.randn(batch_size, self.hidden_size) + t_cell = torch.zeros(batch_size, self.hidden_size) + + return t_hidden, t_cell + + +if __name__ == '__main__': + model = TESLSTM(10, 20, 5) + + t_race = torch.randn(32, 1, 3) + t_gender = torch.randn(32, 1, 2) + t_name = torch.randn(32, 30, 5) + t_hidden, t_cell = model.init_hidden(32) + + print(model(t_race, t_gender, t_name[:, 0:1], t_hidden, t_cell)) diff --git a/trainer.py b/trainer.py new file mode 100644 index 0000000..c93e49e --- /dev/null +++ b/trainer.py @@ -0,0 +1,81 @@ +from collections import deque +import string +from torch.utils.data import DataLoader +import torch +import torch.nn as nn +import torch.optim as optim +from datasets import TESNamesDataset +from models import TESLSTM +from generator import generate + +# Configuration. +data_root = '/home/syafiq/Data/tes-names/' +charset = string.ascii_letters + '\'- ' +max_length = 30 +learning_rate = 0.0003 +batch_size = 64 +num_epochs = 100 + +# Prepare dataset/loader. +dataset = TESNamesDataset(data_root, charset, max_length) +dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + +# GPU execution. +device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + +# Prepare model. +input_size = ( + len(dataset.race_codec.classes_) + + len(dataset.gender_codec.classes_) + + len(dataset.char_codec.classes_) +) +hidden_size = 128 +output_size = len(dataset.char_codec.classes_) + +model = TESLSTM(input_size, hidden_size, output_size) +model = model.to(device) + +# Optimizer. +criterion = nn.NLLLoss() +optimizer = optim.Adam(model.parameters(), lr=learning_rate) + +losses = deque([], maxlen=100) + +# Training. +for epoch in range(num_epochs): + for batch_i, samples in enumerate(dataloader): + model.zero_grad() + + t_race, t_gender, t_name = samples + t_hidden, t_cell = model.init_hidden(t_race.size(0)) + + t_race = t_race.to(device) + t_gender = t_gender.to(device) + t_name = t_name.to(device) + t_hidden = t_hidden.to(device) + t_cell = t_cell.to(device) + + loss = 0. + + for char_i in range(max_length - 1): + t_char = t_name[:, char_i:char_i+1] + t_output, t_hidden, t_cell = \ + model(t_race, t_gender, t_char, t_hidden, t_cell) + + targets = t_name[:, char_i+1:char_i+2].argmax(dim=2).squeeze() + loss += criterion(t_output, targets) + + losses.append(loss.item() / t_race.size(0)) + loss.backward() + optimizer.step() + + if batch_i % 100 == 0: + print(generate('Argonian', 'Male', 'H', dataset, model, device)) + print('[%03d] %05d/%05d Loss: %.4f' % ( + epoch + 1, + batch_i, + len(dataset) // batch_size, + sum(losses) / len(losses) + )) + +torch.save(model.state_dict(), 'model.pt')