This repository has been archived by the owner on Jul 12, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathword2vec.py
133 lines (103 loc) · 5.39 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
This module contains the composite word2vec model.
"""
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
class SkipGramNSModel(nn.Module):
"""
Word2VecModel ("Skip-Gram") with Negative Sampling Loss function.
This is a self-supervised model where (input, context) pairs of words
are trained to be "close" to each other within their projected embedding space.
This can be thought of as a member of the "metric learning" class of problems.
The brilliant insight here is rather than treat the problem as a typical
softmax prediction, it is using the negative sampling loss function we are treating it as
k+1 concurrent binary classification problems.
This model is presented as the foundation for appreciating the power fo unsupervised learning
in NLP, as we will see in later models as well (read: GPT,etc).
"""
def __init__(self, vocab_size: int, embedding_size: int, negative_sample_size: int,
word_frequency: torch.Tensor):
"""
Args:
vocab_size (int): size of the vocabulary
embedding_size (int): size of the embeddings
negative_sample_size (int): size of negative examples to be sampled for loss function
word_frequency (torch.Tensor): the word frequencies from the vocabulary
(to be used for selecting negative examples)
"""
super(SkipGramNSModel, self).__init__()
# model hyper-parameters
self._vocab_size = vocab_size
self._embedding_size = embedding_size
self._negative_sample_size = negative_sample_size # "k"
# (1) input embedding (input word)
self._input_embedding = nn.Embedding(vocab_size, embedding_size)
# (2) output embedding (context word, sampled from context window)
self._output_embedding = nn.Embedding(vocab_size, embedding_size)
# word frequencies used in negative sampling loss
# transform suggested in paper for U(w0)
word_frequency = np.power(word_frequency, 0.75)
word_frequency = word_frequency / word_frequency.sum()
self._word_frequency = word_frequency
# Init weights
self._init_weights()
self._device = 'cpu'
if torch.cuda.is_available():
self._device = torch.cuda.current_device()
def _init_weights(self):
"""
Initializes the weights of the embeddings vectors.
Here we set the embeddings to be U(a,b) distribution.
"""
self._input_embedding.weight.data.uniform_(-0.5 / self._embedding_size,
0.5 / self._embedding_size)
self._output_embedding.weight.data.uniform_(-0.5 / self._embedding_size,
0.5 / self._embedding_size)
def forward(self, data: tuple) -> torch.Tensor:
"""
The main call function of the model, but we output loss not prediction.
Args:
data (tuple): batches of (input_word, context_word)
Returns:
0-d tensor with loss
"""
# unpack of input, context vectors
input_word, context_word = data
batch_size = context_word.size(0)
# (1) Grab negative sample indices (batch_size, k)
# Rather than negative sample in the construction of the dataset,
# we find negative samples here directly from embedding matrix
# Note: the word_frequencies are set to 0 for special tokens and therefore
# will not be sampled.
negative_sample_indices = torch.multinomial(self._word_frequency,
batch_size * self._negative_sample_size,
replacement=True).view(batch_size, -1)
negative_sample_indices = negative_sample_indices.to(self._device)
# Alternative: If no information on word frequencies, sample uniformly.
# negative_sample_indices = torch.randint(0, self._vocab_size - 1,
# size=(batch_size, self._negative_sample_size))
# (2) Look-up input, context word embeddings
# (batch_size, 1) -> (batch_size, 1, embedding_size) ->
# (batch_size, embedding_size)
input_vectors = self._input_embedding(input_word).unsqueeze(2)
output_vectors = self._output_embedding(context_word).unsqueeze(2)
# (3) Look-up negative context word embeddings
# (batch_size, k) -> (batch_size, k, embedding_size)
neg_output_vectors = self._output_embedding(negative_sample_indices).neg()
# (4) Calculate loss function
# 1 target=1 binary classification + k target=0 binary classifications
pos_loss = F.logsigmoid(torch.mul(output_vectors, input_vectors).squeeze()).mean(1)
neg_loss = F.logsigmoid(torch.bmm(neg_output_vectors, input_vectors).squeeze())\
.view(-1, 1, self._negative_sample_size).sum(2).mean(1)
# (5) Return collapsed loss functions
return -torch.mean(pos_loss + neg_loss)
def get_embeddings(self) -> torch.Tensor:
"""
Returns the input_embeddings associated with each word.
Note: output_embeddings would be have a different meaning w.r.t. to context.
Returns:
(vocab_size,embedding_size) torch.Tensor of values
"""
return self._input_embedding.weight.data