forked from isi-nlp/LSTM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSoftmaxLoss.h
160 lines (137 loc) · 4.76 KB
/
SoftmaxLoss.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#ifndef SOFTMAXLOSS_H
#define SOFTMAXLOSS_H
#include <Eigen/Dense>
#include "multinomial.h"
#include "util.h"
namespace nplm
{
// is this cheating?
using Eigen::Matrix;
using Eigen::MatrixBase;
using Eigen::Dynamic;
///// Softmax layer plus log-loss function.
enum loss_function_type { LogLoss, NCELoss, InvalidLoss };
inline loss_function_type string_to_loss_function (const std::string &s)
{
if (s == "log")
return LogLoss;
else if (s == "nce")
return NCELoss;
else
return InvalidLoss;
}
inline std::string loss_function_to_string (loss_function_type f)
{
if (f == LogLoss)
return "log";
else if (f == NCELoss)
return "nce";
}
/// Note: Outputs log-probabilities.
struct SoftmaxLogLoss
{
template <typename DerivedI, typename DerivedW, typename DerivedO>
void fProp(const MatrixBase<DerivedI> &input,
const MatrixBase<DerivedW> &output_words,
const MatrixBase<DerivedO> &output_const,
double &loss)
{
//std::cerr<<"output words are "<<output_words<<std::endl;
//std::cerr<<"output const is "<<output_const<<std::endl;
UNCONST(DerivedO, output_const, output);
//std::cerr<<"input is "<<input<<std::endl;
//getchar();
double log_likelihood = 0.0;
#pragma omp parallel for reduction(+:log_likelihood)
for (int train_id = 0; train_id < input.cols(); train_id++)
{
//std::cerr<<"output word "<<output_words(train_id)<<std::endl;
//If the output word is negative, that means there was no sample
if (output_words(train_id) == -1){
//std::cerr<<"word is -1"<<std::endl;
continue;
}
double normalization = logsum(input.col(train_id));
output.col(train_id).array() = input.col(train_id).array() - normalization;
//std::cerr<<"normalization is"<<normalization<<std::endl;
log_likelihood += output(output_words(train_id), train_id);
}
//std::cerr<<"output is "<<output<<std::endl;
//getchar();
loss = log_likelihood;
}
template <typename DerivedW, typename DerivedO, typename DerivedI>
void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
{
UNCONST(DerivedI, grad_input_const, grad_input);
grad_input.setZero();
#pragma omp parallel for
for (int train_id = 0; train_id < output.cols(); train_id++)
{
//If the output word is -1, there is no gradient
if (output_words(train_id) == -1) {
continue;
}
grad_input(output_words(train_id), train_id) += 1.;
//grad_input.col(train_id) -= output.col(train_id).array().exp().matrix();
grad_input.col(train_id) -= output.col(train_id).array().exp().matrix();
}
//std::cerr<<"grad input is "<<grad_input<<std::endl;
}
};
///// Softmax layer plus NCE loss function.
///// Note: Outputs probabilities.
///// Note: Unlike SoftmaxLogLoss, does not compute *or* apply precomputed
///// normalizations. Currently the caller is expected to do normalization.
template <typename Multinomial>
class SoftmaxNCELoss
{
const Multinomial &unigram;
public:
SoftmaxNCELoss(const Multinomial &unigram)
: unigram(unigram)
{
}
template <typename DerivedI, typename DerivedW, typename DerivedO>
void fProp(const MatrixBase<DerivedI> &scores,
const MatrixBase<DerivedW> &minibatch_samples,
const MatrixBase<DerivedO> &output_const, double &loss)
{
UNCONST(DerivedO, output_const, output);
double log_likelihood = 0.0;
int num_noise_samples = minibatch_samples.rows()-1;
double log_num_noise_samples = std::log(num_noise_samples);
#pragma omp parallel for reduction(+:log_likelihood) schedule(static)
for (int train_id = 0; train_id < scores.cols(); train_id++)
{
for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++)
{
int sample = minibatch_samples(sample_id, train_id);
// To avoid zero or infinite probabilities,
// never take exp of score without normalizing first,
// even if it's a little slower...
double score = scores(sample_id, train_id);
double score_noise = log_num_noise_samples + unigram.logprob(sample);
double z = logadd(score, score_noise);
double logprob = score - z;
double logprob_noise = score_noise - z;
output(sample_id, train_id) = std::exp(logprob);
log_likelihood += sample_id == 0 ? logprob : logprob_noise;
}
}
loss = log_likelihood;
}
template <typename DerivedO, typename DerivedI>
void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
{
UNCONST(DerivedI, output_const, output);
#pragma omp parallel for schedule(static)
for (int train_id = 0; train_id < probs.cols(); train_id++)
{
output.col(train_id) = -probs.col(train_id);
output(0, train_id) += 1.0;
}
}
};
} // namespace nplm
#endif