-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.cc
155 lines (147 loc) · 4.11 KB
/
dataset.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#include "dataset.h"
#include <cmath>
#include <iostream>
#include <set>
#include <string>
#include <utility>
#include <vector>
using std::make_pair;
using std::pair;
using std::set;
using std::sqrt;
using std::string;
using std::vector;
void Dataset::add_row(const vector<string>& fields) {
char *endptr;
for (size_t i = 0; i < field_names_.size(); i++) {
// If we've found non-numeric values for this feature
// or it's not convertible to a float, we will assume
// it's categorical.
float val = strtod(fields[i].data(), &endptr);
if (*endptr == 0) {
auto range_pair_iter = range_index_.find(i);
if (range_pair_iter == range_index_.end()) {
range_index_[i] = make_pair(val, val);
} else {
auto& range_pair = range_pair_iter->second;
if (range_pair.first > val) {
range_pair.first = val;
}
if (range_pair.second < val) {
range_pair.second = val;
}
}
auto mv = means_variances_.find(i);
if (mv == means_variances_.end()) {
means_variances_[i] = make_pair(val, 0.0);
} else {
auto& mv_pair = mv->second;
size_t n = examples_.size();
mv_pair.second = (n/(n+1)) *
(mv_pair.second +
(((mv_pair.first - val) * (mv_pair.first - val)) / (n+1)));
mv_pair.first = (mv_pair.first * n + val) / (n + 1);
}
} else {
field_index_[i].insert(fields[i]);
}
}
examples_.push_back(fields);
}
float Dataset::scale(float val, float min_val,
float max_val) {
if (scale_ != 0.0) {
if (min_val == max_val) {
return 0.0;
} else {
return ((val - min_val) * scale_ /
(max_val - min_val));
}
}
return val;
}
/*
float Dataset::standardize(float val, float mean,
float std_dev) {
return (val - mean) / std_dev;
}
*/
float Dataset::unscale_label(float val) {
auto range = range_index_.find(label_index_);
if (range == range_index_.end()) {
// No scaling; just return as is.
return val;
}
return (val * (range->second.second - range->second.first)) +
range->second.first;
}
/*
float Dataset::unstandardize_label(float val) {
auto mv = means_variances_.find(label_index_);
if (mv == means_variances_.end()) {
// No standardization; return as is.
return val;
}
return (val * mv->second.second) + mv->second.first;
}
*/
void Dataset::process_features() {
for (size_t i = 0; i < field_names_.size(); i++) {
if (i == label_index_) {
continue;
}
auto fields = field_index_.find(i);
if (fields != field_index_.end()) {
feature_index_.push_back(output_features_.size());
for (const string& feature_value : fields->second) {
output_features_.push_back(field_names_[i] + "_" + feature_value);
}
} else {
feature_index_.push_back(output_features_.size());
output_features_.push_back(field_names_[i]);
}
}
// Change variances to square roots, just to avoid recomputing
// standard deviations later.
for (auto mv : means_variances_) {
mv.second.second = sqrt(mv.second.second);
}
pos_ = examples_.begin();
}
void Dataset::process_example(const vector<string>& fields,
pair<vector<float>, float>* example) {
vector<float>& features = example->first;
features.clear();
features.reserve(output_features_.size());
for (size_t i = 0; i < field_names_.size(); i++) {
if (i == label_index_) {
float val = strtod(fields[i].data(), nullptr);
auto range = range_index_.find(i);
if (range != range_index_.end()) {
example->second = scale(val, range->second.first,
range->second.second);
} else {
example->second = val;
}
continue;
}
auto range = range_index_.find(i);
if (range != range_index_.end()) {
float val = strtod(fields[i].data(), nullptr);
features.push_back(scale(val, range->second.first,
range->second.second));
} else {
for (const string& value : field_index_[i]) {
features.push_back(fields[i] == value ? 1.0 : 0.0);
}
}
}
}
bool Dataset::next(pair<vector<float>, float>* example) {
if (!hasNext()) {
return false;
}
vector<string> input_example = *pos_++;
process_example(input_example, example);
return true;
}