-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnetflix_handler.cpp
144 lines (107 loc) · 4.65 KB
/
netflix_handler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include "util/Base.h"
#include "struct/Rating.h"
#include "util/FileUtil.h"
using namespace mf;
void transformIds(const string &input_path, unordered_map<int, int> &uid_map, unordered_map<int, int> &mid_map,
const int skip_line) {
ifstream fin(input_path.c_str());
fin.tie(nullptr);
if (fin.fail()) {
cout << "fail to open the dir" << endl;
exit(0);
}
int line_counter = 0;
string row;
while (line_counter < skip_line) {
getline(fin, row);
line_counter++;
}
int uid, mid;
float rate;
//read from file
while (fin >> uid >> mid >> rate) {
//update line counter
line_counter++;
//print the progress
if (line_counter % 1000000 == 0) {
cout << "read " << line_counter << " lines...\n";
cout.flush();
}
//remap the ids
if (uid_map.find(uid) == uid_map.end()) {
uid_map.insert(std::pair<int, int>(uid, uid_map.size()));
}
if (mid_map.find(mid) == mid_map.end()) {
mid_map.insert(std::pair<int, int>(mid, mid_map.size()));
}
}
fin.close();
}
int convertFile(const string &input_path, const string &outputFolder, const string &outputMMCFileName,
const string &outputCSRFileName, unordered_map<int, int> &uid_map,
unordered_map<int, int> &mid_map, const int skip_line, const string &cumf_prefix) {
vector<Rating> ratings;
ifstream fin(input_path.c_str());
fin.tie(nullptr);
int line_counter = 0;
string row;
while (line_counter < skip_line) {
getline(fin, row);
line_counter++;
}
int uid, mid;
string rate;
while (fin >> uid >> mid >> rate) {
line_counter++;
if (line_counter % 1000000 == 0) {
cout << "load " << line_counter << " lines...\n";
cout.flush();
}
ratings.push_back(Rating(uid_map[uid], mid_map[mid], rate));
}
fin.close();
writeMatrix(ratings, uid_map.size(), mid_map.size(), outputFolder, outputMMCFileName, outputCSRFileName, cumf_prefix);
return line_counter - skip_line;
}
int main(int argc, char const *argv[]) {
string outputFolder;
string trainPath;
string testPath;
string metaPath;
string userIDMapPath;
string itemIDMapPath;
string outputMMCTrainPath;
string outputMMCTestPath;
string outputCSRTrainPath;
string outputCSRTestPath;
po::options_description desc("Allowed options");
desc.add_options()
("help", "produce help message")
("i_train", po::value<string>(&trainPath)->default_value("../raw_data/netflix/netflix_mm"), "path to original training file")
("i_test", po::value<string>(&testPath)->default_value("../raw_data/netflix/netflix_mme"), "path to original testing file")
("o_folder", po::value<string>(&outputFolder)->default_value("../data/netflix/"), "path to output folder")
("meta_path", po::value<string>(&metaPath)->default_value("meta"), "name of meta file")
("user_id_map_path", po::value<string>(&userIDMapPath)->default_value("user_id_map.dat"), "name of user id map file")
("item_id_map_path", po::value<string>(&itemIDMapPath)->default_value("item_id_map.dat"), "name of item id map file")
("o_mmc_train", po::value<string>(&outputMMCTrainPath)->default_value("train.mmc"), "name of output MMC training file")
("o_mmc_test", po::value<string>(&outputMMCTestPath)->default_value("test.mmc"), "name of output MMC testing file")
("o_csr_train", po::value<string>(&outputCSRTrainPath)->default_value("train.csr"), "name of output CSR training file")
("o_csr_test", po::value<string>(&outputCSRTestPath)->default_value("test.csr"), "name of output CSR testing file");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);
if (vm.count("help")) {
cout << desc << endl;
return 0;
}
unordered_map<int, int> uid_map;
unordered_map<int, int> mid_map;
transformIds(trainPath, uid_map, mid_map, 3);
int num_train, num_test;
num_train = convertFile(trainPath, outputFolder, outputMMCTrainPath, outputCSRTrainPath, uid_map, mid_map, 3, "R_train_");
num_test = convertFile(testPath, outputFolder, outputMMCTestPath, outputCSRTestPath, uid_map, mid_map, 3, "R_test_");
writeMeta(outputFolder + metaPath, uid_map.size(), mid_map.size(), num_train, num_test, outputCSRTrainPath, outputCSRTestPath);
writeKeyMap(outputFolder + userIDMapPath, uid_map);
writeKeyMap(outputFolder + itemIDMapPath, mid_map);
return 0;
}