-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathdataLoader.py
66 lines (51 loc) · 1.95 KB
/
dataLoader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
'''
written by Lorenz Muller
'''
import numpy as np
from time import time
def loadData(path='./', valfrac=0.1, delimiter='::', seed=1234,
transpose=False):
'''
loads ml-1m data
:param path: path to the ratings file
:param valfrac: fraction of data to use for validation
:param delimiter: delimiter used in data file
:param seed: random seed for validation splitting
:param transpose: flag to transpose output matrices (swapping users with movies)
:return: train ratings (n_u, n_m), valid ratings (n_u, n_m)
'''
np.random.seed(seed)
tic = time()
print('reading data...')
data = np.loadtxt(path, skiprows=0, delimiter=delimiter).astype('int32')
print('data read in', time() - tic, 'seconds')
n_u = np.unique(data[:, 0]).shape[0] # number of users
n_m = np.unique(data[:, 1]).shape[0] # number of movies
n_r = data.shape[0] # number of ratings
# these dictionaries define a mapping from user/movie id to to user/movie number (contiguous from zero)
udict = {}
for i, u in enumerate(np.unique(data[:, 0]).tolist()):
udict[u] = i
mdict = {}
for i, m in enumerate(np.unique(data[:, 1]).tolist()):
mdict[m] = i
# shuffle indices
idx = np.arange(n_r)
np.random.shuffle(idx)
trainRatings = np.zeros((n_u, n_m), dtype='float32')
validRatings = np.zeros((n_u, n_m), dtype='float32')
for i in range(n_r):
u_id = data[idx[i], 0]
m_id = data[idx[i], 1]
r = data[idx[i], 2]
# the first few ratings of the shuffled data array are validation data
if i <= valfrac * n_r:
validRatings[udict[u_id], mdict[m_id]] = int(r)
# the rest are training data
else:
trainRatings[udict[u_id], mdict[m_id]] = int(r)
if transpose:
trainRatings = trainRatings.T
validRatings = validRatings.T
print('loaded dense data matrix')
return trainRatings, validRatings