-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_large_dataset_mapping.py
146 lines (102 loc) · 3.71 KB
/
create_large_dataset_mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import glob
import re
import h5py
import numpy as np
import cv2
from datetime import datetime
import csv
import scipy.io
from random import shuffle
from keras.utils.np_utils import to_categorical
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
def mapping(x):
if x == -1:
return 6
elif x == -2:
return 7
elif x == -3:
return 8
elif x == -4:
return 9
elif x == -5:
return 10
else:
return x
shuffle_data = True # shuffle the addresses before saving
hdf5_path = 'dataset_mapping.hdf5' # address to where you want to save the hdf5 file
train_parent_directory = sorted(glob.glob('data/*.mat'), key=numericalSort)
# read addresses and labels from the 'train' folder
addrs = train_parent_directory
#labels = [0 if 'cat' in addr else 1 for addr in addrs] # 0 = Cat, 1 = Dog
# to shuffle data
'''
if shuffle_data:
c = list(zip(addrs))
shuffle(c)
addrs, labels = zip(*c)
'''
# Divide the data into 60% train, 20% validation, and 20% test
train_addrs = addrs[0:int(0.9*len(addrs))]
train_labels = addrs[0:int(0.9*len(addrs))]
test_addrs = addrs[int(0.9*len(addrs)):]
test_labels = addrs[int(0.9*len(addrs)):]
train_shape = (len(train_addrs), 128, 128, 200)
test_shape = (len(test_addrs), 128, 128, 200)
train_Y_shape = (len(train_labels), 2, 11)
test_Y_shape = (len(test_labels), 2, 11)
# open a hdf5 file and create earrays
hdf5_file = h5py.File(hdf5_path, mode='w')
hdf5_file.create_dataset("train_img", train_shape, np.float32)
hdf5_file.create_dataset("test_img", test_shape, np.float32)
#hdf5_file.create_dataset("train_mean", train_shape[1:], np.float32)
hdf5_file.create_dataset("train_labels", train_Y_shape, np.int8)
hdf5_file.create_dataset("test_labels", test_Y_shape, np.int8)
# a numpy array to save the mean of the images
#mean = np.zeros(train_shape[1:], np.float32)
# loop over train addresses
for i in range(len(train_addrs)):
# print how many images are saved every 1000 images
if i % 1000 == 0 and i > 1:
print ('Train data: {}/{}'.format(i, len(train_addrs)))
# read an image and resize to (224, 224)
# cv2 load images as BGR, convert it to RGB
addr = train_addrs[i]
mat = scipy.io.loadmat(addr)
img = mat["arr"]
vector = mat["vector"]
new_vector = vector
new_vector[0][0] = mapping(vector[0][0])
new_vector[0][1] = mapping(new_vector[0][1])
categorical_labels = to_categorical(new_vector[0], num_classes=11)
print(addr)
# save the image and calculate the mean so far
hdf5_file["train_img"][i, ...] = img[None]
hdf5_file["train_labels"][i, ...] = categorical_labels[None]
#mean += img / float(len(train_labels))
print('------------------')
# loop over test addresses
for i in range(len(test_addrs)):
# print how many images are saved every 1000 images
if i % 1000 == 0 and i > 1:
print ('Test data: {}/{}'.format(i, len(test_addrs)))
# read an image and resize to (224, 224)
# cv2 load images as BGR, convert it to RGB
addr = test_addrs[i]
mat = scipy.io.loadmat(addr)
img = mat["arr"]
vector = mat["vector"]
new_vector = vector
new_vector[0][0] = mapping(vector[0][0])
new_vector[0][1] = mapping(new_vector[0][1])
categorical_labels = to_categorical(new_vector[0], num_classes=11)
print(addr)
# save the image and calculate the mean so far
hdf5_file["test_img"][i, ...] = img[None]
hdf5_file["test_labels"][i, ...] = categorical_labels[None]
# save the mean and close the hdf5 file
#hdf5_file["train_mean"][...] = mean
hdf5_file.close()