-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocess_Oppdata.py
executable file
·372 lines (325 loc) · 15.6 KB
/
preprocess_Oppdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
# Adapted from: https://github.com/sussexwearlab/DeepConvLSTM
from signal_filtering import filter_opportunity_datasets_accelerometers
import os
import zipfile
import argparse
import numpy as np
import _pickle as cp
from io import BytesIO
from pandas import Series
# Hardcoded number of sensor channels employed in the OPPORTUNITY challenge
NB_SENSOR_CHANNELS = 113
NB_SENSOR_CHANNELS_WITH_FILTERING = 149 # =77 gyros +36*2 accelerometer channels
# Hardcoded names of the files defining the OPPORTUNITY challenge data. As named in the original data.
OPPORTUNITY_DATA_FILES_TRAIN = [
'OpportunityUCIDataset/dataset/S1-Drill.dat',
'OpportunityUCIDataset/dataset/S1-ADL1.dat',
'OpportunityUCIDataset/dataset/S1-ADL2.dat',
'OpportunityUCIDataset/dataset/S1-ADL3.dat',
'OpportunityUCIDataset/dataset/S1-ADL4.dat',
'OpportunityUCIDataset/dataset/S1-ADL5.dat',
'OpportunityUCIDataset/dataset/S2-Drill.dat',
'OpportunityUCIDataset/dataset/S2-ADL1.dat',
'OpportunityUCIDataset/dataset/S2-ADL2.dat',
'OpportunityUCIDataset/dataset/S2-ADL3.dat',
'OpportunityUCIDataset/dataset/S3-Drill.dat',
'OpportunityUCIDataset/dataset/S3-ADL1.dat',
'OpportunityUCIDataset/dataset/S3-ADL2.dat',
'OpportunityUCIDataset/dataset/S3-ADL3.dat'
]
OPPORTUNITY_DATA_FILES_TEST = [
'OpportunityUCIDataset/dataset/S2-ADL4.dat',
'OpportunityUCIDataset/dataset/S2-ADL5.dat',
'OpportunityUCIDataset/dataset/S3-ADL4.dat',
'OpportunityUCIDataset/dataset/S3-ADL5.dat'
]
NORM_MAX_THRESHOLDS = [3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000,
3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000,
3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000,
3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
3000, 3000, 3000, 10000, 10000, 10000, 1500, 1500, 1500,
250, 25, 200, 5000, 5000, 5000, 5000, 5000, 5000,
10000, 10000, 10000, 10000, 10000, 10000, 250, 250, 25,
200, 5000, 5000, 5000, 5000, 5000, 5000, 10000, 10000,
10000, 10000, 10000, 10000, 250, ]
NORM_MIN_THRESHOLDS = [-3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000,
-3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000,
-3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000,
-3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000, -3000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-3000, -3000, -3000, -10000, -10000, -10000, -1000, -1000, -1000,
-250, -100, -200, -5000, -5000, -5000, -5000, -5000, -5000,
-10000, -10000, -10000, -10000, -10000, -10000, -250, -250, -100,
-200, -5000, -5000, -5000, -5000, -5000, -5000, -10000, -10000,
-10000, -10000, -10000, -10000, -250, ]
def select_columns_opp(data, selected_features = list(range(63,69)) + [247]):
"""Selection of the RLA accelerometer and gyroscope
:param data: numpy integer matrix
Sensor data (all features)
:return: tuple((numpy integer 2D matrix, numpy integer 1D matrix))
(Selection of features (N, f), feature_is_accelerometer (f,) one-hot)
"""
# data = data[:,selected_features]
## ACC/GYRO/MAG/QUAT ONLY
features_delete = np.arange(46, 50)
features_delete = np.concatenate([features_delete, np.arange(59, 63)])
features_delete = np.concatenate([features_delete, np.arange(72, 76)])
features_delete = np.concatenate([features_delete, np.arange(85, 89)])
features_delete = np.concatenate([features_delete, np.arange(98, 102)])
features_delete = np.concatenate([features_delete, np.arange(134, 243)])
features_delete = np.concatenate([features_delete, np.arange(244, 249)])
return np.delete(data, features_delete, 1)
#return data
# def normalize(data, max_list, min_list):
# """Normalizes all sensor channels
# :param data: numpy integer matrix
# Sensor data
# :param max_list: numpy integer array
# Array containing maximums values for every one of the 113 sensor channels
# :param min_list: numpy integer array
# Array containing minimum values for every one of the 113 sensor channels
# :return:
# Normalized sensor data
# """
# max_list, min_list = np.array(max_list), np.array(min_list)
# diffs = max_list - min_list
# for i in np.arange(data.shape[1]):
# data[:, i] = (data[:, i]-min_list[i])/diffs[i]
# # Checking the boundaries
# data[data > 1] = 0.99
# data[data < 0] = 0.00
# return data
def normalize(x):
"""Normalizes all sensor channels by mean substraction,
dividing by the standard deviation and by 2.
:param x: numpy integer matrix
Sensor data
:return:
Normalized sensor data
"""
x = np.array(x, dtype=np.float32)
m = np.mean(x, axis=0)
x -= m
std = np.std(x, axis=0)
std += 0.000001
x /= (std * 2) # 2 is for having smaller values
return x
def split_data_into_time_gyros_accelerometers(data, is_accelerometer):
# Assuming index 0 of features is reserved for time.
# Splitting data into gyros, accelerometers and time:
is_accelerometer = np.array(is_accelerometer*2-1, dtype=np.int32)
# is_accelerometer's zeros have been replaced by -1. 1's are untouched.
plane = np.arange(len(is_accelerometer)) * is_accelerometer
delete_gyros = [-e for e in plane if e <= 0]
delete_accms = [ e for e in plane if e >= 0]
time = data[:,0]
gyros = np.delete(data, delete_accms, 1)
accms = np.delete(data, delete_gyros, 1)
return time, gyros, accms
def divide_x_y(data, label, filter_accelerometers):
"""Segments each sample into (time+features) and (label)
:param data: numpy integer matrix
Sensor data
:param label: string, ['gestures' (default), 'locomotion']
Type of activities to be recognized
:return: numpy integer matrix, numpy integer array
Features encapsulated into a matrix and labels as an array
"""
data_x = data[:, 1:114]
# Choose labels type for y
if label not in ['locomotion', 'gestures','LL_right_arm']:
raise RuntimeError("Invalid label: '%s'" % label)
if label == 'locomotion':
data_y = data[:, 114] # Locomotion label
elif label in ['LL_right_arm']:
data_y = data[:, -1] #
elif label =='gestures':
data_y = data[:,115]
return data_x, data_y
def adjust_idx_labels(data_y, label):
"""Transforms original labels into the range [0, nb_labels-1]
:param data_y: numpy integer array
Sensor labels
:param label: string, ['gestures' (default), 'locomotion']
Type of activities to be recognized
:return: numpy integer array
Modified sensor labels
"""
if label == 'locomotion': # Labels for locomotion are adjusted
data_y[data_y == 4] = 3
data_y[data_y == 5] = 4
elif label == 'gestures': # Labels for gestures are adjusted
data_y[data_y == 406516] = 1
data_y[data_y == 406517] = 2
data_y[data_y == 404516] = 3
data_y[data_y == 404517] = 4
data_y[data_y == 406520] = 5
data_y[data_y == 404520] = 6
data_y[data_y == 406505] = 7
data_y[data_y == 404505] = 8
data_y[data_y == 406519] = 9
data_y[data_y == 404519] = 10
data_y[data_y == 406511] = 11
data_y[data_y == 404511] = 12
data_y[data_y == 406508] = 13
data_y[data_y == 404508] = 14
data_y[data_y == 408512] = 15
data_y[data_y == 407521] = 16
data_y[data_y == 405506] = 17
elif label == 'LL_right_arm':
data_y[data_y == 401] = 1
data_y[data_y == 402] = 2
data_y[data_y == 403] = 3
data_y[data_y == 404] = 4
data_y[data_y == 405] = 5
data_y[data_y == 406] = 6
data_y[data_y == 407] = 7
data_y[data_y == 408] = 8
data_y[data_y == 409] = 9
data_y[data_y == 410] = 10
data_y[data_y == 411] = 11
data_y[data_y == 412] = 12
data_y[data_y == 413] = 13
return data_y
def check_data(data_set):
"""Try to access to the file and checks if dataset is in the data directory
In case the file is not found try to download it from original location
:param data_set:
Path with original OPPORTUNITY zip file
:return:
"""
print('Checking dataset {0}'.format(data_set))
data_dir, data_file = os.path.split(data_set)
# When a directory is not provided, check if dataset is in the data directory
if data_dir == "" and not os.path.isfile(data_set):
new_path = os.path.join(os.path.split(__file__)[0], "data", data_set)
if os.path.isfile(new_path) or data_file == 'OpportunityUCIDataset.zip':
data_set = new_path
# When dataset not found, try to download it from UCI repository
if (not os.path.isfile(data_set)) and data_file == 'OpportunityUCIDataset.zip':
print('... dataset path {0} not found'.format(data_set))
import urllib
origin = (
'https://archive.ics.uci.edu/ml/machine-learning-databases/00226/OpportunityUCIDataset.zip'
)
if not os.path.exists(data_dir):
print('... creating directory {0}'.format(data_dir))
os.makedirs(data_dir)
print('... downloading data from {0}'.format(origin))
urllib.urlretrieve(origin, data_set)
return data_dir
def process_dataset_file(data, label, filter_accelerometers):
"""Function defined as a pipeline to process individual OPPORTUNITY files
:param data: numpy integer matrix
Matrix containing data samples (rows) for every sensor channel (column)
:param label: string, ['gestures' (default), 'locomotion']
Type of activities to be recognized
:return: numpy integer matrix, numy integer array
Processed sensor data, segmented into features (x) and labels (y)
"""
# Select correct columns
data = select_columns_opp(data)
#print(np.shape(data))
#print(data)
# Colums are segmentd into features and labels
data_x, data_y = divide_x_y(data, label, filter_accelerometers)
data_y = adjust_idx_labels(data_y, label)
data_y = data_y.astype(int)
# Perform linear interpolation (a.k.a. filling in NaN)
data_x = np.array([Series(i).interpolate() for i in data_x.T]).T
# Remaining missing data are converted to zero
data_x[np.isnan(data_x)] = 0
# All sensor channels are normalized
data_x = normalize(data_x)
if filter_accelerometers:
# x's accelerometers, are filtered out by some LP passes for noise and gravity.
# Time is discarded, accelerometers are filtered to
# split gravity and remove noise.
_, x_gyros, x_accms = split_data_into_time_gyros_accelerometers(
data_x, is_accelerometer
)
print("gyros' shape: {}".format(x_gyros.shape))
print("old accelerometers' shape: {}".format(x_accms.shape))
x_accms = normalize(filter_opportunity_datasets_accelerometers(x_accms))
print("new accelerometers' shape: {}".format(x_accms.shape))
# Put features together (inner concatenation with transposals)
data_x = np.hstack([x_gyros, x_accms])
print("new total shape: {}".format(data_x.shape))
return data_x, data_y
def load_data_files(zipped_dataset, label, data_files, filter_accelerometers=False):
"""Loads specified data files' features (x) and labels (y)
:param zipped_dataset: ZipFile
OPPORTUNITY zip file to read from
:param label: string, ['gestures' (default), 'locomotion']
Type of activities to be recognized. The OPPORTUNITY dataset includes several annotations to perform
recognition modes of locomotion/postures and recognition of sporadic gestures.
:param data_files: list of strings
Data files to load.
:return: numpy integer matrix, numy integer array
Loaded sensor data, segmented into features (x) and labels (y)
"""
nb_sensors = NB_SENSOR_CHANNELS_WITH_FILTERING if filter_accelerometers else NB_SENSOR_CHANNELS
data_x = np.empty((0, nb_sensors))
data_y = np.empty((0))
for filename in data_files:
try:
data = np.loadtxt(BytesIO(zipped_dataset.read(filename)))
print('... file {0}'.format(filename))
x, y = process_dataset_file(data, label, filter_accelerometers)
data_x = np.vstack((data_x, x))
data_y = np.concatenate([data_y, y])
print("Data's shape yet: ")
print(data_x.shape)
except KeyError:
print('ERROR: Did not find {0} in zip file'.format(filename))
return data_x, data_y
def generate_data(dataset, target_filename, label):
"""Function to read the OPPORTUNITY challenge raw data and process all sensor channels
:param dataset: string
Path with original OPPORTUNITY zip file
:param target_filename: string
Processed file
:param label: string, ['gestures' (default), 'locomotion']
Type of activities to be recognized. The OPPORTUNITY dataset includes several annotations to perform
recognition modes of locomotion/postures and recognition of sporadic gestures.
"""
data_dir = check_data(dataset)
zf = zipfile.ZipFile(dataset)
print('\nProcessing train dataset files...\n')
X_train, y_train = load_data_files(zf, label, OPPORTUNITY_DATA_FILES_TRAIN)
print('\nProcessing test dataset files...\n')
X_test, y_test = load_data_files(zf, label, OPPORTUNITY_DATA_FILES_TEST)
print("Final datasets with size: | train {0} | test {1} | ".format(X_train.shape, X_test.shape))
obj = [(X_train, y_train), (X_test, y_test)]
f = open(os.path.join(data_dir, target_filename), 'wb')
cp.dump(obj, f)
f.close()
def get_args():
'''This function parses and return arguments passed in'''
parser = argparse.ArgumentParser(
description='Preprocess OPPORTUNITY dataset')
# Add arguments
parser.add_argument(
'-i', '--input', type=str, help='OPPORTUNITY zip file', required=True)
parser.add_argument(
'-o', '--output', type=str, help='Processed data file', required=True)
parser.add_argument(
'-t', '--task', type=str.lower, help='Type of activities to be recognized', default="gestures", choices = ["gestures", "locomotion"], required=False)
# Array for all arguments passed to script
args = parser.parse_args()
# Assign args to variables
dataset = args.input
target_filename = args.output
label = args.task
# Return all variable values
return dataset, target_filename, label
if __name__ == '__main__':
OpportunityUCIDataset_zip, output, l = get_args();
generate_data(OpportunityUCIDataset_zip, output, l)