-
Notifications
You must be signed in to change notification settings - Fork 38
/
team_code.py
188 lines (143 loc) · 7.51 KB
/
team_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python
# Edit this script to add your team's code. Some functions are *required*, but you can edit most parts of the required functions,
# change or remove non-required functions, and add your own functions.
################################################################################
#
# Optional libraries, functions, and variables. You can change or remove them.
#
################################################################################
import joblib
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import sys
from helper_code import *
################################################################################
#
# Required functions. Edit these functions to add your code, but do not change the arguments of the functions.
#
################################################################################
# Train your models. This function is *required*. You should edit this function to add your code, but do *not* change the arguments
# of this function. If you do not train one of the models, then you can return None for the model.
# Train your digitization model.
def train_models(data_folder, model_folder, verbose):
# Find the data files.
if verbose:
print('Finding the Challenge data...')
records = find_records(data_folder)
num_records = len(records)
if num_records == 0:
raise FileNotFoundError('No data were provided.')
# Train the digitization model. If you are not training a digitization model, then you can remove this part of the code.
if verbose:
print('Training the digitization model...')
# Extract the features and labels from the data.
if verbose:
print('Extracting features and labels from the data...')
digitization_features = list()
classification_features = list()
classification_labels = list()
# Iterate over the records.
for i in range(num_records):
if verbose:
width = len(str(num_records))
print(f'- {i+1:>{width}}/{num_records}: {records[i]}...')
record = os.path.join(data_folder, records[i])
# Extract the features from the image; this simple example uses the same features for the digitization and classification
# tasks.
features = extract_features(record)
digitization_features.append(features)
# Some images may not be labeled...
labels = load_labels(record)
if any(label for label in labels):
classification_features.append(features)
classification_labels.append(labels)
# ... but we expect some images to be labeled for classification.
if not classification_labels:
raise Exception('There are no labels for the data.')
# Train the models.
if verbose:
print('Training the models on the data...')
# Train the digitization model. This very simple model uses the mean of these very simple features as a seed for a random number
# generator.
digitization_model = np.mean(features)
# Train the classification model. If you are not training a classification model, then you can remove this part of the code.
# This very simple model trains a random forest model with these very simple features.
classification_features = np.vstack(classification_features)
classes = sorted(set.union(*map(set, classification_labels)))
classification_labels = compute_one_hot_encoding(classification_labels, classes)
# Define parameters for random forest classifier and regressor.
n_estimators = 12 # Number of trees in the forest.
max_leaf_nodes = 34 # Maximum number of leaf nodes in each tree.
random_state = 56 # Random state; set for reproducibility.
# Fit the model.
classification_model = RandomForestClassifier(
n_estimators=n_estimators, max_leaf_nodes=max_leaf_nodes, random_state=random_state).fit(classification_features, classification_labels)
# Create a folder for the models if it does not already exist.
os.makedirs(model_folder, exist_ok=True)
# Save the models.
save_models(model_folder, digitization_model, classification_model, classes)
if verbose:
print('Done.')
print()
# Load your trained models. This function is *required*. You should edit this function to add your code, but do *not* change the
# arguments of this function. If you do not train one of the models, then you can return None for the model.
def load_models(model_folder, verbose):
digitization_filename = os.path.join(model_folder, 'digitization_model.sav')
digitization_model = joblib.load(digitization_filename)
classification_filename = os.path.join(model_folder, 'classification_model.sav')
classification_model = joblib.load(classification_filename)
return digitization_model, classification_model
# Run your trained digitization model. This function is *required*. You should edit this function to add your code, but do *not*
# change the arguments of this function. If you did not train one of the models, then you can return None for the model.
def run_models(record, digitization_model, classification_model, verbose):
# Run the digitization model; if you did not train this model, then you can set signal = None.
# Load the digitization model.
model = digitization_model['model']
# Load the dimensions of the signal.
header_file = get_header_file(record)
header = load_text(header_file)
num_samples = get_num_samples(header)
num_signals = get_num_signals(header)
# Extract the features.
features = extract_features(record)
features = features.reshape(1, -1)
# Generate "random" waveforms using the a random seed from the features.
seed = int(round(model + np.mean(features)))
signal = np.random.default_rng(seed=seed).uniform(low=-1, high=1, size=(num_samples, num_signals))
# Run the classification model; if you did not train this model, then you can set labels = None.
# Load the classification model and classes.
model = classification_model['model']
classes = classification_model['classes']
# Get the model probabilities.
probabilities = model.predict_proba(features)
probabilities = np.asarray(probabilities, dtype=np.float32)[:, 0, 1]
# Choose the class or classes with the highest probability as the label or labels.
max_probability = np.nanmax(probabilities)
labels = [classes[i] for i, probability in enumerate(probabilities) if probability == max_probability]
return signal, labels
################################################################################
#
# Optional functions. You can change or remove these functions and/or add new functions.
#
################################################################################
# Extract features.
def extract_features(record):
images = load_images(record)
mean = 0.0
std = 0.0
for image in images:
image = np.asarray(image)
mean += np.mean(image)
std += np.std(image)
return np.array([mean, std])
# Save your trained models.
def save_models(model_folder, digitization_model=None, classification_model=None, classes=None):
if digitization_model is not None:
d = {'model': digitization_model}
filename = os.path.join(model_folder, 'digitization_model.sav')
joblib.dump(d, filename, protocol=0)
if classification_model is not None:
d = {'model': classification_model, 'classes': classes}
filename = os.path.join(model_folder, 'classification_model.sav')
joblib.dump(d, filename, protocol=0)