-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsweep.py
208 lines (172 loc) · 6.59 KB
/
sweep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
This script runs a sweep on parameters defined in `sweep_model_label.yaml`.
Add your parameters to the `sweep.yaml` file. Don't forget to put the base model
config in the `sweep.yaml` file as well.
The sweep has to be initated as follows:
`wandb sweep sweep.yaml`
Then check the output of the Terminal and paste the comand to run the sweep.
The ID can be found in the command line output.
"""
import argparse
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold, TimeSeriesSplit
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tqdm.keras import TqdmCallback
import wandb
from configure_dataframes import directory_to_dataframe
from data_preparation_utils import get_datasets
from modelbuilder import ModelBuilder, TransferLearningModelBuilder
from train_utils import load_config
def main(config_name: str, batch_size: int) -> None:
# Wandb login
wandb.init()
# Fix the random generator seeds for better reproducibility
tf.random.set_seed(67)
np.random.seed(67)
if not os.path.exists("models"):
os.makedirs("models")
# Send config to wandb
config = load_config(os.path.join("model_base_configs", config_name + ".yaml"))
wandb.config.update(config)
del config
# Load dataframes
data_df = directory_to_dataframe()
# Filter DF, if you want
if "instrument_to_use" in wandb.config:
data_df = data_df[data_df.instrument.isin(wandb.config["instrument_to_use"])]
# Create datasets
train_df, test_df = get_datasets(
data_df,
train_size=0.7,
sort_by_time=True,
only_unique_time_periods=True,
burst_frac=wandb.config["burst_frac"],
)
# Build and train the model
mb = ModelBuilder(model_params=wandb.config["model_params"])
#mb = TransferLearningModelBuilder(model_params=wandb.config)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
monitor="val_f1_score",
patience=10,
verbose=1,
mode="max",
restore_best_weights=True,
) # or val_recall, experiment
# wandbcallback saves epoch by epoch every metric we gave on modelbuilder to wandb
# checkpoints to save the best model in all epochs for every sweep, may be based on recall or accuracy
# can also load the best parameters of the model before doing an evaluation, this enables us to give the best parameters as single instance to wandb as well
n_splits = 4
_s = TimeSeriesSplit(n_splits=n_splits)
X = train_df["file_path"].values
y = train_df["label"].values
pp_f = lambda x: TransferLearningModelBuilder.preprocess_input(
x, ewc=wandb.config["elim_wrong_channels"]
)
datagen = ImageDataGenerator(preprocessing_function=pp_f)
evals = []
for fold, (train_index, val_index) in enumerate(_s.split(X, y)):
print("----------------------------------------")
print(f"Training on Fold: {fold + 1}/{n_splits}")
print("----------------------------------------")
train_data = train_df.iloc[train_index]
val_data = train_df.iloc[val_index]
val_data, test_data = (
val_data.iloc[: len(val_data) // 2],
val_data.iloc[len(val_data) // 2 :],
)
# Print out class balance
print("Class balance in training set:")
print(train_data.label.value_counts())
print("Class balance in validation set:")
print(val_data.label.value_counts())
print("Class balance in test set:")
print(test_data.label.value_counts())
val_start = val_data.start_time.min()
val_end = val_data.start_time.max()
# Some asserts to make sure that the data is correct
train_in_val = train_data[
(train_data.start_time > val_start) & (train_data.start_time < val_end)
]
assert (
train_in_val.empty
), "Training data is in training data" # gives error on second fold
assert len(np.unique(train_data.start_time)) == len(train_data.start_time)
assert len(np.unique(val_data.start_time)) == len(val_data.start_time)
new_train_ds = datagen.flow_from_dataframe(
train_data,
x_col="file_path",
y_col="label",
batch_size=batch_size,
seed=42,
shuffle=True,
class_mode="binary",
target_size=(256, 256),
color_mode="rgb",
)
val_ds = datagen.flow_from_dataframe(
val_data,
x_col="file_path",
y_col="label",
batch_size=batch_size,
seed=42,
shuffle=False,
class_mode="binary",
target_size=(256, 256),
color_mode="rgb",
)
mb.build()
model = mb.compile()
_ = model.fit(
new_train_ds,
validation_data=val_ds,
epochs=wandb.config["epochs"],
verbose=0,
callbacks=[early_stopping_callback, TqdmCallback(verbose=0)],
)
# Eval
val_loss, val_acc, val_precision, val_recall, val_f1_score = model.evaluate(
test_data, verbose=0
)
# Log metrics
evals.append(
{
"val_loss": val_loss,
"val_acc": val_acc,
"val_precision": val_precision,
"val_recall": val_recall,
"val_f1_score": val_f1_score[0],
}
)
# Print out all results to one line for easier comparison
for metric, value in evals[-1].items():
print(f"{metric}: {value:.4f}", end=" | ")
# Calculate averages and standard deviations
metrics_avg = {
metric: np.mean([eval[metric] for eval in evals]) for metric in evals[0].keys()
}
metrics_std = {
metric: np.std([eval[metric] for eval in evals]) for metric in evals[0].keys()
}
# Log averages
for metric, value in metrics_avg.items():
wandb.log({f"{metric}_avg": value})
# Log standard deviations
for metric, value in metrics_std.items():
wandb.log({f"{metric}_std": value})
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Sweep for a specific model.")
parser.add_argument(
"--config_name", metavar="config_name", required=True, help="Name of model."
)
parser.add_argument(
"--batch_size",
metavar="batch_size",
required=True,
help="Batch size.",
)
# Cast to int
batch_size = int(parser.parse_args().batch_size)
args = parser.parse_args()
main(config_name=args.config_name, batch_size=batch_size)