-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTrain.py
681 lines (545 loc) · 34.5 KB
/
Train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
'''
Copyright (c) 2019-2020, Takashi Shirakawa. All rights reserved.
e-mail: tkshirakawa@gmail.com
Released under the MIT license.
https://opensource.org/licenses/mit-license.php
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
'''
# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function
import sys
if sys.argv[1] == '-h':
print('### Help for argv ###')
print(' argv[1] : Path to a neural network model file (.py).')
print(' argv[2] : Path to a CSV/.h5 file for training image paths')
print(' argv[3] : Path to a CSV/.h5 file for validation image paths')
print(' argv[4] : Path to a directory to save results in it')
print(' argv[5] : Training mode: 0=Normal, 1=Resume by load_model(), 2=Retrain by load_weights()')
print(' argv[6] : Path to a model for mode 1 or 2') # print(' argv[6] : Path to a model for Mode-1 or Mode-2')
print(' argv[7] : Initial epoch to resume training for Mode-1')
print(' NOTE : Input images must be gray-scale without alpha values')
sys.exit()
import os
import platform
import numpy as np
import importlib.machinery as imm
if platform.system() == 'Darwin':
os.environ['KMP_DUPLICATE_LIB_OK']='True'
# The absolute path to this file and directory
trainFilePath = os.path.abspath(__file__)
trainDirPath = os.path.dirname(trainFilePath)
#############################################################################################
# Define loss and metrics.
lossAndMetricsPath = os.path.join(trainDirPath, 'utils', 'Loss_and_metrics.py')
# The following get_loss() and get_metrics() will be used in coremltools when converting a trained model.
# NOTE: The metrics defined in Loss_and_metrics.py may be estimated / assumed values, NOT final results.
_LM = imm.SourceFileLoader('Loss_and_metrics', lossAndMetricsPath).load_module()
# Select a loss function for training
# loss = _LM.MSE_loss
# loss = _LM.MSE_loss_w_iou_score
# loss = _LM.MSE_loss_w_dice_coef # Good
# loss = _LM.LogCosh_loss
# loss = _LM.LogCosh_loss_w_iou_score
# loss = _LM.LogCosh_loss_w_dice_coef # Good
# loss = _LM.KLD_loss
# loss = _LM.KLD_loss_w_iou_score
# loss = _LM.KLD_loss_w_dice_coef
# loss = _LM.focal_CE_loss
# loss = _LM.focal_CE_loss_w_iou_score
# loss = _LM.focal_CE_loss_w_dice_coef
# loss = _LM.constrained_focal_CE_loss
# loss = _LM.constrained_focal_CE_loss_w_iou_score
# loss = _LM.constrained_focal_CE_loss_w_dice_coef # Good
# loss = _LM.hausdorff_distance_loss
# loss = _LM.hausdorff_distance_loss_w_iou_score
loss = _LM.hausdorff_distance_loss_w_dice_coef # Good
# loss = _LM.series_combination_loss_w_iou_score
# loss = _LM.series_combination_loss_w_dice_coef
def get_loss(): return {loss.__name__: loss}
# Metrics
# NOTE: The metrics defined here MUST include the keys 'iou_score' and 'dice_coef' for monitoring the best performance model.
# NOTE: The metrics defined in Loss_and_metrics.py are estimated / assumed values for each batch, NOT final results.
def get_metrics(): return {'iou_score': _LM.iou_score, 'dice_coef': _LM.dice_coef, 'rou': _LM.rou, 'fpoet': _LM.fpoet}
# Define loss and metrics.
#############################################################################################
#############################################################################################
# Define custom callbacks
# from tensorflow.keras.callbacks import Callback ##### For TensorFlow v2 #####
from keras.callbacks import Callback
# NOTE: Call this BestMetricsMonitor prior to the other callbacks to update mIoU metrics with the key 'val_iou_score' and 'val_dice_coef'.
class BestMetricsMonitor(Callback):
def __init__(self, validation, model_base_path, nn_name, patience, **kwargs):
super(BestMetricsMonitor, self).__init__(**kwargs)
# self.val_gen = validation
self.val_img, gt = validation.getdata() # Numpy array of float between 0.0 and 1.0
self.truth = np.clip(gt, 0.0, 1.0) # Numpy array of float between 0.0 and 1.0, Simple clip keeps the gradient within the range
self.model_base_path = model_base_path
self.nn_name = nn_name
self.patience = patience
print('Callback: BestMetricsMonitor - Start monitoring mean IoU and Dice coef to save the best model on each epoch end...')
def on_train_begin(self, logs=None):
self.best_miou = -np.Inf
self.best_dice = -np.Inf
self.wait = 0
self.metrics_updated = False
self.stopped_epoch = 0
def on_epoch_end(self, epoch, logs=None):
print('\nCalling prediction and calculating metrics for validation data...')
# predc = np.where(self.model.predict_generator(self.val_gen) >= 0.5, 1, 0)
# predc = np.where(self.model.predict(self.val_img) >= 0.5, 1, 0)
t = self.truth
p = np.clip(self.model.predict(self.val_img), 0.0, 1.0) # Prediction for validation data
u = np.clip(t + p, 0.0, 1.0)
truth = t.sum(axis=(-3,-2,-1))
predc = p.sum(axis=(-3,-2,-1))
union = u.sum(axis=(-3,-2,-1))
plane = np.where(t.max(axis=(-3,-2,-1)) >= 0.5, 1.0, 0.0)
count = max(plane.sum(), 1.0)
t_or_p = truth + predc
intsec = t_or_p - union
miou = np.sum(intsec / (union + 1e-4), axis=None) / count
dice = np.sum(2.0 * intsec / (t_or_p + 1e-4), axis=None) / count
# Update metrics dictionary with the key 'val_iou_score' / 'val_dice_coef'
try: logs['val_iou_score'] = miou
except: print('ALERT: Failed to update logs[val_iou_score].')
try: logs['val_dice_coef'] = dice
except: print('ALERT: Failed to update logs[val_dice_coef].')
self.metrics_updated = False
def create_model_path(metrics_name, metrics_val):
return '{0}, {1}={2:.4f}, {3}.h5'.format(self.model_base_path, metrics_name, metrics_val, self.nn_name)
# Save model with the best mean IoU
if miou > self.best_miou:
print('Val. mean IoU = {0:.5f} (updated from the value = {1:.5f})'.format(miou, self.best_miou))
path = create_model_path('mIoU', self.best_miou)
if os.path.exists(path): os.remove(path)
self.model.save(filepath=create_model_path('mIoU', miou), overwrite=True, include_optimizer=True)
self.best_miou = miou
self.metrics_updated = True
else:
print('Val. mean IoU = {0:.5f} (not updated, the current best = {1:.5f})'.format(miou, self.best_miou))
# Save model with the best Dice coef
if dice > self.best_dice:
print('Val. Dice coef = {0:.5f} (updated from the value = {1:.5f})'.format(dice, self.best_dice))
path = create_model_path('Dice', self.best_dice)
if os.path.exists(path): os.remove(path)
self.model.save(filepath=create_model_path('Dice', dice), overwrite=True, include_optimizer=True)
self.best_dice = dice
self.metrics_updated = True
else:
print('Val. Dice coef = {0:.5f} (not updated, the current best = {1:.5f})'.format(dice, self.best_dice))
# Early stopping of myself
if self.metrics_updated:
self.wait = 0
else:
self.wait += 1
print('Patience count = {0} (early stop at {1} patience)'.format(self.wait, self.patience))
if self.wait >= self.patience:
self.stopped_epoch = epoch
self.model.stop_training = True
print('Epoch {0}: early stopping...'.format(self.stopped_epoch + 1))
print(' ')
def metrics_updated_in_Monitor(self):
return self.metrics_updated
class AutoLRManager(Callback):
def __init__(self, param, bm_monitor, **kwargs):
super(AutoLRManager, self).__init__(**kwargs)
self.p = param
self.bm_monitor = bm_monitor
print('Callback: AutoLRManager - Start monitoring the best metrics for learning rate management...')
def on_train_begin(self, logs=None):
self.multiplier = 1.0
self.n_good = 0
self.n_bad = 0
def on_epoch_end(self, epoch, logs=None):
metrics_updated = self.bm_monitor.metrics_updated_in_Monitor()
print('Monitoring metrics for learning rate management... [metrics: {0}]'.format('updated' if metrics_updated else 'Not updated'))
if metrics_updated:
self.n_good += 1
self.n_bad = 0
step = self.p['step'][1]
print('Count(s) of update = {0} (LR increased at {1} updates by x{2})'.format(self.n_good, self.p['patience'][1], step))
if self.n_good >= self.p['patience'][1] and step != 1.0:
if step < 1.0: self.multiplier = max(self.p['limit'][1], self.multiplier * step)
elif step > 1.0: self.multiplier = min(self.p['limit'][1], self.multiplier * step)
print('LR multiplier is set to {0} for the next epoch (step {1}, max {2})'.format(self.multiplier, step, self.p['limit'][1]))
self.n_good = 0
else:
self.n_good = 0
self.n_bad += 1
step = self.p['step'][0]
print('Count(s) of not-update = {0} (LR decreased at {1} patiences by x{2})'.format(self.n_bad, self.p['patience'][0], step))
if self.n_bad >= self.p['patience'][0] and step != 1.0:
if step < 1.0: self.multiplier = max(self.p['limit'][0], self.multiplier * step)
elif step > 1.0: self.multiplier = min(self.p['limit'][0], self.multiplier * step)
print('LR multiplier is set to {0} for the next epoch (step {1}, min {2})'.format(self.multiplier, step, self.p['limit'][0]))
self.n_bad = 0
print(' ')
def get_LR_multiplier(self):
return self.multiplier
# Define custom callbacks
#############################################################################################
#############################################################################################
# Main training sequence
def Train():
import shutil
import time
import platform
import matplotlib.pyplot as plt
from datetime import datetime, timedelta, timezone
import tensorflow as tf
import keras as _keras
import keras.backend as K
# from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger, TensorBoard, LearningRateScheduler
from keras.callbacks import CSVLogger, LearningRateScheduler
from keras.utils import plot_model
from keras.models import load_model
##### For TensorFlow v2 #####
# from tensorflow import keras
# from tensorflow.keras import backend as K
# # from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger, TensorBoard, LearningRateScheduler
# from tensorflow.keras.callbacks import CSVLogger, LearningRateScheduler
# from tensorflow.keras.utils import plot_model
# from tensorflow.keras.models import load_model
K.clear_session()
# NOTE: Data must be (samples, height, width, channels)
if K.image_data_format() is not 'channels_last': K.set_image_data_format('channels_last')
JST = timezone(timedelta(hours=+9), 'JST') # Japan Standard Time, Change it for your time zone
startdate = datetime.now(JST)
starttime = time.time()
############################################
# Training parameters
# Load loss and metrics
custom_loss = get_loss()
custom_metrics = get_metrics()
# Neural network model
NN_model_path = sys.argv[1]
NN = imm.SourceFileLoader(os.path.splitext(os.path.basename(NN_model_path))[0], NN_model_path).load_module()
NN_info = []
try: NN_model_name = NN.Model_Name()
except: NN_model_name, _ = NN.__name__, NN_info.append('ALERT: Define a model name in the neural network model file.')
try: NN_model_descript = NN.Model_Description()
except: NN_model_descript, _ = 'Empty description.', NN_info.append('ALERT: Define description for the model in the neural network model file.')
try: NN_num_classes = NN.Number_of_Classes()
except: NN_num_classes, _ = 1, NN_info.append('NOTE: The number of classes was not defined in the neural network model file, automatically set to 1.')
try: NN_batch_size = NN.Batch_Size()
except: NN_batch_size, _ = 32, NN_info.append('NOTE: The batch size was not defined in the neural network model file, automatically set to 32.')
# Initial parameters
LR_params = {'formula' : [None, 0.0, 0], # Learning rate formula calculates LR at points of epochs - ['poly', base_lr, number_of_epochs] is available
'graph' : [[0,4e-3], [100,2e-3]], # Learning rate graph defines LR at points of epochs - [[epoch_1, LR_1], [epoch_2, LR_2], ... [epoch_last, LR_last]]
'step' : [0.1, 2.5], # Multiplying values to LR - will be applied when mIoU is [NOT improved, improved]
'limit' : [0.001, 1.0] , # Limitation of LR multiplier - when [NOT improved, improved]
'patience' : [1000, 1000], # Patience counts before applying step for LR - when [NOT improved, improved]
'stop_count' : 50 } # Define a count number before early stopping
LR_params_set = True
try: LR_params = NN.Learning_Rate_Parameters()
except: LR_params_set, _ = False, NN_info.append('ALERT: The list of learning rate parameters was not defined in the neural network model file, trying to set separately.')
if not LR_params_set:
try: LR_params['formula'] = NN.Learning_Rate_Formula()
except: NN_info.append('ALERT: The LR formula was not defined in the neural network model file, automatically deactivated.')
try: LR_params['graph'] = NN.Learning_Rate_Lsit()
except: NN_info.append('ALERT: The LR graph was not defined in the neural network model file, automatically set.')
# LR_params['graph'] = [[0,3e-3], [3,3.2e-3], [12,4.5e-3], [30,4.5e-3], [50,3e-3], [80,1e-3], [100,5e-4], [150,2e-4]]
# LR_params['graph'] = [[0, 7.81e-4], [5, 7.81e-4], [15, 6e-4], [30, 2e-4], [35, 1e-4], [50, 2e-5]] # For aorta
# LR_params['graph'] = [[0, 7.81e-4], [30, 7.81e-4], [50, 6e-4], [100, 2e-4], [140, 1e-4], [200, 1e-5]] # For heart
# LR_params['graph'] = [[0,2e-3], [50,2e-3], [80,1.5e-3], [100,1.1e-3], [150,3e-4], [200,1e-4]] # For heart 20191116
# LR_params['graph'] = [[0,3e-3], [3,3.2e-3], [12,4.5e-3], [30,4.5e-3], [50,3e-3], [80,1e-3], [100,5e-4], [150,2e-4]] # For heart 20200106
# LR_params['graph'] = [[0,3e-3], [3,3.2e-3], [12,4.8e-3], [30,4.8e-3], [50,3.8e-3], [80,2e-3], [100,1.1e-3], [150,3e-4], [200,1e-4]] # For heart 20190903
# LR_params['graph'] = [[0,3e-3], [3,3.2e-3], [12,4.8e-3], [30,4.8e-3], [50,3.8e-3], [80,2e-3], [100,1.1e-3], [180,3e-4], [250,6e-5]] # For heart 20190903
# LR_params['graph'] = [[0,3e-3], [3,3.2e-3], [12,4.8e-3], [30,4.8e-3], [50,3.8e-3], [80,2e-3], [120,1.1e-3], [180,3e-4], [250,1e-4]] # For heart 20191107
# LR_params['graph'] = [[0,1e-3], [100,1e-3]] # For heart 20191118 Paper
# LR_params['graph'] = [[0, 1.5e-4], [30, 1.0e-4], [80, 0.2e-4], [130, 0.05e-4], [180, 0.01e-4]] # For heart
# LR_params['graph'] = [[0, 7.81e-4], [2, 7.81e-4], [10, 2e-5]] # For bone
try: LR_params['patience'] = NN.Count_before_LR_Step()
except: NN_info.append('ALERT: The patience count for LR step was not defined in the neural network model file, automatically set.')
try: LR_params['stop_count'] = NN.Count_before_Stop()
except: NN_info.append('ALERT: The count for early stopping was not defined in the neural network model file, automatically set.')
if LR_params['formula'][0] is None: number_of_epochs = LR_params['graph'][-1][0]
else: number_of_epochs = LR_params['formula'][2]
# Training parameters
############################################
# Resume training
if sys.argv[5] == '1':
init_epoch = int(sys.argv[7]) - 1 # Starting from zero
if init_epoch < 0 or init_epoch >= number_of_epochs:
init_epoch = min(number_of_epochs-1, max(0, init_epoch))
print('ALART : Initial epoch [{0}] is clipped between 0 and {1}'.format(sys.argv[7], number_of_epochs-1))
trained_model_path = sys.argv[6]
training_mode = 'Resume training'
NN_model_descript = 'The following model will be resumed - ' + trained_model_path + '\n\t\t\t' + NN_model_descript
# Normal training or Retraining
elif sys.argv[5] == '0' or sys.argv[5] == '2':
init_epoch = 0 # Starting from zero
if sys.argv[5] == '0':
trained_model_path = None
training_mode = 'Normal training'
else:
trained_model_path = sys.argv[6]
training_mode = 'Retraining with trained wieghts'
NN_model_descript = 'Trained weights will be loaded from - ' + trained_model_path + '\n\t\t\t' + NN_model_descript
else:
print('ERROR : Invalid training mode!!!')
sys.exit()
# Loaded neural network code may not have Custom_Layers()
try: custom_layers = NN.Custom_Layers()
except: custom_layers, _ = {}, \
NN_info.append('ALERT: The dictionary of custom layers was not defined in the neural network model file, automatically set to empty.')
# Training mode: 0=Normal, 1=Resume the model, 2=Boost the model weights
if sys.argv[5] == '0':
model = NN.Build_Model()
elif sys.argv[5] == '1':
model = load_model(trained_model_path, custom_objects=dict(**custom_loss, **custom_metrics, **custom_layers), compile=False)
elif sys.argv[5] == '2':
model = NN.Build_Model()
model.load_weights(trained_model_path, by_name=True)
else:
print('Invalid mode.')
sys.exit()
# Optimizers
from keras.optimizers import SGD, Adam, Nadam
# from tensorflow.keras.optimizers import SGD, Adam, Nadam ##### For TensorFlow v2 #####
# from optimizers.AdaBound1.adabound1 import AdaBound1
# from optimizers.AdaBound2.adabound2 import AdaBound2
# from optimizers.Santa.Santa import Santa
if LR_params['formula'][0] is not None: base_lr = LR_params['formula'][1]
elif LR_params['graph'] is not None: base_lr = LR_params['graph'][0][1]
else:
print('Invalid learning rate.')
sys.exit()
try: optimizer = NN.Optimizer(base_lr=base_lr)
except: optimizer, _ = Nadam(lr=base_lr, beta_1=0.9, beta_2=0.999), \
NN_info.append('ALERT: The optimizer was not defined in the neural network model file, automatically set to Nadam.')
# except: optimizer, _ = SGD(lr=base_lr, momentum=0.9, nesterov=True), \
# except: optimizer, _ = Adam(lr=base_lr, beta_1=0.9, beta_2=0.999, amsgrad=False), \
# except: optimizer, _ = Nadam(lr=base_lr, beta_1=0.9, beta_2=0.999), \
# except: optimizer, _ = AdaBound1(lr=base_lr, final_lr=0.5, beta_1=0.9, beta_2=0.999, gamma=1e-3, amsbound=False, weight_decay=0.0), \
# except: optimizer, _ = AdaBound2(lr=base_lr, beta_1=0.9, beta_2=0.999, terminal_bound=0.5, lower_bound=0.0, upper_bound=None), \
# except: optimizer, _ = Santa(lr=base_lr, exploration=max(number_of_epochs/2, number_of_epochs-20), rho=0.95, anne_rate=0.5), \
# Compile
model.compile(optimizer = optimizer,
loss = loss,
metrics = list(custom_metrics.values()),
loss_weights = None,
sample_weight_mode = None,
weighted_metrics = None,
target_tensors = None )
# Paths and directories
datestr = startdate.strftime("%Y%m%d%H%M%S")
work_dir_path = os.path.join(sys.argv[4], 'run'+datestr+' ('+training_mode+')')
code_dir_path = os.path.join(work_dir_path, 'code')
NN_dir_path = os.path.join(code_dir_path, 'neural_networks')
utils_dir_path = os.path.join(code_dir_path, 'utils')
model_base_path = os.path.join(work_dir_path, 'model'+datestr)
os.makedirs(NN_dir_path)
os.makedirs(utils_dir_path)
shutil.copy2(trainFilePath, os.path.join(code_dir_path, os.path.basename(trainFilePath))) # Copy this file
shutil.copy2(NN_model_path, os.path.join(NN_dir_path, os.path.basename(NN_model_path))) # Copy model file
shutil.copy2(lossAndMetricsPath, os.path.join(utils_dir_path, os.path.basename(lossAndMetricsPath))) # Copy loss and metrics file
if custom_layers:
shutil.copy2(os.path.join(trainDirPath, 'neural_networks', 'Custom_layers.py'), os.path.join(NN_dir_path, 'Custom_layers.py')) # Copy layer file
# Descriptions
model.summary()
print('Date : {0}'.format(startdate))
print('TensorFlow version : {0}'.format(tf.version.VERSION))
print('TF-Keras version : {0}'.format(tf.keras.__version__))
print('Keras version : {0}'.format(_keras.__version__))
print('OS-version : {0}'.format(platform.platform()))
print('Processor : {0}'.format(platform.processor()))
print('__________________________________________________________________________________________________')
print('Training mode : {0}'.format(training_mode))
print('Model name : {0}'.format(NN_model_name))
print('Model description : {0}'.format(NN_model_descript))
print('Number of classes : {0}'.format(NN_num_classes))
print('Loaded model path : {0}'.format(NN_model_path))
print('Working directory : {0}'.format(work_dir_path))
print('__________________________________________________________________________________________________')
print('Keras data format : {0}'.format(K.image_data_format()))
print('Optimizer : {0}'.format(optimizer.__class__.__name__))
print('Loss : {0}'.format(loss.__name__))
print('Metrics : {0}'.format(model.metrics_names[1:])) # model.metrics_names[0] = 'loss'
print('Custom layers : {0}'.format(list(custom_layers.keys()) ))
print('Batch size : {0}'.format(NN_batch_size))
print('Epochs : {0} - {1}'.format(init_epoch+1, number_of_epochs))
print('Learning rate formula : {0}'.format(LR_params['formula']))
print('Learning rate graph : {0}'.format(LR_params['graph']))
print('LR step : {0}'.format(LR_params['step']))
print('LR limit : {0}'.format(LR_params['limit']))
print('Patience for LR step : {0}'.format(LR_params['patience']))
print('Patience for early stop : {0}'.format(LR_params['stop_count']))
print('__________________________________________________________________________________________________')
if len(NN_info) > 0:
for info in NN_info: print(info)
print('__________________________________________________________________________________________________')
# Image data generator
'''
For TensorFlow 2.0
Model.fit_generator IS DEPRECATED.
To use Model.fit, generator classes, ImageDataGenerator_XXX(), were updated as subclasses of keras.utils.Sequence.
See:
https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit
https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit_generator
'''
from utils.Image_data_generator import ImageDataGenerator_CSV_with_Header, ImageDataGenerator_h5_Dataset
print('\n- Loading images for training...')
ext = os.path.splitext(sys.argv[2])[1]
if ext == '.csv' : training_images = ImageDataGenerator_CSV_with_Header('Train data from CSV', sys.argv[2], batch_size=NN_batch_size, rescale=1.0/255.0, shuffle=True)
elif ext == '.h5' : training_images = ImageDataGenerator_h5_Dataset('image_training', sys.argv[2], batch_size=NN_batch_size, rescale=1.0/255.0)
else : sys.exit()
print('\n- Loading images for validation...')
ext = os.path.splitext(sys.argv[3])[1]
if ext == '.csv' : validation_images = ImageDataGenerator_CSV_with_Header('Validation data from CSV', sys.argv[3], batch_size=NN_batch_size, rescale=1.0/255.0, shuffle=True)
elif ext == '.h5' : validation_images = ImageDataGenerator_h5_Dataset('image_validation', sys.argv[3], batch_size=NN_batch_size, rescale=1.0/255.0)
else : sys.exit()
# Allocate callbacks
print('\n- Defining callbacks...')
bm_monitor = BestMetricsMonitor(validation=validation_images, model_base_path=model_base_path, nn_name=NN_model_name, patience=LR_params['stop_count'])
lr_manager = AutoLRManager(param=LR_params, bm_monitor=bm_monitor)
def ScheduleLR(epoch, lr):
import math
raw_lr = lr
if LR_params['formula'][0] == 'poly':
# See https://arxiv.org/pdf/1506.04579.pdf
print('Learning rate by poly: base_lr = {0}, power = 0.9'.format(LR_params['formula'][1]))
raw_lr = LR_params['formula'][1] * math.pow(1 - epoch / number_of_epochs, 0.9)
# elif LR_params['formula'][0] == 'XXX':
# print('Learning rate by XXX: ...
# raw_lr = LR_params['formula'][1] ...
elif LR_params['graph'] is not None:
def LR_at_epoch(epoch, pt1, pt2): return (pt2[1] - pt1[1]) / (pt2[0] - pt1[0]) * (epoch - pt1[0]) + pt1[1]
print('Learning rate by graph [epoch, LR] : {0}'.format(LR_params['graph']))
for i in range(len(LR_params['graph'])-1):
if LR_params['graph'][i][0] <= epoch and epoch < LR_params['graph'][i+1][0]:
raw_lr = LR_at_epoch(epoch, LR_params['graph'][i], LR_params['graph'][i+1])
break
m = lr_manager.get_LR_multiplier()
new_LR = m * raw_lr
print('LR = {0} (raw LR = {1}, multiplier = {2})'.format(new_LR, raw_lr, m))
return new_LR
# check_pointer = ModelCheckpoint(model_base_path, monitor=LR_params['monitor_for_best'][0], verbose=1, save_best_only=True, mode=LR_params['monitor_for_best'][1])
# early_stopper = EarlyStopping(monitor=LR_params['monitor_for_best'][0], min_delta=0, patience=LR_params['stop_count'], verbose=1, mode=LR_params['monitor_for_best'][1])
lr_scheduler = LearningRateScheduler(ScheduleLR, verbose=0)
csv_logger = CSVLogger(os.path.join(work_dir_path,'training_log.csv'), separator=',', append=False)
# tensorboard = TensorBoard(log_dir=work_dir_path, histogram_freq=0, write_graph=True, write_images=True)
# Save descriptions, network figure and parameters
plot_model(model, to_file=os.path.join(work_dir_path,'model_figure.png'), show_shapes=True, show_layer_names=False)
with open(os.path.join(work_dir_path,'training_parameters.txt'), mode='w') as path_file:
path_file.write('Date : {0}\n'.format(startdate))
path_file.write('TensorFlow version : {0}\n'.format(tf.version.VERSION))
path_file.write('TF-Keras version : {0}\n'.format(tf.keras.__version__))
path_file.write('Keras version : {0}\n'.format(_keras.__version__))
path_file.write('OS-version : {0}\n'.format(platform.platform()))
path_file.write('Processor : {0}\n\n'.format(platform.processor()))
path_file.write('Training mode : {0}\n'.format(training_mode))
path_file.write('Model name : {0}\n'.format(NN_model_name))
path_file.write('Model description : {0}\n'.format(NN_model_descript))
path_file.write('Number of classes : {0}\n'.format(NN_num_classes))
path_file.write('Loaded model path : {0}\n'.format(NN_model_path))
path_file.write('Working directory : {0}\n\n'.format(work_dir_path))
path_file.write('Training images : {0} sets in {1}\n'.format(training_images.datalength(), sys.argv[2]))
path_file.write('Validation images : {0} sets in {1}\n\n'.format(validation_images.datalength(), sys.argv[3]))
path_file.write('Keras data format : {0}\n'.format(K.image_data_format()))
path_file.write('Optimizer : {0}\n'.format(optimizer.__class__.__name__))
path_file.write('Loss : {0}\n'.format(loss.__name__))
path_file.write('Metrics : {0}\n'.format(model.metrics_names[1:]))
path_file.write('Custom layers : {0}\n'.format(list(custom_layers.keys()) ))
path_file.write('Batch size : {0}\n'.format(NN_batch_size))
path_file.write('Epochs : {0} - {1}\n'.format(init_epoch+1, number_of_epochs))
path_file.write('Learning rate formula : {0}\n'.format(LR_params['formula']))
path_file.write('Learning rate graph : {0}\n'.format(LR_params['graph']))
path_file.write('LR step : {0}\n'.format(LR_params['step']))
path_file.write('LR limit : {0}\n'.format(LR_params['limit']))
path_file.write('Patience for LR step : {0}\n\n'.format(LR_params['patience']))
path_file.write('Patience for early stop : {0}\n'.format(LR_params['stop_count']))
if len(NN_info) > 0:
for info in NN_info: path_file.write('{}\n'.format(info))
path_file.write('\n')
model.summary(print_fn=lambda x: path_file.write(x + '\n'))
# Train the model
'''
For TensorFlow 2.0
fit_generator -> fit
Warning: Model.fit_generator IS DEPRECATED. It will be removed in a future version.
Instructions for updating: Please use Model.fit, which supports generators.
See:
https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit
https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit_generator
'''
print('\n- Starting model training...')
results = model.fit_generator(
generator = training_images,
epochs = number_of_epochs,
verbose = 1,
callbacks = [bm_monitor, lr_manager, lr_scheduler, csv_logger],
validation_data = validation_images,
max_queue_size = 10,
workers = 1,
use_multiprocessing = False,
shuffle = False,
initial_epoch = init_epoch )
# results = model.fit_generator(
# generator = training_images.flow(),
# steps_per_epoch = training_images.datalength() // NN_batch_size,
# epochs = number_of_epochs,
# verbose = 1,
# callbacks = [bm_monitor, lr_manager, lr_scheduler, csv_logger],
# validation_data = validation_images.flow(),
# validation_steps = validation_images.datalength() // NN_batch_size,
# max_queue_size = 10,
# workers = 1,
# use_multiprocessing = False,
# shuffle = False,
# initial_epoch = init_epoch )
##### For TensorFlow v2 #####
# results = model.fit(
# x = training_images, # keras.utils.Sequence
# epochs = number_of_epochs,
# verbose = 1,
# callbacks = [bm_monitor, lr_manager, lr_scheduler, csv_logger],
# validation_data = validation_images.getdata(), # tuple of Numpy arrays
# shuffle = False,
# initial_epoch = init_epoch,
# validation_freq = 1,
# max_queue_size = 10,
# workers = 1,
# use_multiprocessing = False )
# Show results
print('\n- Saving training graph...')
try:
his_loss = results.history['loss']
his_miou = results.history['iou_score']
his_dice = results.history['dice_coef']
his_valloss = results.history['val_loss']
his_valmiou = results.history['val_iou_score']
his_valdice = results.history['val_dice_coef']
xlen = range(len(his_loss))
fig = plt.figure()
ax1 = fig.add_subplot(111) # Loss
ax2 = ax1.twinx()
ax1.plot(xlen, his_loss, marker='.', color='salmon', label='Loss - training')
ax1.plot(xlen, his_valloss, marker='.', color='red', label='Loss - validation')
ax2.plot(xlen, his_miou, marker='.', color='deepskyblue', label='mIoU - training')
ax2.plot(xlen, his_valmiou, marker='.', color='blue', label='mIoU - validation')
ax2.plot(xlen, his_dice, marker='.', color='limegreen', label='Dice - training')
ax2.plot(xlen, his_valdice, marker='.', color='green', label='Dice - validation')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss:'+loss.__name__)
ax1.set_yscale("log")
ax1.set_ylim([0.001, 10.0])
ax2.set_ylabel('Metrics')
ax2.set_yscale("log")
ax2.set_ylim([0.6, 1.0])
h1, l1 = ax1.get_legend_handles_labels()
h2, l2 = ax2.get_legend_handles_labels()
ax1.legend(h1+h2, l1+l2, loc='lower center')
plt.savefig(os.path.join(work_dir_path,'training_graph.png'))
# plt.show()
except:
print('ALERT: Failed to save the training graph figure.')
print('\n==================================================================================================')
print('Computation time : {0}'.format(timedelta(seconds=time.time()-starttime)))
print('From the date : {0}\n'.format(startdate))
print('==================================================================================================')
# Main
if __name__ == '__main__': Train()