diff --git a/conf/base/parameters.yml b/conf/base/parameters.yml index acca761..fde8db6 100644 --- a/conf/base/parameters.yml +++ b/conf/base/parameters.yml @@ -43,13 +43,13 @@ data_renaming: # TODO: remove later when renaming finished modelling: # obj detection - detection_model: "yolov3-tiny" + detection_model: "yolov3_tf" detection_implementation: "cvlib" detection_iou_threshold: 0.05 detection_confidence_threshold: 0.2 - # TODO: change nms threshold to iou threshold + # TODO: change nms threshold to iou threshold detection_nms_threshold: 0.2 - + # tracking selected_labels: ["car", "truck", "bus", "motorbike"] opencv_tracker_type: "csrt" diff --git a/conf/base/paths.yml b/conf/base/paths.yml index 76fdec9..00a6b11 100644 --- a/conf/base/paths.yml +++ b/conf/base/paths.yml @@ -7,15 +7,21 @@ s3_paths: s3_camera_details: "ref/camera_details/camera_details.json" s3_frame_level: "frame_level/" # TODO DELETE THIS - s3_profile: "dssg" # TODO: change this for user? + s3_profile: "dssg" # TODO: change this for user? s3_creds: "dev_s3" # TODO: CHANGE TO JUST S3 s3_detection_model: "ref/model_conf/" + s3_cvat_annotations: "ref/annotations/cvat/" + s3_detrac_annotations: "ref/annotations/detrac/" + s3_detrac_images: "raw/images/detrac/" + s3_cvat_training_annotations: "ref/annotations/cvat_train/" local_paths: temp_video: "data/temp/videos/" + temp_raw_images: "data/temp/raw_images/" temp_raw_video: "data/temp/raw_videos/" temp_frame_level: "data/temp/frame_level/" temp_video_level: "data/temp/video_level/" + temp_annotation: "data/temp/annotation/" temp_setup: "data/temp/setup/" video_names: "data/ref/video_names/" diff --git a/conf/base/training_parameters.yml b/conf/base/training_parameters.yml new file mode 100644 index 0000000..89b02ac --- /dev/null +++ b/conf/base/training_parameters.yml @@ -0,0 +1,34 @@ +training: + num_batches : 10 + letterbox_resize : True # Whether to use letterbox resize, i.e., keep the original aspect ratio in the resized img. + total_epochs : 1000 + train_evaluation_step : 100 # Evaluate on the training batch after some steps. + val_evaluation_epoch : 2 # Evaluate on the validation dataset after some epochs. Set to None to evaluate all epoch. + save_epoch : 10 # Save the model after some epochs. + batch_norm_decay : 0.99 # decay in bn ops + weight_decay : 0.0005 # l2 weight decay + global_step : 0 # used when resuming training + warm_up_epoch : 3 # set to larger value if gradient explodes + num_threads : 10 # Number of threads for image processing used in tf.data pipeline. + prefetech_buffer : 5 # Prefetech_buffer used in tf.data pipeline. + trained_model_name : 'yolov3_traffic' + +learning: + optimizer_name : 'momentum' # Chosen from [sgd, momentum, adam, rmsprop] + save_optimizer : True # Whether to save the optimizer parameters into the checkpoint file. + learning_rate_init : 0.0001 + lr_type : 'piecewise' # Chosen from [exponential, piecewise] + lr_decay_epoch : 5 # Epochs after which learning rate decays. Int or float. Used when chosen `exponential` lr_type. + lr_decay_factor : 0.96 # The learning rate decay factor. Used when chosen `exponential` lr_type. + lr_lower_bound : 0.000001 # The minimum learning rate. + pw_boundaries : [30, 50] # epoch based boundaries + pw_values : [0.0001, 0.00003, 0.00001] # FIRST VALUE MUST BE LEARNING_RATE_INIT + +validation: + # nms + nms_threshold : 0.45 # iou threshold in nms operation + score_threshold : 0.01 # threshold of the prob of the classes in nms operation, i.e. score = pred_confs * pred_probs. + nms_topk : 150 # keep at most nms_topk outputs after nms + + # mAP eval + eval_threshold : 0.5 # the iou threshold applied in mAP evaluation diff --git a/data/frame_level/frame001out.jpg b/data/frame_level/frame001out.jpg new file mode 100644 index 0000000..ccaed0d Binary files /dev/null and b/data/frame_level/frame001out.jpg differ diff --git a/requirements.txt b/requirements.txt index 4c35289..e45dc23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,4 @@ sqlalchemy pandas==0.24.2 python-crontab>=2.3.8, <3.0 seaborn>=0.9 +tqdm==4.33.0 diff --git a/src/run_transfer_learning.py b/src/run_transfer_learning.py new file mode 100644 index 0000000..68d66a3 --- /dev/null +++ b/src/run_transfer_learning.py @@ -0,0 +1,33 @@ +from traffic_analysis.d00_utils.load_confs import load_paths, load_credentials, \ + load_parameters, load_training_parameters +from traffic_analysis.d04_modelling.transfer_learning.training_data_loader import TrainingDataLoader, TransferDataset +from traffic_analysis.d04_modelling.transfer_learning.train_tensorflow_model import transfer_learn + +paths = load_paths() +creds = load_credentials() +params = load_parameters() +train_params = load_training_parameters() + +training_data_loader = TrainingDataLoader(datasets=[TransferDataset.cvat, TransferDataset.detrac], + creds=creds, + paths=paths) + +fraction_for_training = 0.8 +x_train, y_train, x_test, y_test = training_data_loader.get_train_and_test(fraction_for_training) + + +saved_text_files_dir = paths['temp_annotation'] +with open(saved_text_files_dir + 'train.txt', 'w') as f: + for item in y_train: + f.write("%s\n" % item) + +with open(saved_text_files_dir + 'test.txt', 'w') as f: + for item in y_test: + f.write("%s\n" % item) + +transfer_learn(paths=paths, + params=params, + train_params=train_params, + train_file='train.txt', + test_file='test.txt', + selected_labels=params['selected_labels']) diff --git a/src/traffic_analysis/d00_utils/load_confs.py b/src/traffic_analysis/d00_utils/load_confs.py index 6eff1df..3a64532 100644 --- a/src/traffic_analysis/d00_utils/load_confs.py +++ b/src/traffic_analysis/d00_utils/load_confs.py @@ -30,6 +30,12 @@ def load_app_parameters(): return {**params['visualization']} +def load_training_parameters(): + with open(project_dir + '/conf/base/training_parameters.yml') as f: + params = yaml.safe_load(f) + return collapse_dict_hierarchy(params) + + def load_credentials(): filepath = os.sep.join( diff --git a/src/traffic_analysis/d02_ref/ref_utils.py b/src/traffic_analysis/d02_ref/ref_utils.py index f1a4726..5963baa 100644 --- a/src/traffic_analysis/d02_ref/ref_utils.py +++ b/src/traffic_analysis/d02_ref/ref_utils.py @@ -5,6 +5,8 @@ import subprocess from subprocess import Popen, PIPE +from traffic_analysis.d00_utils.data_loader_s3 import DataLoaderS3 + def upload_json_to_s3(paths: dict, save_name: str, @@ -66,4 +68,48 @@ def get_names_of_folder_content_from_s3(bucket_name, prefix, s3_profile): end = Time.time() elapsed_time = end-start + assert ((len(files) == 0) or (files[0] != '')), 'set your aws credentials' + return elapsed_time, files + + +def get_s3_video_path_from_xml_name(xml_file_name, s3_creds, paths): + + # Supports old and new naming conventions + vals = xml_file_name.split('_') + data_loader_s3 = DataLoaderS3(s3_credentials=s3_creds, + bucket_name=paths['bucket_name']) + + if (len(vals) >= 4): + date = vals[1] + file_names = [xml_file_name.split('_')[1:][0].replace('-', '') + '-' + + xml_file_name.split('_')[1:][1].replace('-', '')[:6] + '_' + + xml_file_name.split('_')[1:][2], + xml_file_name.split('_')[1:][0] + ' ' + + xml_file_name.split('_')[1:][1].replace('-', ':') + '_' + + xml_file_name.split('_')[1:][2]] + else: + date = vals[0] + file_names = [xml_file_name.split('_')[0].replace('-', '') + '-' + + xml_file_name.split('_')[1].replace('-', '')[:6] + '_' + + xml_file_name.split('_')[2], + xml_file_name.split('_')[0] + ' ' + + xml_file_name.split('_')[1].replace('-', ':') + '_' + + xml_file_name.split('_')[2]] + file_to_download = paths['s3_video'] + \ + date + '/' + \ + file_names[0] + '.mp4' + + if(data_loader_s3.file_exists(file_to_download)): + return file_to_download + + else: + file_to_download = paths['s3_video'] + \ + date + '/' + \ + file_names[1] + '.mp4' + + if (data_loader_s3.file_exists(file_to_download)): + return file_to_download + else: + print('Could not download file: ' + xml_file_name) + return diff --git a/src/traffic_analysis/d02_ref/upload_annotation_names_to_s3.py b/src/traffic_analysis/d02_ref/upload_annotation_names_to_s3.py index cd9819a..b8ee860 100644 --- a/src/traffic_analysis/d02_ref/upload_annotation_names_to_s3.py +++ b/src/traffic_analysis/d02_ref/upload_annotation_names_to_s3.py @@ -1,5 +1,6 @@ from traffic_analysis.d02_ref.ref_utils import get_names_of_folder_content_from_s3 from traffic_analysis.d00_utils.data_loader_s3 import DataLoaderS3 +from traffic_analysis.d02_ref.ref_utils import get_s3_video_path_from_xml_name def upload_annotation_names_to_s3(paths: dict, diff --git a/src/traffic_analysis/d04_modelling/perform_detection_tensorflow.py b/src/traffic_analysis/d04_modelling/perform_detection_tensorflow.py index 0b2e419..31787fb 100644 --- a/src/traffic_analysis/d04_modelling/perform_detection_tensorflow.py +++ b/src/traffic_analysis/d04_modelling/perform_detection_tensorflow.py @@ -9,7 +9,7 @@ remove_overlapping_boxes, letterbox_resize from traffic_analysis.d04_modelling.transfer_learning.convert_darknet_to_tensorflow import parse_anchors, \ yolov3_darknet_to_tensorflow -from traffic_analysis.d04_modelling.transfer_learning.generate_tensorflow_model import YoloV3 +from traffic_analysis.d04_modelling.transfer_learning.tensorflow_model_loader import YoloV3 from traffic_analysis.d04_modelling.perform_detection_opencv import label_detections, \ choose_objects_of_selected_labels diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/convert_darknet_to_tensorflow.py b/src/traffic_analysis/d04_modelling/transfer_learning/convert_darknet_to_tensorflow.py index da9f0ef..385a888 100644 --- a/src/traffic_analysis/d04_modelling/transfer_learning/convert_darknet_to_tensorflow.py +++ b/src/traffic_analysis/d04_modelling/transfer_learning/convert_darknet_to_tensorflow.py @@ -8,7 +8,7 @@ import tensorflow as tf import numpy as np -from traffic_analysis.d04_modelling.transfer_learning.generate_tensorflow_model import YoloV3 +from traffic_analysis.d04_modelling.transfer_learning.tensorflow_model_loader import YoloV3 from traffic_analysis.d02_ref.download_detection_model_from_s3 import download_detection_model_from_s3 diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_evaluation_utils.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_evaluation_utils.py new file mode 100644 index 0000000..c7bac86 --- /dev/null +++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_evaluation_utils.py @@ -0,0 +1,421 @@ +from __future__ import division, print_function + +import numpy as np +import sys +from collections import Counter + +PY_VERSION = sys.version_info[0] +iter_cnt = 0 + + +def voc_ap(rec, prec, use_07_metric=True): + """Compute VOC AP given precision and recall. If use_07_metric is true, uses + the VOC 07 11-point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, image_ids, y_pred): + ''' + Given the y_pred of an input image, get the predicted bbox and label info. + return: + pred_content: 2d list. + ''' + image_id = image_ids[0] + + # keep the first dimension 1 + pred_boxes = y_pred[0][0:1] + pred_confs = y_pred[1][0:1] + pred_probs = y_pred[2][0:1] + + boxes, scores, labels = sess.run(gpu_nms_op, + feed_dict={pred_boxes_flag: pred_boxes, + pred_scores_flag: pred_confs * pred_probs}) + + pred_content = [] + for i in range(len(labels)): + x_min, y_min, x_max, y_max = boxes[i] + score = scores[i] + label = labels[i] + pred_content.append([image_id, x_min, y_min, x_max, y_max, score, label]) + + return pred_content + + +def calc_iou(pred_boxes, true_boxes): + ''' + Maintain an efficient way to calculate the ios matrix using the numpy broadcast tricks. + shape_info: pred_boxes: [N, 4] + true_boxes: [V, 4] + return: IoU matrix: shape: [N, V] + ''' + + # [N, 1, 4] + pred_boxes = np.expand_dims(pred_boxes, -2) + # [1, V, 4] + true_boxes = np.expand_dims(true_boxes, 0) + + # [N, 1, 2] & [1, V, 2] ==> [N, V, 2] + intersect_mins = np.maximum(pred_boxes[..., :2], true_boxes[..., :2]) + intersect_maxs = np.minimum(pred_boxes[..., 2:], true_boxes[..., 2:]) + intersect_wh = np.maximum(intersect_maxs - intersect_mins, 0.) + + # shape: [N, V] + intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] + # shape: [N, 1, 2] + pred_box_wh = pred_boxes[..., 2:] - pred_boxes[..., :2] + # shape: [N, 1] + pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1] + # [1, V, 2] + true_boxes_wh = true_boxes[..., 2:] - true_boxes[..., :2] + # [1, V] + true_boxes_area = true_boxes_wh[..., 0] * true_boxes_wh[..., 1] + + # shape: [N, V] + iou = intersect_area / (pred_box_area + true_boxes_area - intersect_area + 1e-10) + + return iou + + +def voc_eval(gt_dict, val_preds, classidx, iou_thres=0.5, use_07_metric=False): + ''' + Top level function that does the PASCAL VOC evaluation. + ''' + # 1.obtain gt: extract all gt objects for this class + class_recs = {} + npos = 0 + for img_id in gt_dict: + R = [obj for obj in gt_dict[img_id] if obj[-1] == classidx] + bbox = np.array([x[:4] for x in R]) + det = [False] * len(R) + npos += len(R) + class_recs[img_id] = {'bbox': bbox, 'det': det} + + # 2. obtain pred results + pred = [x for x in val_preds if x[-1] == classidx] + img_ids = [x[0] for x in pred] + confidence = np.array([x[-2] for x in pred]) + BB = np.array([[x[1], x[2], x[3], x[4]] for x in pred]) + + # 3. sort by confidence + sorted_ind = np.argsort(-confidence) + try: + BB = BB[sorted_ind, :] + except: + print('no box, ignore') + return 1e-6, 1e-6, 0, 0, 0 + img_ids = [img_ids[x] for x in sorted_ind] + + # 4. mark TPs and FPs + nd = len(img_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + + for d in range(nd): + # all the gt info in some image + R = class_recs[img_ids[d]] + bb = BB[d, :] + ovmax = -np.Inf + BBGT = R['bbox'] + + if BBGT.size > 0: + # calc iou + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + # union + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * ( + BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > iou_thres: + # gt not matched yet + if not R['det'][jmax]: + tp[d] = 1. + R['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + # return rec, prec, ap + return npos, nd, tp[-1] / float(npos), tp[-1] / float(nd), ap + + +def evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, + y_pred, y_true, num_classes, iou_thresh=0.5, calc_now=True): + ''' + Given y_pred and y_true of a batch of data, get the recall and precision of the current batch. + This function will perform gpu operation on the GPU. + ''' + + num_images = y_true[0].shape[0] + true_labels_dict = {i: 0 for i in range(num_classes)} # {class: count} + pred_labels_dict = {i: 0 for i in range(num_classes)} + true_positive_dict = {i: 0 for i in range(num_classes)} + + for i in range(num_images): + true_labels_list, true_boxes_list = [], [] + for j in range(3): # three feature maps + # shape: [13, 13, 3, 80] + true_probs_temp = y_true[j][i][..., 5:-1] + # shape: [13, 13, 3, 4] (x_center, y_center, w, h) + true_boxes_temp = y_true[j][i][..., 0:4] + + # [13, 13, 3] + object_mask = true_probs_temp.sum(axis=-1) > 0 + + # [V, 80] V: Ground truth number of the current image + true_probs_temp = true_probs_temp[object_mask] + # [V, 4] + true_boxes_temp = true_boxes_temp[object_mask] + + # [V], labels, each from 0 to 79 + true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist() + # [V, 4] (x_center, y_center, w, h) + true_boxes_list += true_boxes_temp.tolist() + + if len(true_labels_list) != 0: + for cls, count in Counter(true_labels_list).items(): + true_labels_dict[cls] += count + + # [V, 4] (xmin, ymin, xmax, ymax) + true_boxes = np.array(true_boxes_list) + box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4] + true_boxes[:, 0:2] = box_centers - box_sizes / 2. + true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes + + # [1, xxx, 4] + pred_boxes = y_pred[0][i:i + 1] + pred_confs = y_pred[1][i:i + 1] + pred_probs = y_pred[2][i:i + 1] + + # pred_boxes: [N, 4] + # pred_confs: [N] + # pred_labels: [N] + # N: Detected box number of the current image + pred_boxes, pred_confs, pred_labels = sess.run(gpu_nms_op, + feed_dict={pred_boxes_flag: pred_boxes, + pred_scores_flag: pred_confs * pred_probs}) + # len: N + pred_labels_list = [] if pred_labels is None else pred_labels.tolist() + if pred_labels_list == []: + continue + + # calc iou + # [N, V] + iou_matrix = calc_iou(pred_boxes, true_boxes) + # [N] + max_iou_idx = np.argmax(iou_matrix, axis=-1) + + correct_idx = [] + correct_conf = [] + for k in range(max_iou_idx.shape[0]): + pred_labels_dict[pred_labels_list[k]] += 1 + match_idx = max_iou_idx[k] # V level + if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]: + if match_idx not in correct_idx: + correct_idx.append(match_idx) + correct_conf.append(pred_confs[k]) + else: + same_idx = correct_idx.index(match_idx) + if pred_confs[k] > correct_conf[same_idx]: + correct_idx.pop(same_idx) + correct_conf.pop(same_idx) + correct_idx.append(match_idx) + correct_conf.append(pred_confs[k]) + + for t in correct_idx: + true_positive_dict[true_labels_list[t]] += 1 + + if calc_now: + # avoid divided by 0 + recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6) + precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6) + + return recall, precision + else: + return true_positive_dict, true_labels_dict, pred_labels_dict + + +def evaluate_on_cpu(y_pred, y_true, num_classes, calc_now=True, max_boxes=50, score_thresh=0.5, iou_thresh=0.5): + ''' + Given y_pred and y_true of a batch of data, get the recall and precision of the current batch. + ''' + + num_images = y_true[0].shape[0] + true_labels_dict = {i: 0 for i in range(num_classes)} # {class: count} + pred_labels_dict = {i: 0 for i in range(num_classes)} + true_positive_dict = {i: 0 for i in range(num_classes)} + + for i in range(num_images): + true_labels_list, true_boxes_list = [], [] + for j in range(3): # three feature maps + # shape: [13, 13, 3, 80] + true_probs_temp = y_true[j][i][..., 5:-1] + # shape: [13, 13, 3, 4] (x_center, y_center, w, h) + true_boxes_temp = y_true[j][i][..., 0:4] + + # [13, 13, 3] + object_mask = true_probs_temp.sum(axis=-1) > 0 + + # [V, 3] V: Ground truth number of the current image + true_probs_temp = true_probs_temp[object_mask] + # [V, 4] + true_boxes_temp = true_boxes_temp[object_mask] + + # [V], labels + true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist() + # [V, 4] (x_center, y_center, w, h) + true_boxes_list += true_boxes_temp.tolist() + + if len(true_labels_list) != 0: + for cls, count in Counter(true_labels_list).items(): + true_labels_dict[cls] += count + + # [V, 4] (xmin, ymin, xmax, ymax) + true_boxes = np.array(true_boxes_list) + box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4] + true_boxes[:, 0:2] = box_centers - box_sizes / 2. + true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes + + # [1, xxx, 4] + pred_boxes = y_pred[0][i:i + 1] + pred_confs = y_pred[1][i:i + 1] + pred_probs = y_pred[2][i:i + 1] + + # pred_boxes: [N, 4] + # pred_confs: [N] + # pred_labels: [N] + # N: Detected box number of the current image + pred_boxes, pred_confs, pred_labels = cpu_nms(pred_boxes, pred_confs * pred_probs, num_classes, + max_boxes=max_boxes, score_thresh=score_thresh, + iou_thresh=iou_thresh) + + # len: N + pred_labels_list = [] if pred_labels is None else pred_labels.tolist() + if pred_labels_list == []: + continue + + # calc iou + # [N, V] + iou_matrix = calc_iou(pred_boxes, true_boxes) + # [N] + max_iou_idx = np.argmax(iou_matrix, axis=-1) + + correct_idx = [] + correct_conf = [] + for k in range(max_iou_idx.shape[0]): + pred_labels_dict[pred_labels_list[k]] += 1 + match_idx = max_iou_idx[k] # V level + if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]: + if match_idx not in correct_idx: + correct_idx.append(match_idx) + correct_conf.append(pred_confs[k]) + else: + same_idx = correct_idx.index(match_idx) + if pred_confs[k] > correct_conf[same_idx]: + correct_idx.pop(same_idx) + correct_conf.pop(same_idx) + correct_idx.append(match_idx) + correct_conf.append(pred_confs[k]) + + for t in correct_idx: + true_positive_dict[true_labels_list[t]] += 1 + + if calc_now: + # avoid divided by 0 + recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6) + precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6) + + return recall, precision + else: + return true_positive_dict, true_labels_dict, pred_labels_dict + + +gt_dict = {} # key: img_id, value: gt object list +def parse_gt_rec(gt_filename, target_img_size, letterbox_resize=True): + ''' + parse and re-organize the gt info. + return: + gt_dict: dict. Each key is a img_id, the value is the gt bboxes in the corresponding img. + ''' + + global gt_dict + + if not gt_dict: + new_width, new_height = target_img_size + with open(gt_filename, 'r') as f: + for line in f: + img_id, pic_path, boxes, labels, ori_width, ori_height = parse_line(line) + + objects = [] + for i in range(len(labels)): + x_min, y_min, x_max, y_max = boxes[i] + label = labels[i] + + if letterbox_resize: + resize_ratio = min(new_width / ori_width, new_height / ori_height) + + resize_w = int(resize_ratio * ori_width) + resize_h = int(resize_ratio * ori_height) + + dw = int((new_width - resize_w) / 2) + dh = int((new_height - resize_h) / 2) + + objects.append([x_min * resize_ratio + dw, + y_min * resize_ratio + dh, + x_max * resize_ratio + dw, + y_max * resize_ratio + dh, + label]) + else: + objects.append([x_min * new_width / ori_width, + y_min * new_height / ori_height, + x_max * new_width / ori_width, + y_max * new_height / ori_height, + label]) + gt_dict[img_id] = objects + return gt_dict diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_image_formatting_utils.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_image_formatting_utils.py new file mode 100644 index 0000000..2495c6f --- /dev/null +++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_image_formatting_utils.py @@ -0,0 +1,381 @@ +from __future__ import division, print_function + +import numpy as np +import sys +import cv2 +import random + +PY_VERSION = sys.version_info[0] +iter_cnt = 0 + + +def random_crop_with_constraints(bbox, size, min_scale=0.3, max_scale=1, + max_aspect_ratio=2, constraints=None, + max_trial=50): + """Crop an image randomly with bounding box constraints. + This data augmentation is used in training of + Single Shot Multibox Detector [#]_. More details can be found in + data augmentation section of the original paper. + .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, + Scott Reed, Cheng-Yang Fu, Alexander C. Berg. + SSD: Single Shot MultiBox Detector. ECCV 2016. + Parameters + ---------- + bbox : numpy.ndarray + Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes. + The second axis represents attributes of the bounding box. + Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`, + we allow additional attributes other than coordinates, which stay intact + during bounding box transformations. + size : tuple + Tuple of length 2 of image shape as (width, height). + min_scale : float + The minimum ratio between a cropped region and the original image. + The default value is :obj:`0.3`. + max_scale : float + The maximum ratio between a cropped region and the original image. + The default value is :obj:`1`. + max_aspect_ratio : float + The maximum aspect ratio of cropped region. + The default value is :obj:`2`. + constraints : iterable of tuples + An iterable of constraints. + Each constraint should be :obj:`(min_iou, max_iou)` format. + If means no constraint if set :obj:`min_iou` or :obj:`max_iou` to :obj:`None`. + If this argument defaults to :obj:`None`, :obj:`((0.1, None), (0.3, None), + (0.5, None), (0.7, None), (0.9, None), (None, 1))` will be used. + max_trial : int + Maximum number of trials for each constraint before exit no matter what. + Returns + ------- + numpy.ndarray + Cropped bounding boxes with shape :obj:`(M, 4+)` where M <= N. + tuple + Tuple of length 4 as (x_offset, y_offset, new_width, new_height). + """ + # default params in paper + if constraints is None: + constraints = ( + (0.1, None), + (0.3, None), + (0.5, None), + (0.7, None), + (0.9, None), + (None, 1), + ) + + w, h = size + + candidates = [(0, 0, w, h)] + for min_iou, max_iou in constraints: + min_iou = -np.inf if min_iou is None else min_iou + max_iou = np.inf if max_iou is None else max_iou + + for _ in range(max_trial): + scale = random.uniform(min_scale, max_scale) + aspect_ratio = random.uniform( + max(1 / max_aspect_ratio, scale * scale), + min(max_aspect_ratio, 1 / (scale * scale))) + crop_h = int(h * scale / np.sqrt(aspect_ratio)) + crop_w = int(w * scale * np.sqrt(aspect_ratio)) + + crop_t = random.randrange(h - crop_h) + crop_l = random.randrange(w - crop_w) + crop_bb = np.array((crop_l, crop_t, crop_l + crop_w, crop_t + crop_h)) + + if len(bbox) == 0: + top, bottom = crop_t, crop_t + crop_h + left, right = crop_l, crop_l + crop_w + return bbox, (left, top, right-left, bottom-top) + + iou = bbox_iou(bbox, crop_bb[np.newaxis]) + if min_iou <= iou.min() and iou.max() <= max_iou: + top, bottom = crop_t, crop_t + crop_h + left, right = crop_l, crop_l + crop_w + candidates.append((left, top, right-left, bottom-top)) + break + + # random select one + while candidates: + crop = candidates.pop(np.random.randint(0, len(candidates))) + new_bbox = bbox_crop(bbox, crop, allow_outside_center=False) + if new_bbox.size < 1: + continue + new_crop = (crop[0], crop[1], crop[2], crop[3]) + return new_bbox, new_crop + return bbox, (0, 0, w, h) + + +def random_color_distort(img, brightness_delta=32, hue_vari=18, sat_vari=0.5, val_vari=0.5): + ''' + randomly distort image color. Adjust brightness, hue, saturation, value. + param: + img: a BGR uint8 format OpenCV image. HWC format. + ''' + + def random_hue(img_hsv, hue_vari, p=0.5): + if np.random.uniform(0, 1) > p: + hue_delta = np.random.randint(-hue_vari, hue_vari) + img_hsv[:, :, 0] = (img_hsv[:, :, 0] + hue_delta) % 180 + return img_hsv + + def random_saturation(img_hsv, sat_vari, p=0.5): + if np.random.uniform(0, 1) > p: + sat_mult = 1 + np.random.uniform(-sat_vari, sat_vari) + img_hsv[:, :, 1] *= sat_mult + return img_hsv + + def random_value(img_hsv, val_vari, p=0.5): + if np.random.uniform(0, 1) > p: + val_mult = 1 + np.random.uniform(-val_vari, val_vari) + img_hsv[:, :, 2] *= val_mult + return img_hsv + + def random_brightness(img, brightness_delta, p=0.5): + if np.random.uniform(0, 1) > p: + img = img.astype(np.float32) + brightness_delta = int(np.random.uniform(-brightness_delta, brightness_delta)) + img = img + brightness_delta + return np.clip(img, 0, 255) + + # brightness + img = random_brightness(img, brightness_delta) + img = img.astype(np.uint8) + + # color jitter + img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.float32) + + if np.random.randint(0, 2): + img_hsv = random_value(img_hsv, val_vari) + img_hsv = random_saturation(img_hsv, sat_vari) + img_hsv = random_hue(img_hsv, hue_vari) + else: + img_hsv = random_saturation(img_hsv, sat_vari) + img_hsv = random_hue(img_hsv, hue_vari) + img_hsv = random_value(img_hsv, val_vari) + + img_hsv = np.clip(img_hsv, 0, 255) + img = cv2.cvtColor(img_hsv.astype(np.uint8), cv2.COLOR_HSV2BGR) + + return img + + +def letterbox_resize(img, new_width, new_height, interp=0): + ''' + Letterbox resize. keep the original aspect ratio in the resized image. + ''' + ori_height, ori_width = img.shape[:2] + + resize_ratio = min(new_width / ori_width, new_height / ori_height) + + resize_w = int(resize_ratio * ori_width) + resize_h = int(resize_ratio * ori_height) + + img = cv2.resize(img, (resize_w, resize_h), interpolation=interp) + image_padded = np.full((new_height, new_width, 3), 128, np.uint8) + + dw = int((new_width - resize_w) / 2) + dh = int((new_height - resize_h) / 2) + + image_padded[dh: resize_h + dh, dw: resize_w + dw, :] = img + + return image_padded, resize_ratio, dw, dh + + +def resize_with_bbox(img, bbox, new_width, new_height, interp=0, letterbox=False): + ''' + Resize the image and correct the bbox accordingly. + ''' + + if letterbox: + image_padded, resize_ratio, dw, dh = letterbox_resize(img, new_width, new_height, interp) + + # xmin, xmax + bbox[:, [0, 2]] = bbox[:, [0, 2]] * resize_ratio + dw + # ymin, ymax + bbox[:, [1, 3]] = bbox[:, [1, 3]] * resize_ratio + dh + + return image_padded, bbox + else: + ori_height, ori_width = img.shape[:2] + + img = cv2.resize(img, (new_width, new_height), interpolation=interp) + + # xmin, xmax + bbox[:, [0, 2]] = bbox[:, [0, 2]] / ori_width * new_width + # ymin, ymax + bbox[:, [1, 3]] = bbox[:, [1, 3]] / ori_height * new_height + + return img, bbox + + +def random_flip(img, bbox, px=0, py=0): + ''' + Randomly flip the image and correct the bbox. + param: + px: + the probability of horizontal flip + py: + the probability of vertical flip + ''' + height, width = img.shape[:2] + if np.random.uniform(0, 1) < px: + img = cv2.flip(img, 1) + xmax = width - bbox[:, 0] + xmin = width - bbox[:, 2] + bbox[:, 0] = xmin + bbox[:, 2] = xmax + + if np.random.uniform(0, 1) < py: + img = cv2.flip(img, 0) + ymax = height - bbox[:, 1] + ymin = height - bbox[:, 3] + bbox[:, 1] = ymin + bbox[:, 3] = ymax + return img, bbox + + +def random_expand(img, bbox, max_ratio=4, fill=0, keep_ratio=True): + ''' + Random expand original image with borders, this is identical to placing + the original image on a larger canvas. + param: + max_ratio : + Maximum ratio of the output image on both direction(vertical and horizontal) + fill : + The value(s) for padded borders. + keep_ratio : bool + If `True`, will keep output image the same aspect ratio as input. + ''' + h, w, c = img.shape + ratio_x = random.uniform(1, max_ratio) + if keep_ratio: + ratio_y = ratio_x + else: + ratio_y = random.uniform(1, max_ratio) + + oh, ow = int(h * ratio_y), int(w * ratio_x) + off_y = random.randint(0, oh - h) + off_x = random.randint(0, ow - w) + + dst = np.full(shape=(oh, ow, c), fill_value=fill, dtype=img.dtype) + + dst[off_y:off_y + h, off_x:off_x + w, :] = img + + # correct bbox + bbox[:, :2] += (off_x, off_y) + bbox[:, 2:4] += (off_x, off_y) + + return dst, bbox + + +def mix_up(img1, img2, bbox1, bbox2): + ''' + return: + mix_img: HWC format mix up image + mix_bbox: [N, 5] shape mix up bbox, i.e. `x_min, y_min, x_max, y_mix, mixup_weight`. + ''' + height = max(img1.shape[0], img2.shape[0]) + width = max(img1.shape[1], img2.shape[1]) + + mix_img = np.zeros(shape=(height, width, 3), dtype='float32') + + # rand_num = np.random.random() + rand_num = np.random.beta(1.5, 1.5) + rand_num = max(0, min(1, rand_num)) + mix_img[:img1.shape[0], :img1.shape[1], :] = img1.astype('float32') * rand_num + mix_img[:img2.shape[0], :img2.shape[1], :] += img2.astype('float32') * (1. - rand_num) + + mix_img = mix_img.astype('uint8') + + # the last element of the 2nd dimention is the mix up weight + bbox1 = np.concatenate((bbox1, np.full(shape=(bbox1.shape[0], 1), fill_value=rand_num)), axis=-1) + bbox2 = np.concatenate((bbox2, np.full(shape=(bbox2.shape[0], 1), fill_value=1. - rand_num)), axis=-1) + mix_bbox = np.concatenate((bbox1, bbox2), axis=0) + + return mix_img, mix_bbox + + +def bbox_crop(bbox, crop_box=None, allow_outside_center=True): + """Crop bounding boxes according to slice area. + This method is mainly used with image cropping to ensure bonding boxes fit + within the cropped image. + Parameters + ---------- + bbox : numpy.ndarray + Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes. + The second axis represents attributes of the bounding box. + Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`, + we allow additional attributes other than coordinates, which stay intact + during bounding box transformations. + crop_box : tuple + Tuple of length 4. :math:`(x_{min}, y_{min}, width, height)` + allow_outside_center : bool + If `False`, remove bounding boxes which have centers outside cropping area. + Returns + ------- + numpy.ndarray + Cropped bounding boxes with shape (M, 4+) where M <= N. + """ + bbox = bbox.copy() + if crop_box is None: + return bbox + if not len(crop_box) == 4: + raise ValueError( + "Invalid crop_box parameter, requires length 4, given {}".format(str(crop_box))) + if sum([int(c is None) for c in crop_box]) == 4: + return bbox + + l, t, w, h = crop_box + + left = l if l else 0 + top = t if t else 0 + right = left + (w if w else np.inf) + bottom = top + (h if h else np.inf) + crop_bbox = np.array((left, top, right, bottom)) + + if allow_outside_center: + mask = np.ones(bbox.shape[0], dtype=bool) + else: + centers = (bbox[:, :2] + bbox[:, 2:4]) / 2 + mask = np.logical_and(crop_bbox[:2] <= centers, centers < crop_bbox[2:]).all(axis=1) + + # transform borders + bbox[:, :2] = np.maximum(bbox[:, :2], crop_bbox[:2]) + bbox[:, 2:4] = np.minimum(bbox[:, 2:4], crop_bbox[2:4]) + bbox[:, :2] -= crop_bbox[:2] + bbox[:, 2:4] -= crop_bbox[:2] + + mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:4]).all(axis=1)) + bbox = bbox[mask] + return bbox + + +def bbox_iou(bbox_a, bbox_b, offset=0): + """Calculate Intersection-Over-Union(IOU) of two bounding boxes. + Parameters + ---------- + bbox_a : numpy.ndarray + An ndarray with shape :math:`(N, 4)`. + bbox_b : numpy.ndarray + An ndarray with shape :math:`(M, 4)`. + offset : float or int, default is 0 + The ``offset`` is used to control the whether the width(or height) is computed as + (right - left + ``offset``). + Note that the offset must be 0 for normalized bboxes, whose ranges are in ``[0, 1]``. + Returns + ------- + numpy.ndarray + An ndarray with shape :math:`(N, M)` indicates IOU between each pairs of + bounding boxes in `bbox_a` and `bbox_b`. + """ + if bbox_a.shape[1] < 4 or bbox_b.shape[1] < 4: + raise IndexError("Bounding boxes axis 1 must have at least length 4") + + tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2]) + br = np.minimum(bbox_a[:, None, 2:4], bbox_b[:, 2:4]) + + area_i = np.prod(br - tl + offset, axis=2) * (tl < br).all(axis=2) + area_a = np.prod(bbox_a[:, 2:4] - bbox_a[:, :2] + offset, axis=1) + area_b = np.prod(bbox_b[:, 2:4] - bbox_b[:, :2] + offset, axis=1) + return area_i / (area_a[:, None] + area_b - area_i) diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/generate_tensorflow_model.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_model_loader.py similarity index 59% rename from src/traffic_analysis/d04_modelling/transfer_learning/generate_tensorflow_model.py rename to src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_model_loader.py index e993bc6..b0633fa 100644 --- a/src/traffic_analysis/d04_modelling/transfer_learning/generate_tensorflow_model.py +++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_model_loader.py @@ -192,6 +192,186 @@ def _reshape(result): return boxes, confs, probs + def loss_layer(self, feature_map_i, y_true, anchors): + ''' + calc loss function from a certain scale + input: + feature_map_i: feature maps of a certain scale. shape: [N, 13, 13, 3*(5 + num_class)] etc. + y_true: y_ture from a certain scale. shape: [N, 13, 13, 3, 5 + num_class + 1] etc. + anchors: shape [9, 2] + ''' + + # size in [h, w] format! don't get messed up! + grid_size = tf.shape(feature_map_i)[1:3] + # the downscale ratio in height and weight + ratio = tf.cast(self.img_size / grid_size, tf.float32) + # N: batch_size + N = tf.cast(tf.shape(feature_map_i)[0], tf.float32) + + x_y_offset, pred_boxes, pred_conf_logits, pred_prob_logits = self.reorg_layer(feature_map_i, anchors) + + ########### + # get mask + ########### + + # shape: take 416x416 input image and 13*13 feature_map for example: + # [N, 13, 13, 3, 1] + object_mask = y_true[..., 4:5] + + # the calculation of ignore mask if referred from + # https://github.com/pjreddie/darknet/blob/master/src/yolo_layer.c#L179 + ignore_mask = tf.TensorArray(tf.float32, size=0, dynamic_size=True) + + def loop_cond(idx, ignore_mask): + return tf.less(idx, tf.cast(N, tf.int32)) + + def loop_body(idx, ignore_mask): + # shape: [13, 13, 3, 4] & [13, 13, 3] ==> [V, 4] + # V: num of true gt box of each image in a batch + valid_true_boxes = tf.boolean_mask(y_true[idx, ..., 0:4], tf.cast(object_mask[idx, ..., 0], 'bool')) + # shape: [13, 13, 3, 4] & [V, 4] ==> [13, 13, 3, V] + iou = self.box_iou(pred_boxes[idx], valid_true_boxes) + # shape: [13, 13, 3] + best_iou = tf.reduce_max(iou, axis=-1) + # shape: [13, 13, 3] + ignore_mask_tmp = tf.cast(best_iou < 0.5, tf.float32) + # finally will be shape: [N, 13, 13, 3] + ignore_mask = ignore_mask.write(idx, ignore_mask_tmp) + return idx + 1, ignore_mask + + _, ignore_mask = tf.while_loop(cond=loop_cond, body=loop_body, loop_vars=[0, ignore_mask]) + ignore_mask = ignore_mask.stack() + # shape: [N, 13, 13, 3, 1] + ignore_mask = tf.expand_dims(ignore_mask, -1) + + # shape: [N, 13, 13, 3, 2] + pred_box_xy = pred_boxes[..., 0:2] + pred_box_wh = pred_boxes[..., 2:4] + + # get xy coordinates in one cell from the feature_map + # numerical range: 0 ~ 1 + # shape: [N, 13, 13, 3, 2] + true_xy = y_true[..., 0:2] / ratio[::-1] - x_y_offset + pred_xy = pred_box_xy / ratio[::-1] - x_y_offset + + # get_tw_th + # numerical range: 0 ~ 1 + # shape: [N, 13, 13, 3, 2] + true_tw_th = y_true[..., 2:4] / anchors + pred_tw_th = pred_box_wh / anchors + # for numerical stability + true_tw_th = tf.where(condition=tf.equal(true_tw_th, 0), + x=tf.ones_like(true_tw_th), y=true_tw_th) + pred_tw_th = tf.where(condition=tf.equal(pred_tw_th, 0), + x=tf.ones_like(pred_tw_th), y=pred_tw_th) + true_tw_th = tf.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9)) + pred_tw_th = tf.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9)) + + # box size punishment: + # box with smaller area has bigger weight. This is taken from the yolo darknet C source code. + # shape: [N, 13, 13, 3, 1] + box_loss_scale = 2. - (y_true[..., 2:3] / tf.cast(self.img_size[1], tf.float32)) * ( + y_true[..., 3:4] / tf.cast(self.img_size[0], tf.float32)) + + ############ + # loss_part + ############ + # mix_up weight + # [N, 13, 13, 3, 1] + mix_w = y_true[..., -1:] + # shape: [N, 13, 13, 3, 1] + xy_loss = tf.reduce_sum(tf.square(true_xy - pred_xy) * object_mask * box_loss_scale * mix_w) / N + wh_loss = tf.reduce_sum(tf.square(true_tw_th - pred_tw_th) * object_mask * box_loss_scale * mix_w) / N + + # shape: [N, 13, 13, 3, 1] + conf_pos_mask = object_mask + conf_neg_mask = (1 - object_mask) * ignore_mask + conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, + logits=pred_conf_logits) + conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, + logits=pred_conf_logits) + # TODO: may need to balance the pos-neg by multiplying some weights + conf_loss = conf_loss_pos + conf_loss_neg + if self.use_focal_loss: + alpha = 1.0 + gamma = 2.0 + # TODO: alpha should be a mask array if needed + focal_mask = alpha * tf.pow(tf.abs(object_mask - tf.sigmoid(pred_conf_logits)), gamma) + conf_loss *= focal_mask + conf_loss = tf.reduce_sum(conf_loss * mix_w) / N + + # shape: [N, 13, 13, 3, 1] + # whether to use label smooth + if self.use_label_smooth: + delta = 0.01 + label_target = (1 - delta) * y_true[..., 5:-1] + delta * 1. / self.class_num + else: + label_target = y_true[..., 5:-1] + class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_target, + logits=pred_prob_logits) * mix_w + class_loss = tf.reduce_sum(class_loss) / N + + return xy_loss, wh_loss, conf_loss, class_loss + + def box_iou(self, pred_boxes, valid_true_boxes): + ''' + param: + pred_boxes: [13, 13, 3, 4], (center_x, center_y, w, h) + valid_true: [V, 4] + ''' + + # [13, 13, 3, 2] + pred_box_xy = pred_boxes[..., 0:2] + pred_box_wh = pred_boxes[..., 2:4] + + # shape: [13, 13, 3, 1, 2] + pred_box_xy = tf.expand_dims(pred_box_xy, -2) + pred_box_wh = tf.expand_dims(pred_box_wh, -2) + + # [V, 2] + true_box_xy = valid_true_boxes[:, 0:2] + true_box_wh = valid_true_boxes[:, 2:4] + + # [13, 13, 3, 1, 2] & [V, 2] ==> [13, 13, 3, V, 2] + intersect_mins = tf.maximum(pred_box_xy - pred_box_wh / 2., + true_box_xy - true_box_wh / 2.) + intersect_maxs = tf.minimum(pred_box_xy + pred_box_wh / 2., + true_box_xy + true_box_wh / 2.) + intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.) + + # shape: [13, 13, 3, V] + intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] + # shape: [13, 13, 3, 1] + pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1] + # shape: [V] + true_box_area = true_box_wh[..., 0] * true_box_wh[..., 1] + # shape: [1, V] + true_box_area = tf.expand_dims(true_box_area, axis=0) + + # [13, 13, 3, V] + iou = intersect_area / (pred_box_area + true_box_area - intersect_area + 1e-10) + + return iou + + def compute_loss(self, y_pred, y_true): + ''' + param: + y_pred: returned feature_map list by `forward` function: [feature_map_1, feature_map_2, feature_map_3] + y_true: input y_true by the tf.data pipeline + ''' + loss_xy, loss_wh, loss_conf, loss_class = 0., 0., 0., 0. + anchor_group = [self.anchors[6:9], self.anchors[3:6], self.anchors[0:3]] + + # calc loss in 3 scales + for i in range(len(y_pred)): + result = self.loss_layer(y_pred[i], y_true[i], anchor_group[i]) + loss_xy += result[0] + loss_wh += result[1] + loss_conf += result[2] + loss_class += result[3] + total_loss = loss_xy + loss_wh + loss_conf + loss_class + return [total_loss, loss_xy, loss_wh, loss_conf, loss_class] + def conv2d(inputs, filters, kernel_size, strides=1): def _fixed_padding(inputs, kernel_size): diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_processing_utils.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_processing_utils.py new file mode 100644 index 0000000..91b0825 --- /dev/null +++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_processing_utils.py @@ -0,0 +1,127 @@ +from __future__ import division, print_function + +import numpy as np +import sys +import tensorflow as tf + +PY_VERSION = sys.version_info[0] +iter_cnt = 0 + + +def cpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, iou_thresh=0.5): + """ + Perform NMS on CPU. + Arguments: + boxes: shape [1, 10647, 4] + scores: shape [1, 10647, num_classes] + """ + + boxes = boxes.reshape(-1, 4) + scores = scores.reshape(-1, num_classes) + # Picked bounding boxes + picked_boxes, picked_score, picked_label = [], [], [] + + for i in range(num_classes): + indices = np.where(scores[:,i] >= score_thresh) + filter_boxes = boxes[indices] + filter_scores = scores[:,i][indices] + if len(filter_boxes) == 0: + continue + # do non_max_suppression on the cpu + indices = py_nms(filter_boxes, filter_scores, + max_boxes=max_boxes, iou_thresh=iou_thresh) + picked_boxes.append(filter_boxes[indices]) + picked_score.append(filter_scores[indices]) + picked_label.append(np.ones(len(indices), dtype='int32')*i) + if len(picked_boxes) == 0: + return None, None, None + + boxes = np.concatenate(picked_boxes, axis=0) + score = np.concatenate(picked_score, axis=0) + label = np.concatenate(picked_label, axis=0) + + return boxes, score, label + + + +def py_nms(boxes, scores, max_boxes=50, iou_thresh=0.5): + """ + Pure Python NMS baseline. + + Arguments: boxes: shape of [-1, 4], the value of '-1' means that dont know the + exact number of boxes + scores: shape of [-1,] + max_boxes: representing the maximum of boxes to be selected by non_max_suppression + iou_thresh: representing iou_threshold for deciding to keep boxes + """ + assert boxes.shape[1] == 4 and len(scores.shape) == 1 + + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1) * (y2 - y1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= iou_thresh)[0] + order = order[inds + 1] + + return keep[:max_boxes] + + +def gpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, nms_thresh=0.5): + """ + Perform NMS on GPU using TensorFlow. + + params: + boxes: tensor of shape [1, 10647, 4] # 10647=(13*13+26*26+52*52)*3, for input 416*416 image + scores: tensor of shape [1, 10647, num_classes], score=conf*prob + num_classes: total number of classes + max_boxes: integer, maximum number of predicted boxes you'd like, default is 50 + score_thresh: if [ highest class probability score < score_threshold] + then get rid of the corresponding box + nms_thresh: real value, "intersection over union" threshold used for NMS filtering + """ + + boxes_list, label_list, score_list = [], [], [] + max_boxes = tf.constant(max_boxes, dtype='int32') + + # since we do nms for single image, then reshape it + boxes = tf.reshape(boxes, [-1, 4]) # '-1' means we don't konw the exact number of boxes + score = tf.reshape(scores, [-1, num_classes]) + + # Step 1: Create a filtering mask based on "box_class_scores" by using "threshold". + mask = tf.greater_equal(score, tf.constant(score_thresh)) + # Step 2: Do non_max_suppression for each class + for i in range(num_classes): + # Step 3: Apply the mask to scores, boxes and pick them out + filter_boxes = tf.boolean_mask(boxes, mask[:,i]) + filter_score = tf.boolean_mask(score[:,i], mask[:,i]) + nms_indices = tf.image.non_max_suppression(boxes=filter_boxes, + scores=filter_score, + max_output_size=max_boxes, + iou_threshold=nms_thresh, name='nms_indices') + label_list.append(tf.ones_like(tf.gather(filter_score, nms_indices), 'int32')*i) + boxes_list.append(tf.gather(filter_boxes, nms_indices)) + score_list.append(tf.gather(filter_score, nms_indices)) + + boxes = tf.concat(boxes_list, axis=0) + score = tf.concat(score_list, axis=0) + label = tf.concat(label_list, axis=0) + + return boxes, score, label \ No newline at end of file diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_training_utils.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_training_utils.py new file mode 100644 index 0000000..47e094b --- /dev/null +++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_training_utils.py @@ -0,0 +1,293 @@ +from __future__ import division, print_function + +import numpy as np +import sys +import cv2 +import random +import tensorflow as tf +from tensorflow.core.framework import summary_pb2 + +from traffic_analysis.d00_utils.load_confs import load_training_parameters +from traffic_analysis.d04_modelling.transfer_learning.tensorflow_image_formatting_utils import ( + mix_up, resize_with_bbox, random_flip, random_color_distort, random_expand, random_crop_with_constraints) + +PY_VERSION = sys.version_info[0] +iter_cnt = 0 + + +def get_batch_data(batch_line, class_num, img_size, anchors, mode, multi_scale=False, + mix_up=False, letterbox_resize=True, interval=10): + ''' + generate a batch of imgs and labels + param: + batch_line: a batch of lines from train/val.txt files + class_num: num of total classes. + img_size: the image size to be resized to. format: [width, height]. + anchors: anchors. shape: [9, 2]. + mode: 'train' or 'val'. if set to 'train', data augmentation will be applied. + multi_scale: whether to use multi_scale training, img_size varies from [320, 320] to [640, 640] by default. + Note that it will take effect only when mode is set to 'train'. + letterbox_resize: whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized img. + interval: change the scale of image every interval batches. + ''' + global iter_cnt + # multi_scale training + if multi_scale and mode == 'train': + random.seed(iter_cnt // interval) + random_img_size = [[x * 32, x * 32] for x in range(10, 20)] + img_size = random.sample(random_img_size, 1)[0] + iter_cnt += 1 + + img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = [], [], [], [], [] + + # mix up strategy + if mix_up and mode == 'train': + mix_lines = [] + batch_line = batch_line.tolist() + for idx, line in enumerate(batch_line): + if np.random.uniform(0, 1) < 0.5: + mix_lines.append([line, random.sample(batch_line[:idx] + batch_line[idx+1:], 1)[0]]) + else: + mix_lines.append(line) + batch_line = mix_lines + + for line in batch_line: + img_idx, img, y_true_13, y_true_26, y_true_52 = parse_data(line, class_num, img_size, anchors, mode, + letterbox_resize) + + img_idx_batch.append(img_idx) + img_batch.append(img) + y_true_13_batch.append(y_true_13) + y_true_26_batch.append(y_true_26) + y_true_52_batch.append(y_true_52) + + img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = np.asarray(img_idx_batch, np.int64), \ + np.asarray(img_batch), \ + np.asarray(y_true_13_batch), \ + np.asarray(y_true_26_batch), \ + np.asarray(y_true_52_batch) + + return img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch + + +def parse_data(line, class_num, img_size, anchors, mode, letterbox_resize): + ''' + param: + line: a line from the training/test txt file + class_num: totol class nums. + img_size: the size of image to be resized to. [width, height] format. + anchors: anchors. + mode: 'train' or 'val'. When set to 'train', data_augmentation will be applied. + letterbox_resize: whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized img. + ''' + if not isinstance(line, list): + img_idx, pic_path, boxes, labels, _, _ = parse_line(line) + img = cv2.imread(pic_path) + # expand the 2nd dimension, mix up weight default to 1. + boxes = np.concatenate((boxes, np.full(shape=(boxes.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1) + else: + # the mix up case + _, pic_path1, boxes1, labels1, _, _ = parse_line(line[0]) + img1 = cv2.imread(pic_path1) + img_idx, pic_path2, boxes2, labels2, _, _ = parse_line(line[1]) + img2 = cv2.imread(pic_path2) + + img, boxes = mix_up(img1, img2, boxes1, boxes2) + labels = np.concatenate((labels1, labels2)) + + if mode == 'train': + # random color jittering + # NOTE: applying color distort may lead to bad performance sometimes + img = random_color_distort(img) + + # random expansion with prob 0.5 + if np.random.uniform(0, 1) > 0.5: + img, boxes = random_expand(img, boxes, 4) + + # random cropping + h, w, _ = img.shape + boxes, crop = random_crop_with_constraints(boxes, (w, h)) + x0, y0, w, h = crop + img = img[y0: y0+h, x0: x0+w] + + # resize with random interpolation + h, w, _ = img.shape + interp = np.random.randint(0, 5) + img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=interp, letterbox=letterbox_resize) + + # random horizontal flip + h, w, _ = img.shape + img, boxes = random_flip(img, boxes, px=0.5) + else: + img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=1, letterbox=letterbox_resize) + + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) + + # the input of yolo_v3 should be in range 0~1 + img = img / 255. + + y_true_13, y_true_26, y_true_52 = process_box(boxes, labels, img_size, class_num, anchors) + + return img_idx, img, y_true_13, y_true_26, y_true_52 + + +def process_box(boxes, labels, img_size, class_num, anchors): + ''' + Generate the y_true label, i.e. the ground truth feature_maps in 3 different scales. + params: + boxes: [N, 5] shape, float32 dtype. `x_min, y_min, x_max, y_mix, mixup_weight`. + labels: [N] shape, int64 dtype. + class_num: int64 num. + anchors: [9, 4] shape, float32 dtype. + ''' + anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + + # convert boxes form: + # shape: [N, 2] + # (x_center, y_center) + box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2 + # (width, height) + box_sizes = boxes[:, 2:4] - boxes[:, 0:2] + + # [13, 13, 3, 5+num_class+1] `5` means coords and labels. `1` means mix up weight. + y_true_13 = np.zeros((img_size[1] // 32, img_size[0] // 32, 3, 6 + class_num), np.float32) + y_true_26 = np.zeros((img_size[1] // 16, img_size[0] // 16, 3, 6 + class_num), np.float32) + y_true_52 = np.zeros((img_size[1] // 8, img_size[0] // 8, 3, 6 + class_num), np.float32) + + # mix up weight default to 1. + y_true_13[..., -1] = 1. + y_true_26[..., -1] = 1. + y_true_52[..., -1] = 1. + + y_true = [y_true_13, y_true_26, y_true_52] + + # [N, 1, 2] + box_sizes = np.expand_dims(box_sizes, 1) + # broadcast tricks + # [N, 1, 2] & [9, 2] ==> [N, 9, 2] + mins = np.maximum(- box_sizes / 2, - anchors / 2) + maxs = np.minimum(box_sizes / 2, anchors / 2) + # [N, 9, 2] + whs = maxs - mins + + # [N, 9] + iou = (whs[:, :, 0] * whs[:, :, 1]) / ( + box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :, + 1] + 1e-10) + # [N] + best_match_idx = np.argmax(iou, axis=1) + + ratio_dict = {1.: 8., 2.: 16., 3.: 32.} + for i, idx in enumerate(best_match_idx): + # idx: 0,1,2 ==> 2; 3,4,5 ==> 1; 6,7,8 ==> 0 + feature_map_group = 2 - idx // 3 + # scale ratio: 0,1,2 ==> 8; 3,4,5 ==> 16; 6,7,8 ==> 32 + ratio = ratio_dict[np.ceil((idx + 1) / 3.)] + x = int(np.floor(box_centers[i, 0] / ratio)) + y = int(np.floor(box_centers[i, 1] / ratio)) + k = anchors_mask[feature_map_group].index(idx) + c = labels[i] + # print(feature_map_group, '|', y,x,k,c) + + y_true[feature_map_group][y, x, k, :2] = box_centers[i] + y_true[feature_map_group][y, x, k, 2:4] = box_sizes[i] + y_true[feature_map_group][y, x, k, 4] = 1. + y_true[feature_map_group][y, x, k, 5 + c] = 1. + y_true[feature_map_group][y, x, k, -1] = boxes[i, -1] + + return y_true_13, y_true_26, y_true_52 + + +def parse_line(line): + ''' + Given a line from the training/test txt file, return parsed info. + line format: line_index, img_path, img_width, img_height, [box_info_1 (5 number)], ... + return: + line_idx: int64 + pic_path: string. + boxes: shape [N, 4], N is the ground truth count, elements in the second + dimension are [x_min, y_min, x_max, y_max] + labels: shape [N]. class index. + img_width: int. + img_height: int + ''' + if 'str' not in str(type(line)): + line = line.decode() + s = line.strip().split(' ') + assert len(s) > 8, 'Annotation error! Please check your file. Make sure there is an object in each image.' + line_idx = int(s[0]) + pic_path = s[1] + img_width = int(s[2]) + img_height = int(s[3]) + s = s[4:] + assert len(s) % 5 == 0, 'Annotation error! Please check your file. Maybe partially missing some coordinates?' + box_cnt = len(s) // 5 + boxes = [] + labels = [] + for i in range(box_cnt): + label, x_min, y_min, x_max, y_max = int(s[i * 5]), float(s[i * 5 + 1]), float(s[i * 5 + 2]), float( + s[i * 5 + 3]), float(s[i * 5 + 4]) + boxes.append([x_min, y_min, x_max, y_max]) + labels.append(label) + boxes = np.asarray(boxes, np.float32) + labels = np.asarray(labels, np.int64) + return line_idx, pic_path, boxes, labels, img_width, img_height + + +def make_summary(name, val): + return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)]) + + +def config_learning_rate(lr_decay_freq, train_batch_num, global_step): + train_params = load_training_parameters() + if train_params['lr_type'] == 'exponential': + lr_tmp = tf.train.exponential_decay(train_params['learning_rate_init'], global_step, lr_decay_freq, + train_params['lr_decay_factor'], staircase=True, + name='exponential_learning_rate') + return tf.maximum(lr_tmp, train_params['lr_lower_bound']) + elif train_params['lr_type'] == 'piecewise': + train_params['pw_boundaries'] = [float(i) * train_batch_num + + train_params['global_step'] for i in train_params['pw_boundaries']] + return tf.train.piecewise_constant(global_step, boundaries=[float(i) for i in train_params['pw_boundaries']], + values=train_params['pw_values'], + name='piecewise_learning_rate') + else: + raise ValueError('Unsupported learning rate type!') + + +def config_optimizer(optimizer_name, learning_rate, decay=0.9, momentum=0.9): + if optimizer_name == 'momentum': + return tf.train.MomentumOptimizer(learning_rate, momentum=momentum) + elif optimizer_name == 'rmsprop': + return tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum) + elif optimizer_name == 'adam': + return tf.train.AdamOptimizer(learning_rate) + elif optimizer_name == 'sgd': + return tf.train.GradientDescentOptimizer(learning_rate) + else: + raise ValueError('Unsupported optimizer type!') + + +class AverageMeter(object): + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.average = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.average = self.sum / float(self.count) + + +def shuffle_and_overwrite(file_name): + content = open(file_name, 'r').readlines() + random.shuffle(content) + with open(file_name, 'w') as f: + for line in content: + f.write(line) diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/train_tensorflow_model.py b/src/traffic_analysis/d04_modelling/transfer_learning/train_tensorflow_model.py new file mode 100644 index 0000000..f1a6073 --- /dev/null +++ b/src/traffic_analysis/d04_modelling/transfer_learning/train_tensorflow_model.py @@ -0,0 +1,289 @@ +# coding: utf-8 + +from __future__ import division, print_function +import os +import math +import tensorflow as tf +import numpy as np +import logging +from tqdm import trange + +from traffic_analysis.d04_modelling.transfer_learning.tensorflow_training_utils import get_batch_data, \ + make_summary, config_learning_rate, config_optimizer, AverageMeter +from traffic_analysis.d04_modelling.transfer_learning.tensorflow_evaluation_utils import evaluate_on_gpu, \ + get_preds_gpu, voc_eval, parse_gt_rec +from traffic_analysis.d04_modelling.transfer_learning.tensorflow_processing_utils import gpu_nms +from traffic_analysis.d04_modelling.transfer_learning.tensorflow_model_loader import YoloV3 +from traffic_analysis.d04_modelling.transfer_learning.convert_darknet_to_tensorflow import parse_anchors +from traffic_analysis.d04_modelling.transfer_learning.tensorflow_detection_utils import read_class_names + + +def transfer_learn(paths, params, train_params, train_file, test_file, selected_labels): + """ trains last three layers of yolov3 network on custom dataset + """ + + transfer_learn_model_dir = os.path.join(paths['local_detection_model'], train_params['trained_model_name']) + if not os.path.exists(transfer_learn_model_dir): + os.makedirs(transfer_learn_model_dir) + + truth_dir_path = paths['temp_annotation'] + class_name_path = os.path.join(paths['local_detection_model'], 'yolov3', 'coco.names') # CHANGE THIS + classes = read_class_names(class_name_path) + + selected_label_idxs = [] + for idx, label in classes.items(): + if label in selected_labels: + selected_label_idxs.append(idx) + anchors = parse_anchors(paths) + number_classes = len(classes) + + train_data_path = os.path.join(truth_dir_path, train_file) + test_data_path = os.path.join(truth_dir_path, test_file) + train_img_cnt = len(open(train_data_path, 'r').readlines()) + val_img_cnt = len(open(test_data_path, 'r').readlines()) + train_batch_num = int(math.ceil(float(train_img_cnt) / train_params['num_batches'])) + + lr_decay_freq = int(train_batch_num * train_params['lr_decay_epoch']) + + logging_file_path = os.path.join(truth_dir_path, 'progress.log') + logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', + datefmt='%a, %d %b %Y %H:%M:%S', filename=logging_file_path, filemode='w') + + is_training = tf.placeholder(tf.bool, name="phase_train") + handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag') + pred_boxes_flag = tf.placeholder(tf.float32, [1, None, None]) + pred_scores_flag = tf.placeholder(tf.float32, [1, None, None]) + gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, number_classes, train_params['nms_topk'], + train_params['score_threshold'], train_params['nms_threshold']) + + train_dataset = tf.data.TextLineDataset(train_data_path) + train_dataset = train_dataset.shuffle(train_img_cnt) + train_dataset = train_dataset.batch(train_params['num_batches']) + train_dataset = train_dataset.map( + lambda x: tf.py_func(get_batch_data, + inp=[x, number_classes, [416, 416], anchors, 'train', True, True, True], + Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]), + num_parallel_calls=train_params['num_threads']) + train_dataset = train_dataset.prefetch(train_params['prefetech_buffer']) + + test_dataset = tf.data.TextLineDataset(test_data_path) + test_dataset = test_dataset.batch(1) + test_dataset = test_dataset.map( + lambda x: tf.py_func(get_batch_data, + inp=[x, number_classes, [416, 416], anchors, 'val', False, False, True], + Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]), + num_parallel_calls=train_params['num_threads']) + test_dataset.prefetch(train_params['prefetech_buffer']) + + iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) + train_init_op = iterator.make_initializer(train_dataset) + val_init_op = iterator.make_initializer(test_dataset) + + # get an element from the chosen dataset iterator + image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next() + y_true = [y_true_13, y_true_26, y_true_52] + + # tf.data pipeline will lose the data `static` shape, so we need to set it manually + image_ids.set_shape([None]) + image.set_shape([None, None, None, 3]) + for y in y_true: + y.set_shape([None, None, None, None, None]) + + # define model + yolo_model = YoloV3(number_classes, anchors, use_label_smooth=True, use_focal_loss=True, + batch_norm_decay=train_params['batch_norm_decay'], weight_decay=train_params['weight_decay'], + use_static_shape=False) + + with tf.variable_scope('YoloV3'): + pred_feature_maps = yolo_model.forward(image, is_training=is_training) + loss = yolo_model.compute_loss(pred_feature_maps, y_true) + y_pred = yolo_model.predict(pred_feature_maps) + + l2_loss = tf.losses.get_regularization_loss() + + # setting restore parts and vars to update + saver_to_restore = tf.train.Saver( + var_list=tf.contrib.framework.get_variables_to_restore( + include=None, + exclude=['YoloV3/yolov3_head/Conv_14', 'YoloV3/yolov3_head/Conv_6', 'YoloV3/yolov3_head/Conv_22'])) + update_vars = tf.contrib.framework.get_variables_to_restore(include=['YoloV3/yolov3_head']) + + tf.summary.scalar('train_batch_statistics/total_loss', loss[0]) + tf.summary.scalar('train_batch_statistics/loss_xy', loss[1]) + tf.summary.scalar('train_batch_statistics/loss_wh', loss[2]) + tf.summary.scalar('train_batch_statistics/loss_conf', loss[3]) + tf.summary.scalar('train_batch_statistics/loss_class', loss[4]) + tf.summary.scalar('train_batch_statistics/loss_l2', l2_loss) + tf.summary.scalar('train_batch_statistics/loss_ratio', l2_loss / loss[0]) + + global_step = tf.Variable(float(train_params['global_step']), + trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) + + learning_rate = tf.cond(tf.less(global_step, train_batch_num * train_params['warm_up_epoch']), + lambda: train_params['learning_rate_init'] * + global_step / (train_batch_num * train_params['warm_up_epoch']), + lambda: config_learning_rate(lr_decay_freq=lr_decay_freq, train_batch_num=train_batch_num, + global_step=global_step - + train_batch_num * train_params['warm_up_epoch'])) + tf.summary.scalar('learning_rate', learning_rate) + + if not train_params['save_optimizer']: + saver_to_save = tf.train.Saver() + saver_best = tf.train.Saver() + + optimizer = config_optimizer(train_params['optimizer_name'], learning_rate) + + # set dependencies for BN ops + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + # train_op = optimizer.minimize(loss[0] + l2_loss, var_list=update_vars, global_step=global_step) + # apply gradient clip to avoid gradient exploding + gvs = optimizer.compute_gradients(loss[0] + l2_loss, var_list=update_vars) + clip_grad_var = [gv if gv[0] is None else [ + tf.clip_by_norm(gv[0], 100.), gv[1]] for gv in gvs] + train_op = optimizer.apply_gradients(clip_grad_var, global_step=global_step) + + if train_params['save_optimizer']: + print('Saving optimizer parameters to checkpoint! Remember to restore global_step in fine-tuning afterwards.') + saver_to_save = tf.train.Saver() + saver_best = tf.train.Saver() + + tensorboard_log_path = os.path.join(truth_dir_path, 'tensorboard_logs') + yolov3_tensorflow_path = os.path.join(paths['local_detection_model'], params['detection_model'], 'yolov3.ckpt') + with tf.Session() as sess: + sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) + saver_to_restore.restore(sess, yolov3_tensorflow_path) + merged = tf.summary.merge_all() + writer = tf.summary.FileWriter(tensorboard_log_path, sess.graph) + + print('\n----------- start to train -----------\n') + + best_mAP = -np.Inf + + for epoch in range(train_params['total_epochs']): + + sess.run(train_init_op) + loss_total, loss_xy, loss_wh, loss_conf, loss_class = AverageMeter(), AverageMeter(), AverageMeter(), \ + AverageMeter(), AverageMeter() + + for i in trange(train_batch_num): + _, summary, __y_pred, __y_true, __loss, __global_step, __lr = sess.run( + [train_op, merged, y_pred, y_true, loss, global_step, learning_rate], + feed_dict={is_training: True}) + + writer.add_summary(summary, global_step=__global_step) + + loss_total.update(__loss[0], len(__y_pred[0])) + loss_xy.update(__loss[1], len(__y_pred[0])) + loss_wh.update(__loss[2], len(__y_pred[0])) + loss_conf.update(__loss[3], len(__y_pred[0])) + loss_class.update(__loss[4], len(__y_pred[0])) + + if __global_step % train_params['train_evaluation_step'] == 0 and __global_step > 0: + recall, precision = evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, + __y_pred, __y_true, number_classes, + train_params['nms_threshold']) + + info = "Epoch: {}, global_step: {} | loss: total: {:.2f}, xy: {:.2f}, " \ + "wh: {:.2f}, conf: {:.2f}, class: {:.2f} | ".format(epoch, int(__global_step), + loss_total.average, loss_xy.average, + loss_wh.average, loss_conf.average, + loss_class.average) + info += 'Last batch: rec: {:.3f}, prec: {:.3f} | lr: {:.5g}'.format(recall, precision, __lr) + print(info) + logging.info(info) + + writer.add_summary(make_summary('evaluation/train_batch_recall', recall), + global_step=__global_step) + writer.add_summary(make_summary('evaluation/train_batch_precision', precision), + global_step=__global_step) + + if np.isnan(loss_total.average): + print('****' * 10) + raise ArithmeticError( + 'Gradient exploded! Please train again and you may need modify some parameters.') + + # NOTE: this is just demo. You can set the conditions when to save the weights. + if epoch % train_params['save_epoch'] == 0 and epoch > 0: + if loss_total.average <= 2.: + saver_to_save.save(sess, + os.path.join(train_params['trained_model_name'], + 'model-epoch_{}_step_{}_loss_{:.4f}_lr_{:.5g}'.format( + epoch, int(__global_step), loss_total.average, __lr))) + + # switch to validation dataset for evaluation + if epoch % train_params['val_evaluation_epoch'] == 0 and epoch >= train_params['warm_up_epoch']: + sess.run(val_init_op) + + val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = \ + AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter() + + val_preds = [] + + for j in trange(val_img_cnt): + __image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss], + feed_dict={is_training: False}) + pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, + pred_scores_flag, __image_ids, __y_pred) + val_preds.extend(pred_content) + val_loss_total.update(__loss[0]) + val_loss_xy.update(__loss[1]) + val_loss_wh.update(__loss[2]) + val_loss_conf.update(__loss[3]) + val_loss_class.update(__loss[4]) + + # calc mAP + rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter() + gt_dict = parse_gt_rec(test_data_path, [416, 416], letterbox_resize=True) + + info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\n'.format(epoch, __global_step, __lr) + + for class_idx in range(number_classes): + if class_idx in selected_label_idxs: + npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, class_idx, + iou_thres=train_params['eval_threshold'], + use_07_metric=True) + info += 'EVAL: Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}\n'.format(class_idx, + rec, prec, ap) + + if math.isnan(rec) or math.isnan(prec) or math.isnan(ap): + pass + else: + rec_total.update(rec, npos) + prec_total.update(prec, nd) + ap_total.update(ap, 1) + + mAP = ap_total.average + info += 'EVAL: Recall: {:.4f}, Precison: {:.4f}, mAP: {:.4f}\n'.format( + rec_total.average, prec_total.average, mAP) + info += 'EVAL: loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f}\n'.format( + val_loss_total.average, val_loss_xy.average, val_loss_wh.average, + val_loss_conf.average, val_loss_class.average) + print(info) + logging.info(info) + + if mAP > best_mAP: + best_mAP = mAP + saver_best.save(sess, os.path.join( + transfer_learn_model_dir, + 'best_model_Epoch_{}_step_{}_mAP_{:.4f}_loss_{:.4f}_lr_{:.7g}'.format( + epoch, int(__global_step), best_mAP, val_loss_total.average, __lr))) + + writer.add_summary(make_summary('evaluation/val_mAP', mAP), + global_step=epoch) + writer.add_summary(make_summary('evaluation/val_recall', rec_total.average), + global_step=epoch) + writer.add_summary(make_summary('evaluation/val_precision', prec_total.average), + global_step=epoch) + writer.add_summary(make_summary('validation_statistics/total_loss', val_loss_total.average), + global_step=epoch) + writer.add_summary(make_summary('validation_statistics/loss_xy', val_loss_xy.average), + global_step=epoch) + writer.add_summary(make_summary('validation_statistics/loss_wh', val_loss_wh.average), + global_step=epoch) + writer.add_summary(make_summary('validation_statistics/loss_conf', val_loss_conf.average), + global_step=epoch) + writer.add_summary(make_summary('validation_statistics/loss_class', val_loss_class.average), + global_step=epoch) + + return diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/training_data_loader.py b/src/traffic_analysis/d04_modelling/transfer_learning/training_data_loader.py new file mode 100644 index 0000000..29a5498 --- /dev/null +++ b/src/traffic_analysis/d04_modelling/transfer_learning/training_data_loader.py @@ -0,0 +1,300 @@ +import os +import sys +import xml.etree.ElementTree as ET +from PIL import Image +import numpy as np +from enum import Enum +import cv2 + +from traffic_analysis.d00_utils.load_confs import load_paths, load_credentials +from traffic_analysis.d00_utils.data_loader_s3 import DataLoaderS3 +from traffic_analysis.d00_utils.data_retrieval import delete_and_recreate_dir, mp4_to_npy +from traffic_analysis.d02_ref.ref_utils import get_s3_video_path_from_xml_name +from traffic_analysis.d00_utils.get_project_directory import get_project_directory +from traffic_analysis.d04_modelling.transfer_learning.tensorflow_detection_utils import read_class_names + + +class TransferDataset(Enum): + detrac = 1 + cvat = 2 + + +class TrainingDataLoader(object): + + def __init__(self, datasets, creds, paths): + self.datasets = datasets + self.creds = creds + self.paths = paths + self.load_mapping = {TransferDataset.detrac: self.load_detrac_data, + TransferDataset.cvat: self.load_cvat_data} + + self.data_loader_s3 = DataLoaderS3(s3_credentials=creds[paths['s3_creds']], + bucket_name=paths['bucket_name']) + + return + + + def get_train_and_test(self, train_fraction): + + x, y = self.load_data_from_s3() + + x_train = x[:int(len(x) * train_fraction)] + y_train = y[:int(len(x) * train_fraction)] + + x_test = x[int(len(x) * train_fraction):] + y_test = y[int(len(x) * train_fraction):] + + return x_train, y_train, x_test, y_test + + def load_data_from_s3(self): + + self.clear_temp_folders() + + xs = [] + ys = [] + for dataset in self.datasets: + x, y = self.load_mapping[dataset]() + assert len(x) == len(y), "Mismatch in number of input and output pairs! " \ + "(Dataset: " + dataset.name + ")" + xs += x + ys += y + + # self.clear_temp_folders() + + return xs, ys + + def clear_temp_folders(self): + delete_and_recreate_dir(self.paths['temp_annotation']) + delete_and_recreate_dir(self.paths['temp_raw_images']) + delete_and_recreate_dir(self.paths['temp_raw_video']) + + def load_detrac_data(self): + + print('Parsing detrac xmls...') + y = [] + xml_files = self.data_loader_s3.list_objects(prefix=self.paths['s3_detrac_annotations']) + count = 0 + for xml_file in xml_files: + result = self.parse_detrac_xml_file(xml_file) + if (result): + y += result + + if count == 10: + break + count += 1 + + print('Loading detrac images...') + x = [] + for labels in y: + image_num = labels.split(' ')[0].zfill(5) + impath = labels.split(' ')[1] + folder = impath.split('/')[-1][:9] + + file_to_download = self.paths['s3_detrac_images'] + \ + folder + '/' + \ + 'img' + image_num + '.jpg' + + download_file_to = self.paths['temp_raw_images'] + \ + folder + '_' + \ + image_num + '.jpg' + + self.data_loader_s3.download_file( + path_of_file_to_download=file_to_download, + path_to_download_file_to=download_file_to) + + img = Image.open(download_file_to) + img.load() + x.append(np.asarray(img, dtype="int32")) + + return x, y + + def parse_detrac_xml_file(self, xml_file): + + project_dir = get_project_directory() + image_dir = os.path.join(project_dir, self.paths['temp_raw_images']) + + xml_file_name = xml_file.split('/')[-1] + xml_path = self.paths['temp_annotation'] + xml_file.split('/')[-1] + + class_names_path = os.path.join(self.paths['local_detection_model'], 'yolov3', 'coco.names') + classes = read_class_names(class_names_path) + + try: + self.data_loader_s3.download_file(path_of_file_to_download=xml_file, + path_to_download_file_to=xml_path) + except: + print("Could not download file " + xml_file) + + root = ET.parse(xml_path).getroot() + + results = [] + # [image_index + # image_path + # image_width + # image_height + # label_index, + # x_min, + # y_min, + # x_max, + # y_max] + + im_width = 960 + im_height = 540 + + for track in root.iter('frame'): + frame_str = str(track.attrib['num']).zfill(5) + im_path = os.path.join(image_dir, xml_file_name[:-4] + '_' + frame_str + '.jpg') + result = str(track.attrib['num']) + \ + ' ' + str(im_path) + \ + ' ' + str(im_width) + \ + ' ' + str(im_height) + + for frame_obj in track.iter('target'): + vehicle_type = frame_obj.find('attribute').attrib['vehicle_type'] + if vehicle_type == 'van': + vehicle_type_idx = 2 # say vans are cars because we don't distinguish + else: + for tick in range(len(classes)): + if classes[tick] == vehicle_type: + vehicle_type_idx = tick + + left = float(frame_obj.find('box').attrib['left']) + top = float(frame_obj.find('box').attrib['top']) + width = float(frame_obj.find('box').attrib['width']) + height = float(frame_obj.find('box').attrib['height']) + + x_min = left + y_min = top + x_max = left + width + y_max = top + height + + result += ' ' + str(vehicle_type_idx) + \ + ' ' + str(x_min) + \ + ' ' + str(y_min) + \ + ' ' + str(x_max) + \ + ' ' + str(y_max) + + results.append(result) + + if len(results) > 1: + return results + else: + return None + + def load_cvat_data(self): + + print('Parsing cvat xmls...') + y = [] + xml_files = self.data_loader_s3.list_objects(prefix=self.paths['s3_cvat_training_annotations']) + vid_names = [] + for xml_file in xml_files: + result, vid_name = self.parse_cvat_xml_file(xml_file) + if(result): + y += result + vid_names.append(vid_name) + + + print('Loading cvat videos...') + + # Build a list of the videos needed + video_set = set() + image_path_in_y = [] + for labels in y: + video_set.add(labels.split(' ')[1].split('/')[-1][:-10]) + image_path_in_y.append(labels.split(' ')[1]) + + x = [] + for id in video_set: + video, video_path = self.get_cvat_video(id) + video_name = video_path.split('/')[-1][:-4] + vidcap = cv2.VideoCapture(video_path) + + count = 0 + while vidcap.isOpened(): + success, image = vidcap.read() + if success: + image_num = str(count).zfill(5) + im_path = os.path.join(self.paths['temp_raw_images'], video_name + '_' + image_num + '.jpg') + if im_path in image_path_in_y: + x.append(np.asarray(image, dtype="int32")) + cv2.imwrite(im_path, image) + count += 1 + else: + break + vidcap.release() + return x, y + + def get_cvat_video(self, xml_file_name): + + video_path = get_s3_video_path_from_xml_name(xml_file_name=xml_file_name, + s3_creds=self.creds[self.paths['s3_creds']], + paths=self.paths) + if(video_path): + download_file_to = os.path.join(self.paths['temp_raw_video'], xml_file_name + '.mp4') + self.data_loader_s3.download_file(path_of_file_to_download=video_path, + path_to_download_file_to=download_file_to) + return mp4_to_npy(download_file_to), download_file_to + else: + return + + def parse_cvat_xml_file(self, xml_file): + + path = self.paths['temp_annotation'] + xml_file.split('/')[-1] + + try: + self.data_loader_s3.download_file(path_of_file_to_download=xml_file, + path_to_download_file_to=path) + except: + print("Could not download file " + xml_file) + + root = ET.parse(path).getroot() + vid_name = path.split('/')[-1][:-4] + im_dir = self.paths['temp_raw_images'] + + im_width = 352 + im_height = 288 + + class_names_path = os.path.join(self.paths['local_detection_model'], 'yolov3', 'coco.names') + classes = read_class_names(class_names_path) + + frame_dict = {} + + for track in root.iter('track'): + for frame in track.iter('box'): + frame_num = frame.attrib['frame'] + + if(frame_num not in frame_dict): + frame_name = str(frame_num).zfill(5) + im_path = os.path.join(im_dir, vid_name + '_' + frame_name + '.jpg') + frame_dict[frame_num] = str(frame_num) + ' ' + \ + str(im_path) + ' ' + \ + str(im_width) + ' ' + \ + str(im_height) + if track.attrib['label'] == 'vehicle': + vehicle_type = frame.findall('attribute')[2].text + if vehicle_type == 'van': + vehicle_type_idx = 2 # say vans are cars because we don't distinguish + else: + for tick in range(len(classes)): + if classes[tick] == vehicle_type: + vehicle_type_idx = tick + + x_min = float(frame.attrib['xtl']) + y_min = float(frame.attrib['ytl']) + x_max = float(frame.attrib['xbr']) + y_max = float(frame.attrib['ybr']) + + frame_dict[frame_num] += ' ' + str(vehicle_type_idx) + \ + ' ' + str(x_min) + \ + ' ' + str(y_min) + \ + ' ' + str(x_max) + \ + ' ' + str(y_max) + + results = [] + for key in frame_dict: + results.append(frame_dict[key]) + + if len(results) > 1: + return results, vid_name + else: + return None