diff --git a/conf/base/parameters.yml b/conf/base/parameters.yml
index acca761..fde8db6 100644
--- a/conf/base/parameters.yml
+++ b/conf/base/parameters.yml
@@ -43,13 +43,13 @@ data_renaming: # TODO: remove later when renaming finished
 
 modelling:
   # obj detection
-  detection_model: "yolov3-tiny"
+  detection_model: "yolov3_tf"
   detection_implementation: "cvlib"
   detection_iou_threshold: 0.05
   detection_confidence_threshold: 0.2
-  # TODO: change nms threshold to iou threshold	
+  # TODO: change nms threshold to iou threshold
   detection_nms_threshold: 0.2
-  
+
   # tracking
   selected_labels: ["car", "truck", "bus", "motorbike"]
   opencv_tracker_type: "csrt"
diff --git a/conf/base/paths.yml b/conf/base/paths.yml
index 76fdec9..00a6b11 100644
--- a/conf/base/paths.yml
+++ b/conf/base/paths.yml
@@ -7,15 +7,21 @@ s3_paths:
   s3_camera_details: "ref/camera_details/camera_details.json"
 
   s3_frame_level: "frame_level/" # TODO DELETE THIS
-  s3_profile: "dssg" # TODO: change this for user? 
+  s3_profile: "dssg" # TODO: change this for user?
   s3_creds: "dev_s3" # TODO: CHANGE TO JUST S3
   s3_detection_model: "ref/model_conf/"
+  s3_cvat_annotations: "ref/annotations/cvat/"
+  s3_detrac_annotations: "ref/annotations/detrac/"
+  s3_detrac_images: "raw/images/detrac/"
+  s3_cvat_training_annotations: "ref/annotations/cvat_train/"
 
 local_paths:
   temp_video: "data/temp/videos/"
+  temp_raw_images: "data/temp/raw_images/"
   temp_raw_video: "data/temp/raw_videos/"
   temp_frame_level: "data/temp/frame_level/"
   temp_video_level: "data/temp/video_level/"
+  temp_annotation: "data/temp/annotation/"
   temp_setup: "data/temp/setup/"
 
   video_names: "data/ref/video_names/"
diff --git a/conf/base/training_parameters.yml b/conf/base/training_parameters.yml
new file mode 100644
index 0000000..89b02ac
--- /dev/null
+++ b/conf/base/training_parameters.yml
@@ -0,0 +1,34 @@
+training:
+  num_batches : 10
+  letterbox_resize : True  # Whether to use letterbox resize, i.e., keep the original aspect ratio in the resized img.
+  total_epochs : 1000
+  train_evaluation_step : 100  # Evaluate on the training batch after some steps.
+  val_evaluation_epoch : 2  # Evaluate on the validation dataset after some epochs. Set to None to evaluate all epoch.
+  save_epoch : 10  # Save the model after some epochs.
+  batch_norm_decay : 0.99  # decay in bn ops
+  weight_decay : 0.0005  # l2 weight decay
+  global_step : 0  # used when resuming training
+  warm_up_epoch : 3 # set to larger value if gradient explodes
+  num_threads : 10  # Number of threads for image processing used in tf.data pipeline.
+  prefetech_buffer : 5  # Prefetech_buffer used in tf.data pipeline.
+  trained_model_name : 'yolov3_traffic'
+
+learning:
+  optimizer_name : 'momentum'  # Chosen from [sgd, momentum, adam, rmsprop]
+  save_optimizer : True  # Whether to save the optimizer parameters into the checkpoint file.
+  learning_rate_init : 0.0001
+  lr_type : 'piecewise'  # Chosen from [exponential, piecewise]
+  lr_decay_epoch : 5  # Epochs after which learning rate decays. Int or float. Used when chosen `exponential` lr_type.
+  lr_decay_factor : 0.96  # The learning rate decay factor. Used when chosen `exponential` lr_type.
+  lr_lower_bound : 0.000001  # The minimum learning rate.
+  pw_boundaries : [30, 50]  # epoch based boundaries
+  pw_values : [0.0001, 0.00003, 0.00001] # FIRST VALUE MUST BE LEARNING_RATE_INIT
+
+validation:
+  # nms
+  nms_threshold : 0.45  # iou threshold in nms operation
+  score_threshold : 0.01  # threshold of the prob of the classes in nms operation, i.e. score = pred_confs * pred_probs.
+  nms_topk : 150  # keep at most nms_topk outputs after nms
+
+  # mAP eval
+  eval_threshold : 0.5  # the iou threshold applied in mAP evaluation
diff --git a/data/frame_level/frame001out.jpg b/data/frame_level/frame001out.jpg
new file mode 100644
index 0000000..ccaed0d
Binary files /dev/null and b/data/frame_level/frame001out.jpg differ
diff --git a/requirements.txt b/requirements.txt
index 4c35289..e45dc23 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,4 @@ sqlalchemy
 pandas==0.24.2
 python-crontab>=2.3.8, <3.0
 seaborn>=0.9
+tqdm==4.33.0
diff --git a/src/run_transfer_learning.py b/src/run_transfer_learning.py
new file mode 100644
index 0000000..68d66a3
--- /dev/null
+++ b/src/run_transfer_learning.py
@@ -0,0 +1,33 @@
+from traffic_analysis.d00_utils.load_confs import load_paths, load_credentials, \
+    load_parameters, load_training_parameters
+from traffic_analysis.d04_modelling.transfer_learning.training_data_loader import TrainingDataLoader, TransferDataset
+from traffic_analysis.d04_modelling.transfer_learning.train_tensorflow_model import transfer_learn
+
+paths = load_paths()
+creds = load_credentials()
+params = load_parameters()
+train_params = load_training_parameters()
+
+training_data_loader = TrainingDataLoader(datasets=[TransferDataset.cvat, TransferDataset.detrac],
+                                          creds=creds,
+                                          paths=paths)
+
+fraction_for_training = 0.8
+x_train, y_train, x_test, y_test = training_data_loader.get_train_and_test(fraction_for_training)
+
+
+saved_text_files_dir = paths['temp_annotation']
+with open(saved_text_files_dir + 'train.txt', 'w') as f:
+    for item in y_train:
+        f.write("%s\n" % item)
+
+with open(saved_text_files_dir + 'test.txt', 'w') as f:
+    for item in y_test:
+        f.write("%s\n" % item)
+
+transfer_learn(paths=paths,
+               params=params,
+               train_params=train_params,
+               train_file='train.txt',
+               test_file='test.txt',
+               selected_labels=params['selected_labels'])
diff --git a/src/traffic_analysis/d00_utils/load_confs.py b/src/traffic_analysis/d00_utils/load_confs.py
index 6eff1df..3a64532 100644
--- a/src/traffic_analysis/d00_utils/load_confs.py
+++ b/src/traffic_analysis/d00_utils/load_confs.py
@@ -30,6 +30,12 @@ def load_app_parameters():
     return {**params['visualization']}
 
 
+def load_training_parameters():
+    with open(project_dir + '/conf/base/training_parameters.yml') as f:
+        params = yaml.safe_load(f)
+    return collapse_dict_hierarchy(params)
+
+
 def load_credentials():
 
     filepath = os.sep.join(
diff --git a/src/traffic_analysis/d02_ref/ref_utils.py b/src/traffic_analysis/d02_ref/ref_utils.py
index f1a4726..5963baa 100644
--- a/src/traffic_analysis/d02_ref/ref_utils.py
+++ b/src/traffic_analysis/d02_ref/ref_utils.py
@@ -5,6 +5,8 @@
 import subprocess
 from subprocess import Popen, PIPE
 
+from traffic_analysis.d00_utils.data_loader_s3 import DataLoaderS3
+
 
 def upload_json_to_s3(paths: dict,
                       save_name: str,
@@ -66,4 +68,48 @@ def get_names_of_folder_content_from_s3(bucket_name, prefix, s3_profile):
     end = Time.time()
     elapsed_time = end-start
 
+    assert ((len(files) == 0) or (files[0] != '')), 'set your aws credentials'
+
     return elapsed_time, files
+
+
+def get_s3_video_path_from_xml_name(xml_file_name, s3_creds, paths):
+
+    # Supports old and new naming conventions
+    vals = xml_file_name.split('_')
+    data_loader_s3 = DataLoaderS3(s3_credentials=s3_creds,
+                                  bucket_name=paths['bucket_name'])
+
+    if (len(vals) >= 4):
+        date = vals[1]
+        file_names = [xml_file_name.split('_')[1:][0].replace('-', '') + '-' +
+                      xml_file_name.split('_')[1:][1].replace('-', '')[:6] + '_' +
+                      xml_file_name.split('_')[1:][2],
+                      xml_file_name.split('_')[1:][0] + ' ' +
+                      xml_file_name.split('_')[1:][1].replace('-', ':') + '_' +
+                      xml_file_name.split('_')[1:][2]]
+    else:
+        date = vals[0]
+        file_names = [xml_file_name.split('_')[0].replace('-', '') + '-' +
+                      xml_file_name.split('_')[1].replace('-', '')[:6] + '_' +
+                      xml_file_name.split('_')[2],
+                      xml_file_name.split('_')[0] + ' ' +
+                      xml_file_name.split('_')[1].replace('-', ':') + '_' +
+                      xml_file_name.split('_')[2]]
+    file_to_download = paths['s3_video'] + \
+                       date + '/' + \
+                       file_names[0] + '.mp4'
+
+    if(data_loader_s3.file_exists(file_to_download)):
+        return file_to_download
+
+    else:
+        file_to_download = paths['s3_video'] + \
+                           date + '/' + \
+                           file_names[1] + '.mp4'
+
+        if (data_loader_s3.file_exists(file_to_download)):
+            return file_to_download
+        else:
+            print('Could not download file: ' + xml_file_name)
+            return
diff --git a/src/traffic_analysis/d02_ref/upload_annotation_names_to_s3.py b/src/traffic_analysis/d02_ref/upload_annotation_names_to_s3.py
index cd9819a..b8ee860 100644
--- a/src/traffic_analysis/d02_ref/upload_annotation_names_to_s3.py
+++ b/src/traffic_analysis/d02_ref/upload_annotation_names_to_s3.py
@@ -1,5 +1,6 @@
 from traffic_analysis.d02_ref.ref_utils import get_names_of_folder_content_from_s3
 from traffic_analysis.d00_utils.data_loader_s3 import DataLoaderS3
+from traffic_analysis.d02_ref.ref_utils import get_s3_video_path_from_xml_name
 
 
 def upload_annotation_names_to_s3(paths: dict,
diff --git a/src/traffic_analysis/d04_modelling/perform_detection_tensorflow.py b/src/traffic_analysis/d04_modelling/perform_detection_tensorflow.py
index 0b2e419..31787fb 100644
--- a/src/traffic_analysis/d04_modelling/perform_detection_tensorflow.py
+++ b/src/traffic_analysis/d04_modelling/perform_detection_tensorflow.py
@@ -9,7 +9,7 @@
     remove_overlapping_boxes, letterbox_resize
 from traffic_analysis.d04_modelling.transfer_learning.convert_darknet_to_tensorflow import parse_anchors, \
     yolov3_darknet_to_tensorflow
-from traffic_analysis.d04_modelling.transfer_learning.generate_tensorflow_model import YoloV3
+from traffic_analysis.d04_modelling.transfer_learning.tensorflow_model_loader import YoloV3
 from traffic_analysis.d04_modelling.perform_detection_opencv import label_detections, \
     choose_objects_of_selected_labels
 
diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/convert_darknet_to_tensorflow.py b/src/traffic_analysis/d04_modelling/transfer_learning/convert_darknet_to_tensorflow.py
index da9f0ef..385a888 100644
--- a/src/traffic_analysis/d04_modelling/transfer_learning/convert_darknet_to_tensorflow.py
+++ b/src/traffic_analysis/d04_modelling/transfer_learning/convert_darknet_to_tensorflow.py
@@ -8,7 +8,7 @@
 import tensorflow as tf
 import numpy as np
 
-from traffic_analysis.d04_modelling.transfer_learning.generate_tensorflow_model import YoloV3
+from traffic_analysis.d04_modelling.transfer_learning.tensorflow_model_loader import YoloV3
 from traffic_analysis.d02_ref.download_detection_model_from_s3 import download_detection_model_from_s3
 
 
diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_evaluation_utils.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_evaluation_utils.py
new file mode 100644
index 0000000..c7bac86
--- /dev/null
+++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_evaluation_utils.py
@@ -0,0 +1,421 @@
+from __future__ import division, print_function
+
+import numpy as np
+import sys
+from collections import Counter
+
+PY_VERSION = sys.version_info[0]
+iter_cnt = 0
+
+
+def voc_ap(rec, prec, use_07_metric=True):
+    """Compute VOC AP given precision and recall. If use_07_metric is true, uses
+    the VOC 07 11-point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, image_ids, y_pred):
+    '''
+    Given the y_pred of an input image, get the predicted bbox and label info.
+    return:
+        pred_content: 2d list.
+    '''
+    image_id = image_ids[0]
+
+    # keep the first dimension 1
+    pred_boxes = y_pred[0][0:1]
+    pred_confs = y_pred[1][0:1]
+    pred_probs = y_pred[2][0:1]
+
+    boxes, scores, labels = sess.run(gpu_nms_op,
+                                     feed_dict={pred_boxes_flag: pred_boxes,
+                                                pred_scores_flag: pred_confs * pred_probs})
+
+    pred_content = []
+    for i in range(len(labels)):
+        x_min, y_min, x_max, y_max = boxes[i]
+        score = scores[i]
+        label = labels[i]
+        pred_content.append([image_id, x_min, y_min, x_max, y_max, score, label])
+
+    return pred_content
+
+
+def calc_iou(pred_boxes, true_boxes):
+    '''
+    Maintain an efficient way to calculate the ios matrix using the numpy broadcast tricks.
+    shape_info: pred_boxes: [N, 4]
+                true_boxes: [V, 4]
+    return: IoU matrix: shape: [N, V]
+    '''
+
+    # [N, 1, 4]
+    pred_boxes = np.expand_dims(pred_boxes, -2)
+    # [1, V, 4]
+    true_boxes = np.expand_dims(true_boxes, 0)
+
+    # [N, 1, 2] & [1, V, 2] ==> [N, V, 2]
+    intersect_mins = np.maximum(pred_boxes[..., :2], true_boxes[..., :2])
+    intersect_maxs = np.minimum(pred_boxes[..., 2:], true_boxes[..., 2:])
+    intersect_wh = np.maximum(intersect_maxs - intersect_mins, 0.)
+
+    # shape: [N, V]
+    intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
+    # shape: [N, 1, 2]
+    pred_box_wh = pred_boxes[..., 2:] - pred_boxes[..., :2]
+    # shape: [N, 1]
+    pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]
+    # [1, V, 2]
+    true_boxes_wh = true_boxes[..., 2:] - true_boxes[..., :2]
+    # [1, V]
+    true_boxes_area = true_boxes_wh[..., 0] * true_boxes_wh[..., 1]
+
+    # shape: [N, V]
+    iou = intersect_area / (pred_box_area + true_boxes_area - intersect_area + 1e-10)
+
+    return iou
+
+
+def voc_eval(gt_dict, val_preds, classidx, iou_thres=0.5, use_07_metric=False):
+    '''
+    Top level function that does the PASCAL VOC evaluation.
+    '''
+    # 1.obtain gt: extract all gt objects for this class
+    class_recs = {}
+    npos = 0
+    for img_id in gt_dict:
+        R = [obj for obj in gt_dict[img_id] if obj[-1] == classidx]
+        bbox = np.array([x[:4] for x in R])
+        det = [False] * len(R)
+        npos += len(R)
+        class_recs[img_id] = {'bbox': bbox, 'det': det}
+
+    # 2. obtain pred results
+    pred = [x for x in val_preds if x[-1] == classidx]
+    img_ids = [x[0] for x in pred]
+    confidence = np.array([x[-2] for x in pred])
+    BB = np.array([[x[1], x[2], x[3], x[4]] for x in pred])
+
+    # 3. sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    try:
+        BB = BB[sorted_ind, :]
+    except:
+        print('no box, ignore')
+        return 1e-6, 1e-6, 0, 0, 0
+    img_ids = [img_ids[x] for x in sorted_ind]
+
+    # 4. mark TPs and FPs
+    nd = len(img_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+
+    for d in range(nd):
+        # all the gt info in some image
+        R = class_recs[img_ids[d]]
+        bb = BB[d, :]
+        ovmax = -np.Inf
+        BBGT = R['bbox']
+
+        if BBGT.size > 0:
+            # calc iou
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1., 0.)
+            ih = np.maximum(iymax - iymin + 1., 0.)
+            inters = iw * ih
+
+            # union
+            uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (
+                        BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > iou_thres:
+            # gt not matched yet
+            if not R['det'][jmax]:
+                tp[d] = 1.
+                R['det'][jmax] = 1
+            else:
+                fp[d] = 1.
+        else:
+            fp[d] = 1.
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    # return rec, prec, ap
+    return npos, nd, tp[-1] / float(npos), tp[-1] / float(nd), ap
+
+
+def evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag,
+                    y_pred, y_true, num_classes, iou_thresh=0.5, calc_now=True):
+    '''
+    Given y_pred and y_true of a batch of data, get the recall and precision of the current batch.
+    This function will perform gpu operation on the GPU.
+    '''
+
+    num_images = y_true[0].shape[0]
+    true_labels_dict = {i: 0 for i in range(num_classes)}  # {class: count}
+    pred_labels_dict = {i: 0 for i in range(num_classes)}
+    true_positive_dict = {i: 0 for i in range(num_classes)}
+
+    for i in range(num_images):
+        true_labels_list, true_boxes_list = [], []
+        for j in range(3):  # three feature maps
+            # shape: [13, 13, 3, 80]
+            true_probs_temp = y_true[j][i][..., 5:-1]
+            # shape: [13, 13, 3, 4] (x_center, y_center, w, h)
+            true_boxes_temp = y_true[j][i][..., 0:4]
+
+            # [13, 13, 3]
+            object_mask = true_probs_temp.sum(axis=-1) > 0
+
+            # [V, 80] V: Ground truth number of the current image
+            true_probs_temp = true_probs_temp[object_mask]
+            # [V, 4]
+            true_boxes_temp = true_boxes_temp[object_mask]
+
+            # [V], labels, each from 0 to 79
+            true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist()
+            # [V, 4] (x_center, y_center, w, h)
+            true_boxes_list += true_boxes_temp.tolist()
+
+        if len(true_labels_list) != 0:
+            for cls, count in Counter(true_labels_list).items():
+                true_labels_dict[cls] += count
+
+        # [V, 4] (xmin, ymin, xmax, ymax)
+        true_boxes = np.array(true_boxes_list)
+        box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4]
+        true_boxes[:, 0:2] = box_centers - box_sizes / 2.
+        true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes
+
+        # [1, xxx, 4]
+        pred_boxes = y_pred[0][i:i + 1]
+        pred_confs = y_pred[1][i:i + 1]
+        pred_probs = y_pred[2][i:i + 1]
+
+        # pred_boxes: [N, 4]
+        # pred_confs: [N]
+        # pred_labels: [N]
+        # N: Detected box number of the current image
+        pred_boxes, pred_confs, pred_labels = sess.run(gpu_nms_op,
+                                                       feed_dict={pred_boxes_flag: pred_boxes,
+                                                                  pred_scores_flag: pred_confs * pred_probs})
+        # len: N
+        pred_labels_list = [] if pred_labels is None else pred_labels.tolist()
+        if pred_labels_list == []:
+            continue
+
+        # calc iou
+        # [N, V]
+        iou_matrix = calc_iou(pred_boxes, true_boxes)
+        # [N]
+        max_iou_idx = np.argmax(iou_matrix, axis=-1)
+
+        correct_idx = []
+        correct_conf = []
+        for k in range(max_iou_idx.shape[0]):
+            pred_labels_dict[pred_labels_list[k]] += 1
+            match_idx = max_iou_idx[k]  # V level
+            if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]:
+                if match_idx not in correct_idx:
+                    correct_idx.append(match_idx)
+                    correct_conf.append(pred_confs[k])
+                else:
+                    same_idx = correct_idx.index(match_idx)
+                    if pred_confs[k] > correct_conf[same_idx]:
+                        correct_idx.pop(same_idx)
+                        correct_conf.pop(same_idx)
+                        correct_idx.append(match_idx)
+                        correct_conf.append(pred_confs[k])
+
+        for t in correct_idx:
+            true_positive_dict[true_labels_list[t]] += 1
+
+    if calc_now:
+        # avoid divided by 0
+        recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6)
+        precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6)
+
+        return recall, precision
+    else:
+        return true_positive_dict, true_labels_dict, pred_labels_dict
+
+
+def evaluate_on_cpu(y_pred, y_true, num_classes, calc_now=True, max_boxes=50, score_thresh=0.5, iou_thresh=0.5):
+    '''
+    Given y_pred and y_true of a batch of data, get the recall and precision of the current batch.
+    '''
+
+    num_images = y_true[0].shape[0]
+    true_labels_dict = {i: 0 for i in range(num_classes)}  # {class: count}
+    pred_labels_dict = {i: 0 for i in range(num_classes)}
+    true_positive_dict = {i: 0 for i in range(num_classes)}
+
+    for i in range(num_images):
+        true_labels_list, true_boxes_list = [], []
+        for j in range(3):  # three feature maps
+            # shape: [13, 13, 3, 80]
+            true_probs_temp = y_true[j][i][..., 5:-1]
+            # shape: [13, 13, 3, 4] (x_center, y_center, w, h)
+            true_boxes_temp = y_true[j][i][..., 0:4]
+
+            # [13, 13, 3]
+            object_mask = true_probs_temp.sum(axis=-1) > 0
+
+            # [V, 3] V: Ground truth number of the current image
+            true_probs_temp = true_probs_temp[object_mask]
+            # [V, 4]
+            true_boxes_temp = true_boxes_temp[object_mask]
+
+            # [V], labels
+            true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist()
+            # [V, 4] (x_center, y_center, w, h)
+            true_boxes_list += true_boxes_temp.tolist()
+
+        if len(true_labels_list) != 0:
+            for cls, count in Counter(true_labels_list).items():
+                true_labels_dict[cls] += count
+
+        # [V, 4] (xmin, ymin, xmax, ymax)
+        true_boxes = np.array(true_boxes_list)
+        box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4]
+        true_boxes[:, 0:2] = box_centers - box_sizes / 2.
+        true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes
+
+        # [1, xxx, 4]
+        pred_boxes = y_pred[0][i:i + 1]
+        pred_confs = y_pred[1][i:i + 1]
+        pred_probs = y_pred[2][i:i + 1]
+
+        # pred_boxes: [N, 4]
+        # pred_confs: [N]
+        # pred_labels: [N]
+        # N: Detected box number of the current image
+        pred_boxes, pred_confs, pred_labels = cpu_nms(pred_boxes, pred_confs * pred_probs, num_classes,
+                                                      max_boxes=max_boxes, score_thresh=score_thresh,
+                                                      iou_thresh=iou_thresh)
+
+        # len: N
+        pred_labels_list = [] if pred_labels is None else pred_labels.tolist()
+        if pred_labels_list == []:
+            continue
+
+        # calc iou
+        # [N, V]
+        iou_matrix = calc_iou(pred_boxes, true_boxes)
+        # [N]
+        max_iou_idx = np.argmax(iou_matrix, axis=-1)
+
+        correct_idx = []
+        correct_conf = []
+        for k in range(max_iou_idx.shape[0]):
+            pred_labels_dict[pred_labels_list[k]] += 1
+            match_idx = max_iou_idx[k]  # V level
+            if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]:
+                if match_idx not in correct_idx:
+                    correct_idx.append(match_idx)
+                    correct_conf.append(pred_confs[k])
+                else:
+                    same_idx = correct_idx.index(match_idx)
+                    if pred_confs[k] > correct_conf[same_idx]:
+                        correct_idx.pop(same_idx)
+                        correct_conf.pop(same_idx)
+                        correct_idx.append(match_idx)
+                        correct_conf.append(pred_confs[k])
+
+        for t in correct_idx:
+            true_positive_dict[true_labels_list[t]] += 1
+
+    if calc_now:
+        # avoid divided by 0
+        recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6)
+        precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6)
+
+        return recall, precision
+    else:
+        return true_positive_dict, true_labels_dict, pred_labels_dict
+
+
+gt_dict = {}  # key: img_id, value: gt object list
+def parse_gt_rec(gt_filename, target_img_size, letterbox_resize=True):
+    '''
+    parse and re-organize the gt info.
+    return:
+        gt_dict: dict. Each key is a img_id, the value is the gt bboxes in the corresponding img.
+    '''
+
+    global gt_dict
+
+    if not gt_dict:
+        new_width, new_height = target_img_size
+        with open(gt_filename, 'r') as f:
+            for line in f:
+                img_id, pic_path, boxes, labels, ori_width, ori_height = parse_line(line)
+
+                objects = []
+                for i in range(len(labels)):
+                    x_min, y_min, x_max, y_max = boxes[i]
+                    label = labels[i]
+
+                    if letterbox_resize:
+                        resize_ratio = min(new_width / ori_width, new_height / ori_height)
+
+                        resize_w = int(resize_ratio * ori_width)
+                        resize_h = int(resize_ratio * ori_height)
+
+                        dw = int((new_width - resize_w) / 2)
+                        dh = int((new_height - resize_h) / 2)
+
+                        objects.append([x_min * resize_ratio + dw,
+                                        y_min * resize_ratio + dh,
+                                        x_max * resize_ratio + dw,
+                                        y_max * resize_ratio + dh,
+                                        label])
+                    else:
+                        objects.append([x_min * new_width / ori_width,
+                                        y_min * new_height / ori_height,
+                                        x_max * new_width / ori_width,
+                                        y_max * new_height / ori_height,
+                                        label])
+                gt_dict[img_id] = objects
+    return gt_dict
diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_image_formatting_utils.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_image_formatting_utils.py
new file mode 100644
index 0000000..2495c6f
--- /dev/null
+++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_image_formatting_utils.py
@@ -0,0 +1,381 @@
+from __future__ import division, print_function
+
+import numpy as np
+import sys
+import cv2
+import random
+
+PY_VERSION = sys.version_info[0]
+iter_cnt = 0
+
+
+def random_crop_with_constraints(bbox, size, min_scale=0.3, max_scale=1,
+                                 max_aspect_ratio=2, constraints=None,
+                                 max_trial=50):
+    """Crop an image randomly with bounding box constraints.
+    This data augmentation is used in training of
+    Single Shot Multibox Detector [#]_. More details can be found in
+    data augmentation section of the original paper.
+    .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy,
+       Scott Reed, Cheng-Yang Fu, Alexander C. Berg.
+       SSD: Single Shot MultiBox Detector. ECCV 2016.
+    Parameters
+    ----------
+    bbox : numpy.ndarray
+        Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes.
+        The second axis represents attributes of the bounding box.
+        Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+        we allow additional attributes other than coordinates, which stay intact
+        during bounding box transformations.
+    size : tuple
+        Tuple of length 2 of image shape as (width, height).
+    min_scale : float
+        The minimum ratio between a cropped region and the original image.
+        The default value is :obj:`0.3`.
+    max_scale : float
+        The maximum ratio between a cropped region and the original image.
+        The default value is :obj:`1`.
+    max_aspect_ratio : float
+        The maximum aspect ratio of cropped region.
+        The default value is :obj:`2`.
+    constraints : iterable of tuples
+        An iterable of constraints.
+        Each constraint should be :obj:`(min_iou, max_iou)` format.
+        If means no constraint if set :obj:`min_iou` or :obj:`max_iou` to :obj:`None`.
+        If this argument defaults to :obj:`None`, :obj:`((0.1, None), (0.3, None),
+        (0.5, None), (0.7, None), (0.9, None), (None, 1))` will be used.
+    max_trial : int
+        Maximum number of trials for each constraint before exit no matter what.
+    Returns
+    -------
+    numpy.ndarray
+        Cropped bounding boxes with shape :obj:`(M, 4+)` where M <= N.
+    tuple
+        Tuple of length 4 as (x_offset, y_offset, new_width, new_height).
+    """
+    # default params in paper
+    if constraints is None:
+        constraints = (
+            (0.1, None),
+            (0.3, None),
+            (0.5, None),
+            (0.7, None),
+            (0.9, None),
+            (None, 1),
+        )
+
+    w, h = size
+
+    candidates = [(0, 0, w, h)]
+    for min_iou, max_iou in constraints:
+        min_iou = -np.inf if min_iou is None else min_iou
+        max_iou = np.inf if max_iou is None else max_iou
+
+        for _ in range(max_trial):
+            scale = random.uniform(min_scale, max_scale)
+            aspect_ratio = random.uniform(
+                max(1 / max_aspect_ratio, scale * scale),
+                min(max_aspect_ratio, 1 / (scale * scale)))
+            crop_h = int(h * scale / np.sqrt(aspect_ratio))
+            crop_w = int(w * scale * np.sqrt(aspect_ratio))
+
+            crop_t = random.randrange(h - crop_h)
+            crop_l = random.randrange(w - crop_w)
+            crop_bb = np.array((crop_l, crop_t, crop_l + crop_w, crop_t + crop_h))
+
+            if len(bbox) == 0:
+                top, bottom = crop_t, crop_t + crop_h
+                left, right = crop_l, crop_l + crop_w
+                return bbox, (left, top, right-left, bottom-top)
+
+            iou = bbox_iou(bbox, crop_bb[np.newaxis])
+            if min_iou <= iou.min() and iou.max() <= max_iou:
+                top, bottom = crop_t, crop_t + crop_h
+                left, right = crop_l, crop_l + crop_w
+                candidates.append((left, top, right-left, bottom-top))
+                break
+
+    # random select one
+    while candidates:
+        crop = candidates.pop(np.random.randint(0, len(candidates)))
+        new_bbox = bbox_crop(bbox, crop, allow_outside_center=False)
+        if new_bbox.size < 1:
+            continue
+        new_crop = (crop[0], crop[1], crop[2], crop[3])
+        return new_bbox, new_crop
+    return bbox, (0, 0, w, h)
+
+
+def random_color_distort(img, brightness_delta=32, hue_vari=18, sat_vari=0.5, val_vari=0.5):
+    '''
+    randomly distort image color. Adjust brightness, hue, saturation, value.
+    param:
+        img: a BGR uint8 format OpenCV image. HWC format.
+    '''
+
+    def random_hue(img_hsv, hue_vari, p=0.5):
+        if np.random.uniform(0, 1) > p:
+            hue_delta = np.random.randint(-hue_vari, hue_vari)
+            img_hsv[:, :, 0] = (img_hsv[:, :, 0] + hue_delta) % 180
+        return img_hsv
+
+    def random_saturation(img_hsv, sat_vari, p=0.5):
+        if np.random.uniform(0, 1) > p:
+            sat_mult = 1 + np.random.uniform(-sat_vari, sat_vari)
+            img_hsv[:, :, 1] *= sat_mult
+        return img_hsv
+
+    def random_value(img_hsv, val_vari, p=0.5):
+        if np.random.uniform(0, 1) > p:
+            val_mult = 1 + np.random.uniform(-val_vari, val_vari)
+            img_hsv[:, :, 2] *= val_mult
+        return img_hsv
+
+    def random_brightness(img, brightness_delta, p=0.5):
+        if np.random.uniform(0, 1) > p:
+            img = img.astype(np.float32)
+            brightness_delta = int(np.random.uniform(-brightness_delta, brightness_delta))
+            img = img + brightness_delta
+        return np.clip(img, 0, 255)
+
+    # brightness
+    img = random_brightness(img, brightness_delta)
+    img = img.astype(np.uint8)
+
+    # color jitter
+    img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.float32)
+
+    if np.random.randint(0, 2):
+        img_hsv = random_value(img_hsv, val_vari)
+        img_hsv = random_saturation(img_hsv, sat_vari)
+        img_hsv = random_hue(img_hsv, hue_vari)
+    else:
+        img_hsv = random_saturation(img_hsv, sat_vari)
+        img_hsv = random_hue(img_hsv, hue_vari)
+        img_hsv = random_value(img_hsv, val_vari)
+
+    img_hsv = np.clip(img_hsv, 0, 255)
+    img = cv2.cvtColor(img_hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)
+
+    return img
+
+
+def letterbox_resize(img, new_width, new_height, interp=0):
+    '''
+    Letterbox resize. keep the original aspect ratio in the resized image.
+    '''
+    ori_height, ori_width = img.shape[:2]
+
+    resize_ratio = min(new_width / ori_width, new_height / ori_height)
+
+    resize_w = int(resize_ratio * ori_width)
+    resize_h = int(resize_ratio * ori_height)
+
+    img = cv2.resize(img, (resize_w, resize_h), interpolation=interp)
+    image_padded = np.full((new_height, new_width, 3), 128, np.uint8)
+
+    dw = int((new_width - resize_w) / 2)
+    dh = int((new_height - resize_h) / 2)
+
+    image_padded[dh: resize_h + dh, dw: resize_w + dw, :] = img
+
+    return image_padded, resize_ratio, dw, dh
+
+
+def resize_with_bbox(img, bbox, new_width, new_height, interp=0, letterbox=False):
+    '''
+    Resize the image and correct the bbox accordingly.
+    '''
+
+    if letterbox:
+        image_padded, resize_ratio, dw, dh = letterbox_resize(img, new_width, new_height, interp)
+
+        # xmin, xmax
+        bbox[:, [0, 2]] = bbox[:, [0, 2]] * resize_ratio + dw
+        # ymin, ymax
+        bbox[:, [1, 3]] = bbox[:, [1, 3]] * resize_ratio + dh
+
+        return image_padded, bbox
+    else:
+        ori_height, ori_width = img.shape[:2]
+
+        img = cv2.resize(img, (new_width, new_height), interpolation=interp)
+
+        # xmin, xmax
+        bbox[:, [0, 2]] = bbox[:, [0, 2]] / ori_width * new_width
+        # ymin, ymax
+        bbox[:, [1, 3]] = bbox[:, [1, 3]] / ori_height * new_height
+
+        return img, bbox
+
+
+def random_flip(img, bbox, px=0, py=0):
+    '''
+    Randomly flip the image and correct the bbox.
+    param:
+    px:
+        the probability of horizontal flip
+    py:
+        the probability of vertical flip
+    '''
+    height, width = img.shape[:2]
+    if np.random.uniform(0, 1) < px:
+        img = cv2.flip(img, 1)
+        xmax = width - bbox[:, 0]
+        xmin = width - bbox[:, 2]
+        bbox[:, 0] = xmin
+        bbox[:, 2] = xmax
+
+    if np.random.uniform(0, 1) < py:
+        img = cv2.flip(img, 0)
+        ymax = height - bbox[:, 1]
+        ymin = height - bbox[:, 3]
+        bbox[:, 1] = ymin
+        bbox[:, 3] = ymax
+    return img, bbox
+
+
+def random_expand(img, bbox, max_ratio=4, fill=0, keep_ratio=True):
+    '''
+    Random expand original image with borders, this is identical to placing
+    the original image on a larger canvas.
+    param:
+    max_ratio :
+        Maximum ratio of the output image on both direction(vertical and horizontal)
+    fill :
+        The value(s) for padded borders.
+    keep_ratio : bool
+        If `True`, will keep output image the same aspect ratio as input.
+    '''
+    h, w, c = img.shape
+    ratio_x = random.uniform(1, max_ratio)
+    if keep_ratio:
+        ratio_y = ratio_x
+    else:
+        ratio_y = random.uniform(1, max_ratio)
+
+    oh, ow = int(h * ratio_y), int(w * ratio_x)
+    off_y = random.randint(0, oh - h)
+    off_x = random.randint(0, ow - w)
+
+    dst = np.full(shape=(oh, ow, c), fill_value=fill, dtype=img.dtype)
+
+    dst[off_y:off_y + h, off_x:off_x + w, :] = img
+
+    # correct bbox
+    bbox[:, :2] += (off_x, off_y)
+    bbox[:, 2:4] += (off_x, off_y)
+
+    return dst, bbox
+
+
+def mix_up(img1, img2, bbox1, bbox2):
+    '''
+    return:
+        mix_img: HWC format mix up image
+        mix_bbox: [N, 5] shape mix up bbox, i.e. `x_min, y_min, x_max, y_mix, mixup_weight`.
+    '''
+    height = max(img1.shape[0], img2.shape[0])
+    width = max(img1.shape[1], img2.shape[1])
+
+    mix_img = np.zeros(shape=(height, width, 3), dtype='float32')
+
+    # rand_num = np.random.random()
+    rand_num = np.random.beta(1.5, 1.5)
+    rand_num = max(0, min(1, rand_num))
+    mix_img[:img1.shape[0], :img1.shape[1], :] = img1.astype('float32') * rand_num
+    mix_img[:img2.shape[0], :img2.shape[1], :] += img2.astype('float32') * (1. - rand_num)
+
+    mix_img = mix_img.astype('uint8')
+
+    # the last element of the 2nd dimention is the mix up weight
+    bbox1 = np.concatenate((bbox1, np.full(shape=(bbox1.shape[0], 1), fill_value=rand_num)), axis=-1)
+    bbox2 = np.concatenate((bbox2, np.full(shape=(bbox2.shape[0], 1), fill_value=1. - rand_num)), axis=-1)
+    mix_bbox = np.concatenate((bbox1, bbox2), axis=0)
+
+    return mix_img, mix_bbox
+
+
+def bbox_crop(bbox, crop_box=None, allow_outside_center=True):
+    """Crop bounding boxes according to slice area.
+    This method is mainly used with image cropping to ensure bonding boxes fit
+    within the cropped image.
+    Parameters
+    ----------
+    bbox : numpy.ndarray
+        Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes.
+        The second axis represents attributes of the bounding box.
+        Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+        we allow additional attributes other than coordinates, which stay intact
+        during bounding box transformations.
+    crop_box : tuple
+        Tuple of length 4. :math:`(x_{min}, y_{min}, width, height)`
+    allow_outside_center : bool
+        If `False`, remove bounding boxes which have centers outside cropping area.
+    Returns
+    -------
+    numpy.ndarray
+        Cropped bounding boxes with shape (M, 4+) where M <= N.
+    """
+    bbox = bbox.copy()
+    if crop_box is None:
+        return bbox
+    if not len(crop_box) == 4:
+        raise ValueError(
+            "Invalid crop_box parameter, requires length 4, given {}".format(str(crop_box)))
+    if sum([int(c is None) for c in crop_box]) == 4:
+        return bbox
+
+    l, t, w, h = crop_box
+
+    left = l if l else 0
+    top = t if t else 0
+    right = left + (w if w else np.inf)
+    bottom = top + (h if h else np.inf)
+    crop_bbox = np.array((left, top, right, bottom))
+
+    if allow_outside_center:
+        mask = np.ones(bbox.shape[0], dtype=bool)
+    else:
+        centers = (bbox[:, :2] + bbox[:, 2:4]) / 2
+        mask = np.logical_and(crop_bbox[:2] <= centers, centers < crop_bbox[2:]).all(axis=1)
+
+    # transform borders
+    bbox[:, :2] = np.maximum(bbox[:, :2], crop_bbox[:2])
+    bbox[:, 2:4] = np.minimum(bbox[:, 2:4], crop_bbox[2:4])
+    bbox[:, :2] -= crop_bbox[:2]
+    bbox[:, 2:4] -= crop_bbox[:2]
+
+    mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:4]).all(axis=1))
+    bbox = bbox[mask]
+    return bbox
+
+
+def bbox_iou(bbox_a, bbox_b, offset=0):
+    """Calculate Intersection-Over-Union(IOU) of two bounding boxes.
+    Parameters
+    ----------
+    bbox_a : numpy.ndarray
+        An ndarray with shape :math:`(N, 4)`.
+    bbox_b : numpy.ndarray
+        An ndarray with shape :math:`(M, 4)`.
+    offset : float or int, default is 0
+        The ``offset`` is used to control the whether the width(or height) is computed as
+        (right - left + ``offset``).
+        Note that the offset must be 0 for normalized bboxes, whose ranges are in ``[0, 1]``.
+    Returns
+    -------
+    numpy.ndarray
+        An ndarray with shape :math:`(N, M)` indicates IOU between each pairs of
+        bounding boxes in `bbox_a` and `bbox_b`.
+    """
+    if bbox_a.shape[1] < 4 or bbox_b.shape[1] < 4:
+        raise IndexError("Bounding boxes axis 1 must have at least length 4")
+
+    tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
+    br = np.minimum(bbox_a[:, None, 2:4], bbox_b[:, 2:4])
+
+    area_i = np.prod(br - tl + offset, axis=2) * (tl < br).all(axis=2)
+    area_a = np.prod(bbox_a[:, 2:4] - bbox_a[:, :2] + offset, axis=1)
+    area_b = np.prod(bbox_b[:, 2:4] - bbox_b[:, :2] + offset, axis=1)
+    return area_i / (area_a[:, None] + area_b - area_i)
diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/generate_tensorflow_model.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_model_loader.py
similarity index 59%
rename from src/traffic_analysis/d04_modelling/transfer_learning/generate_tensorflow_model.py
rename to src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_model_loader.py
index e993bc6..b0633fa 100644
--- a/src/traffic_analysis/d04_modelling/transfer_learning/generate_tensorflow_model.py
+++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_model_loader.py
@@ -192,6 +192,186 @@ def _reshape(result):
 
         return boxes, confs, probs
 
+    def loss_layer(self, feature_map_i, y_true, anchors):
+        '''
+        calc loss function from a certain scale
+        input:
+            feature_map_i: feature maps of a certain scale. shape: [N, 13, 13, 3*(5 + num_class)] etc.
+            y_true: y_ture from a certain scale. shape: [N, 13, 13, 3, 5 + num_class + 1] etc.
+            anchors: shape [9, 2]
+        '''
+
+        # size in [h, w] format! don't get messed up!
+        grid_size = tf.shape(feature_map_i)[1:3]
+        # the downscale ratio in height and weight
+        ratio = tf.cast(self.img_size / grid_size, tf.float32)
+        # N: batch_size
+        N = tf.cast(tf.shape(feature_map_i)[0], tf.float32)
+
+        x_y_offset, pred_boxes, pred_conf_logits, pred_prob_logits = self.reorg_layer(feature_map_i, anchors)
+
+        ###########
+        # get mask
+        ###########
+
+        # shape: take 416x416 input image and 13*13 feature_map for example:
+        # [N, 13, 13, 3, 1]
+        object_mask = y_true[..., 4:5]
+
+        # the calculation of ignore mask if referred from
+        # https://github.com/pjreddie/darknet/blob/master/src/yolo_layer.c#L179
+        ignore_mask = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
+
+        def loop_cond(idx, ignore_mask):
+            return tf.less(idx, tf.cast(N, tf.int32))
+
+        def loop_body(idx, ignore_mask):
+            # shape: [13, 13, 3, 4] & [13, 13, 3]  ==>  [V, 4]
+            # V: num of true gt box of each image in a batch
+            valid_true_boxes = tf.boolean_mask(y_true[idx, ..., 0:4], tf.cast(object_mask[idx, ..., 0], 'bool'))
+            # shape: [13, 13, 3, 4] & [V, 4] ==> [13, 13, 3, V]
+            iou = self.box_iou(pred_boxes[idx], valid_true_boxes)
+            # shape: [13, 13, 3]
+            best_iou = tf.reduce_max(iou, axis=-1)
+            # shape: [13, 13, 3]
+            ignore_mask_tmp = tf.cast(best_iou < 0.5, tf.float32)
+            # finally will be shape: [N, 13, 13, 3]
+            ignore_mask = ignore_mask.write(idx, ignore_mask_tmp)
+            return idx + 1, ignore_mask
+
+        _, ignore_mask = tf.while_loop(cond=loop_cond, body=loop_body, loop_vars=[0, ignore_mask])
+        ignore_mask = ignore_mask.stack()
+        # shape: [N, 13, 13, 3, 1]
+        ignore_mask = tf.expand_dims(ignore_mask, -1)
+
+        # shape: [N, 13, 13, 3, 2]
+        pred_box_xy = pred_boxes[..., 0:2]
+        pred_box_wh = pred_boxes[..., 2:4]
+
+        # get xy coordinates in one cell from the feature_map
+        # numerical range: 0 ~ 1
+        # shape: [N, 13, 13, 3, 2]
+        true_xy = y_true[..., 0:2] / ratio[::-1] - x_y_offset
+        pred_xy = pred_box_xy / ratio[::-1] - x_y_offset
+
+        # get_tw_th
+        # numerical range: 0 ~ 1
+        # shape: [N, 13, 13, 3, 2]
+        true_tw_th = y_true[..., 2:4] / anchors
+        pred_tw_th = pred_box_wh / anchors
+        # for numerical stability
+        true_tw_th = tf.where(condition=tf.equal(true_tw_th, 0),
+                              x=tf.ones_like(true_tw_th), y=true_tw_th)
+        pred_tw_th = tf.where(condition=tf.equal(pred_tw_th, 0),
+                              x=tf.ones_like(pred_tw_th), y=pred_tw_th)
+        true_tw_th = tf.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9))
+        pred_tw_th = tf.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9))
+
+        # box size punishment:
+        # box with smaller area has bigger weight. This is taken from the yolo darknet C source code.
+        # shape: [N, 13, 13, 3, 1]
+        box_loss_scale = 2. - (y_true[..., 2:3] / tf.cast(self.img_size[1], tf.float32)) * (
+                    y_true[..., 3:4] / tf.cast(self.img_size[0], tf.float32))
+
+        ############
+        # loss_part
+        ############
+        # mix_up weight
+        # [N, 13, 13, 3, 1]
+        mix_w = y_true[..., -1:]
+        # shape: [N, 13, 13, 3, 1]
+        xy_loss = tf.reduce_sum(tf.square(true_xy - pred_xy) * object_mask * box_loss_scale * mix_w) / N
+        wh_loss = tf.reduce_sum(tf.square(true_tw_th - pred_tw_th) * object_mask * box_loss_scale * mix_w) / N
+
+        # shape: [N, 13, 13, 3, 1]
+        conf_pos_mask = object_mask
+        conf_neg_mask = (1 - object_mask) * ignore_mask
+        conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask,
+                                                                                logits=pred_conf_logits)
+        conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask,
+                                                                                logits=pred_conf_logits)
+        # TODO: may need to balance the pos-neg by multiplying some weights
+        conf_loss = conf_loss_pos + conf_loss_neg
+        if self.use_focal_loss:
+            alpha = 1.0
+            gamma = 2.0
+            # TODO: alpha should be a mask array if needed
+            focal_mask = alpha * tf.pow(tf.abs(object_mask - tf.sigmoid(pred_conf_logits)), gamma)
+            conf_loss *= focal_mask
+        conf_loss = tf.reduce_sum(conf_loss * mix_w) / N
+
+        # shape: [N, 13, 13, 3, 1]
+        # whether to use label smooth
+        if self.use_label_smooth:
+            delta = 0.01
+            label_target = (1 - delta) * y_true[..., 5:-1] + delta * 1. / self.class_num
+        else:
+            label_target = y_true[..., 5:-1]
+        class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_target,
+                                                                           logits=pred_prob_logits) * mix_w
+        class_loss = tf.reduce_sum(class_loss) / N
+
+        return xy_loss, wh_loss, conf_loss, class_loss
+
+    def box_iou(self, pred_boxes, valid_true_boxes):
+        '''
+        param:
+            pred_boxes: [13, 13, 3, 4], (center_x, center_y, w, h)
+            valid_true: [V, 4]
+        '''
+
+        # [13, 13, 3, 2]
+        pred_box_xy = pred_boxes[..., 0:2]
+        pred_box_wh = pred_boxes[..., 2:4]
+
+        # shape: [13, 13, 3, 1, 2]
+        pred_box_xy = tf.expand_dims(pred_box_xy, -2)
+        pred_box_wh = tf.expand_dims(pred_box_wh, -2)
+
+        # [V, 2]
+        true_box_xy = valid_true_boxes[:, 0:2]
+        true_box_wh = valid_true_boxes[:, 2:4]
+
+        # [13, 13, 3, 1, 2] & [V, 2] ==> [13, 13, 3, V, 2]
+        intersect_mins = tf.maximum(pred_box_xy - pred_box_wh / 2.,
+                                    true_box_xy - true_box_wh / 2.)
+        intersect_maxs = tf.minimum(pred_box_xy + pred_box_wh / 2.,
+                                    true_box_xy + true_box_wh / 2.)
+        intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.)
+
+        # shape: [13, 13, 3, V]
+        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
+        # shape: [13, 13, 3, 1]
+        pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]
+        # shape: [V]
+        true_box_area = true_box_wh[..., 0] * true_box_wh[..., 1]
+        # shape: [1, V]
+        true_box_area = tf.expand_dims(true_box_area, axis=0)
+
+        # [13, 13, 3, V]
+        iou = intersect_area / (pred_box_area + true_box_area - intersect_area + 1e-10)
+
+        return iou
+
+    def compute_loss(self, y_pred, y_true):
+        '''
+        param:
+            y_pred: returned feature_map list by `forward` function: [feature_map_1, feature_map_2, feature_map_3]
+            y_true: input y_true by the tf.data pipeline
+        '''
+        loss_xy, loss_wh, loss_conf, loss_class = 0., 0., 0., 0.
+        anchor_group = [self.anchors[6:9], self.anchors[3:6], self.anchors[0:3]]
+
+        # calc loss in 3 scales
+        for i in range(len(y_pred)):
+            result = self.loss_layer(y_pred[i], y_true[i], anchor_group[i])
+            loss_xy += result[0]
+            loss_wh += result[1]
+            loss_conf += result[2]
+            loss_class += result[3]
+        total_loss = loss_xy + loss_wh + loss_conf + loss_class
+        return [total_loss, loss_xy, loss_wh, loss_conf, loss_class]
+
 
 def conv2d(inputs, filters, kernel_size, strides=1):
     def _fixed_padding(inputs, kernel_size):
diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_processing_utils.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_processing_utils.py
new file mode 100644
index 0000000..91b0825
--- /dev/null
+++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_processing_utils.py
@@ -0,0 +1,127 @@
+from __future__ import division, print_function
+
+import numpy as np
+import sys
+import tensorflow as tf
+
+PY_VERSION = sys.version_info[0]
+iter_cnt = 0
+
+
+def cpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, iou_thresh=0.5):
+    """
+    Perform NMS on CPU.
+    Arguments:
+        boxes: shape [1, 10647, 4]
+        scores: shape [1, 10647, num_classes]
+    """
+
+    boxes = boxes.reshape(-1, 4)
+    scores = scores.reshape(-1, num_classes)
+    # Picked bounding boxes
+    picked_boxes, picked_score, picked_label = [], [], []
+
+    for i in range(num_classes):
+        indices = np.where(scores[:,i] >= score_thresh)
+        filter_boxes = boxes[indices]
+        filter_scores = scores[:,i][indices]
+        if len(filter_boxes) == 0:
+            continue
+        # do non_max_suppression on the cpu
+        indices = py_nms(filter_boxes, filter_scores,
+                         max_boxes=max_boxes, iou_thresh=iou_thresh)
+        picked_boxes.append(filter_boxes[indices])
+        picked_score.append(filter_scores[indices])
+        picked_label.append(np.ones(len(indices), dtype='int32')*i)
+    if len(picked_boxes) == 0:
+        return None, None, None
+
+    boxes = np.concatenate(picked_boxes, axis=0)
+    score = np.concatenate(picked_score, axis=0)
+    label = np.concatenate(picked_label, axis=0)
+
+    return boxes, score, label
+
+
+
+def py_nms(boxes, scores, max_boxes=50, iou_thresh=0.5):
+    """
+    Pure Python NMS baseline.
+
+    Arguments: boxes: shape of [-1, 4], the value of '-1' means that dont know the
+                      exact number of boxes
+               scores: shape of [-1,]
+               max_boxes: representing the maximum of boxes to be selected by non_max_suppression
+               iou_thresh: representing iou_threshold for deciding to keep boxes
+    """
+    assert boxes.shape[1] == 4 and len(scores.shape) == 1
+
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= iou_thresh)[0]
+        order = order[inds + 1]
+
+    return keep[:max_boxes]
+
+
+def gpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, nms_thresh=0.5):
+    """
+    Perform NMS on GPU using TensorFlow.
+
+    params:
+        boxes: tensor of shape [1, 10647, 4] # 10647=(13*13+26*26+52*52)*3, for input 416*416 image
+        scores: tensor of shape [1, 10647, num_classes], score=conf*prob
+        num_classes: total number of classes
+        max_boxes: integer, maximum number of predicted boxes you'd like, default is 50
+        score_thresh: if [ highest class probability score < score_threshold]
+                        then get rid of the corresponding box
+        nms_thresh: real value, "intersection over union" threshold used for NMS filtering
+    """
+
+    boxes_list, label_list, score_list = [], [], []
+    max_boxes = tf.constant(max_boxes, dtype='int32')
+
+    # since we do nms for single image, then reshape it
+    boxes = tf.reshape(boxes, [-1, 4]) # '-1' means we don't konw the exact number of boxes
+    score = tf.reshape(scores, [-1, num_classes])
+
+    # Step 1: Create a filtering mask based on "box_class_scores" by using "threshold".
+    mask = tf.greater_equal(score, tf.constant(score_thresh))
+    # Step 2: Do non_max_suppression for each class
+    for i in range(num_classes):
+        # Step 3: Apply the mask to scores, boxes and pick them out
+        filter_boxes = tf.boolean_mask(boxes, mask[:,i])
+        filter_score = tf.boolean_mask(score[:,i], mask[:,i])
+        nms_indices = tf.image.non_max_suppression(boxes=filter_boxes,
+                                                   scores=filter_score,
+                                                   max_output_size=max_boxes,
+                                                   iou_threshold=nms_thresh, name='nms_indices')
+        label_list.append(tf.ones_like(tf.gather(filter_score, nms_indices), 'int32')*i)
+        boxes_list.append(tf.gather(filter_boxes, nms_indices))
+        score_list.append(tf.gather(filter_score, nms_indices))
+
+    boxes = tf.concat(boxes_list, axis=0)
+    score = tf.concat(score_list, axis=0)
+    label = tf.concat(label_list, axis=0)
+
+    return boxes, score, label
\ No newline at end of file
diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_training_utils.py b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_training_utils.py
new file mode 100644
index 0000000..47e094b
--- /dev/null
+++ b/src/traffic_analysis/d04_modelling/transfer_learning/tensorflow_training_utils.py
@@ -0,0 +1,293 @@
+from __future__ import division, print_function
+
+import numpy as np
+import sys
+import cv2
+import random
+import tensorflow as tf
+from tensorflow.core.framework import summary_pb2
+
+from traffic_analysis.d00_utils.load_confs import load_training_parameters
+from traffic_analysis.d04_modelling.transfer_learning.tensorflow_image_formatting_utils import (
+    mix_up, resize_with_bbox, random_flip, random_color_distort, random_expand, random_crop_with_constraints)
+
+PY_VERSION = sys.version_info[0]
+iter_cnt = 0
+
+
+def get_batch_data(batch_line, class_num, img_size, anchors, mode, multi_scale=False,
+                   mix_up=False, letterbox_resize=True, interval=10):
+    '''
+    generate a batch of imgs and labels
+    param:
+        batch_line: a batch of lines from train/val.txt files
+        class_num: num of total classes.
+        img_size: the image size to be resized to. format: [width, height].
+        anchors: anchors. shape: [9, 2].
+        mode: 'train' or 'val'. if set to 'train', data augmentation will be applied.
+        multi_scale: whether to use multi_scale training, img_size varies from [320, 320] to [640, 640] by default.
+        Note that it will take effect only when mode is set to 'train'.
+        letterbox_resize: whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized img.
+        interval: change the scale of image every interval batches.
+    '''
+    global iter_cnt
+    # multi_scale training
+    if multi_scale and mode == 'train':
+        random.seed(iter_cnt // interval)
+        random_img_size = [[x * 32, x * 32] for x in range(10, 20)]
+        img_size = random.sample(random_img_size, 1)[0]
+    iter_cnt += 1
+
+    img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = [], [], [], [], []
+
+    # mix up strategy
+    if mix_up and mode == 'train':
+        mix_lines = []
+        batch_line = batch_line.tolist()
+        for idx, line in enumerate(batch_line):
+            if np.random.uniform(0, 1) < 0.5:
+                mix_lines.append([line, random.sample(batch_line[:idx] + batch_line[idx+1:], 1)[0]])
+            else:
+                mix_lines.append(line)
+        batch_line = mix_lines
+
+    for line in batch_line:
+        img_idx, img, y_true_13, y_true_26, y_true_52 = parse_data(line, class_num, img_size, anchors, mode,
+                                                                   letterbox_resize)
+
+        img_idx_batch.append(img_idx)
+        img_batch.append(img)
+        y_true_13_batch.append(y_true_13)
+        y_true_26_batch.append(y_true_26)
+        y_true_52_batch.append(y_true_52)
+
+    img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = np.asarray(img_idx_batch, np.int64), \
+                                                                                  np.asarray(img_batch), \
+                                                                                  np.asarray(y_true_13_batch), \
+                                                                                  np.asarray(y_true_26_batch), \
+                                                                                  np.asarray(y_true_52_batch)
+
+    return img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch
+
+
+def parse_data(line, class_num, img_size, anchors, mode, letterbox_resize):
+    '''
+    param:
+        line: a line from the training/test txt file
+        class_num: totol class nums.
+        img_size: the size of image to be resized to. [width, height] format.
+        anchors: anchors.
+        mode: 'train' or 'val'. When set to 'train', data_augmentation will be applied.
+        letterbox_resize: whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized img.
+    '''
+    if not isinstance(line, list):
+        img_idx, pic_path, boxes, labels, _, _ = parse_line(line)
+        img = cv2.imread(pic_path)
+        # expand the 2nd dimension, mix up weight default to 1.
+        boxes = np.concatenate((boxes, np.full(shape=(boxes.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1)
+    else:
+        # the mix up case
+        _, pic_path1, boxes1, labels1, _, _ = parse_line(line[0])
+        img1 = cv2.imread(pic_path1)
+        img_idx, pic_path2, boxes2, labels2, _, _ = parse_line(line[1])
+        img2 = cv2.imread(pic_path2)
+
+        img, boxes = mix_up(img1, img2, boxes1, boxes2)
+        labels = np.concatenate((labels1, labels2))
+
+    if mode == 'train':
+        # random color jittering
+        # NOTE: applying color distort may lead to bad performance sometimes
+        img = random_color_distort(img)
+
+        # random expansion with prob 0.5
+        if np.random.uniform(0, 1) > 0.5:
+            img, boxes = random_expand(img, boxes, 4)
+
+        # random cropping
+        h, w, _ = img.shape
+        boxes, crop = random_crop_with_constraints(boxes, (w, h))
+        x0, y0, w, h = crop
+        img = img[y0: y0+h, x0: x0+w]
+
+        # resize with random interpolation
+        h, w, _ = img.shape
+        interp = np.random.randint(0, 5)
+        img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=interp, letterbox=letterbox_resize)
+
+        # random horizontal flip
+        h, w, _ = img.shape
+        img, boxes = random_flip(img, boxes, px=0.5)
+    else:
+        img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=1, letterbox=letterbox_resize)
+
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
+
+    # the input of yolo_v3 should be in range 0~1
+    img = img / 255.
+
+    y_true_13, y_true_26, y_true_52 = process_box(boxes, labels, img_size, class_num, anchors)
+
+    return img_idx, img, y_true_13, y_true_26, y_true_52
+
+
+def process_box(boxes, labels, img_size, class_num, anchors):
+    '''
+    Generate the y_true label, i.e. the ground truth feature_maps in 3 different scales.
+    params:
+        boxes: [N, 5] shape, float32 dtype. `x_min, y_min, x_max, y_mix, mixup_weight`.
+        labels: [N] shape, int64 dtype.
+        class_num: int64 num.
+        anchors: [9, 4] shape, float32 dtype.
+    '''
+    anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+
+    # convert boxes form:
+    # shape: [N, 2]
+    # (x_center, y_center)
+    box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2
+    # (width, height)
+    box_sizes = boxes[:, 2:4] - boxes[:, 0:2]
+
+    # [13, 13, 3, 5+num_class+1] `5` means coords and labels. `1` means mix up weight.
+    y_true_13 = np.zeros((img_size[1] // 32, img_size[0] // 32, 3, 6 + class_num), np.float32)
+    y_true_26 = np.zeros((img_size[1] // 16, img_size[0] // 16, 3, 6 + class_num), np.float32)
+    y_true_52 = np.zeros((img_size[1] // 8, img_size[0] // 8, 3, 6 + class_num), np.float32)
+
+    # mix up weight default to 1.
+    y_true_13[..., -1] = 1.
+    y_true_26[..., -1] = 1.
+    y_true_52[..., -1] = 1.
+
+    y_true = [y_true_13, y_true_26, y_true_52]
+
+    # [N, 1, 2]
+    box_sizes = np.expand_dims(box_sizes, 1)
+    # broadcast tricks
+    # [N, 1, 2] & [9, 2] ==> [N, 9, 2]
+    mins = np.maximum(- box_sizes / 2, - anchors / 2)
+    maxs = np.minimum(box_sizes / 2, anchors / 2)
+    # [N, 9, 2]
+    whs = maxs - mins
+
+    # [N, 9]
+    iou = (whs[:, :, 0] * whs[:, :, 1]) / (
+                box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :,
+                                                                                                         1] + 1e-10)
+    # [N]
+    best_match_idx = np.argmax(iou, axis=1)
+
+    ratio_dict = {1.: 8., 2.: 16., 3.: 32.}
+    for i, idx in enumerate(best_match_idx):
+        # idx: 0,1,2 ==> 2; 3,4,5 ==> 1; 6,7,8 ==> 0
+        feature_map_group = 2 - idx // 3
+        # scale ratio: 0,1,2 ==> 8; 3,4,5 ==> 16; 6,7,8 ==> 32
+        ratio = ratio_dict[np.ceil((idx + 1) / 3.)]
+        x = int(np.floor(box_centers[i, 0] / ratio))
+        y = int(np.floor(box_centers[i, 1] / ratio))
+        k = anchors_mask[feature_map_group].index(idx)
+        c = labels[i]
+        # print(feature_map_group, '|', y,x,k,c)
+
+        y_true[feature_map_group][y, x, k, :2] = box_centers[i]
+        y_true[feature_map_group][y, x, k, 2:4] = box_sizes[i]
+        y_true[feature_map_group][y, x, k, 4] = 1.
+        y_true[feature_map_group][y, x, k, 5 + c] = 1.
+        y_true[feature_map_group][y, x, k, -1] = boxes[i, -1]
+
+    return y_true_13, y_true_26, y_true_52
+
+
+def parse_line(line):
+    '''
+    Given a line from the training/test txt file, return parsed info.
+    line format: line_index, img_path, img_width, img_height, [box_info_1 (5 number)], ...
+    return:
+        line_idx: int64
+        pic_path: string.
+        boxes: shape [N, 4], N is the ground truth count, elements in the second
+            dimension are [x_min, y_min, x_max, y_max]
+        labels: shape [N]. class index.
+        img_width: int.
+        img_height: int
+    '''
+    if 'str' not in str(type(line)):
+        line = line.decode()
+    s = line.strip().split(' ')
+    assert len(s) > 8, 'Annotation error! Please check your file. Make sure there is an object in each image.'
+    line_idx = int(s[0])
+    pic_path = s[1]
+    img_width = int(s[2])
+    img_height = int(s[3])
+    s = s[4:]
+    assert len(s) % 5 == 0, 'Annotation error! Please check your file. Maybe partially missing some coordinates?'
+    box_cnt = len(s) // 5
+    boxes = []
+    labels = []
+    for i in range(box_cnt):
+        label, x_min, y_min, x_max, y_max = int(s[i * 5]), float(s[i * 5 + 1]), float(s[i * 5 + 2]), float(
+            s[i * 5 + 3]), float(s[i * 5 + 4])
+        boxes.append([x_min, y_min, x_max, y_max])
+        labels.append(label)
+    boxes = np.asarray(boxes, np.float32)
+    labels = np.asarray(labels, np.int64)
+    return line_idx, pic_path, boxes, labels, img_width, img_height
+
+
+def make_summary(name, val):
+    return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)])
+
+
+def config_learning_rate(lr_decay_freq, train_batch_num, global_step):
+    train_params = load_training_parameters()
+    if train_params['lr_type'] == 'exponential':
+        lr_tmp = tf.train.exponential_decay(train_params['learning_rate_init'], global_step, lr_decay_freq,
+                                            train_params['lr_decay_factor'], staircase=True,
+                                            name='exponential_learning_rate')
+        return tf.maximum(lr_tmp, train_params['lr_lower_bound'])
+    elif train_params['lr_type'] == 'piecewise':
+        train_params['pw_boundaries'] = [float(i) * train_batch_num +
+                                         train_params['global_step'] for i in train_params['pw_boundaries']]
+        return tf.train.piecewise_constant(global_step, boundaries=[float(i) for i in train_params['pw_boundaries']],
+                                           values=train_params['pw_values'],
+                                           name='piecewise_learning_rate')
+    else:
+        raise ValueError('Unsupported learning rate type!')
+
+
+def config_optimizer(optimizer_name, learning_rate, decay=0.9, momentum=0.9):
+    if optimizer_name == 'momentum':
+        return tf.train.MomentumOptimizer(learning_rate, momentum=momentum)
+    elif optimizer_name == 'rmsprop':
+        return tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum)
+    elif optimizer_name == 'adam':
+        return tf.train.AdamOptimizer(learning_rate)
+    elif optimizer_name == 'sgd':
+        return tf.train.GradientDescentOptimizer(learning_rate)
+    else:
+        raise ValueError('Unsupported optimizer type!')
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.average = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.average = self.sum / float(self.count)
+
+
+def shuffle_and_overwrite(file_name):
+    content = open(file_name, 'r').readlines()
+    random.shuffle(content)
+    with open(file_name, 'w') as f:
+        for line in content:
+            f.write(line)
diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/train_tensorflow_model.py b/src/traffic_analysis/d04_modelling/transfer_learning/train_tensorflow_model.py
new file mode 100644
index 0000000..f1a6073
--- /dev/null
+++ b/src/traffic_analysis/d04_modelling/transfer_learning/train_tensorflow_model.py
@@ -0,0 +1,289 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+import os
+import math
+import tensorflow as tf
+import numpy as np
+import logging
+from tqdm import trange
+
+from traffic_analysis.d04_modelling.transfer_learning.tensorflow_training_utils import get_batch_data, \
+    make_summary, config_learning_rate, config_optimizer, AverageMeter
+from traffic_analysis.d04_modelling.transfer_learning.tensorflow_evaluation_utils import evaluate_on_gpu, \
+    get_preds_gpu, voc_eval, parse_gt_rec
+from traffic_analysis.d04_modelling.transfer_learning.tensorflow_processing_utils import gpu_nms
+from traffic_analysis.d04_modelling.transfer_learning.tensorflow_model_loader import YoloV3
+from traffic_analysis.d04_modelling.transfer_learning.convert_darknet_to_tensorflow import parse_anchors
+from traffic_analysis.d04_modelling.transfer_learning.tensorflow_detection_utils import read_class_names
+
+
+def transfer_learn(paths, params, train_params, train_file, test_file, selected_labels):
+    """ trains last three layers of yolov3 network on custom dataset
+    """
+
+    transfer_learn_model_dir = os.path.join(paths['local_detection_model'], train_params['trained_model_name'])
+    if not os.path.exists(transfer_learn_model_dir):
+        os.makedirs(transfer_learn_model_dir)
+
+    truth_dir_path = paths['temp_annotation']
+    class_name_path = os.path.join(paths['local_detection_model'], 'yolov3', 'coco.names')  # CHANGE THIS
+    classes = read_class_names(class_name_path)
+
+    selected_label_idxs = []
+    for idx, label in classes.items():
+        if label in selected_labels:
+            selected_label_idxs.append(idx)
+    anchors = parse_anchors(paths)
+    number_classes = len(classes)
+    
+    train_data_path = os.path.join(truth_dir_path, train_file)
+    test_data_path = os.path.join(truth_dir_path, test_file)
+    train_img_cnt = len(open(train_data_path, 'r').readlines())
+    val_img_cnt = len(open(test_data_path, 'r').readlines())
+    train_batch_num = int(math.ceil(float(train_img_cnt) / train_params['num_batches']))
+
+    lr_decay_freq = int(train_batch_num * train_params['lr_decay_epoch'])
+
+    logging_file_path = os.path.join(truth_dir_path, 'progress.log')
+    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s',
+                        datefmt='%a, %d %b %Y %H:%M:%S', filename=logging_file_path, filemode='w')
+
+    is_training = tf.placeholder(tf.bool, name="phase_train")
+    handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag')
+    pred_boxes_flag = tf.placeholder(tf.float32, [1, None, None])
+    pred_scores_flag = tf.placeholder(tf.float32, [1, None, None])
+    gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, number_classes, train_params['nms_topk'],
+                         train_params['score_threshold'], train_params['nms_threshold'])
+
+    train_dataset = tf.data.TextLineDataset(train_data_path)
+    train_dataset = train_dataset.shuffle(train_img_cnt)
+    train_dataset = train_dataset.batch(train_params['num_batches'])
+    train_dataset = train_dataset.map(
+        lambda x: tf.py_func(get_batch_data,
+                             inp=[x, number_classes, [416, 416], anchors, 'train', True, True, True],
+                             Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
+        num_parallel_calls=train_params['num_threads'])
+    train_dataset = train_dataset.prefetch(train_params['prefetech_buffer'])
+    
+    test_dataset = tf.data.TextLineDataset(test_data_path)
+    test_dataset = test_dataset.batch(1)
+    test_dataset = test_dataset.map(
+        lambda x: tf.py_func(get_batch_data,
+                             inp=[x, number_classes, [416, 416], anchors, 'val', False, False, True],
+                             Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
+        num_parallel_calls=train_params['num_threads'])
+    test_dataset.prefetch(train_params['prefetech_buffer'])
+    
+    iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
+    train_init_op = iterator.make_initializer(train_dataset)
+    val_init_op = iterator.make_initializer(test_dataset)
+    
+    # get an element from the chosen dataset iterator
+    image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next()
+    y_true = [y_true_13, y_true_26, y_true_52]
+    
+    # tf.data pipeline will lose the data `static` shape, so we need to set it manually
+    image_ids.set_shape([None])
+    image.set_shape([None, None, None, 3])
+    for y in y_true:
+        y.set_shape([None, None, None, None, None])
+    
+    # define model
+    yolo_model = YoloV3(number_classes, anchors, use_label_smooth=True, use_focal_loss=True,
+                        batch_norm_decay=train_params['batch_norm_decay'], weight_decay=train_params['weight_decay'],
+                        use_static_shape=False)
+    
+    with tf.variable_scope('YoloV3'):
+        pred_feature_maps = yolo_model.forward(image, is_training=is_training)
+    loss = yolo_model.compute_loss(pred_feature_maps, y_true)
+    y_pred = yolo_model.predict(pred_feature_maps)
+    
+    l2_loss = tf.losses.get_regularization_loss()
+    
+    # setting restore parts and vars to update
+    saver_to_restore = tf.train.Saver(
+        var_list=tf.contrib.framework.get_variables_to_restore(
+            include=None,
+            exclude=['YoloV3/yolov3_head/Conv_14', 'YoloV3/yolov3_head/Conv_6', 'YoloV3/yolov3_head/Conv_22']))
+    update_vars = tf.contrib.framework.get_variables_to_restore(include=['YoloV3/yolov3_head'])
+    
+    tf.summary.scalar('train_batch_statistics/total_loss', loss[0])
+    tf.summary.scalar('train_batch_statistics/loss_xy', loss[1])
+    tf.summary.scalar('train_batch_statistics/loss_wh', loss[2])
+    tf.summary.scalar('train_batch_statistics/loss_conf', loss[3])
+    tf.summary.scalar('train_batch_statistics/loss_class', loss[4])
+    tf.summary.scalar('train_batch_statistics/loss_l2', l2_loss)
+    tf.summary.scalar('train_batch_statistics/loss_ratio', l2_loss / loss[0])
+    
+    global_step = tf.Variable(float(train_params['global_step']),
+                              trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
+    
+    learning_rate = tf.cond(tf.less(global_step, train_batch_num * train_params['warm_up_epoch']),
+                            lambda: train_params['learning_rate_init'] *
+                                    global_step / (train_batch_num * train_params['warm_up_epoch']),
+                            lambda: config_learning_rate(lr_decay_freq=lr_decay_freq, train_batch_num=train_batch_num,
+                                                         global_step=global_step -
+                                                         train_batch_num * train_params['warm_up_epoch']))
+    tf.summary.scalar('learning_rate', learning_rate)
+    
+    if not train_params['save_optimizer']:
+        saver_to_save = tf.train.Saver()
+        saver_best = tf.train.Saver()
+    
+    optimizer = config_optimizer(train_params['optimizer_name'], learning_rate)
+    
+    # set dependencies for BN ops
+    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    with tf.control_dependencies(update_ops):
+        # train_op = optimizer.minimize(loss[0] + l2_loss, var_list=update_vars, global_step=global_step)
+        # apply gradient clip to avoid gradient exploding
+        gvs = optimizer.compute_gradients(loss[0] + l2_loss, var_list=update_vars)
+        clip_grad_var = [gv if gv[0] is None else [
+              tf.clip_by_norm(gv[0], 100.), gv[1]] for gv in gvs]
+        train_op = optimizer.apply_gradients(clip_grad_var, global_step=global_step)
+    
+    if train_params['save_optimizer']:
+        print('Saving optimizer parameters to checkpoint! Remember to restore global_step in fine-tuning afterwards.')
+        saver_to_save = tf.train.Saver()
+        saver_best = tf.train.Saver()
+    
+    tensorboard_log_path = os.path.join(truth_dir_path, 'tensorboard_logs')
+    yolov3_tensorflow_path = os.path.join(paths['local_detection_model'], params['detection_model'], 'yolov3.ckpt')
+    with tf.Session() as sess:
+        sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
+        saver_to_restore.restore(sess, yolov3_tensorflow_path)
+        merged = tf.summary.merge_all()
+        writer = tf.summary.FileWriter(tensorboard_log_path, sess.graph)
+    
+        print('\n----------- start to train -----------\n')
+    
+        best_mAP = -np.Inf
+    
+        for epoch in range(train_params['total_epochs']):
+    
+            sess.run(train_init_op)
+            loss_total, loss_xy, loss_wh, loss_conf, loss_class = AverageMeter(), AverageMeter(), AverageMeter(), \
+                                                                  AverageMeter(), AverageMeter()
+    
+            for i in trange(train_batch_num):
+                _, summary, __y_pred, __y_true, __loss, __global_step, __lr = sess.run(
+                    [train_op, merged, y_pred, y_true, loss, global_step, learning_rate],
+                    feed_dict={is_training: True})
+    
+                writer.add_summary(summary, global_step=__global_step)
+    
+                loss_total.update(__loss[0], len(__y_pred[0]))
+                loss_xy.update(__loss[1], len(__y_pred[0]))
+                loss_wh.update(__loss[2], len(__y_pred[0]))
+                loss_conf.update(__loss[3], len(__y_pred[0]))
+                loss_class.update(__loss[4], len(__y_pred[0]))
+    
+                if __global_step % train_params['train_evaluation_step'] == 0 and __global_step > 0:
+                    recall, precision = evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag,
+                                                        __y_pred, __y_true, number_classes,
+                                                        train_params['nms_threshold'])
+    
+                    info = "Epoch: {}, global_step: {} | loss: total: {:.2f}, xy: {:.2f}, " \
+                           "wh: {:.2f}, conf: {:.2f}, class: {:.2f} | ".format(epoch, int(__global_step),
+                                                                               loss_total.average, loss_xy.average,
+                                                                               loss_wh.average, loss_conf.average,
+                                                                               loss_class.average)
+                    info += 'Last batch: rec: {:.3f}, prec: {:.3f} | lr: {:.5g}'.format(recall, precision, __lr)
+                    print(info)
+                    logging.info(info)
+    
+                    writer.add_summary(make_summary('evaluation/train_batch_recall', recall),
+                                       global_step=__global_step)
+                    writer.add_summary(make_summary('evaluation/train_batch_precision', precision),
+                                       global_step=__global_step)
+    
+                    if np.isnan(loss_total.average):
+                        print('****' * 10)
+                        raise ArithmeticError(
+                            'Gradient exploded! Please train again and you may need modify some parameters.')
+    
+            # NOTE: this is just demo. You can set the conditions when to save the weights.
+            if epoch % train_params['save_epoch'] == 0 and epoch > 0:
+                if loss_total.average <= 2.:
+                    saver_to_save.save(sess,
+                                       os.path.join(train_params['trained_model_name'],
+                                                    'model-epoch_{}_step_{}_loss_{:.4f}_lr_{:.5g}'.format(
+                                                        epoch, int(__global_step), loss_total.average, __lr)))
+    
+            # switch to validation dataset for evaluation
+            if epoch % train_params['val_evaluation_epoch'] == 0 and epoch >= train_params['warm_up_epoch']:
+                sess.run(val_init_op)
+    
+                val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = \
+                    AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()
+    
+                val_preds = []
+    
+                for j in trange(val_img_cnt):
+                    __image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss],
+                                                             feed_dict={is_training: False})
+                    pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag,
+                                                 pred_scores_flag, __image_ids, __y_pred)
+                    val_preds.extend(pred_content)
+                    val_loss_total.update(__loss[0])
+                    val_loss_xy.update(__loss[1])
+                    val_loss_wh.update(__loss[2])
+                    val_loss_conf.update(__loss[3])
+                    val_loss_class.update(__loss[4])
+    
+                # calc mAP
+                rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter()
+                gt_dict = parse_gt_rec(test_data_path, [416, 416], letterbox_resize=True)
+    
+                info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\n'.format(epoch, __global_step, __lr)
+
+                for class_idx in range(number_classes):
+                    if class_idx in selected_label_idxs:
+                        npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, class_idx,
+                                                           iou_thres=train_params['eval_threshold'],
+                                                           use_07_metric=True)
+                        info += 'EVAL: Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}\n'.format(class_idx,
+                                                                                                         rec, prec, ap)
+
+                        if math.isnan(rec) or math.isnan(prec) or math.isnan(ap):
+                            pass
+                        else:
+                            rec_total.update(rec, npos)
+                            prec_total.update(prec, nd)
+                            ap_total.update(ap, 1)
+
+                mAP = ap_total.average
+                info += 'EVAL: Recall: {:.4f}, Precison: {:.4f}, mAP: {:.4f}\n'.format(
+                    rec_total.average, prec_total.average, mAP)
+                info += 'EVAL: loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f}\n'.format(
+                    val_loss_total.average, val_loss_xy.average, val_loss_wh.average,
+                    val_loss_conf.average, val_loss_class.average)
+                print(info)
+                logging.info(info)
+    
+                if mAP > best_mAP:
+                    best_mAP = mAP
+                    saver_best.save(sess, os.path.join(
+                        transfer_learn_model_dir,
+                        'best_model_Epoch_{}_step_{}_mAP_{:.4f}_loss_{:.4f}_lr_{:.7g}'.format(
+                            epoch, int(__global_step), best_mAP, val_loss_total.average, __lr)))
+    
+                writer.add_summary(make_summary('evaluation/val_mAP', mAP),
+                                   global_step=epoch)
+                writer.add_summary(make_summary('evaluation/val_recall', rec_total.average),
+                                   global_step=epoch)
+                writer.add_summary(make_summary('evaluation/val_precision', prec_total.average),
+                                   global_step=epoch)
+                writer.add_summary(make_summary('validation_statistics/total_loss', val_loss_total.average),
+                                   global_step=epoch)
+                writer.add_summary(make_summary('validation_statistics/loss_xy', val_loss_xy.average),
+                                   global_step=epoch)
+                writer.add_summary(make_summary('validation_statistics/loss_wh', val_loss_wh.average),
+                                   global_step=epoch)
+                writer.add_summary(make_summary('validation_statistics/loss_conf', val_loss_conf.average),
+                                   global_step=epoch)
+                writer.add_summary(make_summary('validation_statistics/loss_class', val_loss_class.average),
+                                   global_step=epoch)
+                
+    return
diff --git a/src/traffic_analysis/d04_modelling/transfer_learning/training_data_loader.py b/src/traffic_analysis/d04_modelling/transfer_learning/training_data_loader.py
new file mode 100644
index 0000000..29a5498
--- /dev/null
+++ b/src/traffic_analysis/d04_modelling/transfer_learning/training_data_loader.py
@@ -0,0 +1,300 @@
+import os
+import sys
+import xml.etree.ElementTree as ET
+from PIL import Image
+import numpy as np
+from enum import Enum
+import cv2
+
+from traffic_analysis.d00_utils.load_confs import load_paths, load_credentials
+from traffic_analysis.d00_utils.data_loader_s3 import DataLoaderS3
+from traffic_analysis.d00_utils.data_retrieval import delete_and_recreate_dir, mp4_to_npy
+from traffic_analysis.d02_ref.ref_utils import get_s3_video_path_from_xml_name
+from traffic_analysis.d00_utils.get_project_directory import get_project_directory
+from traffic_analysis.d04_modelling.transfer_learning.tensorflow_detection_utils import read_class_names
+
+
+class TransferDataset(Enum):
+    detrac = 1
+    cvat = 2
+
+
+class TrainingDataLoader(object):
+
+    def __init__(self, datasets, creds, paths):
+        self.datasets = datasets
+        self.creds = creds
+        self.paths = paths
+        self.load_mapping = {TransferDataset.detrac: self.load_detrac_data,
+                             TransferDataset.cvat: self.load_cvat_data}
+
+        self.data_loader_s3 = DataLoaderS3(s3_credentials=creds[paths['s3_creds']],
+                                           bucket_name=paths['bucket_name'])
+
+        return
+
+
+    def get_train_and_test(self, train_fraction):
+
+        x, y = self.load_data_from_s3()
+
+        x_train = x[:int(len(x) * train_fraction)]
+        y_train = y[:int(len(x) * train_fraction)]
+
+        x_test = x[int(len(x) * train_fraction):]
+        y_test = y[int(len(x) * train_fraction):]
+
+        return x_train, y_train, x_test, y_test
+
+    def load_data_from_s3(self):
+
+        self.clear_temp_folders()
+
+        xs = []
+        ys = []
+        for dataset in self.datasets:
+            x, y = self.load_mapping[dataset]()
+            assert len(x) == len(y), "Mismatch in number of input and output pairs! " \
+                                     "(Dataset: " + dataset.name + ")"
+            xs += x
+            ys += y
+
+        # self.clear_temp_folders()
+
+        return xs, ys
+
+    def clear_temp_folders(self):
+        delete_and_recreate_dir(self.paths['temp_annotation'])
+        delete_and_recreate_dir(self.paths['temp_raw_images'])
+        delete_and_recreate_dir(self.paths['temp_raw_video'])
+
+    def load_detrac_data(self):
+
+        print('Parsing detrac xmls...')
+        y = []
+        xml_files = self.data_loader_s3.list_objects(prefix=self.paths['s3_detrac_annotations'])
+        count = 0
+        for xml_file in xml_files:
+            result = self.parse_detrac_xml_file(xml_file)
+            if (result):
+                y += result
+
+            if count == 10:
+                break
+            count += 1
+
+        print('Loading detrac images...')
+        x = []
+        for labels in y:
+            image_num = labels.split(' ')[0].zfill(5)
+            impath = labels.split(' ')[1]
+            folder = impath.split('/')[-1][:9]
+
+            file_to_download = self.paths['s3_detrac_images'] + \
+                               folder + '/' + \
+                               'img' + image_num + '.jpg'
+
+            download_file_to = self.paths['temp_raw_images'] + \
+                               folder + '_' + \
+                               image_num + '.jpg'
+
+            self.data_loader_s3.download_file(
+                path_of_file_to_download=file_to_download,
+                path_to_download_file_to=download_file_to)
+
+            img = Image.open(download_file_to)
+            img.load()
+            x.append(np.asarray(img, dtype="int32"))
+
+        return x, y
+
+    def parse_detrac_xml_file(self, xml_file):
+
+        project_dir = get_project_directory()
+        image_dir = os.path.join(project_dir, self.paths['temp_raw_images'])
+
+        xml_file_name = xml_file.split('/')[-1]
+        xml_path = self.paths['temp_annotation'] + xml_file.split('/')[-1]
+
+        class_names_path = os.path.join(self.paths['local_detection_model'], 'yolov3', 'coco.names')
+        classes = read_class_names(class_names_path)
+
+        try:
+            self.data_loader_s3.download_file(path_of_file_to_download=xml_file,
+                                              path_to_download_file_to=xml_path)
+        except:
+            print("Could not download file " + xml_file)
+
+        root = ET.parse(xml_path).getroot()
+
+        results = []
+        # [image_index
+        # image_path
+        # image_width
+        # image_height
+        # label_index,
+        # x_min,
+        # y_min,
+        # x_max,
+        # y_max]
+
+        im_width = 960
+        im_height = 540
+
+        for track in root.iter('frame'):
+            frame_str = str(track.attrib['num']).zfill(5)
+            im_path = os.path.join(image_dir, xml_file_name[:-4] + '_' + frame_str + '.jpg')
+            result = str(track.attrib['num']) + \
+                     ' ' + str(im_path) + \
+                     ' ' + str(im_width) + \
+                     ' ' + str(im_height)
+
+            for frame_obj in track.iter('target'):
+                vehicle_type = frame_obj.find('attribute').attrib['vehicle_type']
+                if vehicle_type == 'van':
+                    vehicle_type_idx = 2  # say vans are cars because we don't distinguish
+                else:
+                    for tick in range(len(classes)):
+                        if classes[tick] == vehicle_type:
+                            vehicle_type_idx = tick
+
+                left = float(frame_obj.find('box').attrib['left'])
+                top = float(frame_obj.find('box').attrib['top'])
+                width = float(frame_obj.find('box').attrib['width'])
+                height = float(frame_obj.find('box').attrib['height'])
+
+                x_min = left
+                y_min = top
+                x_max = left + width
+                y_max = top + height
+
+                result += ' ' + str(vehicle_type_idx) + \
+                          ' ' + str(x_min) + \
+                          ' ' + str(y_min) + \
+                          ' ' + str(x_max) + \
+                          ' ' + str(y_max)
+
+            results.append(result)
+
+        if len(results) > 1:
+            return results
+        else:
+            return None
+
+    def load_cvat_data(self):
+
+        print('Parsing cvat xmls...')
+        y = []
+        xml_files = self.data_loader_s3.list_objects(prefix=self.paths['s3_cvat_training_annotations'])
+        vid_names = []
+        for xml_file in xml_files:
+            result, vid_name = self.parse_cvat_xml_file(xml_file)
+            if(result):
+                y += result
+                vid_names.append(vid_name)
+
+
+        print('Loading cvat videos...')
+
+        # Build a list of the videos needed
+        video_set = set()
+        image_path_in_y = []
+        for labels in y:
+            video_set.add(labels.split(' ')[1].split('/')[-1][:-10])
+            image_path_in_y.append(labels.split(' ')[1])
+
+        x = []
+        for id in video_set:
+            video, video_path = self.get_cvat_video(id)
+            video_name = video_path.split('/')[-1][:-4]
+            vidcap = cv2.VideoCapture(video_path)
+
+            count = 0
+            while vidcap.isOpened():
+                success, image = vidcap.read()
+                if success:
+                    image_num = str(count).zfill(5)
+                    im_path = os.path.join(self.paths['temp_raw_images'], video_name + '_' + image_num + '.jpg')
+                    if im_path in image_path_in_y:
+                        x.append(np.asarray(image, dtype="int32"))
+                        cv2.imwrite(im_path, image)
+                    count += 1
+                else:
+                    break
+            vidcap.release()
+        return x, y
+
+    def get_cvat_video(self, xml_file_name):
+
+        video_path = get_s3_video_path_from_xml_name(xml_file_name=xml_file_name,
+                                                     s3_creds=self.creds[self.paths['s3_creds']],
+                                                     paths=self.paths)
+        if(video_path):
+            download_file_to = os.path.join(self.paths['temp_raw_video'], xml_file_name + '.mp4')
+            self.data_loader_s3.download_file(path_of_file_to_download=video_path,
+                                              path_to_download_file_to=download_file_to)
+            return mp4_to_npy(download_file_to), download_file_to
+        else:
+            return
+
+    def parse_cvat_xml_file(self, xml_file):
+
+        path = self.paths['temp_annotation'] + xml_file.split('/')[-1]
+
+        try:
+            self.data_loader_s3.download_file(path_of_file_to_download=xml_file,
+                                              path_to_download_file_to=path)
+        except:
+            print("Could not download file " + xml_file)
+
+        root = ET.parse(path).getroot()
+        vid_name = path.split('/')[-1][:-4]
+        im_dir = self.paths['temp_raw_images']
+
+        im_width = 352
+        im_height = 288
+
+        class_names_path = os.path.join(self.paths['local_detection_model'], 'yolov3', 'coco.names')
+        classes = read_class_names(class_names_path)
+
+        frame_dict = {}
+
+        for track in root.iter('track'):
+            for frame in track.iter('box'):
+                frame_num = frame.attrib['frame']
+
+                if(frame_num not in frame_dict):
+                    frame_name = str(frame_num).zfill(5)
+                    im_path = os.path.join(im_dir, vid_name + '_' + frame_name + '.jpg')
+                    frame_dict[frame_num] = str(frame_num) + ' ' + \
+                                            str(im_path) + ' ' + \
+                                            str(im_width) + ' ' + \
+                                            str(im_height)
+                if track.attrib['label'] == 'vehicle':
+                    vehicle_type = frame.findall('attribute')[2].text
+                    if vehicle_type == 'van':
+                        vehicle_type_idx = 2  # say vans are cars because we don't distinguish
+                    else:
+                        for tick in range(len(classes)):
+                            if classes[tick] == vehicle_type:
+                                vehicle_type_idx = tick
+
+                    x_min = float(frame.attrib['xtl'])
+                    y_min = float(frame.attrib['ytl'])
+                    x_max = float(frame.attrib['xbr'])
+                    y_max = float(frame.attrib['ybr'])
+
+                    frame_dict[frame_num] += ' ' + str(vehicle_type_idx) + \
+                                  ' ' + str(x_min) + \
+                                  ' ' + str(y_min) + \
+                                  ' ' + str(x_max) + \
+                                  ' ' + str(y_max)
+
+        results = []
+        for key in frame_dict:
+            results.append(frame_dict[key])
+
+        if len(results) > 1:
+            return results, vid_name
+        else:
+            return None