make sure get right (img, lab) paire when load dataset

yl-jiang · yl-jiang · commit b6c74739e254 · 2023-01-17T14:05:09.000+08:00
diff --git a/config/train_dist.yaml b/config/train_dist.yaml
@@ -10,7 +10,7 @@ train_hyp:
   test_img_dir: "./data/testimage"  # for test
   val_img_dir: "../../Dataset/Segmentation/cityscapes/image/val/"  # validation image dir
   val_seg_dir: "../../Dataset/Segmentation/cityscapes/label/val/"  # validation label dir
-  cache_num: 500000
+  cache_num: 0
   input_img_size:  # 输入训练网络的图片大小
     - 448
     - 448
@@ -21,32 +21,33 @@ train_hyp:
   num_workers: 0  # Pytorch DataLoader中的参数
   total_epoch: 1000
   device: "gpu"  # 是否使用GPU进行训练['gpu' or 'cpu']
-  accu_batch_size: 64  # 累积梯度下降
+  accu_batch_size: 48  # 累积梯度下降
   do_ema: true  # 是否维持一个Everage Moving Model
   use_tta_when_val: false  # validation时是否使用TTA
   mutil_scale_training: false  # 是否使用多尺度训练
   enable_tensorboard: true
   enable_data_aug: true
-  random_seed: 3047
+  random_seed: 7
   fp16: false
   inference_every: 5  # 每隔多少个epoch validate一次
   show_tbar_every: 5  # 每个多少个step显示实时训练状态信息
   save_ckpt_every: 5  # 每隔多少个epoch保存一次模型
   calculate_metric_every: 5  # 每个多少个epoch计算一次iou
+  log_postfix: 'sgd_relu_onecycle'
 
 optimizer_hyp:
   optimizer_type: 'sgd' # 'sgd' or 'adamw' or 'adam'
-  scheduler_type: 'cosine' # 'onecycle' or 'cosine' or 'linear'
+  scheduler_type: 'onecycle' # 'onecycle' or 'cosine' or 'linear'
   basic_lr_per_img: 0.000625  # 0.01 / 16
   weight_decay: 0.0
-  optimizer_momentum: 0.9
+  optimizer_momentum: 0.98
   eps: 0.00000001
 
 warm_up:
   do_warmup: true  # 是否开启预热训练
   warmup_epoch: 3
-  warmup_bias_lr: 0.1
-  warmup_momentum: 0.8
+  warmup_bias_lr: 0.2
+  warmup_momentum: 0.95
 
 data_aug_hyp:
   data_aug_saturation_p: 0.1
@@ -65,7 +66,7 @@ data_aug_hyp:
   data_aug_fliplr_p: 0.5
   data_aug_flipud_p: 0.0
   data_aug_fill_value: 114
-  data_aug_cutout_p: 0.05
+  data_aug_cutout_p: 0.0
   data_aug_brightness_p: 0.1
   data_aug_cutout_iou_thr: 0.3  # 若随机产生的mask与target任一bbox的iou值大于该阈值，则会采取一些行动避免这种情况发生（默认操作是舍弃该mask）
 
diff --git a/data/dataloader.py b/data/dataloader.py
@@ -93,10 +93,10 @@ class CitySpaceDataset(Dataset):
 
     def __init__(self, img_dir, seg_dir, img_size, enable_data_aug=True, transform=None, cache_num=0) -> None:
         super(CitySpaceDataset, self).__init__(enable_data_aug=enable_data_aug, input_dimension=img_size)
-        self.img_dir = img_dir
-        self.seg_dir = seg_dir
+        self.img_dir = Path(img_dir)
+        self.seg_dir = Path(seg_dir)
         self.trans = transform
-        self.db_img, self.db_seg = self.make_db()
+        self.filenames = self.make_db()
         self.imgs = None
         if cache_num > 0:
             self.cache_num =  cache_num if cache_num <= len(self) else len(self) # len(self)
@@ -110,8 +110,8 @@ def make_db(self):
         assert Path(self.img_dir).exists(), f"directory: {self.img_dir} is not exists!"
         assert Path(self.seg_dir).exists(), f"directory: {self.seg_dir} is not exists!"
 
-        img_filepathes = [p for p in Path(self.img_dir).iterdir() if p.suffix in ([".jpg", ".png", ".tiff"])]
-        seg_filepathes = [p for p in Path(self.seg_dir).iterdir() if p.suffix in ([".jpg", ".png", ".tiff"])]
+        img_filepathes = [p for p in self.img_dir.iterdir() if p.suffix in ([".jpg", ".png", ".tiff"])]
+        seg_filepathes = [p for p in self.seg_dir.iterdir() if p.suffix in ([".jpg", ".png", ".tiff"])]
         assert len(img_filepathes) == len(seg_filepathes), f"len(img_filepathes): {len(img_filepathes)}, but len(seg_filenames): {len(seg_filepathes)}"
         #                                                     (aachen              , 000062              , 000019)
         img_filepathes = sorted(img_filepathes, key=lambda x: (x.stem.split("_")[0], x.stem.split("_")[1], x.stem.split("_")[2]))
@@ -121,17 +121,22 @@ def make_db(self):
         for i, p in enumerate(img_filepathes):
             img_filename = '_'.join(p.stem.split("_")[:-1])
             assert img_filename in seg_filenames, f"image filename: {img_filepathes[i]}, can not found matched segmentation file."
-        return img_filepathes, seg_filepathes
+        return seg_filenames
 
     def __len__(self):
-        return len(self.db_img)
+        return len(self.filenames)
 
     def load_resized_data_pair(self, index):
-        img_p = self.db_img[index]
-        seg_p = self.db_seg[index]
+        filename = self.filenames[index]
+        img_p = self.img_dir / f"{filename}_leftImg8bit.png"
+        assert img_p.exists(), f"{img_p} is not exists!"
+        seg_p = self.seg_dir / f"{filename}_gtFine_labelTrainIds.png"
+        assert seg_p.exists(), f"{seg_p} is not exists!"
+        
         img_arr = cv2.imread(str(img_p))  # (h, w, 3)
         img_arr = cv2.cvtColor(img_arr, cv2.COLOR_BGR2RGB)
         seg_arr = cv2.imread(str(seg_p), 0)[:, :, None]  # (h, w, 1)
+        assert img_arr.shape[0] == seg_arr.shape[0] and img_arr.shape[1] == seg_arr.shape[1], f"img_arr's and seg_arr's shape should be the same, but img_arr.shape={img_arr.shape[:2]} and seg_arr.shape={seg_arr.shape[:2]}"
         # cityspace数据集中的背景类mask值为255, 将背景类的mask修改为0
         bg_mask = seg_arr == 255
         seg_arr += 1
@@ -193,10 +198,17 @@ def pull_item(self, index):
             img_arr = data_pair[..., :3]
             seg_arr = data_pair[..., -1:]
         else:
-            img_p = self.db_img[index]
-            seg_p = self.db_seg[index]
+            filename = self.filenames[index]
+            img_p = self.img_dir / f"{filename}_leftImg8bit.png"
+            assert img_p.exists(), f"{img_p} is not exists!"
+            seg_p = self.seg_dir / f"{filename}_gtFine_labelTrainIds.png"
+            assert seg_p.exists(), f"{seg_p} is not exists!"
+            
             img_arr = cv2.imread(str(img_p))  # (h, w, 3)
+            img_arr = cv2.cvtColor(img_arr, cv2.COLOR_BGR2RGB)
             seg_arr = cv2.imread(str(seg_p), 0)[:, :, None]  # (h, w, 1)
+            assert img_arr.shape[0] == seg_arr.shape[0] and img_arr.shape[1] == seg_arr.shape[1], f"img_arr's and seg_arr's shape should be the same, but img_arr.shape={img_arr.shape[:2]} and seg_arr.shape={seg_arr.shape[:2]}"
+        
             # cityspace数据集中的背景类mask值为255, 将背景类的mask修改为0
             bg_mask = seg_arr == 255
             seg_arr += 1
@@ -207,6 +219,7 @@ def pull_item(self, index):
 
     @Dataset.aug_getitem
     def __getitem__(self, index):
+        assert index < len(self), f"index should less than {len(self)}, but got {index}"
         img_arr, seg_arr = self.pull_item(index) 
 
         if self.enable_data_aug and self.trans is not None:
diff --git a/nets/usquarenet_experiment.py b/nets/usquarenet_experiment.py
@@ -53,9 +53,9 @@ def __init__(self, in_channel, num_class, kernel=3, stride=1, padding=1, dilatio
         super(ConvBnAct, self).__init__()
         self.conv = nn.Conv2d(in_channel, num_class, kernel, stride, padding=padding, dilation=dilation, bias=bias)
         self.bn = nn.BatchNorm2d(num_class)
-        # self.act = nn.SiLU(inplace=True) if act else nn.Identity()
+        self.act = nn.SiLU(inplace=True) if act else nn.Identity()
         # self.act = nn.ReLU(inplace=True) if act else nn.Identity()
-        self.act = nn.LeakyReLU(negative_slope=0.01, inplace=True) if act else nn.Identity()
+        # self.act = nn.LeakyReLU(negative_slope=0.01, inplace=True) if act else nn.Identity()
 
     def forward(self, x):
         x = self.conv(x)
diff --git a/train_ddp.py b/train_ddp.py
@@ -82,7 +82,7 @@ def __init__(self, hyp):
 
         # config warmup step
         if self.hyp['do_warmup']:
-            self.hyp['warmup_steps'] = max(self.hyp.get('warmup_epoch', 3) * len(self.traindataloader), 3000)
+            self.hyp['warmup_steps'] = max(self.hyp.get('warmup_epoch', 3) * len(self.traindataloader), 1000)
 
     def load_dataset(self, is_training):
         if is_training:
@@ -114,7 +114,7 @@ def _init_logger(self, model):
         logger = logging.getLogger(f"UPerNet_Rank_{self.rank}")
         formated_config = print_config(self.hyp)  # record training parameters in log.txt
         logger.setLevel(logging.INFO)
-        txt_log_path = str(self.cwd / 'log' / f'log_rank_{self.rank}' / f'log_{self.model.__class__.__name__}_{datetime.now().strftime("%Y%m%d-%H:%M:%S")}.txt')
+        txt_log_path = str(self.cwd / 'log' / f'log_rank_{self.rank}' / f'log_{self.model.__class__.__name__}_{datetime.now().strftime("%Y%m%d-%H:%M:%S")}_{self.hyp["log_postfix"]}.txt')
         maybe_mkdir(Path(txt_log_path).parent)
         handler = logging.FileHandler(txt_log_path)
         handler.setLevel(logging.INFO)
@@ -188,13 +188,13 @@ def _init_bias(self):
 
     def _init_scheduler(self, optimizer, trainloader):
         if self.hyp['scheduler_type'].lower() == "onecycle":   # onecycle lr scheduler
-            scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=0.001, epochs=self.hyp['total_epoch'], steps_per_epoch=len(trainloader), three_phase=True)
+            scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, epochs=self.hyp['total_epoch'], steps_per_epoch=len(trainloader), three_phase=True)
         elif self.hyp['scheduler_type'].lower() == 'linear':  # linear lr scheduler
-            max_ds_rate = 0.0001
+            max_ds_rate = 0.01
             linear_lr = lambda epoch: (1 - epoch / (self.hyp['total_epoch'] - 1)) * (1. - max_ds_rate) + max_ds_rate  # lr_bias越大lr的下降速度越慢,整个epoch跑完最后的lr值也越大
             scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=linear_lr)
         else:  # consin lr scheduler
-            max_ds_rate = 0.0001  # 整个训练过程中lr的最小值等于: max_ds_rate * init_lr
+            max_ds_rate = 0.01  # 整个训练过程中lr的最小值等于: max_ds_rate * init_lr
             cosin_lr = lambda epoch: ((1 + math.cos(epoch * math.pi / self.hyp['total_epoch'])) / 2) * (1. - max_ds_rate) + max_ds_rate  # cosine
             scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=cosin_lr)
         return scheduler
@@ -287,7 +287,7 @@ def step(self):
                     with amp.autocast(enabled=self.use_cuda):
                         preds = self.model(img)
                         loss_dict = self.loss_fcn(preds, gt_seg)
-                        # loss_dict['total_loss'] /= self.accumulate
+                        loss_dict['total_loss'] /= self.accumulate
                         loss_dict['total_loss'] *= get_world_size()
 
                     iter_end_time = time.time()