diff --git a/pdino_config_result/config.py b/pdino_config_result/config.py new file mode 100644 index 0000000..c3a74ed --- /dev/null +++ b/pdino_config_result/config.py @@ -0,0 +1,562 @@ +auto_scale_lr = dict(base_batch_size=16) +backend_args = None +classes = ( + 'pedestrian', + 'people', + 'bicycle', + 'car', + 'van', + 'truck', + 'tricycle', + 'awning-tricycle', + 'bus', + 'motor', +) +data_root = 'visdrone/' +dataset_type = 'CocoDataset' +default_hooks = dict( + checkpoint=dict(interval=1, type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='DetVisualizationHook')) +default_scope = 'mmdet' +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +fp16 = None +launcher = 'none' +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +max_epochs = 24 +model = dict( + as_two_stage=True, + backbone=dict( + init_cfg=dict(checkpoint='pretrained/p2t_tiny.pth', type='Pretrained'), + style='pytorch', + type='p2t_tiny'), + bbox_head=dict( + loss_bbox=dict(loss_weight=5.0, type='L1Loss'), + loss_cls=dict( + alpha=0.25, + gamma=2.0, + loss_weight=1.0, + type='FocalLoss', + use_sigmoid=True), + loss_iou=dict(loss_weight=2.0, type='GIoULoss'), + num_classes=10, + sync_cls_avg_factor=True, + type='DINOHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_size_divisor=1, + std=[ + 58.395, + 57.12, + 57.375, + ], + type='DetDataPreprocessor'), + decoder=dict( + layer_cfg=dict( + cross_attn_cfg=dict(dropout=0.0, embed_dims=256, num_levels=5), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0), + self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8)), + num_layers=6, + post_norm_cfg=None, + return_intermediate=True), + dn_cfg=dict( + box_noise_scale=1.0, + group_cfg=dict(dynamic=True, num_dn_queries=100, num_groups=None), + label_noise_scale=0.5), + encoder=dict( + layer_cfg=dict( + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0), + self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_levels=5)), + num_layers=6), + neck=dict( + act_cfg=None, + in_channels=[ + 48, + 96, + 240, + 384, + ], + kernel_size=1, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=5, + out_channels=256, + type='ChannelMapper'), + num_feature_levels=5, + num_queries=900, + positional_encoding=dict( + normalize=True, num_feats=128, offset=0.0, temperature=20), + test_cfg=dict(max_per_img=300), + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(box_format='xywh', type='BBoxL1Cost', weight=5.0), + dict(iou_mode='giou', type='IoUCost', weight=2.0), + ], + type='HungarianAssigner')), + type='DINO', + with_box_refine=True) +num_levels = 5 +optim_wrapper = dict( + clip_grad=dict(max_norm=0.1, norm_type=2), + optimizer=dict(lr=0.0001, type='AdamW', weight_decay=0.0001), + paramwise_cfg=dict( + bypass_duplicate=True, custom_keys=dict(backbone=dict(lr_mult=0.1))), + type='OptimWrapper') +param_scheduler = [ + dict( + begin=0, + by_epoch=True, + end=12, + gamma=0.1, + milestones=[ + 11, + ], + type='MultiStepLR'), +] +pretrained = 'pretrained/p2t_tiny.pth' +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + ann_file='VisDrone2019-DET-val/NEW_val.json', + backend_args=None, + data_prefix=dict(img='VisDrone2019-DET-val/images/'), + data_root='visdrone/', + metainfo=dict( + classes=( + 'pedestrian', + 'people', + 'bicycle', + 'car', + 'van', + 'truck', + 'tricycle', + 'awning-tricycle', + 'bus', + 'motor', + )), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(keep_ratio=True, scale=( + 1333, + 800, + ), type='Resize'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + ), + type='PackDetInputs'), + ], + test_mode=True, + type='CocoDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='visdrone/VisDrone2019-DET-val/NEW_val.json', + backend_args=None, + format_only=False, + metric='bbox', + type='CocoMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(keep_ratio=True, scale=( + 1333, + 800, + ), type='Resize'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + ), + type='PackDetInputs'), +] +train_cfg = dict(max_epochs=24, type='EpochBasedTrainLoop', val_interval=1) +train_dataloader = dict( + batch_sampler=dict(type='AspectRatioBatchSampler'), + batch_size=2, + dataset=dict( + ann_file='VisDrone2019-DET-train/NEW_val.json', + backend_args=None, + data_prefix=dict(img='VisDrone2019-DET-train/images/'), + data_root='visdrone/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + metainfo=dict( + classes=( + 'pedestrian', + 'people', + 'bicycle', + 'car', + 'van', + 'truck', + 'tricycle', + 'awning-tricycle', + 'bus', + 'motor', + )), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(prob=0.5, type='RandomFlip'), + dict( + transforms=[ + [ + dict( + keep_ratio=True, + scales=[ + ( + 480, + 1333, + ), + ( + 512, + 1333, + ), + ( + 544, + 1333, + ), + ( + 576, + 1333, + ), + ( + 608, + 1333, + ), + ( + 640, + 1333, + ), + ( + 672, + 1333, + ), + ( + 704, + 1333, + ), + ( + 736, + 1333, + ), + ( + 768, + 1333, + ), + ( + 800, + 1333, + ), + ], + type='RandomChoiceResize'), + ], + [ + dict( + keep_ratio=True, + scales=[ + ( + 400, + 4200, + ), + ( + 500, + 4200, + ), + ( + 600, + 4200, + ), + ], + type='RandomChoiceResize'), + dict( + allow_negative_crop=True, + crop_size=( + 384, + 600, + ), + crop_type='absolute_range', + type='RandomCrop'), + dict( + keep_ratio=True, + scales=[ + ( + 480, + 1333, + ), + ( + 512, + 1333, + ), + ( + 544, + 1333, + ), + ( + 576, + 1333, + ), + ( + 608, + 1333, + ), + ( + 640, + 1333, + ), + ( + 672, + 1333, + ), + ( + 704, + 1333, + ), + ( + 736, + 1333, + ), + ( + 768, + 1333, + ), + ( + 800, + 1333, + ), + ], + type='RandomChoiceResize'), + ], + ], + type='RandomChoice'), + dict(type='PackDetInputs'), + ], + type='CocoDataset'), + num_workers=2, + persistent_workers=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(prob=0.5, type='RandomFlip'), + dict( + transforms=[ + [ + dict( + keep_ratio=True, + scales=[ + ( + 480, + 1333, + ), + ( + 512, + 1333, + ), + ( + 544, + 1333, + ), + ( + 576, + 1333, + ), + ( + 608, + 1333, + ), + ( + 640, + 1333, + ), + ( + 672, + 1333, + ), + ( + 704, + 1333, + ), + ( + 736, + 1333, + ), + ( + 768, + 1333, + ), + ( + 800, + 1333, + ), + ], + type='RandomChoiceResize'), + ], + [ + dict( + keep_ratio=True, + scales=[ + ( + 400, + 4200, + ), + ( + 500, + 4200, + ), + ( + 600, + 4200, + ), + ], + type='RandomChoiceResize'), + dict( + allow_negative_crop=True, + crop_size=( + 384, + 600, + ), + crop_type='absolute_range', + type='RandomCrop'), + dict( + keep_ratio=True, + scales=[ + ( + 480, + 1333, + ), + ( + 512, + 1333, + ), + ( + 544, + 1333, + ), + ( + 576, + 1333, + ), + ( + 608, + 1333, + ), + ( + 640, + 1333, + ), + ( + 672, + 1333, + ), + ( + 704, + 1333, + ), + ( + 736, + 1333, + ), + ( + 768, + 1333, + ), + ( + 800, + 1333, + ), + ], + type='RandomChoiceResize'), + ], + ], + type='RandomChoice'), + dict(type='PackDetInputs'), +] +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=1, + dataset=dict( + ann_file='VisDrone2019-DET-val/NEW_val.json', + backend_args=None, + data_prefix=dict(img='VisDrone2019-DET-val/images/'), + data_root='visdrone/', + metainfo=dict( + classes=( + 'pedestrian', + 'people', + 'bicycle', + 'car', + 'van', + 'truck', + 'tricycle', + 'awning-tricycle', + 'bus', + 'motor', + )), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(keep_ratio=True, scale=( + 1333, + 800, + ), type='Resize'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + ), + type='PackDetInputs'), + ], + test_mode=True, + type='CocoDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='visdrone/VisDrone2019-DET-val/NEW_val.json', + backend_args=None, + format_only=False, + metric='bbox', + type='CocoMetric') +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = 'dino_swin_img' diff --git a/pdino_result_pic/base_lr.svg b/pdino_result_pic/base_lr.svg new file mode 100644 index 0000000..e1dd3bd --- /dev/null +++ b/pdino_result_pic/base_lr.svg @@ -0,0 +1 @@ +01e-52e-53e-54e-55e-56e-57e-58e-59e-51e-41.1e-4-10k010k20k30k40k50k60k70k80k \ No newline at end of file diff --git a/pdino_result_pic/coco_bbox_mAP.svg b/pdino_result_pic/coco_bbox_mAP.svg new file mode 100644 index 0000000..8e89d6a --- /dev/null +++ b/pdino_result_pic/coco_bbox_mAP.svg @@ -0,0 +1 @@ +00.020.040.060.080.10.120.140.160.180.20.220.24024681012141618202224 \ No newline at end of file diff --git a/pdino_result_pic/coco_bbox_mAP_50.svg b/pdino_result_pic/coco_bbox_mAP_50.svg new file mode 100644 index 0000000..b7dcd95 --- /dev/null +++ b/pdino_result_pic/coco_bbox_mAP_50.svg @@ -0,0 +1 @@ +00.050.10.150.20.250.30.350.40.45024681012141618202224 \ No newline at end of file diff --git a/pdino_result_pic/coco_bbox_mAP_75.svg b/pdino_result_pic/coco_bbox_mAP_75.svg new file mode 100644 index 0000000..0d29ef7 --- /dev/null +++ b/pdino_result_pic/coco_bbox_mAP_75.svg @@ -0,0 +1 @@ +00.020.040.060.080.10.120.140.160.180.20.22024681012141618202224 \ No newline at end of file diff --git a/pdino_result_pic/coco_bbox_mAP_l.svg b/pdino_result_pic/coco_bbox_mAP_l.svg new file mode 100644 index 0000000..a84c1d4 --- /dev/null +++ b/pdino_result_pic/coco_bbox_mAP_l.svg @@ -0,0 +1 @@ +00.050.10.150.20.250.30.350.4024681012141618202224 \ No newline at end of file diff --git a/pdino_result_pic/coco_bbox_mAP_m.svg b/pdino_result_pic/coco_bbox_mAP_m.svg new file mode 100644 index 0000000..9958933 --- /dev/null +++ b/pdino_result_pic/coco_bbox_mAP_m.svg @@ -0,0 +1 @@ +00.050.10.150.20.250.30.35024681012141618202224 \ No newline at end of file diff --git a/pdino_result_pic/coco_bbox_mAP_s.svg b/pdino_result_pic/coco_bbox_mAP_s.svg new file mode 100644 index 0000000..fafa0d6 --- /dev/null +++ b/pdino_result_pic/coco_bbox_mAP_s.svg @@ -0,0 +1 @@ +00.020.040.060.080.10.120.140.16024681012141618202224 \ No newline at end of file diff --git a/pdino_result_pic/events.out.tfevents.1702123270.autodl-container-81e5118cae-4b8d01ae.9731.0 b/pdino_result_pic/events.out.tfevents.1702123270.autodl-container-81e5118cae-4b8d01ae.9731.0 new file mode 100644 index 0000000..ac14fdb Binary files /dev/null and b/pdino_result_pic/events.out.tfevents.1702123270.autodl-container-81e5118cae-4b8d01ae.9731.0 differ diff --git a/pdino_result_pic/loss.svg b/pdino_result_pic/loss.svg new file mode 100644 index 0000000..39be623 --- /dev/null +++ b/pdino_result_pic/loss.svg @@ -0,0 +1 @@ +101214161820222426-10k010k20k30k40k50k60k70k80k \ No newline at end of file diff --git a/pdino_result_pic/loss_bbox.svg b/pdino_result_pic/loss_bbox.svg new file mode 100644 index 0000000..2831e2e --- /dev/null +++ b/pdino_result_pic/loss_bbox.svg @@ -0,0 +1 @@ +0.040.060.080.10.120.140.160.180.20.220.240.260.280.3Alt + Scroll to Zoom-10k010k20k30k40k50k60k70k80k \ No newline at end of file diff --git a/pdino_result_pic/loss_cls.svg b/pdino_result_pic/loss_cls.svg new file mode 100644 index 0000000..2a255b4 --- /dev/null +++ b/pdino_result_pic/loss_cls.svg @@ -0,0 +1 @@ +0.150.20.250.30.350.40.450.50.550.6-10k010k20k30k40k50k60k70k80k \ No newline at end of file diff --git a/pdino_result_pic/loss_iou.svg b/pdino_result_pic/loss_iou.svg new file mode 100644 index 0000000..31dbddd --- /dev/null +++ b/pdino_result_pic/loss_iou.svg @@ -0,0 +1 @@ +0.60.70.80.911.11.21.31.4-10k010k20k30k40k50k60k70k80k \ No newline at end of file