diff --git a/pdino_config_result/config.py b/pdino_config_result/config.py
new file mode 100644
index 0000000..c3a74ed
--- /dev/null
+++ b/pdino_config_result/config.py
@@ -0,0 +1,562 @@
+auto_scale_lr = dict(base_batch_size=16)
+backend_args = None
+classes = (
+ 'pedestrian',
+ 'people',
+ 'bicycle',
+ 'car',
+ 'van',
+ 'truck',
+ 'tricycle',
+ 'awning-tricycle',
+ 'bus',
+ 'motor',
+)
+data_root = 'visdrone/'
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+ checkpoint=dict(interval=1, type='CheckpointHook'),
+ logger=dict(interval=50, type='LoggerHook'),
+ param_scheduler=dict(type='ParamSchedulerHook'),
+ sampler_seed=dict(type='DistSamplerSeedHook'),
+ timer=dict(type='IterTimerHook'),
+ visualization=dict(type='DetVisualizationHook'))
+default_scope = 'mmdet'
+env_cfg = dict(
+ cudnn_benchmark=False,
+ dist_cfg=dict(backend='nccl'),
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+fp16 = None
+launcher = 'none'
+load_from = None
+log_level = 'INFO'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
+max_epochs = 24
+model = dict(
+ as_two_stage=True,
+ backbone=dict(
+ init_cfg=dict(checkpoint='pretrained/p2t_tiny.pth', type='Pretrained'),
+ style='pytorch',
+ type='p2t_tiny'),
+ bbox_head=dict(
+ loss_bbox=dict(loss_weight=5.0, type='L1Loss'),
+ loss_cls=dict(
+ alpha=0.25,
+ gamma=2.0,
+ loss_weight=1.0,
+ type='FocalLoss',
+ use_sigmoid=True),
+ loss_iou=dict(loss_weight=2.0, type='GIoULoss'),
+ num_classes=10,
+ sync_cls_avg_factor=True,
+ type='DINOHead'),
+ data_preprocessor=dict(
+ bgr_to_rgb=True,
+ mean=[
+ 123.675,
+ 116.28,
+ 103.53,
+ ],
+ pad_size_divisor=1,
+ std=[
+ 58.395,
+ 57.12,
+ 57.375,
+ ],
+ type='DetDataPreprocessor'),
+ decoder=dict(
+ layer_cfg=dict(
+ cross_attn_cfg=dict(dropout=0.0, embed_dims=256, num_levels=5),
+ ffn_cfg=dict(
+ embed_dims=256, feedforward_channels=2048, ffn_drop=0.0),
+ self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8)),
+ num_layers=6,
+ post_norm_cfg=None,
+ return_intermediate=True),
+ dn_cfg=dict(
+ box_noise_scale=1.0,
+ group_cfg=dict(dynamic=True, num_dn_queries=100, num_groups=None),
+ label_noise_scale=0.5),
+ encoder=dict(
+ layer_cfg=dict(
+ ffn_cfg=dict(
+ embed_dims=256, feedforward_channels=2048, ffn_drop=0.0),
+ self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_levels=5)),
+ num_layers=6),
+ neck=dict(
+ act_cfg=None,
+ in_channels=[
+ 48,
+ 96,
+ 240,
+ 384,
+ ],
+ kernel_size=1,
+ norm_cfg=dict(num_groups=32, type='GN'),
+ num_outs=5,
+ out_channels=256,
+ type='ChannelMapper'),
+ num_feature_levels=5,
+ num_queries=900,
+ positional_encoding=dict(
+ normalize=True, num_feats=128, offset=0.0, temperature=20),
+ test_cfg=dict(max_per_img=300),
+ train_cfg=dict(
+ assigner=dict(
+ match_costs=[
+ dict(type='FocalLossCost', weight=2.0),
+ dict(box_format='xywh', type='BBoxL1Cost', weight=5.0),
+ dict(iou_mode='giou', type='IoUCost', weight=2.0),
+ ],
+ type='HungarianAssigner')),
+ type='DINO',
+ with_box_refine=True)
+num_levels = 5
+optim_wrapper = dict(
+ clip_grad=dict(max_norm=0.1, norm_type=2),
+ optimizer=dict(lr=0.0001, type='AdamW', weight_decay=0.0001),
+ paramwise_cfg=dict(
+ bypass_duplicate=True, custom_keys=dict(backbone=dict(lr_mult=0.1))),
+ type='OptimWrapper')
+param_scheduler = [
+ dict(
+ begin=0,
+ by_epoch=True,
+ end=12,
+ gamma=0.1,
+ milestones=[
+ 11,
+ ],
+ type='MultiStepLR'),
+]
+pretrained = 'pretrained/p2t_tiny.pth'
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+ batch_size=1,
+ dataset=dict(
+ ann_file='VisDrone2019-DET-val/NEW_val.json',
+ backend_args=None,
+ data_prefix=dict(img='VisDrone2019-DET-val/images/'),
+ data_root='visdrone/',
+ metainfo=dict(
+ classes=(
+ 'pedestrian',
+ 'people',
+ 'bicycle',
+ 'car',
+ 'van',
+ 'truck',
+ 'tricycle',
+ 'awning-tricycle',
+ 'bus',
+ 'motor',
+ )),
+ pipeline=[
+ dict(backend_args=None, type='LoadImageFromFile'),
+ dict(keep_ratio=True, scale=(
+ 1333,
+ 800,
+ ), type='Resize'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ meta_keys=(
+ 'img_id',
+ 'img_path',
+ 'ori_shape',
+ 'img_shape',
+ 'scale_factor',
+ ),
+ type='PackDetInputs'),
+ ],
+ test_mode=True,
+ type='CocoDataset'),
+ drop_last=False,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+ ann_file='visdrone/VisDrone2019-DET-val/NEW_val.json',
+ backend_args=None,
+ format_only=False,
+ metric='bbox',
+ type='CocoMetric')
+test_pipeline = [
+ dict(backend_args=None, type='LoadImageFromFile'),
+ dict(keep_ratio=True, scale=(
+ 1333,
+ 800,
+ ), type='Resize'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ meta_keys=(
+ 'img_id',
+ 'img_path',
+ 'ori_shape',
+ 'img_shape',
+ 'scale_factor',
+ ),
+ type='PackDetInputs'),
+]
+train_cfg = dict(max_epochs=24, type='EpochBasedTrainLoop', val_interval=1)
+train_dataloader = dict(
+ batch_sampler=dict(type='AspectRatioBatchSampler'),
+ batch_size=2,
+ dataset=dict(
+ ann_file='VisDrone2019-DET-train/NEW_val.json',
+ backend_args=None,
+ data_prefix=dict(img='VisDrone2019-DET-train/images/'),
+ data_root='visdrone/',
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
+ metainfo=dict(
+ classes=(
+ 'pedestrian',
+ 'people',
+ 'bicycle',
+ 'car',
+ 'van',
+ 'truck',
+ 'tricycle',
+ 'awning-tricycle',
+ 'bus',
+ 'motor',
+ )),
+ pipeline=[
+ dict(backend_args=None, type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(prob=0.5, type='RandomFlip'),
+ dict(
+ transforms=[
+ [
+ dict(
+ keep_ratio=True,
+ scales=[
+ (
+ 480,
+ 1333,
+ ),
+ (
+ 512,
+ 1333,
+ ),
+ (
+ 544,
+ 1333,
+ ),
+ (
+ 576,
+ 1333,
+ ),
+ (
+ 608,
+ 1333,
+ ),
+ (
+ 640,
+ 1333,
+ ),
+ (
+ 672,
+ 1333,
+ ),
+ (
+ 704,
+ 1333,
+ ),
+ (
+ 736,
+ 1333,
+ ),
+ (
+ 768,
+ 1333,
+ ),
+ (
+ 800,
+ 1333,
+ ),
+ ],
+ type='RandomChoiceResize'),
+ ],
+ [
+ dict(
+ keep_ratio=True,
+ scales=[
+ (
+ 400,
+ 4200,
+ ),
+ (
+ 500,
+ 4200,
+ ),
+ (
+ 600,
+ 4200,
+ ),
+ ],
+ type='RandomChoiceResize'),
+ dict(
+ allow_negative_crop=True,
+ crop_size=(
+ 384,
+ 600,
+ ),
+ crop_type='absolute_range',
+ type='RandomCrop'),
+ dict(
+ keep_ratio=True,
+ scales=[
+ (
+ 480,
+ 1333,
+ ),
+ (
+ 512,
+ 1333,
+ ),
+ (
+ 544,
+ 1333,
+ ),
+ (
+ 576,
+ 1333,
+ ),
+ (
+ 608,
+ 1333,
+ ),
+ (
+ 640,
+ 1333,
+ ),
+ (
+ 672,
+ 1333,
+ ),
+ (
+ 704,
+ 1333,
+ ),
+ (
+ 736,
+ 1333,
+ ),
+ (
+ 768,
+ 1333,
+ ),
+ (
+ 800,
+ 1333,
+ ),
+ ],
+ type='RandomChoiceResize'),
+ ],
+ ],
+ type='RandomChoice'),
+ dict(type='PackDetInputs'),
+ ],
+ type='CocoDataset'),
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+ dict(backend_args=None, type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(prob=0.5, type='RandomFlip'),
+ dict(
+ transforms=[
+ [
+ dict(
+ keep_ratio=True,
+ scales=[
+ (
+ 480,
+ 1333,
+ ),
+ (
+ 512,
+ 1333,
+ ),
+ (
+ 544,
+ 1333,
+ ),
+ (
+ 576,
+ 1333,
+ ),
+ (
+ 608,
+ 1333,
+ ),
+ (
+ 640,
+ 1333,
+ ),
+ (
+ 672,
+ 1333,
+ ),
+ (
+ 704,
+ 1333,
+ ),
+ (
+ 736,
+ 1333,
+ ),
+ (
+ 768,
+ 1333,
+ ),
+ (
+ 800,
+ 1333,
+ ),
+ ],
+ type='RandomChoiceResize'),
+ ],
+ [
+ dict(
+ keep_ratio=True,
+ scales=[
+ (
+ 400,
+ 4200,
+ ),
+ (
+ 500,
+ 4200,
+ ),
+ (
+ 600,
+ 4200,
+ ),
+ ],
+ type='RandomChoiceResize'),
+ dict(
+ allow_negative_crop=True,
+ crop_size=(
+ 384,
+ 600,
+ ),
+ crop_type='absolute_range',
+ type='RandomCrop'),
+ dict(
+ keep_ratio=True,
+ scales=[
+ (
+ 480,
+ 1333,
+ ),
+ (
+ 512,
+ 1333,
+ ),
+ (
+ 544,
+ 1333,
+ ),
+ (
+ 576,
+ 1333,
+ ),
+ (
+ 608,
+ 1333,
+ ),
+ (
+ 640,
+ 1333,
+ ),
+ (
+ 672,
+ 1333,
+ ),
+ (
+ 704,
+ 1333,
+ ),
+ (
+ 736,
+ 1333,
+ ),
+ (
+ 768,
+ 1333,
+ ),
+ (
+ 800,
+ 1333,
+ ),
+ ],
+ type='RandomChoiceResize'),
+ ],
+ ],
+ type='RandomChoice'),
+ dict(type='PackDetInputs'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+ batch_size=1,
+ dataset=dict(
+ ann_file='VisDrone2019-DET-val/NEW_val.json',
+ backend_args=None,
+ data_prefix=dict(img='VisDrone2019-DET-val/images/'),
+ data_root='visdrone/',
+ metainfo=dict(
+ classes=(
+ 'pedestrian',
+ 'people',
+ 'bicycle',
+ 'car',
+ 'van',
+ 'truck',
+ 'tricycle',
+ 'awning-tricycle',
+ 'bus',
+ 'motor',
+ )),
+ pipeline=[
+ dict(backend_args=None, type='LoadImageFromFile'),
+ dict(keep_ratio=True, scale=(
+ 1333,
+ 800,
+ ), type='Resize'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ meta_keys=(
+ 'img_id',
+ 'img_path',
+ 'ori_shape',
+ 'img_shape',
+ 'scale_factor',
+ ),
+ type='PackDetInputs'),
+ ],
+ test_mode=True,
+ type='CocoDataset'),
+ drop_last=False,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+ ann_file='visdrone/VisDrone2019-DET-val/NEW_val.json',
+ backend_args=None,
+ format_only=False,
+ metric='bbox',
+ type='CocoMetric')
+vis_backends = [
+ dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+ name='visualizer',
+ type='DetLocalVisualizer',
+ vis_backends=[
+ dict(type='LocalVisBackend'),
+ dict(type='TensorboardVisBackend'),
+ ])
+work_dir = 'dino_swin_img'
diff --git a/pdino_result_pic/base_lr.svg b/pdino_result_pic/base_lr.svg
new file mode 100644
index 0000000..e1dd3bd
--- /dev/null
+++ b/pdino_result_pic/base_lr.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/coco_bbox_mAP.svg b/pdino_result_pic/coco_bbox_mAP.svg
new file mode 100644
index 0000000..8e89d6a
--- /dev/null
+++ b/pdino_result_pic/coco_bbox_mAP.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/coco_bbox_mAP_50.svg b/pdino_result_pic/coco_bbox_mAP_50.svg
new file mode 100644
index 0000000..b7dcd95
--- /dev/null
+++ b/pdino_result_pic/coco_bbox_mAP_50.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/coco_bbox_mAP_75.svg b/pdino_result_pic/coco_bbox_mAP_75.svg
new file mode 100644
index 0000000..0d29ef7
--- /dev/null
+++ b/pdino_result_pic/coco_bbox_mAP_75.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/coco_bbox_mAP_l.svg b/pdino_result_pic/coco_bbox_mAP_l.svg
new file mode 100644
index 0000000..a84c1d4
--- /dev/null
+++ b/pdino_result_pic/coco_bbox_mAP_l.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/coco_bbox_mAP_m.svg b/pdino_result_pic/coco_bbox_mAP_m.svg
new file mode 100644
index 0000000..9958933
--- /dev/null
+++ b/pdino_result_pic/coco_bbox_mAP_m.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/coco_bbox_mAP_s.svg b/pdino_result_pic/coco_bbox_mAP_s.svg
new file mode 100644
index 0000000..fafa0d6
--- /dev/null
+++ b/pdino_result_pic/coco_bbox_mAP_s.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/events.out.tfevents.1702123270.autodl-container-81e5118cae-4b8d01ae.9731.0 b/pdino_result_pic/events.out.tfevents.1702123270.autodl-container-81e5118cae-4b8d01ae.9731.0
new file mode 100644
index 0000000..ac14fdb
Binary files /dev/null and b/pdino_result_pic/events.out.tfevents.1702123270.autodl-container-81e5118cae-4b8d01ae.9731.0 differ
diff --git a/pdino_result_pic/loss.svg b/pdino_result_pic/loss.svg
new file mode 100644
index 0000000..39be623
--- /dev/null
+++ b/pdino_result_pic/loss.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/loss_bbox.svg b/pdino_result_pic/loss_bbox.svg
new file mode 100644
index 0000000..2831e2e
--- /dev/null
+++ b/pdino_result_pic/loss_bbox.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/loss_cls.svg b/pdino_result_pic/loss_cls.svg
new file mode 100644
index 0000000..2a255b4
--- /dev/null
+++ b/pdino_result_pic/loss_cls.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/pdino_result_pic/loss_iou.svg b/pdino_result_pic/loss_iou.svg
new file mode 100644
index 0000000..31dbddd
--- /dev/null
+++ b/pdino_result_pic/loss_iou.svg
@@ -0,0 +1 @@
+
\ No newline at end of file