forked from open-mmlab/mmdetection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
grounding_dino_swin-t_finetune_16xb4_1x_lvis.py
120 lines (110 loc) · 4.02 KB
/
grounding_dino_swin-t_finetune_16xb4_1x_lvis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
data_root = 'data/coco/'
model = dict(test_cfg=dict(
max_per_img=300,
chunked_size=40,
))
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', prob=0.5),
dict(
type='RandomChoice',
transforms=[
[
dict(
type='RandomChoiceResize',
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
(736, 1333), (768, 1333), (800, 1333)],
keep_ratio=True)
],
[
dict(
type='RandomChoiceResize',
# The radio of all image in train dataset < 7
# follow the original implement
scales=[(400, 4200), (500, 4200), (600, 4200)],
keep_ratio=True),
dict(
type='RandomCrop',
crop_type='absolute_range',
crop_size=(384, 600),
allow_negative_crop=True),
dict(
type='RandomChoiceResize',
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
(736, 1333), (768, 1333), (800, 1333)],
keep_ratio=True)
]
]),
dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
dict(
type='RandomSamplingNegPos',
tokenizer_name=_base_.lang_model_name,
num_sample_negative=85,
# change this
label_map_file='data/coco/annotations/lvis_v1_label_map.json',
max_tokens=256),
dict(
type='PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor', 'flip', 'flip_direction', 'text',
'custom_entities', 'tokens_positive', 'dataset_mode'))
]
train_dataloader = dict(
dataset=dict(
_delete_=True,
type='ClassBalancedDataset',
oversample_thr=1e-3,
dataset=dict(
type='ODVGDataset',
data_root=data_root,
need_text=False,
label_map_file='annotations/lvis_v1_label_map.json',
ann_file='annotations/lvis_v1_train_od.json',
data_prefix=dict(img=''),
filter_cfg=dict(filter_empty_gt=False, min_size=32),
return_classes=True,
pipeline=train_pipeline)))
val_dataloader = dict(
dataset=dict(
data_root=data_root,
type='LVISV1Dataset',
ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
data_prefix=dict(img='')))
test_dataloader = val_dataloader
val_evaluator = dict(
_delete_=True,
type='LVISFixedAPMetric',
ann_file=data_root +
'annotations/lvis_v1_minival_inserted_image_name.json')
test_evaluator = val_evaluator
optim_wrapper = dict(
_delete_=True,
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
clip_grad=dict(max_norm=0.1, norm_type=2),
paramwise_cfg=dict(
custom_keys={
'absolute_pos_embed': dict(decay_mult=0.),
'backbone': dict(lr_mult=0.1),
# 'language_model': dict(lr_mult=0),
}))
# learning policy
max_epochs = 12
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=max_epochs,
by_epoch=True,
milestones=[11],
gamma=0.1)
]
train_cfg = dict(max_epochs=max_epochs, val_interval=3)
default_hooks = dict(
checkpoint=dict(
max_keep_ckpts=1, save_best='lvis_fixed_ap/AP', rule='greater'))
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa