Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using Grounding-Dino To train coco dataset ,error is below #11808

Open
xyb1314 opened this issue Jun 21, 2024 · 3 comments
Open

Using Grounding-Dino To train coco dataset ,error is below #11808

xyb1314 opened this issue Jun 21, 2024 · 3 comments
Assignees

Comments

@xyb1314
Copy link

xyb1314 commented Jun 21, 2024

error

File "C:\Users\Administrator\miniconda3\envs\mmdet\lib\site-packages\transformers\modeling_attn_mask_utils.py", line 439, in _prepare_4d_attention_mask_for_sdpa
batch_size, key_value_length = mask.shape
ValueError: too many values to unpack (expected 2)

@xyb1314
Copy link
Author

xyb1314 commented Jun 21, 2024

config file

`base = [
'../base/datasets/opixray_detection.py',
'../base/schedules/schedule_1x.py', '../base/default_runtime.py'
]
load_from = 'C:/Users/Administrator/Desktop/mmdetection-main_1/model/groundingdino_swint_ogc_mmdet-822d7e9d.pth'# noqa
lang_model_name = 'C:/Users/Administrator/Desktop/mmdetection-main_1/model/bert-base-uncased'
num_levels = 5
model = dict(
type='GroundingDINO',
num_queries=900,
with_box_refine=True,
as_two_stage=True,
# num_feature_levels=num_levels,
data_preprocessor=dict(
type='DetDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_mask=False,
),
language_model=dict(
type='BertModel',
name=lang_model_name,
pad_to_max=False,
use_sub_sentence_represent=True,
special_tokens_list=['[CLS]', '[SEP]', '.', '?'],
add_pooling_layer=False,
),
backbone=dict(
type='SwinTransformer',
embed_dims=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
patch_norm=True,
out_indices=(1, 2, 3),
with_cp=True,
convert_weights=False),
neck=dict(
type='ChannelMapper',
in_channels=[192, 384, 768],
kernel_size=1,
out_channels=256,
act_cfg=None,
bias=True,
norm_cfg=dict(type='GN', num_groups=32),
num_outs=4),
encoder=dict(
num_layers=6,
num_cp=6,
# visual layer config
layer_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
# text layer config
text_layer_cfg=dict(
self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)),
# fusion layer config
fusion_layer_cfg=dict(
v_dim=256,
l_dim=256,
embed_dim=1024,
num_heads=4,
init_values=1e-4),
),
decoder=dict(
num_layers=6,
return_intermediate=True,
layer_cfg=dict(
# query self attention layer
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
# cross attention layer query to text
cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
# cross attention layer query to image
cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
post_norm_cfg=None),
positional_encoding=dict(
num_feats=128, normalize=True, offset=0.0, temperature=20),
bbox_head=dict(
type='GroundingDINOHead',
num_classes=80,
sync_cls_avg_factor=True,
contrastive_cfg=dict(max_text_len=256, log_scale=0.0, bias=False),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0), # 2.0 in DeformDETR
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
dn_cfg=dict( # TODO: Move to model.train_cfg ?
label_noise_scale=0.5,
box_noise_scale=1.0, # 0.4 for DN-DETR
group_cfg=dict(dynamic=True, num_groups=None,
num_dn_queries=100)), # TODO: half num_dn_queries
# training and testing settings
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
match_costs=[
dict(type='BinaryFocalLossCost', weight=2.0),
dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
dict(type='IoUCost', iou_mode='giou', weight=2.0)
])),
test_cfg=dict(max_per_img=300))

dataset settings

train_pipeline = [
dict(type='LoadImageFromFile', backend_args=base.backend_args),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', prob=0.5),
dict(
type='RandomChoice',
# transforms=[
# [
# dict(
# type='RandomChoiceResize',
# scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
# (608, 1333), (640, 1333), (672, 1333), (704, 1333),
# (736, 1333), (768, 1333), (800, 1333)],
# keep_ratio=True)
# ],
# [
# dict(
# type='RandomChoiceResize',
# # The radio of all image in train dataset < 7
# # follow the original implement
# scales=[(400, 4200), (500, 4200), (600, 4200)],
# keep_ratio=True),
# dict(
# type='RandomCrop',
# crop_type='absolute_range',
# crop_size=(384, 600),
# allow_negative_crop=True),
# dict(
# type='RandomChoiceResize',
# scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
# (608, 1333), (640, 1333), (672, 1333), (704, 1333),
# (736, 1333), (768, 1333), (800, 1333)],
# keep_ratio=True)
# ]
# ]
transforms=[
# 方案一,直接随机resize (keep ratio)
[
dict(
type='RandomChoiceResize',
# scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
# (608, 1333), (640, 1333), (672, 1333), (704, 1333),
# (736, 1333), (768, 1333), (800, 1333)],
scales=[(192, 320), (205, 320), (218, 320), (230, 320),
(243, 320), (256, 320), (269, 320), (282, 320),
(294, 320), (307, 320), (320, 320)],
keep_ratio=True)
],
# 方案二,先随机resize,然后随机剪裁,最后进行随机resize
[
dict(
type='RandomChoiceResize',
# The radio of all image in train dataset < 7
# follow the original implement
# scales=[(400, 4200), (500, 4200), (600, 4200)],
scales=[(160, 1008), (200, 1008), (240, 1008)],

                keep_ratio=True),
            dict(
                type='RandomCrop',
                crop_type='absolute_range',
                # crop_size=(384, 600),
                crop_size=(150, 200),  # 320/800*384 , 600/1333*320

                allow_negative_crop=True),
            dict(
                type='RandomChoiceResize',
                # scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
                #         (608, 1333), (640, 1333), (672, 1333), (704, 1333),
                #         (736, 1333), (768, 1333), (800, 1333)],
                scales=[(192, 320), (205, 320), (218, 320), (230, 320),
                        (243, 320), (256, 320), (269, 320), (282, 320),
                        (294, 320), (307, 320), (320, 320)],
                keep_ratio=True)
        ]
    ]

),
dict(
    type='PackDetInputs',
    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
               'scale_factor', 'flip', 'flip_direction', 'text',
               'custom_entities'))

]

test_pipeline = [
dict(type='LoadImageFromFile', backend_args=base.backend_args),
dict(type='FixScaleResize', scale=(800, 1333), keep_ratio=True),
dict(type='LoadAnnotations', with_bbox=True),
dict(
type='PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor', 'text', 'custom_entities'))
]

train_dataloader = dict(
dataset=dict(
filter_cfg=dict(filter_empty_gt=False),
pipeline=train_pipeline,
return_classes=True))
val_dataloader = dict(
dataset=dict(pipeline=test_pipeline, return_classes=True))
test_dataloader = val_dataloader

optim_wrapper = dict(
delete=True,
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
clip_grad=dict(max_norm=0.1, norm_type=2),
paramwise_cfg=dict(custom_keys={
'absolute_pos_embed': dict(decay_mult=0.),
'backbone': dict(lr_mult=0.1)
}))
max_epochs = 12
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=max_epochs,
by_epoch=True,
milestones=[11],
gamma=0.1)
]

default_hooks = dict(
checkpoint=dict(
type='CheckpointHook',
max_keep_ckpts=1,
save_best='auto'))

auto_scale_lr = dict(base_batch_size=2)
`

@xyb1314
Copy link
Author

xyb1314 commented Jun 21, 2024

mask shape require 2D tensor size , but what i got is 3D tensor size

@LinxinS97
Copy link

Same problems here. Maybe the transformer version is incorrect.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants