Using Grounding-Dino To train coco dataset ,error is below #11808

xyb1314 · 2024-06-21T03:51:09Z

error

File "C:\Users\Administrator\miniconda3\envs\mmdet\lib\site-packages\transformers\modeling_attn_mask_utils.py", line 439, in _prepare_4d_attention_mask_for_sdpa
batch_size, key_value_length = mask.shape
ValueError: too many values to unpack (expected 2)

xyb1314 · 2024-06-21T03:56:44Z

config file

`base = [
'../base/datasets/opixray_detection.py',
'../base/schedules/schedule_1x.py', '../base/default_runtime.py'
]
load_from = 'C:/Users/Administrator/Desktop/mmdetection-main_1/model/groundingdino_swint_ogc_mmdet-822d7e9d.pth'# noqa
lang_model_name = 'C:/Users/Administrator/Desktop/mmdetection-main_1/model/bert-base-uncased'
num_levels = 5
model = dict(
type='GroundingDINO',
num_queries=900,
with_box_refine=True,
as_two_stage=True,
# num_feature_levels=num_levels,
data_preprocessor=dict(
type='DetDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_mask=False,
),
language_model=dict(
type='BertModel',
name=lang_model_name,
pad_to_max=False,
use_sub_sentence_represent=True,
special_tokens_list=['[CLS]', '[SEP]', '.', '?'],
add_pooling_layer=False,
),
backbone=dict(
type='SwinTransformer',
embed_dims=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
patch_norm=True,
out_indices=(1, 2, 3),
with_cp=True,
convert_weights=False),
neck=dict(
type='ChannelMapper',
in_channels=[192, 384, 768],
kernel_size=1,
out_channels=256,
act_cfg=None,
bias=True,
norm_cfg=dict(type='GN', num_groups=32),
num_outs=4),
encoder=dict(
num_layers=6,
num_cp=6,
# visual layer config
layer_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
# text layer config
text_layer_cfg=dict(
self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)),
# fusion layer config
fusion_layer_cfg=dict(
v_dim=256,
l_dim=256,
embed_dim=1024,
num_heads=4,
init_values=1e-4),
),
decoder=dict(
num_layers=6,
return_intermediate=True,
layer_cfg=dict(
# query self attention layer
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
# cross attention layer query to text
cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
# cross attention layer query to image
cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
post_norm_cfg=None),
positional_encoding=dict(
num_feats=128, normalize=True, offset=0.0, temperature=20),
bbox_head=dict(
type='GroundingDINOHead',
num_classes=80,
sync_cls_avg_factor=True,
contrastive_cfg=dict(max_text_len=256, log_scale=0.0, bias=False),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0), # 2.0 in DeformDETR
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
dn_cfg=dict( # TODO: Move to model.train_cfg ?
label_noise_scale=0.5,
box_noise_scale=1.0, # 0.4 for DN-DETR
group_cfg=dict(dynamic=True, num_groups=None,
num_dn_queries=100)), # TODO: half num_dn_queries
# training and testing settings
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
match_costs=[
dict(type='BinaryFocalLossCost', weight=2.0),
dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
dict(type='IoUCost', iou_mode='giou', weight=2.0)
])),
test_cfg=dict(max_per_img=300))

dataset settings

train_pipeline = [
dict(type='LoadImageFromFile', backend_args=base.backend_args),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', prob=0.5),
dict(
type='RandomChoice',
# transforms=[
# [
# dict(
# type='RandomChoiceResize',
# scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
# (608, 1333), (640, 1333), (672, 1333), (704, 1333),
# (736, 1333), (768, 1333), (800, 1333)],
# keep_ratio=True)
# ],
# [
# dict(
# type='RandomChoiceResize',
# # The radio of all image in train dataset < 7
# # follow the original implement
# scales=[(400, 4200), (500, 4200), (600, 4200)],
# keep_ratio=True),
# dict(
# type='RandomCrop',
# crop_type='absolute_range',
# crop_size=(384, 600),
# allow_negative_crop=True),
# dict(
# type='RandomChoiceResize',
# scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
# (608, 1333), (640, 1333), (672, 1333), (704, 1333),
# (736, 1333), (768, 1333), (800, 1333)],
# keep_ratio=True)
# ]
# ]
transforms=[
# 方案一，直接随机resize （keep ratio）
[
dict(
type='RandomChoiceResize',
# scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
# (608, 1333), (640, 1333), (672, 1333), (704, 1333),
# (736, 1333), (768, 1333), (800, 1333)],
scales=[(192, 320), (205, 320), (218, 320), (230, 320),
(243, 320), (256, 320), (269, 320), (282, 320),
(294, 320), (307, 320), (320, 320)],
keep_ratio=True)
],
# 方案二，先随机resize，然后随机剪裁，最后进行随机resize
[
dict(
type='RandomChoiceResize',
# The radio of all image in train dataset < 7
# follow the original implement
# scales=[(400, 4200), (500, 4200), (600, 4200)],
scales=[(160, 1008), (200, 1008), (240, 1008)],

                keep_ratio=True),
            dict(
                type='RandomCrop',
                crop_type='absolute_range',
                # crop_size=(384, 600),
                crop_size=(150, 200),  # 320/800*384 , 600/1333*320

                allow_negative_crop=True),
            dict(
                type='RandomChoiceResize',
                # scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
                #         (608, 1333), (640, 1333), (672, 1333), (704, 1333),
                #         (736, 1333), (768, 1333), (800, 1333)],
                scales=[(192, 320), (205, 320), (218, 320), (230, 320),
                        (243, 320), (256, 320), (269, 320), (282, 320),
                        (294, 320), (307, 320), (320, 320)],
                keep_ratio=True)
        ]
    ]

),
dict(
    type='PackDetInputs',
    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
               'scale_factor', 'flip', 'flip_direction', 'text',
               'custom_entities'))

]

test_pipeline = [
dict(type='LoadImageFromFile', backend_args=base.backend_args),
dict(type='FixScaleResize', scale=(800, 1333), keep_ratio=True),
dict(type='LoadAnnotations', with_bbox=True),
dict(
type='PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor', 'text', 'custom_entities'))
]

train_dataloader = dict(
dataset=dict(
filter_cfg=dict(filter_empty_gt=False),
pipeline=train_pipeline,
return_classes=True))
val_dataloader = dict(
dataset=dict(pipeline=test_pipeline, return_classes=True))
test_dataloader = val_dataloader

optim_wrapper = dict(
delete=True,
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
clip_grad=dict(max_norm=0.1, norm_type=2),
paramwise_cfg=dict(custom_keys={
'absolute_pos_embed': dict(decay_mult=0.),
'backbone': dict(lr_mult=0.1)
}))
max_epochs = 12
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=max_epochs,
by_epoch=True,
milestones=[11],
gamma=0.1)
]

default_hooks = dict(
checkpoint=dict(
type='CheckpointHook',
max_keep_ckpts=1,
save_best='auto'))

auto_scale_lr = dict(base_batch_size=2)
`

xyb1314 · 2024-06-21T10:17:34Z

mask shape require 2D tensor size , but what i got is 3D tensor size

LinxinS97 · 2024-06-28T10:09:03Z

Same problems here. Maybe the transformer version is incorrect.

mm-assistant bot assigned BIGWangYuDong Jun 21, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Using Grounding-Dino To train coco dataset ,error is below #11808

Using Grounding-Dino To train coco dataset ,error is below #11808

xyb1314 commented Jun 21, 2024 •

edited

Loading

xyb1314 commented Jun 21, 2024

xyb1314 commented Jun 21, 2024

LinxinS97 commented Jun 28, 2024

Using Grounding-Dino To train coco dataset ,error is below #11808

Using Grounding-Dino To train coco dataset ,error is below #11808

Comments

xyb1314 commented Jun 21, 2024 • edited Loading

xyb1314 commented Jun 21, 2024

dataset settings

xyb1314 commented Jun 21, 2024

LinxinS97 commented Jun 28, 2024

xyb1314 commented Jun 21, 2024 •

edited

Loading