在nuscens数据集上,
Results and Models
Backbone | Model | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download |
---|---|---|---|---|---|---|---|
R-50 | DETR | 150e | 7.9 | 40.1 | config | model | log |
我们先看检测器
/mmdetection-2.28.2/mmdet/models/detectors/detr.py
def forward_train(self,
img,
img_metas,
gt_bboxes,
gt_labels,
gt_bboxes_ignore=None):
"""
Args:
img (Tensor): Input images of shape (N, C, H, W).
Typically these should be mean centered and std scaled.
img_metas (list[dict]): A List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys see
:class:`mmdet.datasets.pipelines.Collect`.
gt_bboxes (list[Tensor]): Each item are the truth boxes for each
image in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
super(SingleStageDetector, self).forward_train(img, img_metas)
x = self.extract_feat(img)
losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
gt_labels, gt_bboxes_ignore)
return losses
具体的代码
def forward_single(self, x, img_metas): # 单独处理每个特征层
# construct binary masks which used for the transformer.
# NOTE following the official DETR repo, non-zero values representing
# ignored positions, while zero values means valid positions.
batch_size = x.size(0) # 获取B
input_img_h, input_img_w = img_metas[0]['batch_input_shape'] # 获取输入batch的shape
masks = x.new_ones((batch_size, input_img_h, input_img_w)) # Tensor [B, H, W] 默认为1
for img_id in range(batch_size): # 遍历每张图像
img_h, img_w, _ = img_metas[img_id]['img_shape'] # Resize后的图像尺寸
masks[img_id, :img_h, :img_w] = 0 # 有效位置标记为0, 其余位置为1
x = self.input_proj(x) # 1*1卷积 特征降维 [B, C, H, W]
# interpolate masks to have the same spatial shape with x
masks = F.interpolate( # 将masks resize到和特征相同尺度
masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1) # Tensor [B, H, W]
# position encoding 通过配置文件可知positional_encoding为类SinePositionalEncoding
pos_embed = self.positional_encoding(masks) # [B, C, H, W]
# outs_dec: [nb_dec, bs, num_query, embed_dim]
# 由配置文件可知, transformer为类Transformer
outs_dec, _ = self.transformer(x, # Tensor [B, C, H, W]
masks, # Tensor [B, H, W]
self.query_embedding.weight, # Tensor [num_query, C] 可学习的query
pos_embed) # Tensor [B, C, H, W]
# outs_dec: Tensor [num_layer, B, num_query, C] 经过transformer encode和decoder后的结果
all_cls_scores = self.fc_cls(outs_dec) # 初始化可知 fc_cls为全链接层 [num_layer, B, num_query, num_cls+1]
all_bbox_preds = self.fc_reg(self.activate(
self.reg_ffn(outs_dec))).sigmoid() # 初始化可知 Sigmoid(Linear(ReLU(FFN(outs_dec)))) [num_layer, B, num_query, 4]
return all_cls_scores, all_bbox_preds # Tensor: [num_layer, B, num_query, num_cls+1], [num_layer, B, num_query, 4]
看 backbone
stem部分 = 7*7conv + bn + relu + maxpool。这部分通常只是提取图像低级特征,故一般都需要固定这部分权重。
backbone=dict(
type='ResNet',
depth=50,
num_stages=4, # assert num_stages >= 1 and num_stages <= 4 最大是4,最小是1
# 表示本模块输出的特征图索引,(0, 1, 2, 3),表示4个 stage 输出都需要,(3, ),表示第4个 stage 输出都需要
# 其对应的 stride 为 (4,8,16,32),channel 为 (256, 512, 1024, 2048)
out_indices=(3, ),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
arch_settings = {
18: (BasicBlock, (2, 2, 2, 2)),
34: (BasicBlock, (3, 4, 6, 3)),
50: (Bottleneck, (3, 4, 6, 3)), # ResNet-50
101: (Bottleneck, (3, 4, 23, 3)),
152: (Bottleneck, (3, 8, 36, 3))
}
# BasicBlock,Bottleneck都是一个单独的类,可以理解成模块
#链接:https://blog.csdn.net/weixin_47691066/article/details/126032709
看 bbox_head
1、Transformer encoder部分首先将输入的特征图降维并flatten,然后送入下图左半部分所示的结构中,和空间位置编码一起并行经过多个自注意力分支、正则化和FFN,得到一组长度为N的预测目标序列。
2、接着,将Transformer encoder得到的预测目标序列经过上图右半部分所示的Transformer decoder,并行的解码得到输出序列(而不是像机器翻译那样逐个元素输出)。和传统的autogreesive机制不同,每个层可以解码N个目标,由于解码器的位置不变性,即调换输入顺序结果不变,除了每个像素本身的信息,位置信息也很重要,所以这N个输入嵌入必须不同以产生不同的结果,所以学习NLP里面的方法,加入positional encoding并且每层都加,作者非常用力的在处理position的问题,在使用 transformer 处理图片类的输入的时候,一定要注意position的问题。
bbox_head=dict(
type='DETRHead',
num_classes=80,
in_channels=2048,
# 这个是 transformer 模块
transformer=dict(
type='Transformer',
encoder=dict(
type='DetrTransformerEncoder', # 编码器
num_layers=6,# 一共六层
transformerlayers=dict(
type='BaseTransformerLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',# 多头注意力
embed_dims=256,# 嵌入量维度256
num_heads=8,# 多头数量8
dropout=0.1)# 随机丢弃
],
feedforward_channels=2048,# 返回通道数2048
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),# 操作顺序,自注意力、norm、FFN
decoder=dict(
type='DetrTransformerDecoder',# 解码器
return_intermediate=True,
num_layers=6,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=dict(
type='MultiheadAttention',# 多头
embed_dims=256,
num_heads=8,
dropout=0.1),
feedforward_channels=2048,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')),# 自注意力、交叉注意力、FFN
)),
使用共享参数的FFNs(由一个具有ReLU激活函数和d维隐藏层的3层感知器和一个线性投影层构成)独立解码为包含类别得分和预测框坐标的最终检测结果(N个),FFN预测框的标准化中心坐标,高度和宽度w.r.t. 输入图像,然后线性层使用softmax函数预测类标签。
Transformer类位于mmdet/models/utils/transformer.py,如下
class Transformer(BaseModule):
"""
Following the official DETR implementation, this module copy-paste
from torch.nn.Transformer with modifications:
* positional encodings are passed in MultiheadAttention
* extra LN at the end of encoder is removed
* decoder returns a stack of activations from all decoding layers
"""
def __init__(self, encoder=None, decoder=None, init_cfg=None):
super(Transformer, self).__init__(init_cfg=init_cfg)
self.encoder = build_transformer_layer_sequence(encoder)
self.decoder = build_transformer_layer_sequence(decoder)
self.embed_dims = self.encoder.embed_dims
def forward(self, x, mask, query_embed, pos_embed):
bs, c, h, w = x.shape
# use `view` instead of `flatten` for dynamically exporting to ONNX
x = x.view(bs, c, -1).permute(2, 0, 1) # [B, C, H, W] -> [N, B, C]
pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1) # [B, C, H, W] -> [N, B, C]
query_embed = query_embed.unsqueeze(1).repeat( # 直接复制可学习的query
1, bs, 1) # [num_query, C] -> [num_query, B, dim]
mask = mask.view(bs, -1) # [B, H, W] -> [B, N]
memory = self.encoder( # 由配置文件可知encoder为类DetrTransformerEncoder
query=x, # [N, B, C]
key=None,
value=None,
query_pos=pos_embed, # [N, B, C]
query_key_padding_mask=mask) # [B, N]
# memory: Tensor [N, B, C]
target = torch.zeros_like(query_embed) # [num_query, B, dim] 全0
# out_dec: [num_layers, num_query, bs, dim]
out_dec = self.decoder( # 由配置文件可知decoder为类DetrTransformerDecoder
query=target,
key=memory,
value=memory,
key_pos=pos_embed,
query_pos=query_embed,
key_padding_mask=mask)
# out_dec: Tensor [num_layer, num_query, B, C]
out_dec = out_dec.transpose(1, 2) # [num_layer, B, num_query, C]
memory = memory.permute(1, 2, 0).reshape(bs, c, h, w) # [B, C, H, W]
return out_dec, memory
class DetrTransformerEncoder(TransformerLayerSequence):
def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
super(DetrTransformerEncoder, self).__init__(*args, **kwargs)
if post_norm_cfg is not None:
self.post_norm = build_norm_layer(
post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
else:
assert not self.pre_norm, f'Use prenorm in ' \
f'{self.__class__.__name__},' \
f'Please specify post_norm_cfg'
self.post_norm = None
def forward(self, *args, **kwargs):
# 调用父类TransformerLayerSequence中的forward方法, 而类TransformerLayerSequence集成在mmcv中
# mmcv.cnn.bricks.transformer 暂时不做解析
x = super(DetrTransformerEncoder, self).forward(*args, **kwargs)
if self.post_norm is not None: # 不满足
x = self.post_norm(x)
return x # Tensor [N, B, C]
class DetrTransformerDecoder(TransformerLayerSequence):
def __init__(self,
*args,
post_norm_cfg=dict(type='LN'),
return_intermediate=False,
**kwargs):
super(DetrTransformerDecoder, self).__init__(*args, **kwargs)
self.return_intermediate = return_intermediate
if post_norm_cfg is not None:
self.post_norm = build_norm_layer(post_norm_cfg,
self.embed_dims)[1]
else:
self.post_norm = None
def forward(self, query, *args, **kwargs):
if not self.return_intermediate: # 不满足
x = super().forward(query, *args, **kwargs)
if self.post_norm:
x = self.post_norm(x)[None]
return x
intermediate = [] # 保存每个layer的输出结果 [num_query, B, C]
for layer in self.layers:
# 遍历Decoder中的每个layer 由配置文件可知layer为类DetrTransformerDecoderLayer
# 实际上调用的是父类BaseTransformerLayer中的forward函数
# 该类被封装在mmcv中,暂时不做解析
query = layer(query, *args, **kwargs)
if self.return_intermediate: # 满足
if self.post_norm is not None: # 满足
intermediate.append(self.post_norm(query))
else:
intermediate.append(query)
return torch.stack(intermediate) # 将每个layer的结果拼接在一起 [num_layer, num_query, B, C]
class DetrTransformerDecoderLayer(BaseTransformerLayer):
def __init__(self,
attn_cfgs,
feedforward_channels,
ffn_dropout=0.0,
operation_order=None,
act_cfg=dict(type='ReLU', inplace=True),
norm_cfg=dict(type='LN'),
ffn_num_fcs=2,
**kwargs):
super(DetrTransformerDecoderLayer, self).__init__(
attn_cfgs=attn_cfgs,
feedforward_channels=feedforward_channels,
ffn_dropout=ffn_dropout,
operation_order=operation_order,
act_cfg=act_cfg,
norm_cfg=norm_cfg,
ffn_num_fcs=ffn_num_fcs,
**kwargs)
assert len(operation_order) == 6
assert set(operation_order) == set(
['self_attn', 'norm', 'cross_attn', 'ffn'])
看 positional_encoding
SinePositionalEncoding类位于mmdet/models/utils/positional_encoding.py,如下
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True),
class SinePositionalEncoding(BaseModule):
def __init__(self,
num_feats,
temperature=10000,
normalize=False,
scale=2 * math.pi,
eps=1e-6,
offset=0.,
init_cfg=None):
super(SinePositionalEncoding, self).__init__(init_cfg)
if normalize:
assert isinstance(scale, (float, int)), 'when normalize is set,' \
'scale should be provided and in float or int type, ' \
f'found {type(scale)}'
self.num_feats = num_feats # 实际上等于feature特征channel的一半
self.temperature = temperature
self.normalize = normalize # 由配置可知为True
self.scale = scale # 默认为2pi
self.eps = eps
self.offset = offset # 默认为0
def forward(self, mask):
# For convenience of exporting to ONNX, it's required to convert
# `masks` from bool to int.
mask = mask.to(torch.int) # Bool -> Int [B, H, W]
not_mask = 1 - mask # logical_not 反转后,有效位置为1, 无效位置为0
# cumsum方法可参考https://pytorch.org/docs/stable/generated/torch.cumsum.html?highlight=cumsum#torch.cumsum
x_embed = not_mask.cumsum(2, dtype=torch.float32) # 按行求累加和
y_embed = not_mask.cumsum(1, dtype=torch.float32) # 按列求累加和
# 是否对数据进行标准化处理, y_embed[:,-1:, :]代表h方向的最大值
if self.normalize: # 满足 normalize并scale=2*pi
y_embed = (y_embed + self.offset) /(y_embed[:, -1:, :] + self.eps) * self.scale
x_embed = (x_embed + self.offset) /(x_embed[:, :, -1:] + self.eps) * self.scale
# 可参考Transformer论文中的公式
dim_t = torch.arange(
self.num_feats, dtype=torch.float32, device=mask.device) # 生成[0, 128]的数组
dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats) # 归一化
pos_x = x_embed[:, :, :, None] / dim_t # [B, H, W, C]# [b, h, w, 1] -> [b, h, w, 128]
pos_y = y_embed[:, :, :, None] / dim_t # [b, h, w, 1] -> [b, h, w, 128]
# use `view` instead of `flatten` for dynamically exporting to ONNX
B, H, W = mask.size()
pos_x = torch.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
dim=4).view(B, H, W, -1) # [B, H, W, C]# 对偶数位置进行sin处理, 对奇数位置进行cos处理. [b, h, w, 64, 2] -> [b, h, w, 128]
pos_y = torch.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
dim=4).view(B, H, W, -1) # [B, H, W, C]
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) # [B, C, H, W] # [b, h, w, 128] -> [b, h, w, 256] -> [b, 256, h, w]
return pos
损失函数
https://zhuanlan.zhihu.com/p/572772363?utm_id=0
分配器
https://zhuanlan.zhihu.com/p/572772363?utm_id=0