Denosing操作理解
DN-DETR增加denosing操作,帮助快速拟合,提高了目标检测任务中的效率和精度。通过这种方式,DN-DETR 克服了原始 DETR 的一些限制,使其在实际应用中具有更好的性能和鲁棒性。
GTBoxes通过随机偏移H, L,W进行偏移,当中心点x,y的偏移都>L/2,W/2会将label设置为num_class,也就是负样本,会对GTBoxes重复n次组成
n=10
padding_size = n(重复次数)*gt_boxes_num(gt框的数目)+num_queries(query数目)
的attn_mask为(padding_size * padding_size ),attn_mask如下图黑色为False,白色为True,红色框部分为新增加的掩码。这是传统DN-DETR的MASK实现方法。
在中经过了一些演变,左边是传统dn的mask, 右边是sparse4D的mask, 右边两部分都做了马尔可夫匹配,所以两部分互不干扰。
传统DN部分
targets = [torch.cat((img_meta['gt_bboxes_3d']._data.gravity_center, img_meta['gt_bboxes_3d']._data.tensor[:, 3:]),dim=1) for img_meta in img_metas ]
labels = [img_meta['gt_labels_3d']._data for img_meta in img_metas ]
known = [(torch.ones_like(t)).cuda() for t in labels]
know_idx = known
unmask_bbox = unmask_label = torch.cat(known)
#gt_num
known_num = [t.size(0) for t in targets] # 将所有batch的数目统计
labels = torch.cat([t for t in labels]) # 所有batch label整合
boxes = torch.cat([t for t in targets])
batch_idx = torch.cat([torch.full((t.size(0), ), i) for i, t in enumerate(targets)])
known_indice = torch.nonzero(unmask_label + unmask_bbox)
known_indice = known_indice.view(-1)
# add noise
total_raydn_num = self.raydn_num * self.raydn_group
known_indice = known_indice.repeat(self.scalar+total_raydn_num, 1).view(-1)
known_labels = labels.repeat(self.scalar, 1).view(-1).long().to(reference_points.device)
known_bid = batch_idx.repeat(self.scalar+total_raydn_num, 1).view(-1)
known_bboxs = boxes.repeat(self.scalar, 1).to(reference_points.device)
known_bbox_center = known_bboxs[:, :3].clone() # 3D物体 vcs中心点
known_bbox_scale = known_bboxs[:, 3:6].clone() # 3D物体 whl
if self.bbox_noise_scale > 0:
diff = known_bbox_scale / 2 + self.bbox_noise_trans # 随意根据whl获得偏移量
rand_prob = torch.rand_like(known_bbox_center) * 2 - 1.0
known_bbox_center += torch.mul(rand_prob,
diff) * self.bbox_noise_scale #将中心点偏移
known_bbox_center[..., 0:3] = (known_bbox_center[..., 0:3] - self.pc_range[0:3]) / (self.pc_range[3:6] - self.pc_range[0:3])
#归一化【0-1】
known_bbox_center = known_bbox_center.clamp(min=0.0, max=1.0)
#去除超区域的数据
mask = torch.norm(rand_prob, 2, 1) > self.split #将偏移大的部分找出来
known_labels[mask] = self.num_classes #认为偏移大的就是负样本,对Label进行赋值标记
计算loss部分,原始部分num_quries部分的结果会经过匈牙利匹配做matchloss,但是dn部分不会做匈牙利匹配,直接计算为dn loss。
training step
loss = base loss + match loss + dn loss
RayDN 同一射线误检测优化
Code: https://github.com/LiewFeng/RayDN
Parper: https://arxiv.org/abs/2402.03634
实现原理
3D BEV坐标系转换到相机坐标系,在相机坐标系对Z轴随机增加noise;加完后在投射到3D BEV坐标系
for g_id in range(self.raydn_group):
raydn_known_labels = labels.repeat(self.raydn_num, 1).view(-1).long().to(reference_points.device)
raydn_known_bboxs = boxes.repeat(self.raydn_num, 1).to(reference_points.device)
raydn_known_bbox_center = raydn_known_bboxs[:, :3].clone()
raydn_known_bbox_scale = raydn_known_bboxs[:, 3:6].clone()
noise_scale = raydn_known_bbox_scale[:, :].mean(dim=-1) / 2
noise_step = (self.raydn_sampler.sample([noise_scale.shape[0]]).to(reference_points.device) * 2 - 1.0) * self.raydn_radius
noise_scale = noise_scale.view(self.raydn_num, -1)
noise_step = noise_step.view(self.raydn_num, -1)
min_value, min_index = noise_step.abs().min(dim=0)
reset_mask = min_value.abs() > self.split
reset_value = (torch.rand(reset_mask.sum()).to(reference_points.device) * 2 - 1) * self.split
min_value[reset_mask] = reset_value
noise_step.scatter_(0, min_index.unsqueeze(0), min_value.unsqueeze(0))
mask = torch.zeros_like(noise_step)
mask.scatter_(0, min_index.unsqueeze(0), 1)
mask = mask < 1
mask = mask.view(-1)
raydn_known_labels[mask] = self.num_classes
raydn_known_bbox_center = raydn_known_bbox_center.view(self.raydn_num, -1, 3)
ori_raydn_known_bbox_center = raydn_known_bbox_center.clone()
for view_id in range(data['lidar2img'].shape[1]):
raydn_known_bbox_center_copy = torch.cat([ori_raydn_known_bbox_center.clone(), ori_raydn_known_bbox_center.new_ones((ori_raydn_known_bbox_center.shape[0], ori_raydn_known_bbox_center.shape[1], 1))], dim=-1)
tmp_p = raydn_known_bbox_center_copy.new_zeros(raydn_known_bbox_center_copy.shape)
# 将vcs坐标系转换到相机坐标系
for batch_id in range(data['lidar2img'].shape[0]):
tmp_p[:, sum(known_num[:batch_id]): sum(known_num[:batch_id+1])] = (data['lidar2img'][batch_id][view_id] @ raydn_known_bbox_center_copy[:, sum(known_num[:batch_id]): sum(known_num[:batch_id+1])].permute(0, 2, 1)).permute(0, 2, 1)
#获取相机坐标系z轴大于0的mask
z_mask = tmp_p[..., 2] > 0 # depth > 0
# /z
tmp_p[..., :2] = tmp_p[..., :2] / (tmp_p[..., 2:3] + z_mask.unsqueeze(-1) * 1e-6 - (~z_mask).unsqueeze(-1) * 1e-6)
pad_h, pad_w = img_metas[0]['pad_shape'][0][:2] #(320, 800) #(640, 1600)
hw_mask = (
(tmp_p[..., 0] < pad_w)
& (tmp_p[..., 0] >= 0)
& (tmp_p[..., 1] < pad_h)
& (tmp_p[..., 1] >= 0)
) # 0 < u < h and 0 < v < w
valid_mask = torch.logical_and(hw_mask, z_mask)
# 对Z轴增加偏移量
tmp_p[..., 2] += noise_scale*noise_step
tmp_p[..., :2] = tmp_p[..., :2] * tmp_p[..., 2:3]
proj_back = raydn_known_bbox_center_copy.new_zeros(raydn_known_bbox_center_copy.shape)
# 相机坐标系转换到VCS坐标系,绘获得很多同一射线上的物体,noise大的当作负样本,所以手动增加了很多同一射线上的物体。
for batch_id in range(data['lidar2img'].shape[0]):
proj_back[:, sum(known_num[:batch_id]): sum(known_num[:batch_id+1])] = (data['lidar2img'][batch_id][view_id].inverse() @ tmp_p[:, sum(known_num[:batch_id]): sum(known_num[:batch_id+1])].permute(0, 2, 1)).permute(0, 2, 1)
raydn_known_bbox_center[valid_mask.unsqueeze(-1).repeat(1, 1, 3)] = proj_back[..., :3][valid_mask.unsqueeze(-1).repeat(1, 1, 3)]
raydn_known_bbox_center = raydn_known_bbox_center.view(-1, 3)
# 归一化
raydn_known_bbox_center[..., 0:3] = (raydn_known_bbox_center[..., 0:3] - self.pc_range[0:3]) / (self.pc_range[3:6] - self.pc_range[0:3])
raydn_known_bbox_center = raydn_known_bbox_center.clamp(min=0.0, max=1.0)
known_labels = torch.cat([known_labels, raydn_known_labels], dim=0)
known_bbox_center = torch.cat([known_bbox_center, raydn_known_bbox_center], dim=0)
亲测有用