CV+Deep Learning——网络架构Pytorch复现系列——Detection(二：RtinaNet)更换backbones

上一话

CV+Deep Learning——网络架构Pytorch复现系列——Detection(一：SSD:Single Shot MultiBox Detector 4.推理Detect)https://blog.csdn.net/XiaoyYidiaodiao/article/details/128683973?spm=1001.2014.3001.5501

复现Object Detection，会复现的网络架构有：

1.SSD: Single Shot MultiBox Detector(√)

2.RetinaNet(√)

3.Faster RCNN

4.YOLO系列

....

代码：

https://github.com/HanXiaoyiGitHub/Simple-CV-Pytorch-masterhttps://github.com/HanXiaoyiGitHub/Simple-CV-Pytorch-master

2.复现RetinaNet

之前已经讲过RetinaNet，链接如下：

目标检测——RetinaNet-Focal Loss 焦点损失函数的理解https://blog.csdn.net/XiaoyYidiaodiao/article/details/124553661?spm=1001.2014.3001.5502

也不想做过多的讲解了，就讲讲在RetinaNet中是如何更换Backbones(将以前的ResNet更换为DarkNet)

之前ResNet骨干网络的代码

我懒得写了直接调用Pytorch包的，但是值得注意的是输出的feature map的channels可能需要修改（这里我在RetinaNet.py中进行了修改），与之后Neck（FPN）网络中输入channles匹配。

import torch
from torch import nn
from torchvision.models import resnet18, resnet34, resnet50, \
    resnet101, resnet152


class ResNet(nn.Module):
    def __init__(self, resnet_type="resnet50", pretrained=False):
        super(ResNet, self).__init__()
        if resnet_type == "resnet18":
            self.model = resnet18(pretrained=pretrained)
        elif resnet_type == "resnet34":
            self.model = resnet34(pretrained=pretrained)
        elif resnet_type == "resnet50":
            self.model = resnet50(pretrained=pretrained)
        elif resnet_type == "resnet101":
            self.model = resnet101(pretrained=pretrained)
        elif resnet_type == "resnet152":
            self.model = resnet152(pretrained=pretrained)
        del self.model.fc
        del self.model.avgpool

    def forward(self, x):
        x = self.model.conv1(x)
        x = self.model.bn1(x)
        x = self.model.relu(x)
        x = self.model.maxpool(x)

        x = self.model.layer1(x)
        C3 = self.model.layer2(x)
        C4 = self.model.layer3(C3)
        C5 = self.model.layer4(C4)

        del x

        return [C3, C4, C5]


if __name__ == "__main__":
    backbone = ResNet(resnet_type='resnet18', pretrained=True)
    x = torch.randn([16, 3, 512, 512])
    C3, C4, C5 = backbone(x)
    print(C3.shape)  # torch.Size([16, 512, 64, 64])
    print(C4.shape)  # torch.Size([16, 1024, 32, 32])
    print(C5.shape)  # torch.Size([16, 2048, 16, 16])

DarkNet骨干网络的代码

这里更换的backbones是DarkNetTiny，DarkNet19和DarkNet53，DarkNet系列是出自YOLO系列，其中DarkNet19是来自于YOLO9000(也就是我们通常意义上的YOLOv2[1]，DarkNet53是来自于最经典的YOLOv3[2]，而DarkNetTiny是来自YOLOv3-Tiny[2]。

import torch
import torch.nn as nn

__all__ = [
    'darknettiny',
    'darknet19',
    'darknet53',
]


class DarkNet(nn.Module):
    def __init__(self, darknet_type='darknet19'):
        super(DarkNet, self).__init__()
        self.darknet_type = darknet_type
        if darknet_type == 'darknettiny':
            self.model = darknettiny()
        elif darknet_type == 'darknet19':
            self.model = darknet19()
        elif darknet_type == 'darknet53':
            self.model = darknet53()

    def forward(self, x):
        out = self.model(x)
        return out


class ActBlock(nn.Module):
    def __init__(self, act_type='leakyrelu', inplace=True):
        super(ActBlock, self).__init__()
        assert act_type in ['silu', 'relu', 'leakyrelu'], \
            "Unsupported activation function!"
        if act_type == 'silu':
            self.act = nn.SiLU(inplace=inplace)
        elif act_type == 'relu':
            self.act = nn.ReLU(inplace=inplace)
        elif act_type == 'leakyrelu':
            self.act = nn.LeakyReLU(0.1, inplace=inplace)

    def forward(self, x):
        x = self.act(x)
        return x


class ConvBlock(nn.Module):
    def __init__(self, inplanes, planes, kernel_size, stride, padding, groups=1, has_bn=True, has_act=True,
                 act_type='leakyrelu'):
        super(ConvBlock, self).__init__()
        bias = False if has_bn else True

        self.layer = nn.Sequential(
            nn.Conv2d(in_channels=inplanes, out_channels=planes, kernel_size=kernel_size, stride=stride,
                      padding=padding, groups=groups, bias=bias),
            nn.BatchNorm2d(planes) if has_bn else nn.Sequential(),
            ActBlock(act_type=act_type, inplace=True) if has_act else nn.Sequential()

        )

    def forward(self, x):
        x = self.layer(x)
        return x


class DarkNetTiny(nn.Module):
    def __init__(self, act_type='leakyrelu'):
        super(DarkNetTiny, self).__init__()
        self.conv1 = ConvBlock(inplanes=3, planes=16, kernel_size=3, stride=1, padding=1, groups=1, has_bn=True,
                               has_act=True, act_type=act_type)
        self.conv2 = ConvBlock(inplanes=16, planes=32, kernel_size=3, stride=1, padding=1, groups=1, has_bn=True,
                               has_act=True, act_type=act_type)
        self.conv3 = ConvBlock(inplanes=32, planes=64, kernel_size=3, stride=1, padding=1, groups=1, has_bn=True,
                               has_act=True, act_type=act_type)
        self.conv4 = ConvBlock(inplanes=64, planes=128, kernel_size=3, stride=1, padding=1, groups=1, has_bn=True,
                               has_act=True, act_type=act_type)
        self.conv5 = ConvBlock(inplanes=128, planes=256, kernel_size=3, stride=1, padding=1, groups=1, has_bn=True,
                               has_act=True, act_type=act_type)
        self.conv6 = ConvBlock(inplanes=256, planes=512, kernel_size=3, stride=1, padding=1, groups=1, has_bn=True,
                               has_act=True, act_type=act_type)

        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.zeropad = nn.ZeroPad2d((0, 1, 0, 1))
        self.last_maxpool = nn.MaxPool2d(kernel_size=2, stride=1)
        self.out_channels = [64, 128, 256]

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)

        x = self.conv2(x)
        x = self.maxpool(x)

        C3 = self.conv3(x)
        C3 = self.maxpool(C3)

        C4 = self.conv4(C3)
        C4 = self.maxpool(C4)  # 128

        C5 = self.conv5(C4)
        C5 = self.maxpool(C5)  # 256

        del x
        return [C3, C4, C5]


class D19Block(nn.Module):
    def __init__(self, inplanes, planes, layer_num, use_maxpool=False, act_type='leakyrelu'):
        super(D19Block, self).__init__()
        self.use_maxpool = use_maxpool
        layers = []
        for i in range(0, layer_num):
            if i % 2 == 0:
                layers.append(
                    ConvBlock(inplanes=inplanes, planes=planes, kernel_size=3, stride=1, padding=1, groups=1,
                              has_bn=True, has_act=True, act_type=act_type))
            else:
                layers.append(
                    ConvBlock(inplanes=planes, planes=inplanes, kernel_size=1, stride=1, padding=0, groups=1,
                              has_bn=True, has_act=True, act_type=act_type))
        self.D19Block = nn.Sequential(*layers)
        if self.use_maxpool:
            self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = self.D19Block(x)

        if self.use_maxpool:
            x = self.maxpool(x)
        return x


class DarkNet19(nn.Module):
    def __init__(self, act_type='leakyrelu'):
        super(DarkNet19, self).__init__()

        self.layer1 = ConvBlock(inplanes=3, planes=32, kernel_size=3, stride=1, padding=1, groups=1, has_bn=True,
                                has_act=True, act_type=act_type)
        self.layer2 = D19Block(inplanes=32, planes=64, layer_num=1, use_maxpool=True, act_type=act_type)
        self.layer3 = D19Block(inplanes=64, planes=128, layer_num=3, use_maxpool=True, act_type=act_type)
        self.layer4 = D19Block(inplanes=128, planes=256, layer_num=3, use_maxpool=True, act_type=act_type)
        self.layer5 = D19Block(inplanes=256, planes=512, layer_num=5, use_maxpool=True, act_type=act_type)
        self.layer6 = D19Block(inplanes=512, planes=1024, layer_num=5, use_maxpool=False, act_type=act_type)
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.out_channels = [128, 256, 512]

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.layer1(x)
        x = self.maxpool(x)
        x = self.layer2(x)

        C3 = self.layer3(x)
        C4 = self.layer4(C3)
        C5 = self.layer5(C4)

        del x
        return [C3, C4, C5]


# conv*2+residual
class BasicBlock(nn.Module):
    def __init__(self, inplanes, planes):
        super(BasicBlock, self).__init__()
        self.conv1 = ConvBlock(inplanes=inplanes, planes=planes, kernel_size=1, stride=1, padding=0)
        self.conv2 = ConvBlock(inplanes=planes, planes=planes * 2, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)

        out += x
        del x
        return out


class DarkNet53(nn.Module):
    def __init__(self):
        super(DarkNet53, self).__init__()
        self.conv1 = ConvBlock(inplanes=3, planes=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = ConvBlock(inplanes=32, planes=64, kernel_size=3, stride=2, padding=1)

        self.block1 = nn.Sequential(
            BasicBlock(inplanes=64, planes=32),
            ConvBlock(inplanes=64, planes=128, kernel_size=3, stride=2, padding=1)
        )  # 128

        self.block2 = nn.Sequential(
            BasicBlock(inplanes=128, planes=64),
            BasicBlock(inplanes=128, planes=64),
            ConvBlock(inplanes=128, planes=256, kernel_size=3, stride=2, padding=1)
        )  # 256

        self.block3 = nn.Sequential(
            BasicBlock(inplanes=256, planes=128),
            BasicBlock(inplanes=256, planes=128),
            BasicBlock(inplanes=256, planes=128),
            BasicBlock(inplanes=256, planes=128),
            BasicBlock(inplanes=256, planes=128),
            BasicBlock(inplanes=256, planes=128),
            BasicBlock(inplanes=256, planes=128),
            BasicBlock(inplanes=256, planes=128),
            ConvBlock(inplanes=256, planes=512, kernel_size=3, stride=2, padding=1)
        )  # 512

        self.block4 = nn.Sequential(
            BasicBlock(inplanes=512, planes=256),
            BasicBlock(inplanes=512, planes=256),
            BasicBlock(inplanes=512, planes=256),
            BasicBlock(inplanes=512, planes=256),
            BasicBlock(inplanes=512, planes=256),
            BasicBlock(inplanes=512, planes=256),
            BasicBlock(inplanes=512, planes=256),
            BasicBlock(inplanes=512, planes=256),
            ConvBlock(inplanes=512, planes=1024, kernel_size=3, stride=2, padding=1)
        )  # 1024

        self.block5 = nn.Sequential(
            BasicBlock(inplanes=1024, planes=512),
            BasicBlock(inplanes=1024, planes=512),
            BasicBlock(inplanes=1024, planes=512),
            BasicBlock(inplanes=1024, planes=512)
        )

        self.out_channels = [256, 512, 1024]

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.block1(x)
        C3 = self.block2(x)
        C4 = self.block3(C3)
        C5 = self.block4(C4)
        del x
        return [C3, C4, C5]


def darknettiny(**kwargs):
    model = DarkNetTiny(**kwargs)
    return model


def darknet19(**kwargs):
    model = DarkNet19(**kwargs)
    return model


def darknet53(**kwargs):
    model = DarkNet53(**kwargs)
    return model


if __name__ == '__main__':
    x = torch.randn([8, 3, 512, 512])
    darknet = DarkNet(darknet_type='darknet53')
    [C3, C4, C5] = darknet(x)
    print("C3.shape:{}".format(C3.shape))
    print("C4.shape:{}".format(C4.shape))
    print("C5.shape:{}".format(C5.shape))

    # DarkNet53
    # C3.shape: torch.Size([8, 256, 64, 64])
    # C4.shape: torch.Size([8, 512, 32, 32])
    # C5.shape: torch.Size([8, 1024, 16, 16])

    # DarkNet19
    # C3.shape: torch.Size([8, 128, 64, 64])
    # C4.shape: torch.Size([8, 256, 32, 32])
    # C5.shape: torch.Size([8, 512, 16, 16])

    # DarkNetTiny
    # C3.shape: torch.Size([8, 64, 64, 64])
    # C4.shape: torch.Size([8, 128, 32, 32])
    # C5.shape: torch.Size([8, 256, 16, 16])

如何在RetinaNet网络中使用呢？我设置了个Backbones_type，修改这个就行。

RetinaNet.py代码

import os
import sys

BASE_DIR = os.path.dirname(
    os.path.dirname(
        os.path.abspath(__file__)))
sys.path.append(BASE_DIR)

import torch
import torch.nn as nn
from torchvision.ops import nms
from models.detection.RetinaNet.neck import FPN
from models.detection.RetinaNet.loss import FocalLoss
from models.detection.RetinaNet.anchor import Anchors
from models.detection.RetinaNet.head import clsHead, regHead
from models.detection.RetinaNet.backbones.ResNet import ResNet
from models.detection.RetinaNet.utils.ClipBoxes import ClipBoxes
from models.detection.RetinaNet.backbones.DarkNet import DarkNet
from models.detection.RetinaNet.utils.BBoxTransform import BBoxTransform


# assert input annotations are [x_min, y_min, x_max, y_max]
class RetinaNet(nn.Module):
    def __init__(self,
                 backbones_type="resnet50",
                 num_classes=80,
                 planes=256,
                 pretrained=False,
                 training=False):
        super(RetinaNet, self).__init__()
        self.backbones_type = backbones_type
        # coco 80, voc 20
        self.num_classes = num_classes
        self.planes = planes
        self.training = training
        if backbones_type[:6] == 'resnet':
            self.backbone = ResNet(resnet_type=self.backbones_type,
                                   pretrained=pretrained)
        elif backbones_type[:7] == 'darknet':
            self.backbone = DarkNet(darknet_type=self.backbones_type)
        expand_ratio = {
            "resnet18": 1,
            "resnet34": 1,
            "resnet50": 4,
            "resnet101": 4,
            "resnet152": 4,
            "darknettiny": 0.5,
            "darknet19": 1,
            "darknet53": 2
        }

        C3_inplanes, C4_inplanes, C5_inplanes = \
            int(128 * expand_ratio[self.backbones_type]), \
            int(256 * expand_ratio[self.backbones_type]), \
            int(512 * expand_ratio[self.backbones_type])
        self.fpn = FPN(C3_inplanes=C3_inplanes,
                       C4_inplanes=C4_inplanes,
                       C5_inplanes=C5_inplanes,
                       planes=self.planes)

        self.cls_head = clsHead(inplanes=self.planes,
                                num_classes=self.num_classes)

        self.reg_head = regHead(inplanes=self.planes)

        self.anchors = Anchors()
        self.regressBoxes = BBoxTransform()
        self.clipBoxes = ClipBoxes()

        self.loss = FocalLoss()
        self.freeze_bn()

    def freeze_bn(self):
        '''Freeze BatchNorm layers.'''
        for layer in self.modules():
            if isinstance(layer, nn.BatchNorm2d):
                layer.eval()

    def forward(self, inputs):
        if self.training:
            img_batch, annots = inputs

        # inference
        else:
            img_batch = inputs

        [C3, C4, C5] = self.backbone(img_batch)

        del inputs
        features = self.fpn([C3, C4, C5])
        del C3, C4, C5
        # (batch_size, total_anchors_nums, num_classes)
        cls_heads = torch.cat([self.cls_head(feature) for feature in features], dim=1)
        # (batch_size, total_anchors_nums, 4)
        reg_heads = torch.cat([self.reg_head(feature) for feature in features], dim=1)

        del features

        anchors = self.anchors(img_batch)

        if self.training:
            return self.loss(cls_heads, reg_heads, anchors, annots)
        # inference
        else:
            transformed_anchors = self.regressBoxes(anchors, reg_heads)
            transformed_anchors = self.clipBoxes(transformed_anchors, img_batch)

            # scores
            finalScores = torch.Tensor([])

            # anchor id:0~79
            finalAnchorBoxesIndexes = torch.Tensor([]).long()

            # coordinates size:[...,4]
            finalAnchorBoxesCoordinates = torch.Tensor([])

            if torch.cuda.is_available():
                finalScores = finalScores.cuda()
                finalAnchorBoxesIndexes = finalAnchorBoxesIndexes.cuda()
                finalAnchorBoxesCoordinates = finalAnchorBoxesCoordinates.cuda()

            # num_classes
            for i in range(cls_heads.shape[2]):
                scores = torch.squeeze(cls_heads[:, :, i])
                scores_over_thresh = (scores > 0.05)
                if scores_over_thresh.sum() == 0:
                    # no boxes to NMS, just continue
                    continue
                scores = scores[scores_over_thresh]
                anchorBoxes = torch.squeeze(transformed_anchors)
                anchorBoxes = anchorBoxes[scores_over_thresh]
                anchors_nms_idx = nms(anchorBoxes, scores, 0.5)

                # use idx to find the scores of anchor
                finalScores = torch.cat((finalScores, scores[anchors_nms_idx]))
                # [0,0,0,...,1,1,1,...,79,79]
                finalAnchorBoxesIndexesValue = torch.tensor([i] * anchors_nms_idx.shape[0])

                if torch.cuda.is_available():
                    finalAnchorBoxesIndexesValue = finalAnchorBoxesIndexesValue.cuda()

                finalAnchorBoxesIndexes = torch.cat((finalAnchorBoxesIndexes, finalAnchorBoxesIndexesValue))
                # [...,4]
                finalAnchorBoxesCoordinates = torch.cat((finalAnchorBoxesCoordinates, anchorBoxes[anchors_nms_idx]))

        return finalScores, finalAnchorBoxesIndexes, finalAnchorBoxesCoordinates


if __name__ == "__main__":
    C = torch.randn([8, 3, 512, 512])
    annot = torch.randn([8, 15, 5])
    model = RetinaNet(backbones_type="darknet19", num_classes=80, pretrained=True, training=True)
    model = model.cuda()
    C = C.cuda()
    annot = annot.cuda()
    model = torch.nn.DataParallel(model).cuda()
    model.training = True
    out = model([C, annot])
    # if model.training == True out==loss
    # out = model([C, annot])
    # if model.training == False out== scores
    # out = model(C)
    for i in range(len(out)):
        print(out[i])

# Scores: torch.Size([486449])
# tensor([4.1057, 4.0902, 4.0597,  ..., 0.0509, 0.0507, 0.0507], device='cuda:0')
# Id: torch.Size([486449])
# tensor([ 0,  0,  0,  ..., 79, 79, 79], device='cuda:0')
# loc: torch.Size([486449, 4])
# tensor([[ 45.1607, 249.4807, 170.5788, 322.8085],
# [ 85.9825, 324.4150, 122.9968, 382.6297],
# [148.1854, 274.0474, 179.0922, 343.4529],
# ...,
# [222.5421,   0.0000, 256.3059,  15.5591],
# [143.3349, 204.4784, 170.2395, 228.6654],
# [208.4509, 140.1983, 288.0962, 165.8708]], device='cuda:0')

未完...

参考文献

[1] Redmon J, Farhadi A. YOLO9000: better, faster, stronger[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2017: 7263-7271.

[2] Redmon J, Farhadi A. Yolov3: An incremental improvement[J]. arXiv preprint arXiv:1804.02767, 2018.

[3] Lin T Y, Goyal P, Girshick R, et al. Focal loss for dense object detection[C]//Proceedings of the IEEE international conference on computer vision. 2017: 2980-2988.