YOLO11网络结构以及改进1

news2025/4/19 18:57:32

YOLO11

- 1.YOLO11网络结构图在哪里？
- 2.对应的网络结构图
- 3.每一个模块详解
- - 3.1 Conv模块
  - 3.2关于卷积模块
  - 3.3 关于给各个模块指定参数的细节
- 4.加入CBAM

1.YOLO11网络结构图在哪里？

在这里插入图片描述

2.对应的网络结构图

在这里插入图片描述

3.每一个模块详解

3.1 Conv模块

位置：ultralytics/nn/modules/conv.py
在这里插入图片描述
特点：YOLO11代码更加模块化以及简洁化

3.2关于卷积模块

在这里插入图片描述

3.3 关于给各个模块指定参数的细节

回到yolo11.yaml文件，我们只是绘制了对应的网络模块，而没有管后面的参数信息，这明显是不够的，后期我们进行模型改进，也是要注意这个的，不是随心所欲。
在这里插入图片描述
`def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
“”“Parse a YOLO model.yaml dictionary into a PyTorch model.”“”
import ast

# Args
legacy = True  # backward compatibility for v3/v5/v8/v9 models
max_channels = float("inf")
nc, act, scales = (d.get(x) for x in ("nc", "activation", "scales"))
depth, width, kpt_shape = (d.get(x, 1.0) for x in ("depth_multiple", "width_multiple", "kpt_shape"))
if scales:
    scale = d.get("scale")
    if not scale:
        scale = tuple(scales.keys())[0]
        LOGGER.warning(f"WARNING ⚠️ no model scale passed. Assuming scale='{scale}'.")
    depth, width, max_channels = scales[scale]

if act:
    Conv.default_act = eval(act)  # redefine default activation, i.e. Conv.default_act = nn.SiLU()
    if verbose:
        LOGGER.info(f"{colorstr('activation:')} {act}")  # print

if verbose:
    LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10}  {'module':<45}{'arguments':<30}")
ch = [ch]
layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]):  # from, number, module, args
    m = getattr(torch.nn, m[3:]) if "nn." in m else globals()[m]  # get module
    for j, a in enumerate(args):
        if isinstance(a, str):
            try:
                args[j] = locals()[a] if a in locals() else ast.literal_eval(a)
            except ValueError:
                pass
    n = n_ = max(round(n * depth), 1) if n > 1 else n  # depth gain
    if m in {
        Classify,
        Conv,
        ConvTranspose,
        GhostConv,
        Bottleneck,
        GhostBottleneck,
        SPP,
        SPPF,
        C2fPSA,
        C2PSA,
        DWConv,
        Focus,
        BottleneckCSP,
        C1,
        C2,
        C2f,
        C3k2,
        RepNCSPELAN4,
        ELAN1,
        ADown,
        AConv,
        SPPELAN,
        C2fAttn,
        C3,
        C3TR,
        C3Ghost,
        nn.ConvTranspose2d,
        DWConvTranspose2d,
        C3x,
        RepC3,
        PSA,
        SCDown,
        C2fCIB,
    }:
        c1, c2 = ch[f], args[0]
        if c2 != nc:  # if c2 not equal to number of classes (i.e. for Classify() output)
            c2 = make_divisible(min(c2, max_channels) * width, 8)
        if m is C2fAttn:
            args[1] = make_divisible(min(args[1], max_channels // 2) * width, 8)  # embed channels
            args[2] = int(
                max(round(min(args[2], max_channels // 2 // 32)) * width, 1) if args[2] > 1 else args[2]
            )  # num heads

        args = [c1, c2, *args[1:]]
        if m in {
            BottleneckCSP,
            C1,
            C2,
            C2f,
            C3k2,
            C2fAttn,
            C3,
            C3TR,
            C3Ghost,
            C3x,
            RepC3,
            C2fPSA,
            C2fCIB,
            C2PSA,
        }:
            args.insert(2, n)  # number of repeats
            n = 1
        if m is C3k2:  # for M/L/X sizes
            legacy = False
            if scale in "mlx":
                args[3] = True
    elif m is AIFI:
        args = [ch[f], *args]
    elif m in {HGStem, HGBlock}:
        c1, cm, c2 = ch[f], args[0], args[1]
        args = [c1, cm, c2, *args[2:]]
        if m is HGBlock:
            args.insert(4, n)  # number of repeats
            n = 1
    elif m is ResNetLayer:
        c2 = args[1] if args[3] else args[1] * 4
    elif m is nn.BatchNorm2d:
        args = [ch[f]]
    elif m is Concat:
        c2 = sum(ch[x] for x in f)
    elif m in {Detect, WorldDetect, Segment, Pose, OBB, ImagePoolingAttn, v10Detect}:
        args.append([ch[x] for x in f])
        if m is Segment:
            args[2] = make_divisible(min(args[2], max_channels) * width, 8)
        if m in {Detect, Segment, Pose, OBB}:
            m.legacy = legacy
    elif m is RTDETRDecoder:  # special case, channels arg must be passed in index 1
        args.insert(1, [ch[x] for x in f])
    elif m is CBLinear:
        c2 = args[0]
        c1 = ch[f]
        args = [c1, c2, *args[1:]]
    elif m is CBFuse:
        c2 = ch[f[-1]]
    else:
        c2 = ch[f]

    m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
    t = str(m)[8:-2].replace("__main__.", "")  # module type
    m_.np = sum(x.numel() for x in m_.parameters())  # number params
    m_.i, m_.f, m_.type = i, f, t  # attach index, 'from' index, type
    if verbose:
        LOGGER.info(f"{i:>3}{str(f):>20}{n_:>3}{m_.np:10.0f}  {t:<45}{str(args):<30}")  # print
    save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
    layers.append(m_)
    if i == 0:
        ch = []
    ch.append(c2)
return nn.Sequential(*layers), sorted(save)`

改进一定要修改此处代码

4.加入CBAM

由于本身就有CBAM的代码
所以只需要在yaml中加入即可

# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect

# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
  # [depth, width, max_channels]
  n: [0.50, 0.25, 1024] # summary: 319 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
  s: [0.50, 0.50, 1024] # summary: 319 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
  m: [0.50, 1.00, 512] # summary: 409 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
  l: [1.00, 1.00, 512] # summary: 631 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
  x: [1.00, 1.50, 512] # summary: 631 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs

# YOLO11n backbone
backbone:
  # [from, repeats, module, args]
  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
  - [-1, 2, C3k2, [256, False, 0.25]]
  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
  - [-1, 2, C3k2, [512, False, 0.25]]
  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
  - [-1, 2, C3k2, [512, True]]
  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
  - [-1, 2, C3k2, [1024, True]]
  - [-1, 1, SPPF, [1024, 5]] # 9
  - [-1, 2, C2PSA, [1024]] # 10

# YOLO11n head
head:
  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
  - [-1, 2, C3k2, [512, False]] # 13

  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
  - [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
  - [-1, 1, CBAM, []]

  - [-1, 1, Conv, [256, 3, 2]]
  - [[-1, 13], 1, Concat, [1]] # cat head P4
  - [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
  - [-1, 1, CBAM, []]

  - [-1, 1, Conv, [512, 3, 2]]
  - [[-1, 10], 1, Concat, [1]] # cat head P5
  - [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
  - [-1, 1, CBAM, []]

  - [[16, 19, 22], 1, Detect, [nc]] # Detect(P3, P4, P5)