前面我们得知YOLOv8不但可以实现目标检测任务,还包揽了分类、分割、姿态估计等计算机视觉任务。在上一篇博文中,博主已经介绍了YOLOv8如何实现分类,在这篇博文里,博主将介绍其如何将实例分割给收入囊中。
YOLOv8实例分割架构图
如下图所示,YOLOv8采用了一种分割头与检测头相结合的方式来进行实例分割,在这个过程中,其会输出目标检测框与实例分割蒙版。
(先前博主以为这个是语义分割的,但后经人指正才发觉是实例分割,这也同时解答了我一些疑惑)

输出结果图像如下:

经典语义分割模型结构
为了让我们更好的理解语义分割模型,我们以最经典的语义分割模型UNet为例,可以看到其最终的结果要与原图像大小相同,但最终的维度(n)会有差别,这与我们确定使用的mask的数量有关。

YOLOv8实例分割模型结构
YOLOv8的实例分割YOLOv8`的目标检测模型结构即为接近,区别在于在最后的目标检测头基础上添加了实例分割头,同时其最终的实例分割头也是具有三种尺度的:
[[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5)
下图中对各个模块进行了编号,大家可以与yaml的模型文件进行对照

# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv8-seg instance segmentation model. For Usage examples see https://docs.ultralytics.com/tasks/segment
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n-seg.yaml' will call yolov8-seg.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.33, 0.25, 1024]
s: [0.33, 0.50, 1024]
m: [0.67, 0.75, 768]
l: [1.00, 1.00, 512]
x: [1.00, 1.25, 512]
# YOLOv8.0n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 3, C2f, [128, True]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 6, C2f, [256, True]]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 6, C2f, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 3, C2f, [1024, True]]
- [-1, 1, SPPF, [1024, 5]] # 9
# YOLOv8.0n head
head:
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 3, C2f, [512]] # 12
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 3, C2f, [256]] # 15 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 12], 1, Concat, [1]] # cat head P4
- [-1, 3, C2f, [512]] # 18 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 9], 1, Concat, [1]] # cat head P5
- [-1, 3, C2f, [1024]] # 21 (P5/32-large)
- [[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5)
YOLOv8检测头(可忽略)
那么我们看下这个分割头到底是如何定义的,分割头继承了检测头:
检测头代码如下:我们可以看到ultralytics更新了检测头(加入了YOLOv10,博主这里将该方法删掉了,因为用不到),其创新点为混合匹配机制,故在检测头中多出了forward_end2end:
class Detect(nn.Module):
"""YOLOv8 Detect head for detection models."""
dynamic = False # force grid reconstruction
export = False # export mode
end2end = False # end2end
max_det = 300 # max_det
shape = None
anchors = torch.empty(0) # init
strides = torch.empty(0) # init
def __init__(self, nc=80, ch=()):
"""Initializes the YOLOv8 detection layer with specified number of classes and channels."""
super().__init__()
self.nc = nc # number of classes
self.nl = len(ch) # number of detection layers
self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
self.no = nc + self.reg_max * 4 # number of outputs per anchor
self.stride = torch.zeros(self.nl) # strides computed during build
c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels
self.cv2 = nn.ModuleList(
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
)
self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
if self.end2end:
self.one2one_cv2 = copy.deepcopy(self.cv2)
self.one2one_cv3 = copy.deepcopy(self.cv3)
def forward(self, x):
"""Concatenates and returns predicted bounding boxes and class probabilities."""
if self.end2end:
return self.forward_end2end(x)
for i in range(self.nl):
x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
if self.training: # Training path
return x
y = self._inference(x)
return y if self.export else (y, x)
y = self._inference(one2one)
y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)
return y if self.export else (y, {"one2many": x, "one2one": one2one})
def _inference(self, x):
"""Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
# Inference path
shape = x[0].shape # BCHW
x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
if self.dynamic or self.shape != shape:
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
self.shape = shape
if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}: # avoid TF FlexSplitV ops
box = x_cat[:, : self.reg_max * 4]
cls = x_cat[:, self.reg_max * 4 :]
else:
box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
if self.export and self.format in {"tflite", "edgetpu"}:
# Precompute normalization factor to increase numerical stability
# See https://github.com/ultralytics/ultralytics/issues/7371
grid_h = shape[2]
grid_w = shape[3]
grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
norm = self.strides / (self.stride[0] * grid_size)
dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
else:
dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
return torch.cat((dbox, cls.sigmoid()), 1)
YOLOv8分割头
分割头代码如下:
class Segment(Detect):
"""YOLOv8 Segment head for segmentation models."""
def __init__(self, nc=80, nm=32, npr=256, ch=()):
"""Initialize the YOLO model attributes such as the number of masks, prototypes, and the convolution layers."""
super().__init__(nc, ch)
self.nm = nm # number of masks
self.npr = npr # number of protos
self.proto = Proto(ch[0], self.npr, self.nm) # protos
c4 = max(ch[0] // 4, self.nm)
self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
def forward(self, x):
"""Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""
p = self.proto(x[0]) # mask protos
bs = p.shape[0] # batch size
mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2) # mask coefficients
x = Detect.forward(self, x)
if self.training:
return x, mc, p
return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
输入到分割头的图像存储在list中,共有三个不同尺度,这与YOLOv8目标检测是相同的

上述第一个操作便是Proto操作 ,传入的是第一尺度的输出特征图,Proto的功能是针对x[0]进行卷积,将原来80x80大小的feature通过上采样变为160x160,这个图像是基础蒙版(mask)。
p = self.proto(x[0])
Proto(
(cv1): Conv(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(upsample): ConvTranspose2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(cv2): Conv(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(cv3): Conv(
(conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
(act): SiLU(inplace=True)
)
)
随后将x输入到cv4模块(期内包含3个模块组成))(即图像中的三个不同尺度的操作),cv4结构如下:
ModuleList(
(0): Sequential(
(0): Conv(
(conv): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) #CBS模块
(act): SiLU(inplace=True)
)
(1): Conv(
(conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) #CBS模块
(act): SiLU(inplace=True)
)
(2): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1)) #用于通道维度转换,Conv2d
)
(1): Sequential(
(0): Conv(
(conv): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(1): Conv(
(conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(2): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
)
(2): Sequential(
(0): Conv(
(conv): Conv2d(256, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(1): Conv(
(conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(2): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
)
)
其过程如下图所示:

得到的即为mask
随后进入检测的前向传播过程,因为YOLOv8本身就是做的检测,因此这个结果还是要进入检测头:
for i in range(self.nl):
x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
cv2中包含三个模块,最终的输出大小不变,通道数均为64,即为(64,80,80)(64,40,40)(64,20,20)
ModuleList(
(0): Sequential(
(0): Conv(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(1): Conv(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
)
(1): Sequential(
(0): Conv(
(conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(1): Conv(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
)
(2): Sequential(
(0): Conv(
(conv): Conv2d(256, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(1): Conv(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
)
)
cv3中也是三个模块,图像大小依旧不变,通道维度变为80,即(80,80,80)(80,40,40)(80,20,20)
ModuleList(
(0): Sequential(
(0): Conv(
(conv): Conv2d(64, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(1): Conv(
(conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(2): Conv2d(80, 80, kernel_size=(1, 1), stride=(1, 1))
)
(1): Sequential(
(0): Conv(
(conv): Conv2d(128, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(1): Conv(
(conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(2): Conv2d(80, 80, kernel_size=(1, 1), stride=(1, 1))
)
(2): Sequential(
(0): Conv(
(conv): Conv2d(256, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(1): Conv(
(conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(act): SiLU(inplace=True)
)
(2): Conv2d(80, 80, kernel_size=(1, 1), stride=(1, 1))
)
)
最终将其使用torch.cat进行拼接,得到(144,80,80)(144,40,40)(144,20,20)
随后便是推理的后处理过程,即对输出的这三个尺度的图像进行解码:
self.no = nc + self.reg_max * 4,其中reg_max是根据YOLOv8不同模型大小设定的,即 scale 4/8/12/16/20 for n/s/m/l/x),此处reg_max=16
def _inference(self, x):
"""Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
# Inference path
shape = x[0].shape # BCHW (144,80,80)
x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)#(1,144,8400)8400=80*80+40*40+20*20
if self.dynamic or self.shape != shape:
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
self.shape = shape #
if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}: # avoid TF FlexSplitV ops#不执行
box = x_cat[:, : self.reg_max * 4]
cls = x_cat[:, self.reg_max * 4 :]
else:
box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
if self.export and self.format in {"tflite", "edgetpu"}:#不执行
# Precompute normalization factor to increase numerical stability
# See https://github.com/ultralytics/ultralytics/issues/7371
grid_h = shape[2]
grid_w = shape[3]
grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
norm = self.strides / (self.stride[0] * grid_size)
dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
else:
dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
return torch.cat((dbox, cls.sigmoid()), 1)
self.anchors为torch.Size([2, 8400]), self.strides为torch.Size([1, 8400])
shape为torch.Size([1, 144, 80, 80]) 144=64+80,这个64是预测的box的值,最后还要进行转换
根据x_cat进行拆分,得到预测的box与cls,box即为(1,64,8400),cls为(1,80,8400)
随后通过DEL模块对box进行分解:
dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
得到的box即为(1,4,8400)

DEL中的Conv2d没有梯度,即参数不会更新,这个模块作用便是将64分解为4*16,进而得到4*1
最后将 dbox 和 cls(类别)返回
return torch.cat((dbox, cls.sigmoid()), 1)
其维度为(84,8400),84=80+4,8400代表预测的目标个数
最终返回数据:
return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
其中mc为(1,32,8400)x是一个元组,x[0]为(1,84,8400)x[1】为列表,包含(1,144,80,80)(1,144,40,40),(1,144,20,20),p为(1,32,160,160)p为基础蒙版
返回的数据:

其中(1,32,8400)即为预测的mask
非极大值抑制
在推理过程中,博主使用的图像大小为(3,480,640),所有最后得到的数据维度为(1,116,6300)其中,116=84(80+4)+32,这是因为YOLOv8中不仅要完成语义分割还要实现目标检测,其中(1,32,6300)是用于语义分割的。
而6300=60*80+30*40+15*20
下面的分解代码证实了这一点,即mask的数量(nm)为116-80-4=32,mask开始坐标为80+4=84
nm = prediction.shape[1] - nc - 4 # number of masks
mi = 4 + nc
设定输出:
output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
得到的output为38个值,其中38=4+class_score+class+32(保存检测与分割结果)
随后筛选出的大于阈值的类别,得到36个,即(1,36,116),这里的36指的是符合的个数,是从6300中筛选出的。
将box,类别 以及分割mask分开:
box, cls, mask = x.split((4, nc, nm), 1)
box为(36,4),cls为(36,80),mask 为 (36,32)
随后再从类别中选出最大的
conf, j = cls.max(1, keepdim=True)
x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
得到的conf为分值,j为坐标(代表类别),维度均为(36,1),并将这些数据再次拼接到一起,得到(36,38),其中36为目标个数,38为4+1+1+32,即 box+conf+cls_id+mask
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes,max_wh是定义的,值为7860
scores = x[:, 4] # scores
if rotated:
boxes = torch.cat((x[:, :2] + c, x[:, 2:4], x[:, -1:]), dim=-1) # xywhr
i = nms_rotated(boxes, scores, iou_thres)
else:#执行这个,对box进行非极大值抑制,这个是调用了torch的包
boxes = x[:, :4] + c # boxes (offset by class)
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
i = i[:max_det]#max_det=300,即最多只能预测300个目标
output[xi] = x[i]
返回的 i 为 tensor([20, 24, 3, 32, 34], device='cuda:0'),这里给出的i是36个中经过筛选后的检测框编号,最终将x中的目标筛选出存储到output中,可以看到output是一个列表,存放的是每个batch的结果,由于在预测时只输入一张图像,故里面只有一个数据,筛选出的结果为(5,38),即有5个目标。

后处理过程
那么,这个mask要如何使用呢。我们接下来看一下其后处理过程
def postprocess(self, preds, img, orig_imgs):
后处理过程中传入的参数为preds即预测的结果,即YOLOv8分割头输出的结果

img是输入的图像(归一化后的),orig_img是原始图像

在后处理过程的刚开始,便是利用非极大值抑制来筛选出部分数据:
p = ops.non_max_suppression(
preds[0],
self.args.conf,
self.args.iou,
agnostic=self.args.agnostic_nms,
max_det=self.args.max_det,
nc=len(self.model.names),
classes=self.args.classes,
)
得到的结果 p 即为(5,38)
判断preds[1]是否是tuple类型,是,则为preds[1][-1],即为(1,32,120,160)
proto = preds[1][-1] if isinstance(preds[1], tuple) else preds[1]
随后进行下面的循环(预测只有一张图像,故只有一轮)
for i, (pred, orig_img, img_path) in enumerate(zip(p, orig_imgs, self.batch[0])):
if not len(pred): # save empty boxes
masks = None
elif self.args.retina_masks:
pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
masks = ops.process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2]) # HWC
else:#执行的是这个分支
masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True) # HWC
pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks))
return results
我们可以看到mask的处理结果:
masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True)
process_mask方法是如何处理的呢?
我们先看一下其传入的参数,proto为(1,32,120,160),pred为(5,38),取从6到38,即只是mask的32维数据,即为(5,32),同时还有bbox为 (5,4),img.shape[2:]为宽高
def process_mask(protos, masks_in, bboxes, shape, upsample=False):
c, mh, mw = protos.shape # CHW 32 120 160
ih, iw = shape #480,640
masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw) # CHW (5,120,160)
width_ratio = mw / iw #0.25
height_ratio = mh / ih#0.25
downsampled_bboxes = bboxes.clone()#克隆
downsampled_bboxes[:, 0] *= width_ratio #对其进行缩放
downsampled_bboxes[:, 2] *= width_ratio
downsampled_bboxes[:, 3] *= height_ratio
downsampled_bboxes[:, 1] *= height_ratio
masks = crop_mask(masks, downsampled_bboxes) # CHW
if upsample:
masks = F.interpolate(masks[None], shape, mode="bilinear", align_corners=False)[0] # CHW
return masks.gt_(0.0)
bboxs原始值:
tensor([[2.3550e+02, 1.1798e+02, 3.6113e+02, 3.4263e+02],
[2.7596e-01, 1.5049e+02, 1.8605e+02, 4.1289e+02],
[5.9170e+02, 1.7436e+02, 6.3966e+02, 3.0410e+02],
[9.4319e+00, 1.5594e+02, 4.7882e+02, 4.7825e+02],
[3.0707e+01, 1.3653e+02, 4.7746e+02, 4.7805e+02]], device='cuda:0')
bbox缩小后的值:
tensor([[5.8876e+01, 2.9494e+01, 9.0283e+01, 8.5657e+01],
[6.8989e-02, 3.7624e+01, 4.6512e+01, 1.0322e+02],
[1.4793e+02, 4.3590e+01, 1.5992e+02, 7.6024e+01],
[2.3580e+00, 3.8984e+01, 1.1971e+02, 1.1956e+02],
[7.6767e+00, 3.4133e+01, 1.1936e+02, 1.1951e+02]], device='cuda:0')
接下来便是将保证mask在Bbox内。
masks = crop_mask(masks, downsampled_bboxes)
def crop_mask(masks, boxes):
_, h, w = masks.shape
x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(n,1,1)
r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,1,w)
c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(1,h,1)
return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
得到的mask依旧为(5,120,160),随后对mask进行上采样,使其与原本的图像大小一样的,这里就已经是蒙版了,通过插值的方式进行上采样,得到的mask为(5,480,640)
mask的理解
在检测头(分割头)中输出的32 维的向量可以看作是与每个检测框关联的分割
mask的系数或权重。针对于分割头的输出
1x32x160x160,一个关键的概念是prototype masks。它是一个固定数量(32)的基础mask,每个mask的尺寸为160×160。这些基础mask并不直接对应于任何特定的物体或类别,而是被设计为可以线性组合来表示任何可能的物体mask。
简单来说,模型不直接预测每个物体的完整
mask,而是预测一组基本的masks(称为prototype masks)以及每个物体如何组合这些masks(权重/系数)。这种方法的好处是,模型只需要预测一个较小的mask张量,然后可以通过简单的矩阵乘法将这些小mask组合成完整的物体masks。大家可以把它类比于线性代数中基向量的概念,空间中的任何一个向量是不是都可以表示为一组基向量的线性组合,那么其中的
prototype masks即32x160x160的mask张量可以把它理解为一组基向量,而之前在检测框中的32维向量可以理解为组合这一组基向量的权重或者说系数。
当我们从检测头得到一个
32维的向量,分割头得到32个基础masks时,这个32维的向量实际上表示了如何组合这些基础masks来得到一个特定物体的 mask。具体来说,我们用这个32维向量对32个基础 masks进行线性组合,从而得到与检测框关联的最终 mask。简单来说,这就像你现在有 32 种不同的颜料,检测头给你一个配方(32 维向量),告诉你如何混合这些颜料来得到一个特定的颜色(最终的 mask)。这样做的优点是我们不需要为每个检测框都预测一个完整的
mask,这个非常消耗内存和计算资源。相反,我们只需要预测一个相对较小的32维向量和一个固定数量的基础masks,然后在后处理中进行组合即可。
结果可视化
最后附上将结果可视化的代码
import cv2
import numpy as np
from ultralytics import YOLO
def hsv2bgr(h, s, v):
h_i = int(h * 6)
f = h * 6 - h_i
p = v * (1 - s)
q = v * (1 - f * s)
t = v * (1 - (1 - f) * s)
r, g, b = 0, 0, 0
if h_i == 0:
r, g, b = v, t, p
elif h_i == 1:
r, g, b = q, v, p
elif h_i == 2:
r, g, b = p, v, t
elif h_i == 3:
r, g, b = p, q, v
elif h_i == 4:
r, g, b = t, p, v
elif h_i == 5:
r, g, b = v, p, q
return int(b * 255), int(g * 255), int(r * 255)
def random_color(id):
h_plane = (((id << 2) ^ 0x937151) % 100) / 100.0
s_plane = (((id << 3) ^ 0x315793) % 100) / 100.0
return hsv2bgr(h_plane, s_plane, 1)
if __name__ == "__main__":
model = YOLO("yolov8n-seg.pt")
img = cv2.imread("img.jpg")
result = model(img)[0]
names = result.names
boxes = result.boxes.data.tolist()
masks = result.masks
h, w = img.shape[:2]
for i, mask in enumerate(masks.data):
mask = mask.cpu().numpy().astype(np.uint8)
mask_resized = cv2.resize(mask, (w, h))
label = int(boxes[i][5])
color = np.array(random_color(label))
colored_mask = (np.ones((h, w, 3)) * color).astype(np.uint8)
masked_colored_mask = cv2.bitwise_and(colored_mask, colored_mask, mask=mask_resized)
mask_indices = mask_resized == 1
img[mask_indices] = (img[mask_indices] * 0.6 + masked_colored_mask[mask_indices] * 0.4).astype(np.uint8)
for obj in boxes:
left, top, right, bottom = int(obj[0]), int(obj[1]), int(obj[2]), int(obj[3])
confidence = obj[4]
label = int(obj[5])
color = random_color(label)
cv2.rectangle(img, (left, top), (right, bottom), color = color ,thickness=2, lineType=cv2.LINE_AA)
caption = f"{names[label]} {confidence:.2f}"
w, h = cv2.getTextSize(caption, 0, 1, 2)[0]
cv2.rectangle(img, (left - 3, top - 33), (left + w + 10, top), color, -1)
cv2.putText(img, caption, (left, top - 5), 0, 1, (0, 0, 0), 2, 16)
cv2.imwrite("predict-seg.jpg", img)
print("save done")











![[星瞳科技]如何用OpenMV制造一个可以追小球的云台?](https://img-blog.csdnimg.cn/img_convert/09d61d62b96528dfb5d0d8546b3135c0.jpeg)





