46语义分割和数据集
"""
图像分割将图像划分为若干组成区域,这类问题的方法通常利用图像中像素之间的相关性。
它在训练时不需要有关图像像素的标签信息,在预测时也无法保证分割出的区域具有我们希望得到的语义。
图像分割可能会将狗分为两个区域:一个覆盖以黑色为主的嘴和眼睛,另一个覆盖以黄色为主的其余部分身体。
实例分割也叫同时检测并分割(simultaneous detection and segmentation),
它研究如何识别图像中各个目标实例的像素级区域。
与语义分割不同,实例分割不仅需要区分语义,还要区分不同的目标实例。
例如,如果图像中有两条狗,则实例分割需要区分像素属于的两条狗中的哪一条。
"""
import os
import torch
import torchvision
from d2l import torch as d2l
import matplotlib. pyplot as plt
voc_dir = '../data/VOCdevkit/VOC2012'
def read_voc_images ( voc_dir, is_train= True ) :
"""将所有输入的图像和标签读入内存"""
txt_fname = os. path. join( voc_dir, 'ImageSets' , 'Segmentation' ,
'train.txt' if is_train else 'val.txt' )
mode = torchvision. io. image. ImageReadMode. RGB
with open ( txt_fname, 'r' ) as f:
images = f. read( ) . split( )
features, labels = [ ] , [ ]
for i, fname in enumerate ( images) :
features. append( torchvision. io. read_image(
os. path. join( voc_dir, 'JPEGImages' , f' { fname} .jpg' ) ) )
labels. append( torchvision. io. read_image(
os. path. join( voc_dir, 'SegmentationClass' , f' { fname} .png' ) ,
mode) )
return features, labels
train_features, train_labels = read_voc_images( voc_dir, True )
n = 5
imgs = train_features[ 0 : n] + train_labels[ 0 : n]
imgs = [ img. permute( 1 , 2 , 0 ) for img in imgs]
d2l. show_images( imgs, 2 , n)
plt. show( )
VOC_COLORMAP = [ [ 0 , 0 , 0 ] , [ 128 , 0 , 0 ] , [ 0 , 128 , 0 ] , [ 128 , 128 , 0 ] ,
[ 0 , 0 , 128 ] , [ 128 , 0 , 128 ] , [ 0 , 128 , 128 ] , [ 128 , 128 , 128 ] ,
[ 64 , 0 , 0 ] , [ 192 , 0 , 0 ] , [ 64 , 128 , 0 ] , [ 192 , 128 , 0 ] ,
[ 64 , 0 , 128 ] , [ 192 , 0 , 128 ] , [ 64 , 128 , 128 ] , [ 192 , 128 , 128 ] ,
[ 0 , 64 , 0 ] , [ 128 , 64 , 0 ] , [ 0 , 192 , 0 ] , [ 128 , 192 , 0 ] ,
[ 0 , 64 , 128 ] ]
VOC_CLASSES = [ 'background' , 'aeroplane' , 'bicycle' , 'bird' , 'boat' ,
'bottle' , 'bus' , 'car' , 'cat' , 'chair' , 'cow' ,
'diningtable' , 'dog' , 'horse' , 'motorbike' , 'person' ,
'potted plant' , 'sheep' , 'sofa' , 'train' , 'tv/monitor' ]
def voc_colormap2label ( ) :
"""构建从RGB到VOC类别索引的映射"""
colormap2label = torch. zeros( 256 ** 3 , dtype= torch. long )
for i, colormap in enumerate ( VOC_COLORMAP) :
colormap2label[
( colormap[ 0 ] * 256 + colormap[ 1 ] ) * 256 + colormap[ 2 ] ] = i
return colormap2label
def voc_label_indices ( colormap, colormap2label) :
"""将VOC标签中的RGB值映射到它们的类别索引"""
colormap = colormap. permute( 1 , 2 , 0 ) . numpy( ) . astype( 'int32' )
idx = ( ( colormap[ : , : , 0 ] * 256 + colormap[ : , : , 1 ] ) * 256
+ colormap[ : , : , 2 ] )
return colormap2label[ idx]
"""
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1]])
aeroplane
"""
def voc_rand_crop ( feature, label, height, width) :
"""随机裁剪特征和标签图像"""
rect = torchvision. transforms. RandomCrop. get_params(
feature, ( height, width) )
feature = torchvision. transforms. functional. crop( feature, * rect)
label = torchvision. transforms. functional. crop( label, * rect)
return feature, label
imgs = [ ]
for _ in range ( n) :
imgs += voc_rand_crop( train_features[ 0 ] , train_labels[ 0 ] , 200 , 300 )
imgs = [ img. permute( 1 , 2 , 0 ) for img in imgs]
d2l. show_images( imgs[ : : 2 ] + imgs[ 1 : : 2 ] , 2 , n)
"""
imgs = [0, 1, 2, 3, 4, 5]
result = imgs[::2] + imgs[1::2]
# imgs[::2] 返回 [0, 2, 4]
# imgs[1::2] 返回 [1, 3, 5]
# result 将这两个子列表连接起来,返回 [0, 2, 4, 1, 3, 5]
"""
plt. show( )
class VOCSegDataset ( torch. utils. data. Dataset) :
"""一个用于加载VOC数据集的自定义数据集"""
def __init__ ( self, is_train, crop_size, voc_dir) :
self. transform = torchvision. transforms. Normalize(
mean= [ 0.485 , 0.456 , 0.406 ] , std= [ 0.229 , 0.224 , 0.225 ] )
self. crop_size = crop_size
features, labels = read_voc_images( voc_dir, is_train= is_train)
self. features = [ self. normalize_image( feature)
for feature in self. filter ( features) ]
self. labels = self. filter ( labels)
self. colormap2label = voc_colormap2label( )
print ( 'read ' + str ( len ( self. features) ) + ' examples' )
def normalize_image ( self, img) :
return self. transform( img. float ( ) / 255 )
def filter ( self, imgs) :
return [ img for img in imgs if (
img. shape[ 1 ] >= self. crop_size[ 0 ] and
img. shape[ 2 ] >= self. crop_size[ 1 ] ) ]
def __getitem__ ( self, idx) :
feature, label = voc_rand_crop( self. features[ idx] , self. labels[ idx] ,
* self. crop_size)
return ( feature, voc_label_indices( label, self. colormap2label) )
def __len__ ( self) :
return len ( self. features)
crop_size = ( 320 , 480 )
voc_train = VOCSegDataset( True , crop_size, voc_dir)
voc_test = VOCSegDataset( False , crop_size, voc_dir)
batch_size = 64
train_iter = torch. utils. data. DataLoader( voc_train, batch_size, shuffle= True ,
drop_last= True ,
num_workers= 0 )
for X, Y in train_iter:
print ( X. shape)
print ( Y. shape)
break
def load_data_voc ( batch_size, crop_size) :
"""加载VOC语义分割数据集"""
voc_dir = '../data/VOCdevkit/VOC2012'
num_workers = 4
train_iter = torch. utils. data. DataLoader(
VOCSegDataset( True , crop_size, voc_dir) , batch_size,
shuffle= True , drop_last= True , num_workers= num_workers)
test_iter = torch. utils. data. DataLoader(
VOCSegDataset( False , crop_size, voc_dir) , batch_size,
drop_last= True , num_workers= num_workers)
return train_iter, test_iter