Linux bio数据结构

数据结构


/*
 * main unit of I/O for the block layer and lower layers (ie drivers and
 * stacking drivers)
 */
struct bio {
	struct bio		*bi_next;	/* request queue link */
	struct gendisk		*bi_disk;
	unsigned int		bi_opf;		/* bottom bits req flags,
						 * top bits REQ_OP. Use
						 * accessors.
						 */
	unsigned short		bi_flags;	/* status, etc and bvec pool number */
	unsigned short		bi_ioprio;
	unsigned short		bi_write_hint;
	blk_status_t		bi_status;
	u8			bi_partno;

	/* Number of segments in this BIO after
	 * physical address coalescing is performed.
	 */
	unsigned int		bi_phys_segments;

	/*
	 * To keep track of the max segment size, we account for the
	 * sizes of the first and last mergeable segments in this bio.
	 */
	unsigned int		bi_seg_front_size;
	unsigned int		bi_seg_back_size;

	struct bvec_iter	bi_iter;

	atomic_t		__bi_remaining;
	bio_end_io_t		*bi_end_io;

	void			*bi_private;
#ifdef CONFIG_BLK_CGROUP
	/*
	 * Optional ioc and css associated with this bio.  Put on bio
	 * release.  Read comment on top of bio_associate_current().
	 */
	struct io_context	*bi_ioc;
	struct cgroup_subsys_state *bi_css;
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
	void			*bi_cg_private;
	struct blk_issue_stat	bi_issue_stat;
#endif
#endif
	union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
		struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
	};

	unsigned short		bi_vcnt;	/* how many bio_vec's */


	/*
	 * Everything starting with bi_max_vecs will be preserved by bio_reset()
	 */

	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */

	atomic_t		__bi_cnt;	/* pin count */

	struct bio_vec		*bi_io_vec;	/* the actual vec list */

	struct bio_set		*bi_pool;

	/* Encryption context. May contain secret key material. */
	struct bio_crypt_ctx	bi_crypt_ctx;
	/*
	 * We can inline a number of vecs at the end of the bio, to avoid
	 * double allocations for a small number of bio_vecs. This member
	 * MUST obviously be kept at the very end of the bio.
	 */
	struct bio_vec		bi_inline_vecs[0];
};

/*
 * was unsigned short, but we might as well be ready for > 64kB I/O pages
 */
struct bio_vec {
	struct page	*bv_page;
	unsigned int	bv_len;
	unsigned int	bv_offset;
};

struct bvec_iter {
	sector_t		bi_sector;	/* device address in 512 byte
						   sectors */
	unsigned int		bi_size;	/* residual I/O count */

	unsigned int		bi_idx;		/* current index into bvl_vec */

	unsigned int            bi_done;	/* number of bytes completed */

	unsigned int            bi_bvec_done;	/* number of bytes completed in
						   current bvec */
};

bio的作用

已读取文件为例，必须知道文件内偏移和磁盘数据块的映射关系，比如已Ext4文件系统为例，可以通过ext4_map_blocks查找extent B+数据查到映射关系，而读取磁盘数据存放到page cache中，而bio正是：描述磁盘数据块和page cache页的对应关系。每个bio对应磁盘一块连续的位置，这个连续的位置可以对应一或者多个page，所以一个bio有一个bi_io_vec表。这样的好处是读取的文件是连续的一块，那么只要一个bio即可。磁盘数据块由bvec_iter中的bi_sector表示，代表读取的连续数据块的一个扇区号。

正如宋宝华文章中提到的（这小段引用自参考文章）：

我们现在假设2种情况

第1种情况是page_cache_sync_readahead()要读的0~16KB数据，在硬盘里面正好是顺序排列的(是否顺序排列，要查文件系统，如ext3、ext4)，Linux会为这一次4页的读，分配1个bio就足够了，并且让这个bio里面分配4个bi_io_vec，指向4个不同的内存页：

第2种情况是page_cache_sync_readahead()要读的0~16KB数据，在硬盘里面正好是完全不连续的4块 (是否顺序排列，要查文件系统，如ext3、ext4)，Linux会为这一次4页的读，分配4个bio，并且让这4个bio里面，每个分配1个bi_io_vec，指向4个不同的内存页面：

当然你还可以有第3种情况，比如0~8KB在硬盘里面连续，8~16KB不连续，那可以是这样的：

读取文件场景bio的创建和使用

ext4_mpage_readpages函数就会读取磁盘数据就会构建一个个的bio，如上面描述我们知道bio指向的是一块连续的磁盘数据，也就是说如果读取文件不连续之后，就要新建一个bio，我们看看ext4_mpage_readpages是怎么实现该逻辑的：


int ext4_mpage_readpages(struct address_space *mapping,
			 struct list_head *pages, struct page *page,
			 unsigned nr_pages)
{
	struct bio *bio = NULL;
	sector_t last_block_in_bio = 0;

	struct inode *inode = mapping->host;
	const unsigned blkbits = inode->i_blkbits;
	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
	const unsigned blocksize = 1 << blkbits;
	sector_t block_in_file;
	sector_t last_block;
	sector_t last_block_in_file;
	sector_t blocks[MAX_BUF_PER_PAGE];
	unsigned page_block;
	struct block_device *bdev = inode->i_sb->s_bdev;
	int length;
	unsigned relative_block = 0;
	struct ext4_map_blocks map;

	map.m_pblk = 0;
	map.m_lblk = 0;
	map.m_len = 0;
	map.m_flags = 0;

	for (; nr_pages; nr_pages--) {
		int fully_mapped = 1;
		unsigned first_hole = blocks_per_page;

		prefetchw(&page->flags);
		if (pages) {
			page = list_entry(pages->prev, struct page, lru);
			list_del(&page->lru);
			if (add_to_page_cache_lru(page, mapping, page->index,
				  readahead_gfp_mask(mapping)))
				goto next_page;
		}

		if (page_has_buffers(page))
			goto confused;

		block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
		last_block = block_in_file + nr_pages * blocks_per_page;
		last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
		if (last_block > last_block_in_file)
			last_block = last_block_in_file;
		page_block = 0;

		/*
		 * Map blocks using the previous result first.
		 */
        //1:

		if ((map.m_flags & EXT4_MAP_MAPPED) &&
		    block_in_file > map.m_lblk &&
		    block_in_file < (map.m_lblk + map.m_len)) {
			unsigned map_offset = block_in_file - map.m_lblk;
			unsigned last = map.m_len - map_offset;

			for (relative_block = 0; ; relative_block++) {
				if (relative_block == last) {
					/* needed? */
					map.m_flags &= ~EXT4_MAP_MAPPED;
					break;
				}
				if (page_block == blocks_per_page)
					break;
				blocks[page_block] = map.m_pblk + map_offset +
					relative_block;
				page_block++;
				block_in_file++;
			}
		}

		/*
		 * Then do more ext4_map_blocks() calls until we are
		 * done with this page.
		 */
        //2:
		while (page_block < blocks_per_page) {
			if (block_in_file < last_block) {
				map.m_lblk = block_in_file;
				map.m_len = last_block - block_in_file;

				if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
				set_error_page:
					SetPageError(page);
					zero_user_segment(page, 0,
							  PAGE_SIZE);
					unlock_page(page);
					goto next_page;
				}
			}
            //3:
			if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
				fully_mapped = 0;
				if (first_hole == blocks_per_page)
					first_hole = page_block;
				page_block++;
				block_in_file++;
				continue;
			}

			if (first_hole != blocks_per_page)
				goto confused;		/* hole -> non-hole */

			/* Contiguous blocks? */
            //4:
			if (page_block && blocks[page_block-1] != map.m_pblk-1)
				goto confused;
            //5
			for (relative_block = 0; ; relative_block++) {
				if (relative_block == map.m_len) {
					/* needed? */
					map.m_flags &= ~EXT4_MAP_MAPPED;
					break;
				} else if (page_block == blocks_per_page)
					break;
				blocks[page_block] = map.m_pblk+relative_block;
				page_block++;
				block_in_file++;
			}
		}
        //6:
		if (first_hole != blocks_per_page) {
			zero_user_segment(page, first_hole << blkbits,
					  PAGE_SIZE);
			if (first_hole == 0) {
				SetPageUptodate(page);
				unlock_page(page);
				goto next_page;
			}
		} else if (fully_mapped) {
            //7
			SetPageMappedToDisk(page);
		}

		if (fully_mapped && blocks_per_page == 1 &&
		    !PageUptodate(page) && cleancache_get_page(page) == 0) {
			SetPageUptodate(page);
			goto confused;
		}

		/*
		 * This page will go to BIO.  Do we need to send this
		 * BIO off first?
		 */
        //8:
		if (bio && (last_block_in_bio != blocks[0] - 1)) {
		submit_and_realloc:
			ext4_submit_bio_read(bio);
			bio = NULL;
		}
		if (bio == NULL) {
			struct fscrypt_ctx *ctx = NULL;

			if (ext4_encrypted_inode(inode) &&
			    S_ISREG(inode->i_mode)) {
				ctx = fscrypt_get_ctx(inode, GFP_NOFS);
				if (IS_ERR(ctx))
					goto set_error_page;
			}
			bio = bio_alloc(GFP_KERNEL,
				min_t(int, nr_pages, BIO_MAX_PAGES));
			if (!bio) {
				if (ctx)
					fscrypt_release_ctx(ctx);
				goto set_error_page;
			}
			bio_set_dev(bio, bdev);
			bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
			bio->bi_end_io = mpage_end_io;
			bio->bi_private = ctx;
			ext4_set_bio_ctx(inode, bio);
			bio_set_op_attrs(bio, REQ_OP_READ, 0);
		}

		length = first_hole << blkbits;
        //9
		if (bio_add_page(bio, page, length, 0) < length)
			goto submit_and_realloc;

		if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
		     (relative_block == map.m_len)) ||
		    (first_hole != blocks_per_page)) {
			ext4_submit_bio_read(bio);
			bio = NULL;
		} else
			last_block_in_bio = blocks[blocks_per_page - 1];
		goto next_page;
    //10
	confused:
		if (bio) {
			ext4_submit_bio_read(bio);
			bio = NULL;
		}
		if (!PageUptodate(page))
			block_read_full_page(page, ext4_get_block);
		else
			unlock_page(page);
	next_page:
		if (pages)
			put_page(page);
	}
	BUG_ON(pages && !list_empty(pages));
	if (bio)
		ext4_submit_bio_read(bio);
	return 0;
}

1：第2步中调用ext4_map_blocks的时候map.m_len可以连续映射多个block，这样下次循环时候EXT4_MAP_MAPPED flag就是设置，使用先前映射好的结果即可。

2：ext4_map_blocks执行真正的文件逻辑地址到磁盘地址的映射。如果错误返回值小于0，没有错误的时候返回的是映射的block数。注意：假设map.m_len = 32，实际ext4 extent tree上没有连续32个block，这种情况就无法映射32个block，比如可能仅仅映射了10个block，此时map.m_flags依然会设置EXT4_MAP_MAPPED。

ext4_map_blocks->ext4_ext_map_blocks:

上面代码逻辑需要理解ext4 extent tree查找逻辑，ee_len代表ex映射的连续物理块个数，ee_block代表ex映射的起始路基块地址，ee_start：ex起始逻辑地址映射的起始物理块号。代码中newblock=map->m_lblk - ee_block + ee_start即是起始逻辑地址映射的磁盘块地址，allocated = ee_len - (map->m_lblk - ee_block)是成功映射的block数，注意未必等于map.m_len，有可能小于map.m_len，因为磁盘block未必有map.m_len个连续的块。

3：没有设置EXT4_MAP_MAPPED的情况，比如是file hole(文件洞）的情况，此处暂不考虑该种情况。

4：blocks[page_block-1] != map.m_pblk-1条件满足时，代表着block块不再连续，根据bio的设计原则，一旦block不连续，就需要新建一个bio，goto confuse就是处理不连续情况，提交当前的bio，并且bio = NULL，后面就新建bio了。注意这里主要是针对一个page内的block不连续，比如page size = 4K, block size = 1K，一个page中存在4个 block 块缓冲区，可能这4个block就不连续，这里就是处理这种情况，一旦不连续goto confuse就会调用进入block_read_full_page逻辑。

5：调用到这里代表block是连续，将物理block地址存入blocks数组中。

6：有file hole的情况，直接将page cache数据设置0，并且SetPageUptodate及unlock_page解锁page。否则进入7。

7：SetPageMappedToDisk设置BH_Mapped标志位，代表成功完成了逻辑地址到磁盘block地址的映射。

8：这里主要是针对page间block块不连续问题，block块不连续后submit_bio提交当前的bio，然后bio = NULL，触发if(bio = NULL)重新创建新的bio。

9：bio_add_page将page加入bio中，因为前面提到一个bio可以包含多个page，所以连续的block对应的page可以一直通过bio_add_page加入到bio中。

10：confused代表block不连续情况的一种处理逻辑。

参考文章：

宋宝华：文件读写（BIO）波澜壮阔的一生

Linux 通用块层 bio 详解 – 字节岛技术分享