Linux 读文件 - readahead预读算法

顺序读场景

intmain

{

charc[ 4096];

intin = -1;

in = open( "news.txt", O_RDONLY);

intindex= 0;

while(read(in, &c, 4096) == 4096)

{

printf( "index: %d,len: %ld.\n",index, strlen(c));

memset(c, 0, sizeof(c));

index++;

}

数据结构

/*
 * Track a single file's readahead state
 */
struct file_ra_state {
	pgoff_t start;			/* where readahead started */
	unsigned int size;		/* # of readahead pages */
	unsigned int async_size;	/* do asynchronous readahead when
					   there are only # of pages ahead */

	unsigned int ra_pages;		/* Maximum readahead window */
	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
	loff_t prev_pos;		/* Cache last read() position */
};

start: 开始预读的数据页索引，是指相对文件内的index。

size : 一共要预读多个少页面。

async_size: 如果当前预读的“存货”只剩async_size时，就会触发async readahead，这个值很重要，控着async readahead的时机。异步读触发时类似网络协议栈的滑动窗口，窗口会移动，也就是file_ra_state字段会更新，具体如何更新见下文。这里async readahead就是指generic_file_buffered_read函数中的：page_cache_async_readahead函数调用

Linux read的核心函数generic_file_buffered_read_nginux的博客-CSDN博客

ra_page: readahead窗口的最大值，类似网络协议栈滑动窗口，这个窗口有最大限制。

prev_pos : 上次读的postion，文件内偏移，主义单位时bytes。

注意：PageReadahead的page非常重要，因为一旦应用程序读取数据到这个page，就要触发异步预读（page_cache_async_readahead）

数据结构初始化:

代码：mm/readahead.c: ondemand_readahead（预读算法的实现函数）


/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct address_space *mapping,
		struct file_ra_state *ra, struct file *filp,
		bool hit_readahead_marker, pgoff_t index,
		unsigned long req_size)
{
	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
	unsigned long max_pages = ra->ra_pages;
	unsigned long add_pages;
	pgoff_t prev_index;

	/*
	 * If the request exceeds the readahead window, allow the read to
	 * be up to the optimal hardware IO size
	 */
	if (req_size > max_pages && bdi->io_pages > max_pages)
		max_pages = min(req_size, bdi->io_pages);

	/*
	 * start of file
	 */
	if (!index)
		goto initial_readahead;

    ...

initial_readahead:
	ra->start = index;
    //req_size是指预读请求的大小数据页数量（每个数据页4K)
	ra->size = get_init_ra_size(req_size, max_pages);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

    ...
    //提交文件读取，触发io预读
	ra_submit(ra, mapping, filp);
}

/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-8 page = 32k initial, > 8 page = 128k initial
 */

//第一个参数：请求读取的数据页数量，比如目前场景每次读取4K字节，size = 1
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
	unsigned long newsize = roundup_pow_of_two(size);

    //每次读取小于4K bytes, 预读大小是请求大小的4被，比如read(4096), newsize = 1 * 4 = 4;
	if (newsize <= max / 32)
		newsize = newsize * 4;

    //中等读取大小，即1pages < ra <= 8pages，预读大小设置成请求大小的2倍
    //比如read(4K) newsize = 4 * 2 = 8
	else if (newsize <= max / 4)
		newsize = newsize * 2;
    //大的读取大小，即每次读取大于8 pages，比如read(64K) newsize = max = 32 pages
	else
		newsize = max;

	return newsize;
}

根据《Linux read的核心函数generic_file_buffered_read_nginux的博客-CSDN博客》文章我们知道，顺序读取大概调用逻辑:

1. 触发同步预读（sync readahead)

2.异步异步读取（async readahead)

首次同步预读 - sync readahead

调用路径：generic_file_buffered_read

-->page_cache_sync_readahead

-->ondemand_readahead : intial_readahead

--->ra_submit 向block layer请求io预读

首次文件头读进入ondemand_readahead的initial_readahead逻辑，计算file_ra_state值：

ra->start = 0; 因为是从文件头开始读取

ra->size = 4;

ra->async_size = 3;

ra->ra_pages = 32;

ra->prev_pos = -1;

根据ondemand_readahead中initial_readahead label处逻辑看，ra->size是由get_init_ra_size函数计算，该函数第一个参数是应用read的数据页（每个数据页4K)的数量，该场景每次读取4K bytes,相当于调用get_init_ra_size（1，32）返回4。

ra_submit向block layer请求io预读

/*
 * Submit IO for the read-ahead request in file_ra_state.
 */
static inline void ra_submit(struct file_ra_state *ra,
		struct address_space *mapping, struct file *filp)
{
	__do_page_cache_readahead(mapping, filp,
			ra->start, ra->size, ra->async_size);
}

/*
 * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
void __do_page_cache_readahead(struct address_space *mapping,
		struct file *file, pgoff_t index, unsigned long nr_to_read,
		unsigned long lookahead_size)
{
	struct inode *inode = mapping->host;
	loff_t isize = i_size_read(inode);
	pgoff_t end_index;	/* The last page we want to read */

	if (isize == 0)
		return;

	end_index = (isize - 1) >> PAGE_SHIFT;
	if (index > end_index)
		return;
	/* Don't read past the page containing the last byte of the file */
	if (nr_to_read > end_index - index)
		nr_to_read = end_index - index + 1;

	page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
			lookahead_size);
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
	return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}


/**
 * page_cache_readahead_unbounded - Start unchecked readahead.
 * @mapping: File address space.
 * @file: This instance of the open file; used for authentication.
 * @index: First page index to read.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_readahead_unbounded(struct address_space *mapping,
		struct file *file, pgoff_t index, unsigned long nr_to_read,
		unsigned long lookahead_size)
{
	LIST_HEAD(page_pool);

    //注意这里添加了__GFP_NORETRY | __GFP_NOWARN，page内存申请不会进入慢路径
	gfp_t gfp_mask = readahead_gfp_mask(mapping);
	struct readahead_control rac = {
		.mapping = mapping,
		.file = file,
		._index = index,
	};
	unsigned long i;

	/*
	 * Partway through the readahead operation, we will have added
	 * locked pages to the page cache, but will not yet have submitted
	 * them for I/O.  Adding another page may need to allocate memory,
	 * which can trigger memory reclaim.  Telling the VM we're in
	 * the middle of a filesystem operation will cause it to not
	 * touch file-backed pages, preventing a deadlock.  Most (all?)
	 * filesystems already specify __GFP_NOFS in their mapping's
	 * gfp_mask, but let's be explicit here.
	 */
	unsigned int nofs = memalloc_nofs_save();

	/*
	 * Preallocate as many pages as we will need.
	 */
	for (i = 0; i < nr_to_read; i++) {
		struct page *page = xa_load(&mapping->i_pages, index + i);

		BUG_ON(index + i != rac._index + rac._nr_pages);

		if (page && !xa_is_value(page)) {
			/*
			 * Page already present?  Kick off the current batch
			 * of contiguous pages before continuing with the
			 * next batch.  This page may be the one we would
			 * have intended to mark as Readahead, but we don't
			 * have a stable reference to this page, and it's
			 * not worth getting one just for that.
			 */
			read_pages(&rac, &page_pool, true);
			continue;
		}

		page = __page_cache_alloc(gfp_mask);
		if (!page)
			break;
		if (mapping->a_ops->readpages) {
			page->index = index + i;
			list_add(&page->lru, &page_pool);
		} else if (add_to_page_cache_lru(page, mapping, index + i,
					gfp_mask) < 0) {
			put_page(page);
			read_pages(&rac, &page_pool, true);
			continue;
		}
        //设置PageReadahead标志，非常重要
		if (i == nr_to_read - lookahead_size)
			SetPageReadahead(page);
		rac._nr_pages++;
	}

	/*
	 * Now start the IO.  We ignore I/O errors - if the page is not
	 * uptodate then the caller will launch readpage again, and
	 * will then handle the error.
	 */
	read_pages(&rac, &page_pool, false);
	memalloc_nofs_restore(nofs);
}

上面由于预读了4个页面，所以下次generic_file_buffered_read for(;;)循环中find_get_page会找到cache page(因为被预读进cache中了），循环继续，每次循环增加index值，由于index增加，就会触发PageReadahead（page)，进而调用page_cache_async_readahead：

这种设计是因为不能一直预读，因为预读失败会受到惩罚（浪费内存），而是要根据应用顺序读取到一定程度才进行新的预读，这个时机就是应用读取到PageReadahead（page)对应的page。page_cache_async_readahead-->ondemand_readahead触发新的“异步”预读：


/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct address_space *mapping,
		struct file_ra_state *ra, struct file *filp,
		bool hit_readahead_marker, pgoff_t index,
		unsigned long req_size)
{
	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
	unsigned long max_pages = ra->ra_pages;
	unsigned long add_pages;
	pgoff_t prev_index;

	/*
	 * If the request exceeds the readahead window, allow the read to
	 * be up to the optimal hardware IO size
	 */
	if (req_size > max_pages && bdi->io_pages > max_pages)
		max_pages = min(req_size, bdi->io_pages);

	/*
	 * start of file
	 */
	if (!index)
		goto initial_readahead;

	/*
	 * It's the expected callback index, assume sequential access.
	 * Ramp up sizes, and push forward the readahead window.
	 */
     //@1
	if ((index == (ra->start + ra->size - ra->async_size) ||
	     index == (ra->start + ra->size))) {
		ra->start += ra->size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * Hit a marked page without valid readahead state.
	 * E.g. interleaved reads.
	 * Query the pagecache for async_size, which normally equals to
	 * readahead size. Ramp it up and use it as the new readahead size.
	 */
    //@2 没有命中@1，比如没有完整顺序读的情况，刚好跳过了@1中的条件
	if (hit_readahead_marker) {
		pgoff_t start;

		rcu_read_lock();
		start = page_cache_next_miss(mapping, index + 1, max_pages);
		rcu_read_unlock();

		if (!start || start - index > max_pages)
			return;

		ra->start = start;
		ra->size = start - index;	/* old async_size */
		ra->size += req_size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * oversize read
	 */
	if (req_size > max_pages)
		goto initial_readahead;

	/*
	 * sequential cache miss
	 * trivial case: (index - prev_index) == 1
	 * unaligned reads: (index - prev_index) == 0
	 */
	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
	if (index - prev_index <= 1UL)
		goto initial_readahead;

	/*
	 * Query the page cache and look for the traces(cached history pages)
	 * that a sequential stream would leave behind.
	 */
	if (try_context_readahead(mapping, ra, index, req_size, max_pages))
		goto readit;

	/*
	 * standalone, small random read
	 * Read as is, and do not pollute the readahead state.
	 */
    //@3随机读取，倒数第二个参数代表只读取用户要求数量的page数量，最后一个参数0代表不进行
    //预读
	__do_page_cache_readahead(mapping, filp, index, req_size, 0);
	return;

initial_readahead:
	ra->start = index;
	ra->size = get_init_ra_size(req_size, max_pages);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
	/*
	 * Will this read hit the readahead marker made by itself?
	 * If so, trigger the readahead marker hit now, and merge
	 * the resulted next readahead window into the current one.
	 * Take care of maximum IO pages as above.
	 */
	if (index == ra->start && ra->size == ra->async_size) {
		add_pages = get_next_ra_size(ra, max_pages);
		if (ra->size + add_pages <= max_pages) {
			ra->async_size = add_pages;
			ra->size += add_pages;
		} else {
			ra->size = max_pages;
			ra->async_size = max_pages >> 1;
		}
	}

	ra_submit(ra, mapping, filp);
}

@1：对应顺序读取命中预读，比如我们当前场景预读了4个page，当读取到index = 1第二个page的时候就会命中该逻辑，会将“预读窗口”向前移动。

满足index == (ra->start + ra->size - ra->async_size）条件，所以向前移动预读窗口：

//start = 4
ra->start += ra->size; 
//根据get_next_ra_size函数，其中cur = ra->size = 4，所以return max =32,直接将预读窗口放大到了最大32
ra->size = get_next_ra_size(ra, max_pages);
//async_size = ra->size
ra->async_size = ra->size;


/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
static unsigned long get_next_ra_size(struct file_ra_state *ra,
				      unsigned long max)
{
	unsigned long cur = ra->size;

	if (cur < max / 16)
		return 4 * cur;
	if (cur <= max / 2)
		return 2 * cur;
	return max;
}

@2：没有触发1，但是触发异步readahead逻辑，同样继续新的预读逻辑,这里考虑到了多线程同时读一个文件的情况，ra结构体要在多线程之间完成状态切换，可看参考文章《Linux文件预读三》

参考文章：

Linux文件系统预读(一) - 知乎

Linux文件系统预读(二) - 知乎

Linux文件预读(三) - 知乎

深入分析Linux内核File cache机制（上篇） - 知乎