Linux 读文件 - readahead预读算法

news2024/12/29 13:34:29

顺序读场景

intmain

{

charc[ 4096];

intin = -1;

in = open( "news.txt", O_RDONLY);

intindex= 0;

while(read(in, &c, 4096) == 4096)

{

printf( "index: %d,len: %ld.\n",index, strlen(c));

memset(c, 0, sizeof(c));

index++;

}

数据结构

/*
 * Track a single file's readahead state
 */
struct file_ra_state {
	pgoff_t start;			/* where readahead started */
	unsigned int size;		/* # of readahead pages */
	unsigned int async_size;	/* do asynchronous readahead when
					   there are only # of pages ahead */

	unsigned int ra_pages;		/* Maximum readahead window */
	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
	loff_t prev_pos;		/* Cache last read() position */
};

start: 开始预读的数据页索引,是指相对文件内的index。

size : 一共要预读多个少页面。

async_size: 如果当前预读的“存货”只剩async_size时,就会触发async readahead,这个值很重要,控着async readahead的时机。异步读触发时类似网络协议栈的滑动窗口,窗口会移动,也就是file_ra_state字段会更新,具体如何更新见下文。这里async readahead就是指generic_file_buffered_read函数中的:page_cache_async_readahead函数调用

Linux read的核心函数generic_file_buffered_read_nginux的博客-CSDN博客

ra_page: readahead窗口的最大值,类似网络协议栈滑动窗口,这个窗口有最大限制。

prev_pos : 上次读的postion,文件内偏移,主义单位时bytes。

注意:PageReadahead的page非常重要,因为一旦应用程序读取数据到这个page,就要触发异步预读(page_cache_async_readahead)

数据结构初始化:

代码:mm/readahead.c: ondemand_readahead(预读算法的实现函数)


/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct address_space *mapping,
		struct file_ra_state *ra, struct file *filp,
		bool hit_readahead_marker, pgoff_t index,
		unsigned long req_size)
{
	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
	unsigned long max_pages = ra->ra_pages;
	unsigned long add_pages;
	pgoff_t prev_index;

	/*
	 * If the request exceeds the readahead window, allow the read to
	 * be up to the optimal hardware IO size
	 */
	if (req_size > max_pages && bdi->io_pages > max_pages)
		max_pages = min(req_size, bdi->io_pages);

	/*
	 * start of file
	 */
	if (!index)
		goto initial_readahead;

    ...

initial_readahead:
	ra->start = index;
    //req_size是指预读请求的大小数据页数量(每个数据页4K)
	ra->size = get_init_ra_size(req_size, max_pages);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

    ...
    //提交文件读取,触发io预读
	ra_submit(ra, mapping, filp);
}

/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-8 page = 32k initial, > 8 page = 128k initial
 */

//第一个参数:请求读取的数据页数量,比如目前场景每次读取4K字节,size = 1
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
	unsigned long newsize = roundup_pow_of_two(size);

    //每次读取小于4K bytes, 预读大小是请求大小的4被,比如read(4096), newsize = 1 * 4 = 4;
	if (newsize <= max / 32)
		newsize = newsize * 4;

    //中等读取大小,即1pages < ra <= 8pages,预读大小设置成请求大小的2倍
    //比如read(4K) newsize = 4 * 2 = 8
	else if (newsize <= max / 4)
		newsize = newsize * 2;
    //大的读取大小,即每次读取大于8 pages,比如read(64K) newsize = max = 32 pages
	else
		newsize = max;

	return newsize;
}

根据《Linux read的核心函数generic_file_buffered_read_nginux的博客-CSDN博客》文章我们知道,顺序读取大概调用逻辑:

1. 触发同步预读(sync readahead)

2.异步异步读取(async readahead)

首次同步预读 - sync readahead

调用路径:generic_file_buffered_read

                         -->page_cache_sync_readahead

                                -->ondemand_readahead : intial_readahead

                                      --->ra_submit 向block layer请求io预读

首次文件头读进入ondemand_readahead的initial_readahead逻辑,计算file_ra_state值:

ra->start = 0; 因为是从文件头开始读取

ra->size = 4;

ra->async_size = 3;

ra->ra_pages = 32;

ra->prev_pos = -1;

根据ondemand_readahead中initial_readahead label处逻辑看,ra->size是由get_init_ra_size函数计算,该函数第一个参数是应用read的数据页(每个数据页4K)的数量,该场景每次读取4K bytes,相当于调用get_init_ra_size(1,32)返回4。

ra_submit向block layer请求io预读

/*
 * Submit IO for the read-ahead request in file_ra_state.
 */
static inline void ra_submit(struct file_ra_state *ra,
		struct address_space *mapping, struct file *filp)
{
	__do_page_cache_readahead(mapping, filp,
			ra->start, ra->size, ra->async_size);
}

/*
 * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
void __do_page_cache_readahead(struct address_space *mapping,
		struct file *file, pgoff_t index, unsigned long nr_to_read,
		unsigned long lookahead_size)
{
	struct inode *inode = mapping->host;
	loff_t isize = i_size_read(inode);
	pgoff_t end_index;	/* The last page we want to read */

	if (isize == 0)
		return;

	end_index = (isize - 1) >> PAGE_SHIFT;
	if (index > end_index)
		return;
	/* Don't read past the page containing the last byte of the file */
	if (nr_to_read > end_index - index)
		nr_to_read = end_index - index + 1;

	page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
			lookahead_size);
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
	return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}


/**
 * page_cache_readahead_unbounded - Start unchecked readahead.
 * @mapping: File address space.
 * @file: This instance of the open file; used for authentication.
 * @index: First page index to read.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_readahead_unbounded(struct address_space *mapping,
		struct file *file, pgoff_t index, unsigned long nr_to_read,
		unsigned long lookahead_size)
{
	LIST_HEAD(page_pool);

    //注意这里添加了__GFP_NORETRY | __GFP_NOWARN,page内存申请不会进入慢路径
	gfp_t gfp_mask = readahead_gfp_mask(mapping);
	struct readahead_control rac = {
		.mapping = mapping,
		.file = file,
		._index = index,
	};
	unsigned long i;

	/*
	 * Partway through the readahead operation, we will have added
	 * locked pages to the page cache, but will not yet have submitted
	 * them for I/O.  Adding another page may need to allocate memory,
	 * which can trigger memory reclaim.  Telling the VM we're in
	 * the middle of a filesystem operation will cause it to not
	 * touch file-backed pages, preventing a deadlock.  Most (all?)
	 * filesystems already specify __GFP_NOFS in their mapping's
	 * gfp_mask, but let's be explicit here.
	 */
	unsigned int nofs = memalloc_nofs_save();

	/*
	 * Preallocate as many pages as we will need.
	 */
	for (i = 0; i < nr_to_read; i++) {
		struct page *page = xa_load(&mapping->i_pages, index + i);

		BUG_ON(index + i != rac._index + rac._nr_pages);

		if (page && !xa_is_value(page)) {
			/*
			 * Page already present?  Kick off the current batch
			 * of contiguous pages before continuing with the
			 * next batch.  This page may be the one we would
			 * have intended to mark as Readahead, but we don't
			 * have a stable reference to this page, and it's
			 * not worth getting one just for that.
			 */
			read_pages(&rac, &page_pool, true);
			continue;
		}

		page = __page_cache_alloc(gfp_mask);
		if (!page)
			break;
		if (mapping->a_ops->readpages) {
			page->index = index + i;
			list_add(&page->lru, &page_pool);
		} else if (add_to_page_cache_lru(page, mapping, index + i,
					gfp_mask) < 0) {
			put_page(page);
			read_pages(&rac, &page_pool, true);
			continue;
		}
        //设置PageReadahead标志,非常重要
		if (i == nr_to_read - lookahead_size)
			SetPageReadahead(page);
		rac._nr_pages++;
	}

	/*
	 * Now start the IO.  We ignore I/O errors - if the page is not
	 * uptodate then the caller will launch readpage again, and
	 * will then handle the error.
	 */
	read_pages(&rac, &page_pool, false);
	memalloc_nofs_restore(nofs);
}

上面由于预读了4个页面,所以下次generic_file_buffered_read for(;;)循环中find_get_page会找到cache page(因为被预读进cache中了),循环继续,每次循环增加index值,由于index增加,就会触发PageReadahead(page),进而调用page_cache_async_readahead:

这种设计是因为不能一直预读,因为预读失败会受到惩罚(浪费内存),而是要根据应用顺序读取到一定程度才进行新的预读,这个时机就是应用读取到PageReadahead(page)对应的page。page_cache_async_readahead-->ondemand_readahead触发新的“异步”预读:


/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct address_space *mapping,
		struct file_ra_state *ra, struct file *filp,
		bool hit_readahead_marker, pgoff_t index,
		unsigned long req_size)
{
	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
	unsigned long max_pages = ra->ra_pages;
	unsigned long add_pages;
	pgoff_t prev_index;

	/*
	 * If the request exceeds the readahead window, allow the read to
	 * be up to the optimal hardware IO size
	 */
	if (req_size > max_pages && bdi->io_pages > max_pages)
		max_pages = min(req_size, bdi->io_pages);

	/*
	 * start of file
	 */
	if (!index)
		goto initial_readahead;

	/*
	 * It's the expected callback index, assume sequential access.
	 * Ramp up sizes, and push forward the readahead window.
	 */
     //@1
	if ((index == (ra->start + ra->size - ra->async_size) ||
	     index == (ra->start + ra->size))) {
		ra->start += ra->size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * Hit a marked page without valid readahead state.
	 * E.g. interleaved reads.
	 * Query the pagecache for async_size, which normally equals to
	 * readahead size. Ramp it up and use it as the new readahead size.
	 */
    //@2 没有命中@1,比如没有完整顺序读的情况,刚好跳过了@1中的条件
	if (hit_readahead_marker) {
		pgoff_t start;

		rcu_read_lock();
		start = page_cache_next_miss(mapping, index + 1, max_pages);
		rcu_read_unlock();

		if (!start || start - index > max_pages)
			return;

		ra->start = start;
		ra->size = start - index;	/* old async_size */
		ra->size += req_size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * oversize read
	 */
	if (req_size > max_pages)
		goto initial_readahead;

	/*
	 * sequential cache miss
	 * trivial case: (index - prev_index) == 1
	 * unaligned reads: (index - prev_index) == 0
	 */
	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
	if (index - prev_index <= 1UL)
		goto initial_readahead;

	/*
	 * Query the page cache and look for the traces(cached history pages)
	 * that a sequential stream would leave behind.
	 */
	if (try_context_readahead(mapping, ra, index, req_size, max_pages))
		goto readit;

	/*
	 * standalone, small random read
	 * Read as is, and do not pollute the readahead state.
	 */
    //@3随机读取,倒数第二个参数代表只读取用户要求数量的page数量,最后一个参数0代表不进行
    //预读
	__do_page_cache_readahead(mapping, filp, index, req_size, 0);
	return;

initial_readahead:
	ra->start = index;
	ra->size = get_init_ra_size(req_size, max_pages);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
	/*
	 * Will this read hit the readahead marker made by itself?
	 * If so, trigger the readahead marker hit now, and merge
	 * the resulted next readahead window into the current one.
	 * Take care of maximum IO pages as above.
	 */
	if (index == ra->start && ra->size == ra->async_size) {
		add_pages = get_next_ra_size(ra, max_pages);
		if (ra->size + add_pages <= max_pages) {
			ra->async_size = add_pages;
			ra->size += add_pages;
		} else {
			ra->size = max_pages;
			ra->async_size = max_pages >> 1;
		}
	}

	ra_submit(ra, mapping, filp);
}

@1对应顺序读取命中预读,比如我们当前场景预读了4个page,当读取到index = 1第二个page的时候就会命中该逻辑,会将“预读窗口”向前移动。

满足index == (ra->start + ra->size - ra->async_size)条件,所以向前移动预读窗口:

//start = 4
ra->start += ra->size; 
//根据get_next_ra_size函数,其中cur = ra->size = 4,所以return max =32,直接将预读窗口放大到了最大32
ra->size = get_next_ra_size(ra, max_pages);
//async_size = ra->size
ra->async_size = ra->size;


/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
static unsigned long get_next_ra_size(struct file_ra_state *ra,
				      unsigned long max)
{
	unsigned long cur = ra->size;

	if (cur < max / 16)
		return 4 * cur;
	if (cur <= max / 2)
		return 2 * cur;
	return max;
}

@2:没有触发1,但是触发异步readahead逻辑,同样继续新的预读逻辑,这里考虑到了多线程同时读一个文件的情况,ra结构体要在多线程之间完成状态切换,可看参考文章《Linux文件预读三》

参考文章:

Linux文件系统预读(一) - 知乎

Linux文件系统预读(二) - 知乎

Linux文件预读(三) - 知乎

深入分析Linux内核File cache机制(上篇) - 知乎

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/710944.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

CentOS7 静默方式安装 Oracle19C

CentOS7 静默方式安装 Oracle19C 操作系统&#xff1a;CentOS7 Oracle&#xff1a; 19C 安装常用工具和依赖 yum -y install vim tar net-tools wget perl python3 readline* deltarpm python-deltarpm \zip unzip bc compat-libcap1* compat-libcap* binutils compat-libstdc…

最近的感悟与总结

目录 一、判别填鸭、与被填鸭的思考讲解人&#xff1a;听者&#xff1a; 二、最近感悟&#xff1a;三、再来三道数学题吧&#xff1a;四、总结 一、判别填鸭、与被填鸭的思考 讲解人&#xff1a; 1.是否在讲解过程中增加了知识认识的维度&#xff1f;(具体是什么&#xff0c;…

08_Linux按键输入

目录 Linux下按键驱动原理 修改设备树文件 添加KEY设备节点 按键驱动程序编写 编写测试APP 运行测试 Linux下按键驱动原理 按键驱动和LED驱动原理上来讲基本都是一样的,都是操作GPIO,只不过一个是读取GPIO 的高低电平,一个是从GPIO输出高低电平。实现按键输入,在驱动程序…

nginx的安装与自启动配置

1. nginx源码下载 nginx源码下载 2. nginx编译安装 2.1 解压安装包 tar -zxvf nginx-1.24.0.tar.gz2.2 编译安装 cd nginx-1.24.0 ./configure make make install执行./configure时可能出现如下的error&#xff0c;需要安装依赖库&#xff1a; 缺少pcre库 下载pcre安装包…

链码的安装、实例化、查询、调用

目录 1、首先保证网络已经处于启动状态 2、进入CLI容器 3、检查当前节点已经加入到哪些通道中&#xff08;刚进来&#xff0c;默认是 peer0.example.com&#xff09; 链码的安装 1、检查通道名称是否设置正确 2、使用install安装链码 3、实例化链码 4、查詢链码 发起交易 …

English Learning - L3 作业打卡 Lesson7 Day52 2023.6.27 周二

English Learning - L3 作业打卡 Lesson7 Day52 2023.6.27 周二 引言&#x1f349;句1: And that is when it dawned on me that I didn’t have to be 5,5 ft. anymore.成分划分弱读连读爆破语调 &#x1f349;句2: I could be as tall as I wanted, or as short as I wanted …

【电子学会】2023年05月图形化三级 -- 躲避陨石

文章目录 躲避陨石1. 准备工作2. 功能实现3. 设计思路与实现&#xff08;1&#xff09;角色、舞台背景设置a. 角色设置b. 舞台背景设置 &#xff08;2&#xff09;脚本编写a. 角色&#xff1a;Rocketshipb. 角色&#xff1a;Ball 4. 评分标准5. 知识块 躲避陨石 飞船在陨石乱飞…

MySQL数据库第一课 ---------安装

作者前言 欢迎小可爱们前来借鉴我的gtiee秦老大大 (qin-laoda) - Gitee.com 目录 虚拟环境虚拟环境 安装虚拟机 自主安装 简易安装 ———————————————————————————————————————— 虚拟环境虚拟环境 我们现在运行代码的环境是在 PyCharm…

大型语言模型作为属性化训练数据生成器

大型语言模型作为属性化训练数据生成器&#xff0c;提出一种使用多样化属性提示的数据生成方法&#xff0c;可以生成具有多样性和属性的训练数据&#xff0c;从而提高了模型的性能和数据生成的效率。 动机&#xff1a;大型语言模型(LLM)最近被用作各种自然语言处理(NLP)任务的…

从零开始——腾讯云服务器安装运行部署教程

一、腾讯云 1、购买腾讯云服务器 现在腾讯云服务器有优惠活动&#xff08;大家可以在网上搜索各个服务器的优惠活动&#xff0c;有很多&#xff09; 活动链接&#xff1a;上云精选_云服务器秒杀_开发者上云推荐-腾讯云 (tencent.com) 这里我选2G/2核/4M/50G轻量级服务器 这…

Linux基础服务6——Apache

文章目录 一、基本了解1.1 自带工具1.2 服务配置文件1.3 访问控制法则1.4 web相关的命令1.4.1 curl命令1.4.2 httpd命令 二、安装配置2.1 安装服务2.2 访问网页2.3 虚拟主机配置2.3.1 相同ip&#xff0c;不同端口2.3.2 相同端口&#xff0c;不同ip2.3.3 域名访问 三、https3.1 …

深度剖析 Linux 伙伴系统的设计与实现

目录 伙伴系统的核心数据结构 总结&#xff1a; 到底什么是伙伴 伙伴系统的内存分配原理 伙伴系统的内存回收原理 伙伴系统的实现 从 CPU 高速缓存列表中获取内存页 伙伴系统的核心数据结构 如上图所示&#xff0c;内核会为 NUMA 节点中的每个物理内存区域 zone 分配一个…

《计算机网络--自顶向下方法》第四章--网络层:数据平面

4.1网络层概述 每台路由器的数据平面的主要作用是从其输入链路向其输出链路转发数据报&#xff1b;控制平面的主要作用是协调这些本地路由器转发动作&#xff0c;使得数据报沿着源和目的地主机之间的路由器路径进行端到端传送 路由器具有截断的协议栈&#xff0c;即没有网络层…

[RISC-V]Milk-V开发板 i2c测试oled及波形输出

I2C3 引脚图 修改i2c3复用功能 build\boards\cv180x\cv1800b_sophpi_duo_sd\u-boot\cvi_board_init.c //I2C3 pin6 7 PINMUX_CONFIG(SD1_CMD, IIC3_SCL); PINMUX_CONFIG(SD1_CLK, IIC3_SDA);扫描I2C3上的设备 [rootcvitek]~# i2cdetect -y -r 3 0 1 2 3 4 5 6 7 8 9 a b c …

大数据Doris(五十四):BACKUP数据备份原理和语法

文章目录 BACKUP数据备份原理和语法 一、BACKUP数据备份原理 1、快照及快照上传 2、元数据准备及上传 二、BACKUP数据备份语法 BACKUP数据备份原理和语法 通过Doris数据导出的各种方式我们可以将Doris中的数据进行备份&#xff0c;除了export方式之外&#xff0c;Doris 还…

高压线路零序电流方向保护程序逻辑原理(二)

二、零序电流方向保护的采样中断服务程序 零序电流方向保护与其他微机保护的采样中断服务程序相同&#xff0c;均有电压求和自检和电流求和自检及相电流差突变量起动元件DI1。零序电流方向保护的采样中断服务程序中最突出的问题是通过3U。突变量元件来实现闭锁保护&#xff0c…

使用Dependency Walker和Process Explorer排查程序缺少ucrtbase.dll等运行时库以及报0xC000007B错误问题总结

目录 1、问题描述 2、分析软件问题的常用分析工具 3、使用Dependency Walker排查启动程序时报找不到ucrtbase.dll、vcruntime140.dll等运行时库的问题 3.1、使用Dependency Walker查看exe程序的库依赖关系&#xff0c;排查找不到ucrtbase.dll、vcruntime140.dll库问题 3.2…

华为OD机试真题 Python 实现【相对开音节】【2022Q4 100分】,附详细解题思路

一、题目描述 相对开音节构成的结构为辅音元音&#xff08;aeiou&#xff09;辅音(r除外)e&#xff0c;常见的单词有life,time,woke,coke,joke,note,nose,communicate&#xff0c;use&#xff0c;gate&#xff0c;same&#xff0c;late等。 给定一个字符串&#xff0c;以空格…

关于 Camera 产品的功能分析

1、问题背景 通过最近做的一些项目&#xff0c;发现 Tuning 一款 Camera 产品前&#xff0c;要考虑到的事情有很多&#xff0c;不是简单的点亮&#xff0c;按要求调完效果就结束了。 从目前的经验来看&#xff0c;准备工作做的越充分&#xff0c;后期遇到的问题也就越少。本文…

MQTT(三)MQTTX工具使用

MQTTX工具使用 MQTTX是一种开源的MQTT客户端工具&#xff0c;用于帮助开发人员测试和调试MQTT&#xff08;Message Queuing Telemetry Transport&#xff09;协议。MQTT是一种轻量级的消息传输协议&#xff0c;通常用于物联网设备和应用程序之间的通信。 MQTTX提供了一个用户…