linux页框回收之shrink_node函数源码剖析

news2025/1/12 3:04:33
概述 

《Linux内存回收入口_nginux的博客-CSDN博客》前文我们概略的描述了几种内存回收入口,我们知道几种回收入口最终都会调用进入shrink_node函数,本文将以Linux 5.9源码来描述shrink_node函数的源码实现。

函数调用流程图

 

 scan_control数据结构

struct scan_control {
	/* How many pages shrink_list() should reclaim */
	unsigned long nr_to_reclaim;

	/*
	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
	 * are scanned.
	 */
	nodemask_t	*nodemask;

	/*
	 * The memory cgroup that hit its limit and as a result is the
	 * primary target of this reclaim invocation.
	 */
	struct mem_cgroup *target_mem_cgroup;

	/*
	 * Scan pressure balancing between anon and file LRUs
	 */
	unsigned long	anon_cost;
	unsigned long	file_cost;

	/* Can active pages be deactivated as part of reclaim? */
    //是否能从active lru列表进行deactivate的reclaim
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
	unsigned int may_deactivate:2;
    //如果是1:代表强制进行deactivate,即同时deactivate file和anon
    //如果是0,按需进行deactivate file或者anon,具体条件见下面shrink_node源码分析
	unsigned int force_deactivate:1;
	unsigned int skipped_deactivate:1;

	/* Writepage batching in laptop mode; RECLAIM_WRITE */
	unsigned int may_writepage:1;

	/* Can mapped pages be reclaimed? */
	unsigned int may_unmap:1;

	/* Can pages be swapped as part of reclaim? */
	unsigned int may_swap:1;

	/*
	 * Cgroups are not reclaimed below their configured memory.low,
	 * unless we threaten to OOM. If any cgroups are skipped due to
	 * memory.low and nothing was reclaimed, go back for memory.low.
	 */
	unsigned int memcg_low_reclaim:1;
	unsigned int memcg_low_skipped:1;

	unsigned int hibernation_mode:1;

	/* One of the zones is ready for compaction */
	unsigned int compaction_ready:1;

	/* There is easily reclaimable cold cache in the current node */
    //设置为1代表只回收file page cache,不回收aone page
	unsigned int cache_trim_mode:1;

	/* The file pages on the current node are dangerously low */
    //设置1代表只回收aone page,不回收file page
	unsigned int file_is_tiny:1;

	/* Allocation order */
	s8 order;

	/* Scan (total_size >> priority) pages at once */
	s8 priority;

	/* The highest zone to isolate pages for reclaim from */
	s8 reclaim_idx;

	/* This context's GFP mask */
	gfp_t gfp_mask;

	/* Incremented by the number of inactive pages that were scanned */
	unsigned long nr_scanned;

	/* Number of pages freed so far during a call to shrink_zones() */
	unsigned long nr_reclaimed;

	struct {
		unsigned int dirty;
		unsigned int unqueued_dirty;
		unsigned int congested;
		unsigned int writeback;
		unsigned int immediate;
		unsigned int file_taken;
		unsigned int taken;
	} nr;

	/* for recording the reclaimed slab by now */
	struct reclaim_state reclaim_state;
};
shrink_node函数

static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
	struct reclaim_state *reclaim_state = current->reclaim_state;
	unsigned long nr_reclaimed, nr_scanned;
	struct lruvec *target_lruvec;
	bool reclaimable = false;
	unsigned long file;

	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);

again:
	memset(&sc->nr, 0, sizeof(sc->nr));

	nr_reclaimed = sc->nr_reclaimed;
	nr_scanned = sc->nr_scanned;

	/*
	 * Determine the scan balance between anon and file LRUs.
	 */
	spin_lock_irq(&pgdat->lru_lock);
	sc->anon_cost = target_lruvec->anon_cost;
	sc->file_cost = target_lruvec->file_cost;
	spin_unlock_irq(&pgdat->lru_lock);

	/*
	 * Target desirable inactive:active list ratios for the anon
	 * and file LRU lists.
	 */
	if (!sc->force_deactivate) {
		unsigned long refaults;

		refaults = lruvec_page_state(target_lruvec,
				WORKINGSET_ACTIVATE_ANON);
        //anon的refaults值比上次回收发生了变化,或者inactive anon很少,设置
        //DEACTIVATE_ANON表示需要deactivate anon
		if (refaults != target_lruvec->refaults[0] ||
			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
			sc->may_deactivate |= DEACTIVATE_ANON;
		else
			sc->may_deactivate &= ~DEACTIVATE_ANON;

		/*
		 * When refaults are being observed, it means a new
		 * workingset is being established. Deactivate to get
		 * rid of any stale active pages quickly.
		 */
		refaults = lruvec_page_state(target_lruvec,
				WORKINGSET_ACTIVATE_FILE);
		if (refaults != target_lruvec->refaults[1] ||
		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
			sc->may_deactivate |= DEACTIVATE_FILE;
		else
			sc->may_deactivate &= ~DEACTIVATE_FILE;
	} else
		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;

	/*
	 * If we have plenty of inactive file pages that aren't
	 * thrashing, try to reclaim those first before touching
	 * anonymous pages.
	 */
    //file是inactive file的数量
	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
        //只回收file page,影响get_scan_count
		sc->cache_trim_mode = 1;
	else
		sc->cache_trim_mode = 0;

	/*
	 * Prevent the reclaimer from falling into the cache trap: as
	 * cache pages start out inactive, every cache fault will tip
	 * the scan balance towards the file LRU.  And as the file LRU
	 * shrinks, so does the window for rotation from references.
	 * This means we have a runaway feedback loop where a tiny
	 * thrashing file LRU becomes infinitely more attractive than
	 * anon pages.  Try to detect this based on file LRU size.
	 */
	if (!cgroup_reclaim(sc)) {
		unsigned long total_high_wmark = 0;
		unsigned long free, anon;
		int z;

		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
			   node_page_state(pgdat, NR_INACTIVE_FILE);

		for (z = 0; z < MAX_NR_ZONES; z++) {
			struct zone *zone = &pgdat->node_zones[z];
			if (!managed_zone(zone))
				continue;

			total_high_wmark += high_wmark_pages(zone);
		}

		/*
		 * Consider anon: if that's low too, this isn't a
		 * runaway file reclaim problem, but rather just
		 * extreme pressure. Reclaim as per usual then.
		 */
		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
        //设置1代表只回收aone page,不回收file page
		sc->file_is_tiny =
			file + free <= total_high_wmark &&
			!(sc->may_deactivate & DEACTIVATE_ANON) &&
			anon >> sc->priority;
	}
    //回收的核心函数,后面文章专门分析
	shrink_node_memcgs(pgdat, sc);

	if (reclaim_state) {
		sc->nr_reclaimed += reclaim_state->reclaimed_slab;
		reclaim_state->reclaimed_slab = 0;
	}

	/* Record the subtree's reclaim efficiency */
	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
		   sc->nr_scanned - nr_scanned,
		   sc->nr_reclaimed - nr_reclaimed);

    //这一轮回收到了页面
	if (sc->nr_reclaimed - nr_reclaimed)
		reclaimable = true;

    //只允许kswapd线程设置这些flag,因为只有kswapd能clear这些flag,避免混乱
    //比如memcg reclaim也能设置,没法保证kswapd肯定会被wakeup去clear这些标志
	if (current_is_kswapd()) {
		/*
		 * If reclaim is isolating dirty pages under writeback,
		 * it implies that the long-lived page allocation rate
		 * is exceeding the page laundering rate. Either the
		 * global limits are not being effective at throttling
		 * processes due to the page distribution throughout
		 * zones or there is heavy usage of a slow backing
		 * device. The only option is to throttle from reclaim
		 * context which is not ideal as there is no guarantee
		 * the dirtying process is throttled in the same way
		 * balance_dirty_pages() manages.
		 *
		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
		 * count the number of pages under pages flagged for
		 * immediate reclaim and stall if any are encountered
		 * in the nr_immediate check below.
		 */
        //设置PGDAT_DIRTY代表reclaim发现很多页面正在回写
		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
			set_bit(PGDAT_WRITEBACK, &pgdat->flags);

		/* Allow kswapd to start writing pages during reclaim.*/
        设置PGDAT_DIRTY代表reclaim发现很多脏页
		if (sc->nr.unqueued_dirty == sc->nr.file_taken)
			set_bit(PGDAT_DIRTY, &pgdat->flags);

		/*
		 * If kswapd scans pages marked for immediate
		 * reclaim and under writeback (nr_immediate), it
		 * implies that pages are cycling through the LRU
		 * faster than they are written so also forcibly stall.
		 */
		if (sc->nr.immediate)
			congestion_wait(BLK_RW_ASYNC, HZ/10);
	}

	/*
	 * Tag a node/memcg as congested if all the dirty pages
	 * scanned were backed by a congested BDI and
	 * wait_iff_congested will stall.
	 *
	 * Legacy memcg will stall in page writeback so avoid forcibly
	 * stalling in wait_iff_congested().
	 */
    //只允许kswapd线程设置LRUVEC_CONGESTED,因为只有kswapd能clear LRUVEC_CONGESTED,
    //比如memcg reclaim也能设置,没法保证kswap能唤醒去clear LRUVEC_CONGESTED,导致
    //direct reclaim阻塞在wait_iff_congested
	if ((current_is_kswapd() ||
	     (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
	    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
		set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);

	/*
	 * Stall direct reclaim for IO completions if underlying BDIs
	 * and node is congested. Allow kswapd to continue until it
	 * starts encountering unqueued dirty pages or cycling through
	 * the LRU too quickly.
	 */
    //如果是非kswapd线程,且判定当前回收设置过拥塞flag,就要等待,所以direct reclaim
    //会被阻塞
	if (!current_is_kswapd() && current_may_throttle() &&
	    !sc->hibernation_mode &&
	    test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
		wait_iff_congested(BLK_RW_ASYNC, HZ/10);

    //如果需要继续回收,就goto again继续
	if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
				    sc))
		goto again;

	/*
	 * Kswapd gives up on balancing particular nodes after too
	 * many failures to reclaim anything from them and goes to
	 * sleep. On reclaim progress, reset the failure counter. A
	 * successful direct reclaim run will revive a dormant kswapd.
	 */
	if (reclaimable)
		pgdat->kswapd_failures = 0;
}
should_continue_reclaim 

/*
 * Reclaim/compaction is used for high-order allocation requests. It reclaims
 * order-0 pages before compacting the zone. should_continue_reclaim() returns
 * true if more pages should be reclaimed such that when the page allocator
 * calls try_to_compact_pages() that it will have enough free pages to succeed.
 * It will give up earlier than that if there is difficulty reclaiming pages.
 */
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
					unsigned long nr_reclaimed,
					struct scan_control *sc)
{
	unsigned long pages_for_compaction;
	unsigned long inactive_lru_pages;
	int z;

	/* If not in reclaim/compaction mode, stop */
	if (!in_reclaim_compaction(sc))
		return false;

	/*
	 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
	 * number of pages that were scanned. This will return to the caller
	 * with the risk reclaim/compaction and the resulting allocation attempt
	 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
	 * allocations through requiring that the full LRU list has been scanned
	 * first, by assuming that zero delta of sc->nr_scanned means full LRU
	 * scan, but that approximation was wrong, and there were corner cases
	 * where always a non-zero amount of pages were scanned.
	 */
	if (!nr_reclaimed)
		return false;

    //compaction_suitable会检查水位是否已满足条件(要根据orderPAGE_ALLOC_COSTLY_ORDER
    //使用不同的watermark,如果不满足就不会返回success/continue
	/* If compaction would go ahead or the allocation would succeed, stop */
	for (z = 0; z <= sc->reclaim_idx; z++) {
		struct zone *zone = &pgdat->node_zones[z];
		if (!managed_zone(zone))
			continue;

        //满足了水位return false,代表不要继续shrink_node了
		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
		case COMPACT_SUCCESS:
		case COMPACT_CONTINUE:
			return false;
		default:
			/* check next zone */
			;
		}
	}

	/*
	 * If we have not reclaimed enough pages for compaction and the
	 * inactive lists are large enough, continue reclaiming
	 */
    //上面水位检查不通过,且也没有reclaim足够的page来做compaction,那就继续reclaim吧
	pages_for_compaction = compact_gap(sc->order);
	inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
	if (get_nr_swap_pages() > 0)
		inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);

	return inactive_lru_pages > pages_for_compaction;
}

参考文章:

[PATCH v2 4/4] mm/vmscan: Don't mess with pgdat->flags in memcg reclaim. - Andrey Ryabinin

Linux 内存管理_workingset内存_jianchwa的博客-CSDN博客

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/811411.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

如何关闭谷歌浏览器自动更新

适用范围: 写自动化脚本时&#xff0c;需要安装浏览器驱动&#xff0c;安装浏览器驱动时需要下载对应的浏览器驱动版本&#xff0c;如果浏览器版本一直在自动更新的话&#xff0c;自动化脚本会报错浏览器版本和浏览器驱动不匹配&#xff0c;所以建议关闭谷歌浏览器自动更新&am…

认识 springboot 之 它的配置文件 -2

前言 本篇了解springboot中配置的作用&#xff0c;介绍配置文件的种类&#xff0c;介绍简单使用配置文件&#xff0c;简单的小技巧如何设置注释&#xff0c;开启热部署等等&#xff0c;如有错误&#xff0c;请在评论区指正&#xff0c;让我们一起交流&#xff0c;共同进步&…

内存分区模型

C程序在执行时&#xff0c;将内存大方向划分为4个区域 代码区&#xff1a;存放函数体的二进制代码&#xff0c;由操作系统进行管理的全局区&#xff1a;存放全局变量和静态变量以及常量栈区&#xff1a;由编译器自动分配释放, 存放函数的参数值,局部变量等堆区&#xff1a;由程…

ReentrantLock锁的实现

ReentrantLock基于AQS&#xff0c;在并发编程中可以实现公平锁和非公平锁来对同步资源进行控制&#xff0c;并且是可重入锁。 1.ReentrantLock中的类的继承结构&#xff1a; 2.构造方法&#xff1a; 3.非公平锁的实现 看是否能够通过CAS来设置state来获取到锁&#xff0c;如果…

【数据结构】这堆是什么

目录 1.二叉树的顺序结构 2.堆的概念及结构 3.堆的实现 3.1 向上调整算法与向下调整算法 3.2 堆的创建 3.3 建堆的空间复杂度 3.4 堆的插入 3.5 堆的删除 3.6 堆的代码的实现 4.堆的应用 4.1 堆排序 4.2 TOP-K问题 首先&#xff0c;堆是一种数据结构&#xff0c;一种特…

Selenium开发环境搭建

1.下载Python https://www.python.org/downloads/ 下载下来选择自己创建的路径进行安装&#xff0c;然后配置环境变量 cmd命令框查看 2.安装selenium cmd命令框输入&#xff1a; pip install selenium3.下载pycharm https://www.jetbrains.com/pycharm/download/#sec…

VLOOKUP多条件查询

LOOKUP(1,0/((A3:A15A18)*(C3:C15C18)),F3:F15)

打印Winfrom控件实现简陋版的打印(C#)

本文在前面写的博文基础上进行修改&#xff1a;利用Graphics的CopyFromScreen实现简陋版的打印(C#)_zxy2847225301的博客-CSDN博客 通过截图的方式进行打印在前面的文章后面已经介绍过&#xff0c;有问题。 UI布局如下&#xff1a; 代码如下&#xff1a; using System; using…

无涯教程-jQuery - Dialog组件函数

小部件对话框函数可与JqueryUI中的小部件一起使用。对话框是在HTML页面上显示信息的一种不错的方法。对话框是一个带有标题和内容区域的浮动窗口。此窗口可以移动&#xff0c;调整大小&#xff0c;并且默认情况下可以使用" X"图标关闭。 Dialog - 语法 $( "#d…

CAN转ETHERCAT网关can协议和485协议区别

大家好&#xff0c;今天要跟大家分享一款自主研发的通讯网关&#xff0c;JM-ECT-CAN。这款产品能够将各种CAN总线和ETHERCAT网络连接起来&#xff0c;实现高效的数据传输和通信。那么&#xff0c;这款通讯网关具体有哪些功能和特点呢&#xff1f;接下来&#xff0c;我们就一起来…

苍穹外卖心得与总结【对比瑞吉】【如何获得铁粉】

对于苍穹外卖项目&#xff0c;从学习课程加复习已经13天了。 对于一名已经学习过SSMLinuxRedis数据库的Java练习生来说&#xff0c;这个项目相对于之前学习的《瑞吉外卖》新增了很多功能和技术&#xff0c;是很值得练手和提升的课程&#xff0c;下面给出自己的一些见解。&#…

大厂程序员的水平比非大厂高很多嘛?

最近一个月&#xff0c;筛选了一百多份简历&#xff0c;前前后后面试了二三十人&#xff0c;基本上都是有大厂经历的人。同时&#xff0c;也录用了几个有大厂经历的。但整体而言&#xff0c;打破了对大厂出来的都是优质人才的幻觉。看到的实际情况与想象中的落差还是比较大的。…

从零开始学python(十二)如何成为一名优秀的爬虫工程师

前言 回顾之前讲述了python语法编程 必修入门基础和网络编程&#xff0c;多线程/多进程/协程等方面的内容&#xff0c;后续讲到了数据库编程篇MySQL&#xff0c;Redis&#xff0c;MongoDB篇&#xff0c;和机器学习&#xff0c;全栈开发&#xff0c;数据分析前面没看的也不用往…

ChatIE:通过多轮问答问题实现实命名实体识别和关系事件的零样本信息抽取,并在NYT11-HRL等数据集上超过了全监督模型

项目设计集合&#xff08;人工智能方向&#xff09;&#xff1a;助力新人快速实战掌握技能、自主完成项目设计升级&#xff0c;提升自身的硬实力&#xff08;不仅限NLP、知识图谱、计算机视觉等领域&#xff09;&#xff1a;汇总有意义的项目设计集合&#xff0c;助力新人快速实…

【机器学习】Cost Function

Cost Function 1、计算 cost2、cost 函数的直观理解3、cost 可视化总结附录 首先&#xff0c;导入所需的库&#xff1a; import numpy as np %matplotlib widget import matplotlib.pyplot as plt from lab_utils_uni import plt_intuition, plt_stationary, plt_update_onclic…

【数字IC设计】VCS仿真DesignWare IP

DesignWare介绍 DesignWare是SoC/ASIC设计者最钟爱的设计IP库和验证IP库。它包括一个独立于工艺的、经验证的、可综合的虚拟微架构的元件集合&#xff0c;包括逻辑、算术、存储和专用元件系列&#xff0c;超过140个模块。DesignWare和 Design Compiler的结合可以极大地改进综合…

c++ 给无名形参提供默认值

如上图&#xff0c;若函数的形参不在函数体里使用&#xff0c;可以不提供形参名&#xff0c;而且可以给此形参提供默认值。也能编译通过。 在看vs2019上的源码时&#xff0c;也出现了这种写法。应用SFINAE&#xff08;substitute false is not an error&#xff09;原则&#x…

Go Ethereum源码学习笔记 001 Geth Start

Go Ethereum源码学习笔记 前言[Chapter_001] 万物的起点: Geth Start什么是 geth&#xff1f;go-ethereum Codebase 结构 Geth Start前奏: Geth Consolegeth 节点是如何启动的NodeNode的关闭 Ethereum Backend附录 前言 首先读者需要具备Go语言基础&#xff0c;至少要通关菜鸟…

周末放松大作战:优化生活质量的秘密武器

博主 默语带您 Go to New World. ✍ 个人主页—— 默语 的博客&#x1f466;&#x1f3fb; 《java 面试题大全》 &#x1f369;惟余辈才疏学浅&#xff0c;临摹之作或有不妥之处&#xff0c;还请读者海涵指正。☕&#x1f36d; 《MYSQL从入门到精通》数据库是开发者必会基础之…

C++代码质量提升指南-工具篇

提高代码质量的方法&#xff1a; 使用代码规范&#xff1a;代码规范是指对代码的编写风格和格式进行规范的规则。使用代码规范可以提高代码的可读性、可维护性和可扩展性。进行单元测试&#xff1a;单元测试是一种用于验证代码单元是否正确运行的测试方法。进行单元测试可以帮…