物理内存

相关数据结构

page（页）

Linux 内核内存管理的实现以 page 数据结构为核心，其他的内存管理设施都基于 page 数据结构，如 VMA 管理、缺页中断、RMAP、页面分配与回收等。page 数据结构定义在 include/linux/mm_types.h 头文件中，大量使用了 C 语言的联合体(union)来优化其数据结构的大小，因为每个物理页面都需要一个 page 数据结构来跟踪和管理这些物理页面的使用情况，所以管理成本很高。
在这里插入图片描述

struct page {
    // 第一部分，标志位
	unsigned long flags;		
					
    // 第二部分，40字节的联合体
	union {
	    // 管理匿名页面/文件映射页面
		struct {	
			struct list_head lru;
			struct address_space *mapping;
			pgoff_t index;		
			unsigned long private;
		};
        // 管理 slab/slob/slub 分配器
		struct {	
			union {
				struct list_head slab_list;	
				struct {	
					struct page *next;
					int pages;	
					int pobjects;	
				};
			};
			struct kmem_cache *slab_cache;
			void *freelist;		
			union {
				void *s_mem;	
				unsigned long counters;		
				struct {			
					unsigned inuse:16;
					unsigned objects:15;
					unsigned frozen:1;
				};
			};
		};
		struct {	
			unsigned long compound_head;	
			unsigned char compound_dtor;
			unsigned char compound_order;
			atomic_t compound_mapcount;
		};
		struct {	
			unsigned long _compound_pad_1;	
			unsigned long _compound_pad_2;
			struct list_head deferred_list;
		};
		// 管理页表
		struct {	
			unsigned long _pt_pad_1;	
			pgtable_t pmd_huge_pte;
			unsigned long _pt_pad_2;
			union {
				struct mm_struct *pt_mm; 
				atomic_t pt_frag_refcount; 
			};
			spinlock_t ptl;
		};
		// 管理 ZONE_DEVICE 页面
		struct {	
			struct dev_pagemap *pgmap;
			unsigned long hmm_data;
			unsigned long _zd_pad_1;	
		};
		struct rcu_head rcu_head;
	};
    // 第三部分，4字节的联合体，管理 _mapcount 等
	union {		
		atomic_t _mapcount;
		unsigned int page_type;
		unsigned int active;		
		int units;			
	};
	// 第四部分，_refcount 引用计数
	atomic_t _refcount;
} _struct_page_alignment;

主要成员解释如下：

unsigned long flags：页面的标志位集合
struct list_head lru：LRU 链表节点，匿名页面或文件映射页面会通过该成员添加到 LRU 链表中
struct address_space *mapping：表示页面所指向的地址空间
pgoff_t index：表示这个页面在一个映射中的序号或偏移量
unsigned long private：指向私有数据的指针
struct list_head slab_list：slab 链表节点
struct page *next：在 slub 分配器中使用
struct kmem_cache *slab_cache：slab 缓存描述符，slab 分配器中的第一个物理页面的 page 数据结构中的 slab cache 指向 slab 缓存描述符
void *freelist：管理区。管理区可以看作一个数组，数组的每个成员占用 1 字节，每个成员代表一个 slab 对象
void *s_mem：在 slab 分配器中用来指向第一个 slab 对象的起始地址
spinlock_t ptl：用于保护页表操作的自旋锁，通常在更新页表时候需要这个锁以进行保护
struct rcu_head rcu_head：RCU 锁，在 slab 分配器中释放slab 的物理页面
atomic_t _mapcount：用于统计 _mapcount
atomic_t _refcount：用于统计 _refcount
unsigned int active：表示slab分配器中活跃对象的数量。当为0时，表示这个slab分配器中没有活跃对象，可以销毁这个slab分配器。活跃对象就是已经被迁移到对象缓冲池中的对象

比较重要的字段

flags

flags 成员是页面的标志位集合，标志位是内存管理中非常重要的部分，具体定义在 include/linux/page-flags.h 文件中，重要的标志位如下。

enum pageflags {
	PG_locked,		/* Page is locked. Don't touch. */
	PG_referenced,
	PG_uptodate,
	PG_dirty,
	PG_lru,
	PG_active,
	PG_workingset,
	PG_waiters,		/* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
	PG_error,
	PG_slab,
	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use*/
	PG_arch_1,
	PG_reserved,
	PG_private,		/* If pagecache, has fs-private data */
	PG_private_2,		/* If pagecache, has fs aux data */
	PG_writeback,		/* Page is under writeback */
	PG_head,		/* A head page */
	PG_mappedtodisk,	/* Has blocks allocated on-disk */
	PG_reclaim,		/* To be reclaimed asap */
	PG_swapbacked,		/* Page is backed by RAM/swap */
	PG_unevictable,		/* Page is "unevictable"  */
#ifdef CONFIG_MMU
	PG_mlocked,		/* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
	PG_uncached,		/* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
	PG_hwpoison,		/* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
	PG_young,
	PG_idle,
#endif
	__NR_PAGEFLAGS,

	/* Filesystems */
	PG_checked = PG_owner_priv_1,

	/* SwapBacked */
	PG_swapcache = PG_owner_priv_1,	/* Swap page: swp_entry_t in private */

	/* Two page bits are conscripted by FS-Cache to maintain local caching
	 * state.  These bits are set on pages belonging to the netfs's inodes
	 * when those inodes are being locally cached.
	 */
	PG_fscache = PG_private_2,	/* page backed by cache */

	/* XEN */
	/* Pinned in Xen as a read-only pagetable page. */
	PG_pinned = PG_owner_priv_1,
	/* Pinned as part of domain save (see xen_mm_pin_all()). */
	PG_savepinned = PG_dirty,
	/* Has a grant mapping of another (foreign) domain's page. */
	PG_foreign = PG_owner_priv_1,

	/* SLOB */
	PG_slob_free = PG_private,

	/* Compound pages. Stored in first tail page's flags */
	PG_double_map = PG_private_2,

	/* non-lru isolated movable page */
	PG_isolated = PG_reclaim,
};

PG_locked 表示页面已经上锁了。如果该位置位，说明页面已经上锁，内存管理的其他模块不能访问这个页面，以防发生竞争。
PG_error 表示页面操作过程中发生I/O错误时会设置该位。
PG_referenced 和 PG_active 用于控制页面的活跃程度，在 kswapd 页面回收中使用。
PG_uptodate 表示页面的数据已经从块设备成功读取。
PG_dirty 表示页面内容发生改变，这个页面为脏页，即页面的内容被改写后还没有和外部存储器进行过同步操作。
PG_lru 表示页面在 LRU 链表中。LRU 链表指最近最少使用（Least Recently Used）链表。内核使用 LRU 链表来管理活跃和不活跃页面。
PG_slab 表示页面用于 slab 分配器。
PG_waiters 表示有进程在等待这个页面。
PG_writeback 表示页面的内容正在向块设备回写。
PG_swapcache 表示页面处于交换缓存中。
PG_swapbacked 表示页面具有 swap 缓存功能，通常匿名页面才可以写回交换分区。
PG_reclaim 表示这个页面马上要被回收。
PG_unevictable 表示页面不可被回收。
PG_mlocked 表示页面对应的 VMA 处于 mlocked 状态。

内核定义了一些宏，用于检查页面是否设置了某个特定的标志位或者用于操作某些标志位。这些宏的名称都有一定的模式，具体如下。

PageXXX()用于检查页面是否设置了 PG_XXX 标志位，如 PageLRU() 检查PG_lru 标志位是否置位了，PageDirty() 检查 PG_dirty 是否置位了。
SetPageXX() 设置页面中的 PG_XXX 标志位，如 SetPageLRU() 用于设置 PG_lru ,SetPageDirty() 用于设置 PG_dirty 标志位。
ClearPageXXX() 用于无条件地清除某个特定的标志位。

这些宏实现在 include/linux/page-flags.h 文件中。

除了上述字段外 flags 还有如下字段：

/*
 * page->flags layout:
 *
 * There are five possibilities for how page->flags get laid out.  The first
 * pair is for the normal case without sparsemem. The second pair is for
 * sparsemem when there is plenty of space for node and section information.
 * The last is when there is insufficient space in page->flags and a separate
 * lookup is necessary.
 *
 * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE |             ... | FLAGS |
 *      " plus space for last_cpupid: |       NODE     | ZONE | LAST_CPUPID ... | FLAGS |
 * classic sparse with space for node:| SECTION | NODE | ZONE |             ... | FLAGS |
 *      " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS |
 * classic sparse no space for node:  | SECTION |     ZONE    | ... | FLAGS |
 */

mapping

mapping 成员表示页面所指向的地址空间。

内核中的地址空间通常有两个不同的地址空间：

一个用于文件映射页面，如在读取文件时，地址空间用于将文件的内容数据与装载数据的存储介质区关联起来；
另一个用于匿名映射。

内核使用一个简单直接的方式实现了“一个指针，两种用途”，mapping 成员的最低两位用于判断是否指向匿名映射或 KSM 页面的地址空间。如果指向匿名页面，那么 mapping 成员指向匿名页面的地址空间数据结构 anon_vma 。

#define PAGE_MAPPING_ANON	0x1
#define PAGE_MAPPING_MOVABLE	0x2
#define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)

static __always_inline int PageMappingFlags(struct page *page)
{
	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
}

static __always_inline int PageAnon(struct page *page)
{
	page = compound_head(page);
	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}

static __always_inline int __PageMovable(struct page *page)
{
	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
				PAGE_MAPPING_MOVABLE;
}

_refcount

_refcount 表示内核中引用该页面的次数。

当 _refcount 的值为 0 时，表示该页面为空闲页面或即将要被释放的页面。
当 _refcount 的值大于 0 时，表示该页面已经被分配且内核正在使用，暂时不会被释放。

内核中提供加/减 _refcount 的接口函数，读者应该使用这些接口函数来使用 _refcount 引用计数。

get_page(): _refcount 加 1 。
put_page(): _refcount 减 1 。若 _refcount 减 1 后等于 0 ,那么会释放该页面。

这两个接口函数实现在 include/linux/mm.h 文件中。

static inline void get_page(struct page *page)
{
	page = compound_head(page);
	/*
	 * Getting a normal page or the head of a compound page
	 * requires to already have an elevated page->_refcount.
	 */
	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
	page_ref_inc(page);
}

static inline void put_page(struct page *page)
{
	page = compound_head(page);

	/*
	 * For devmap managed pages we need to catch refcount transition from
	 * 2 to 1, when refcount reach one it means the page is free and we
	 * need to inform the device driver through callback. See
	 * include/linux/memremap.h and HMM for details.
	 */
	if (put_devmap_managed_page(page))
		return;

	if (put_page_testzero(page))
		__put_page(page);
}

get_page() 函数调用 page ref_inc() 来增加引用计数，最后使用 atomic_inc() 函数原子地增加引用计数。
put page() 首先使用 put_page_testzero() 函数来使 _refcount 减 1 并且判断其是否为 0 。如果 _refcount 减 1 之后等于 0 ,就会调用 _put_page() 来释放这个页面。

_mapcount

_mapcount 表示这个页面被进程映射的个数，即已经映射了多少个用户 PTE 。
每个用户进程都拥有各自独立的虚拟空间（256TB）和一份独立的页表，所以可能出现多个用户进程地址空间同时映射到一个物理页面的情况，RMAP 系统就是利用这个特性来实现的。_mapcount 主要用于RMAP系统中。

若 _mapcount 等于 -1 ，表示没有 PTE 映射到页面。
若 _mapcount 等于 0 ，表示只有父进程映射到页面。
匿名页面刚分配时， _mapcount 初始化为 0 。例如,当 do_anonymous_page() 产生的匿名页面通过 page_add_new_anon_rmap() 添加到 rmap 系统中时，会设置 _mapcount 为 0 ，这表明匿名页面当前只有父进程的 PTE 映射到页面。
若 _mapcount 大于 0 ,表示除了父进程外还有其他进程映射到这个页面。同样以创建子进程时共享父进程地址空间为例，设置父进程的 PTE 内容到子进程中并增加该页面的 _mapcount 。

linux 内核通过 page_dup_rmap 函数修改 _mapcount 。

static inline void page_dup_rmap(struct page *page, bool compound)
{
	atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
}

PG_Locked

page 数据结构中的成员 flags 定义了一个标志位 PG_locked ,内核通常利用 PG_locked 来设置一个页锁。lock _page() 函数用于申请页锁，如果页锁被其他进程占用了，那么它会睡眠等待。
lock _page() 函数的声明和实现如下。

void __lock_page(struct page *__page)
{
	struct page *page = compound_head(__page);
	wait_queue_head_t *q = page_waitqueue(page);
	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
				EXCLUSIVE);
}

static inline int trylock_page(struct page *page)
{
	page = compound_head(page);
	return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}

static inline void lock_page(struct page *page)
{
	might_sleep();
	if (!trylock_page(page))
		__lock_page(page);
}

可以看到，lock page() 函数首先会调用 trylock_page() 函数，然后调用 _lock_page() 函数。
trylock _page() 和 lock _page() 这两个函数看起来很相似，但有很大的区别。trylock _page() 定义在 include/linux/pagemap.h 文件中，它使用 test_and_set_bit_lock() 尝试为 page 的 flags 设置 PG_locked 标志位，并且返回原来标志位的值。如果 page 的 PG_locked 位已经置位了，那么当前进程调用 trylock _page() 时返回 false ,说明有其他进程已经锁住了 page 。因此，若 trylock _page() 返回 false ，表示获取锁失败；若返回 true ，表示获取锁成功。
当 trylock_page() 无法获取锁时,当前进程会调用 wait_on_page_bit_common() 函数让其在等待队列中睡眠、等待这个锁。需要注意的是，当前进程会进入不可中断的睡眠状态。当前进程在睡眠等待时不受干扰，对信号不做任何反应，所以这个状态称为不可中断的状态。通常使用 ps 命令看到的标记为 D 状态的进程就是处于不可中断状态的进程，不可以发送 SIGKILL 信号让它们终止。

zone（区）

在 Linux 下将一个节点内不同用途的内存区域划分为不同的区（zone），对应结构体 struct zone，该结构体定义于 /include/linux/mmzone.h 中，如下：

struct zone {
	/* Read-mostly fields */

	/* zone watermarks, access with *_wmark_pages(zone) macros */
	unsigned long _watermark[NR_WMARK];
	unsigned long watermark_boost;

	unsigned long nr_reserved_highatomic;

	/*
	 * We don't know if the memory that we're going to allocate will be
	 * freeable or/and it will be released eventually, so to avoid totally
	 * wasting several GB of ram we must reserve some of the lower zone
	 * memory (otherwise we risk to run OOM on the lower zones despite
	 * there being tons of freeable ram on the higher zones).  This array is
	 * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
	 * changes.
	 */
	long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
	int node;
#endif
	struct pglist_data	*zone_pgdat;
	struct per_cpu_pageset __percpu *pageset;

#ifndef CONFIG_SPARSEMEM
	/*
	 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
	 * In SPARSEMEM, this map is stored in struct mem_section
	 */
	unsigned long		*pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
	unsigned long		zone_start_pfn;

	/*
	 * spanned_pages is the total pages spanned by the zone, including
	 * holes, which is calculated as:
	 * 	spanned_pages = zone_end_pfn - zone_start_pfn;
	 *
	 * present_pages is physical pages existing within the zone, which
	 * is calculated as:
	 *	present_pages = spanned_pages - absent_pages(pages in holes);
	 *
	 * managed_pages is present pages managed by the buddy system, which
	 * is calculated as (reserved_pages includes pages allocated by the
	 * bootmem allocator):
	 *	managed_pages = present_pages - reserved_pages;
	 *
	 * So present_pages may be used by memory hotplug or memory power
	 * management logic to figure out unmanaged pages by checking
	 * (present_pages - managed_pages). And managed_pages should be used
	 * by page allocator and vm scanner to calculate all kinds of watermarks
	 * and thresholds.
	 *
	 * Locking rules:
	 *
	 * zone_start_pfn and spanned_pages are protected by span_seqlock.
	 * It is a seqlock because it has to be read outside of zone->lock,
	 * and it is done in the main allocator path.  But, it is written
	 * quite infrequently.
	 *
	 * The span_seq lock is declared along with zone->lock because it is
	 * frequently read in proximity to zone->lock.  It's good to
	 * give them a chance of being in the same cacheline.
	 *
	 * Write access to present_pages at runtime should be protected by
	 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
	 * present_pages should get_online_mems() to get a stable value.
	 */
	atomic_long_t		managed_pages;
	unsigned long		spanned_pages;
	unsigned long		present_pages;

	const char		*name;

#ifdef CONFIG_MEMORY_ISOLATION
	/*
	 * Number of isolated pageblock. It is used to solve incorrect
	 * freepage counting problem due to racy retrieving migratetype
	 * of pageblock. Protected by zone->lock.
	 */
	unsigned long		nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
	/* see spanned/present_pages for more description */
	seqlock_t		span_seqlock;
#endif

	int initialized;

	/* Write-intensive fields used from the page allocator */
	ZONE_PADDING(_pad1_)

	/* free areas of different sizes */
	struct free_area	free_area[MAX_ORDER];

	/* zone flags, see below */
	unsigned long		flags;

	/* Primarily protects free_area */
	spinlock_t		lock;

	/* Write-intensive fields used by compaction and vmstats. */
	ZONE_PADDING(_pad2_)

	/*
	 * When free pages are below this point, additional steps are taken
	 * when reading the number of free pages to avoid per-cpu counter
	 * drift allowing watermarks to be breached
	 */
	unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
	/* pfn where compaction free scanner should start */
	unsigned long		compact_cached_free_pfn;
	/* pfn where async and sync compaction migration scanner should start */
	unsigned long		compact_cached_migrate_pfn[2];
#endif

#ifdef CONFIG_COMPACTION
	/*
	 * On compaction failure, 1<<compact_defer_shift compactions
	 * are skipped before trying again. The number attempted since
	 * last failure is tracked with compact_considered.
	 */
	unsigned int		compact_considered;
	unsigned int		compact_defer_shift;
	int			compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
	/* Set to true when the PG_migrate_skip bits should be cleared */
	bool			compact_blockskip_flush;
#endif

	bool			contiguous;

	ZONE_PADDING(_pad3_)
	/* Zone statistics */
	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
	atomic_long_t		vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;

zone 经常会被访问到，因此这个数据结构要求以 L1 高速缓存对齐。另外，这里的 ZONE PADDING() 让 zone->lock 和 zone->lru_lock 这两个很热门的锁可以分布在不同的高速缓存行中。一个内存节点最多有几个 zone ，因此 zone 数据结构不需要像 page 一样关注数据结构的大小，ZONE_PADDING() 可以为了性能而浪费空间。在内存管理开发过程中，内核开发者逐步发现有一些自旋锁会竞争得非常厉害，很难获取。在稍微早期的Linux内核（如Linux4.0）中，zone->lock 和 zone->lru_lock 这两个锁有时需要同时获取，因此保证它们使用不同的高速缓存行是内核常用的一种优化技巧。然而，在Linux 5.0内核中，zone->lru_lock 已经转移到内存节点的 pglist_data 数据结构中。
关于部分数据成员的解释如下。

unsigned long _watermark[NR_WMARK]：每个 zone 在系统启动时会计算出 3 个水位，分别是最低警戒水位（WMARK_MIN）、低水位（WMARK_LOW）和高水位（WMARK_HIGH），这在页面分配器和kswapd 页面回收中会用到。
long lowmem_reserve[MAX_NR_ZONES]：防止页面分配器过度使用低端 zone 的内存。
struct pglist_data *zone_pgdat：指向内存节点。
int node：NUMA 中标识所属 node
struct per_cpu_pageset __percpu *pageset：用于维护每个CPU上的一系列页面，以减少自旋锁的争用。
unsigned long zone_start_pfn：的起始页帧号。
atomic_long_t managed_pages：zone 中被伙伴系统管理的页面数量。
unsigned long spanned_pages：zone 包含的页面数量。
unsigned long present_pages：zone 里实际管理的页面数量。对于一些架构来说，其值和 spanned _pages 的值相等。
struct free_area free_area[MAX_ORDER]：伙伴系统的核心数据结构，管理空闲页块(page block）链表的数组。
spinlock_t lock：并行访问时用于保护 zone 的自旋锁。
spinlock_t lru_lock：并行访问时用于保护 zone 中 LRU 链表的自旋锁。在 Linux 5.0 内核中，该成员已转移到 pglist_data 中。
struct lruvec lruvec：LRU链表集合，同样该成员已转移到 pglist_data 中。
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]：zone 计数值。在 Linux 5.0 内核中，该成员已转移到 pglist_data 中。

通常情况下，内核的 zone 分为 ZONE_DMA、ZONE_DMA32、ZONE NORMAL 和 ZONE_HIGHMEM 。zone 类型定义在 include/linux/mmzone.h 文件中。

enum zone_type {
#ifdef CONFIG_ZONE_DMA
	/*
	 * ZONE_DMA is used when there are devices that are not able
	 * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
	 * carve out the portion of memory that is needed for these devices.
	 * The range is arch specific.
	 *
	 * Some examples
	 *
	 * Architecture		Limit
	 * ---------------------------
	 * parisc, ia64, sparc	<4G
	 * s390, powerpc	<2G
	 * arm			Various
	 * alpha		Unlimited or 0-16MB.
	 *
	 * i386, x86_64 and multiple other arches
	 * 			<16M.
	 */
	ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
	/*
	 * x86_64 needs two ZONE_DMAs because it supports devices that are
	 * only able to do DMA to the lower 16M but also 32 bit devices that
	 * can only do DMA areas below 4G.
	 */
	ZONE_DMA32,
#endif
	/*
	 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
	 * performed on pages in ZONE_NORMAL if the DMA devices support
	 * transfers to all addressable memory.
	 */
	ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
	/*
	 * A memory area that is only addressable by the kernel through
	 * mapping portions into its own address space. This is for example
	 * used by i386 to allow the kernel to address the memory beyond
	 * 900MB. The kernel will set up special mappings (page
	 * table entries on i386) for each page that the kernel needs to
	 * access.
	 */
	ZONE_HIGHMEM,
#endif
	ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
	ZONE_DEVICE,
#endif
	__MAX_NR_ZONES

};

ZONE_DMA：用于 ISA 设备的 DMA 操作，范围是 0~16MB ，只适用于Intel x86 架构，ARM 架构没有这个内存管理区。
ZONE_DMA32：用于最低 4GB 的内存访问的设备，如只支持 32 位的 DMA 设备。
ZONE_NORMAL：4GB 以后的物理内存，用于线性映射物理内存。若系统内存小于 4GB，则没有这个内存管理区。
ZONE_HIGHMEM：用于管理高端内存，这些高端内存是不能线性映射到内核地址空间的。注意，在 64 位Linux操作系统中没有这个内存管理区。

pglist_data（节点）

其中，pglist_data 数据结构用来描述一个内存节点的所有资源。在 UMA 架构中，只有一个内存节点，即系统有一个全局的变量 contig_page_data 来描述这个内存节点。在 NUMA 架构中，整个系统的内存由一个 pglist_data * 的指针数组 node_data[] 来管理，在系统初始化时通过枚举 BIOS 固件（ACPI）来完成。

pglist_data 结构定义于 /include/linux/mmzone.h 中，如下：

typedef struct pglist_data {
	struct zone node_zones[MAX_NR_ZONES];
	struct zonelist node_zonelists[MAX_ZONELISTS];
	int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
	struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
	struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
	/*
	 * Must be held any time you expect node_start_pfn,
	 * node_present_pages, node_spanned_pages or nr_zones to stay constant.
	 *
	 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
	 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
	 * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
	 *
	 * Nests above zone->lock and zone->span_seqlock
	 */
	spinlock_t node_size_lock;
#endif
	unsigned long node_start_pfn;
	unsigned long node_present_pages; /* total number of physical pages */
	unsigned long node_spanned_pages; /* total size of physical page
					     range, including holes */
	int node_id;
	wait_queue_head_t kswapd_wait;
	wait_queue_head_t pfmemalloc_wait;
	struct task_struct *kswapd;	/* Protected by
					   mem_hotplug_begin/end() */
	int kswapd_order;
	enum zone_type kswapd_classzone_idx;

	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
	int kcompactd_max_order;
	enum zone_type kcompactd_classzone_idx;
	wait_queue_head_t kcompactd_wait;
	struct task_struct *kcompactd;
#endif
	/*
	 * This is a per-node reserve of pages that are not available
	 * to userspace allocations.
	 */
	unsigned long		totalreserve_pages;

#ifdef CONFIG_NUMA
	/*
	 * zone reclaim becomes active if more unmapped pages exist.
	 */
	unsigned long		min_unmapped_pages;
	unsigned long		min_slab_pages;
#endif /* CONFIG_NUMA */

	/* Write-intensive fields used by page reclaim */
	ZONE_PADDING(_pad1_)
	spinlock_t		lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
	/*
	 * If memory initialisation on large machines is deferred then this
	 * is the first PFN that needs to be initialised.
	 */
	unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	spinlock_t split_queue_lock;
	struct list_head split_queue;
	unsigned long split_queue_len;
#endif

	/* Fields commonly accessed by the page reclaim scanner */
	struct lruvec		lruvec;

	unsigned long		flags;

	ZONE_PADDING(_pad2_)

	/* Per-node vmstats */
	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;

struct zone node_zones[MAX_NR_ZONES]：字段包含该节点所拥有的 zones。并非所有的 zone 都已被填充，但这是一个满的列表。它被该节点的 node_zonelists 以及其他节点的 node_zonelists 所引用.
struct zonelist node_zonelists[MAX_ZONELISTS]：用于 buddy system 分配内存。
int nr_zones：内存区域数量。
struct page *node_mem_map：页描述符，除了稀疏内存模型外都使用。
struct page_ext *node_page_ext：页的扩展属性。
unsigned long node_start_pfn 起始物理页号。
unsigned long node_present_pages：物理页总数（不包括空洞）。
unsigned long node_spanned_pages：物理页总数（包括空洞）。
int node_id：节点标识符。

内存架构

UMA（Uniform Memory Access）架构指的是内存有统一的结构并且可以统一寻址。
NUMA（Non-Uniform Memory Access）架构指的是系统中有多个节点和多个簇，CPU访问本地内存节点的速度最快，访问远端的内存节点的速度要慢一些。

内存模型

Linux 提供了三种内存模型，定义于 include/asm-generic/memory_model.h 中，如下图所示
在这里插入图片描述
内存模型在编译期就会被确定下来，目前常用的是 Sparse Memory 模型，即离散内存模型。

Flat Memory

平滑内存模型。物理内存地址连续，有一个全局变量 mem_map ——由一个大的 struct page 数组直接对应现有的物理内存。

Discontiguous Memory

非连续性内存模型。主要针对内存中存在空洞的情况。

对于每一段连续的物理内存，都有一个 pglist_data 结构体进行对应，其成员 node_mem_map 为一个 struct page 指针，指向一个 page 结构体数组，由该结构体对应到该段连续物理内存。

有一个全局变量 node_data 为一个 pglist_data 数组，其中存放着指向每一个 pglist_data ，该数组的大小为 MAX_NUMNODES 。

Sparse Memory

离散内存模型。在一个 mem_section 结构体中存在一个 section_mem_map 成员指向一个 struct page 数组对应一段连续的物理内存，即将内存按照 section 为单位进行分段。

存在一个全局指针数组 mem_section （与结构体同名）存放所有的 mem_section 指针，指向理论上支持的内存空间，每个 section 对应的物理内存不一定存在，若不存在则此时该 section 的指针为 NULL 。

这种模型支持内存的热拔插。

在这里插入图片描述
mem_section 结构体定义于 /include/linux/mmzone.h 中，如下：

struct mem_section {
	/*
	 * This is, logically, a pointer to an array of struct
	 * pages.  However, it is stored with some other magic.
	 * (see sparse.c::sparse_init_one_section())
	 *
	 * Additionally during early boot we encode node id of
	 * the location of the section here to guide allocation.
	 * (see sparse.c::memory_present())
	 *
	 * Making it a UL at least makes someone do a cast
	 * before using it wrong.
	 */
	unsigned long section_mem_map;

	/* See declaration of similar field in struct zone */
	unsigned long *pageblock_flags;
#ifdef CONFIG_PAGE_EXTENSION
	/*
	 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
	 * section. (see page_ext.h about this.)
	 */
	struct page_ext *page_ext;
	unsigned long pad;
#endif
	/*
	 * WARNING: mem_section must be a power-of-2 in size for the
	 * calculation and use of SECTION_ROOT_MASK to make sense.
	 */
};

mem_section 变量定义于 /mm/sparse.c 中，如下：

#ifdef CONFIG_SPARSEMEM_EXTREME
struct mem_section **mem_section;
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
	____cacheline_internodealigned_in_smp;
#endif

若未开启 CONFIG_SPARSEMEM_EXTREME 编译选项则 mem_section 为一个常规的二维数组，否则为一个二级指针，其所指向空间内存动态分配。
在这里插入图片描述
kernel 中提供了两个用以在 PFN（Page Frame Numer）与 page 结构体之间进行转换的宏，定义于 /include/asm-generic/memory_model.h 中，如下：

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
#ifdef CONFIG_SPARSEMEM_EXTREME
	if (!mem_section)
		return NULL;
#endif
	if (!mem_section[SECTION_NR_TO_ROOT(nr)])
		return NULL;
	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}

static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
	return pfn >> PFN_SECTION_SHIFT;
}

static inline unsigned long page_to_section(const struct page *page)
{
	return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
	return __nr_to_section(pfn_to_section_nr(pfn));
}

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
	unsigned long map = section->section_mem_map;
	map &= SECTION_MAP_MASK;
	return (struct page *)map;
}

#elif defined(CONFIG_SPARSEMEM)
/*
 * Note: section's mem_map is encoded to reflect its start_pfn.
 * section[i].section_mem_map == mem_map's address - start_pfn;
 */
#define __page_to_pfn(pg)					\
({	const struct page *__pg = (pg);				\
	int __sec = page_to_section(__pg);			\
	(unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec)));	\
})

#define __pfn_to_page(pfn)				\
({	unsigned long __pfn = (pfn);			\
	struct mem_section *__sec = __pfn_to_section(__pfn);	\
	__section_mem_map_addr(__sec) + __pfn;		\
})
#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */

因此 pfn 与 page 的转换关系如下图所示：
在这里插入图片描述
基于Sparse Memory 内存模型上引入了 vmemmap 的概念，是目前 Linux 最常用的内存模型之一。
在开启了 vmemmap 之后，所有的 mem_section 中的 page 都抽象到一个虚拟数组 vmemmap 中，这样在进行 struct page * 和 pfn 转换时，直接使用 vmemmap 数组即可。
在这里插入图片描述

buddy system

分配掩码

分配掩码是描述页面分配方法的标志，它影响着页面分配的整个流程。因为 Linux 内核是一个通用的操作系统，所以页面分配器被设计成一个复杂的系统。它既要高效，又要兼顾很多种情况，特别是在内存紧张的情况下的内存分配。gfp_mask 其实被定义成一个 unsigned 类型的变量。

typedef unsigned __bitwise gfp_t;

gfp_mask 定义在 include/linux/gfp.h 文件中。修饰符在 Linux 4.4 内核中被重新归类，大致可以分成如下几类：

内存管理区修饰符(zone modifier)。
移动修饰符（mobility and placement modifier)。
水位修饰符（watermark modifier)。
页面回收修饰符(page reclaim modifier)。
行为修饰符(action modifier)。

下面详细介绍各种修饰符的标志。

内存管理区修饰符

内存管理区修饰符主要用于表示应当从哪些内存管理区中来分配物理内存。内存管理区修饰符使用gfp_mask 的低 4 位来表示。

___GFP_DMA：从 ZONE_DMA 中分配内存
___GFP_DMA32：从 ZONE_DMA32 中分配内存
___GFP _HIGHMEM：优先从 ZONE_HIGHMEM 中分配内存
___GFP_MOVABLE：页面可以被迁移或者回收，如用于内存规整机制

移动修饰符

移动修饰符主要用于指示分配出来的页面具有的迁移属性。在 Linux2.6.24 内核中，为了解决外碎片化的问题，引入了迁移类型，因此在分配内存时需要指定所分配的页面具有哪些迁移属性。

___GFP_RECLAIMABLE：在 slab 分配器中指定了 SLAB_RECLAIM_ACCOUNT 标志位，表示 slab 分配器中使用的页面可以通过收割机来回收
___GFP_HARDWALL：使能 cpusect 内存分配策略
___GFP_THISNODE：从指定的内存节点中分配内存，并且没有回退机制
___GFP_ACCOUNT：分配过程会被 kmemcg 记录

水位修饰符

水位修饰符用于控制是否可以访问系统预留的内存。所谓系统预留内存指的是最低警戒水位以下的内存，一般优先级的分配请求是不能访问它们的，只有高优先级的分配请求才能访问，如 ___GFP_HIGH 、___GFP_ATOMIC 等。

___GFP_HIGH：表示分配内存具有高优先级，并且这个分配请求是很有必要的，分配器可以使用系统预留的内存(即最低警戒水位线下的预留内存)
___GFP_ATOMIC：表示分配内存的过程不能执行页面回收或者睡眠动作，并且具有很高的优先级，可以访问系统预留的内存。常见的一个场景是在中断上下文中分配内存
___GFP_MEMALLOC：分配过程中允许访问所有的内存，包括系统预留的内存。分配内存进程通常要保证在分配内存过程中很快会有内存被释放，如进程退出或者页面回收
___GFP_NOMEMALLOC：分配过程不允许访问系统预留的内存

页面回收修饰符

___GFP_IO：允许开启I/O传输
___GFP_FS：允许调用底层的文件系统。这个标志清零通常是为了避免死锁的发生，如果相应的文件系统操作路径上已经持有了锁，分配内存过程又递归地调用这个文件系统的相应操作路径，可能会产生死锁
___GFP_DIRECT_RECLAIM：分配内存的过程中允许使用页面直接回收机制
___GFP_KSWAPD_RECLAIM：表示当到达内存管理区的低水位时会唤醒 kswapd 内核线程，以异步地回收内存，直到内存管理区恢复到了高水位为止
___GFP_RECLAIM：用于允许或者禁止直接页面回收和 kswapd 内核线程
___GFP_REPEAT：当分配失败时会继续尝试
___GFP_NOFAIL：当分配失败时会无限地尝试下去，直到分配成功为止。当分配者希望分配内存不失败时，应该使用这个标志位，而不是自己写一个 while 循环来不断地调用页面分配接口函数
___GFP_NORETRY：当使用了直接页面回收和内存规整等机制还无法分配内存时，最好不要重复尝试分配了，直接返回 NULL

行为修饰符

___GFP_COLD：分配的内存不会马上被使用。通常会返回一个空的高速缓存页面
___GFP_NOWARN：关闭分配过程中的一些错误报告
___GFP_ZERO：返回一个全部填充为 0 的页面
___GFP_NOTRACK：不被kmemcheck 机制跟踪
___GFP_OTHER NODE：在远端的一个内存节点上分配。通常在 khugepaged 内核线程中使用

类型标志

前文列出了 5 大类修饰符的标志，对于内核开发者或者驱动开发者来说，要正确使用这些标志是一件很困难的事情，因此定义了一些常用的标志的组合—类型标志（type flag)

GFP_KERNEL(__GFP_RECLAIM | __GFP_IO | __GFP_FS)：内核分配内存常用的标志之一。它可能会被阻塞，即分配过程中可能会睡眠
GFP_ATOMIC(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)：调用者不能睡眠并且保证分配会成功。它可以访问系统预留的内存
GFP_NOWAIT(__GFP_KSWAPD_RECLAIM)：分配中不允许睡眠等待
GFP_NOFS(__GFP_RECLAIM | __GFP_IO)：不会访问任何的文件系统的接口和操作
GFP_NOIO(__GFP_RECLAIM)：不需要启动任何的 I/O 操作。如使用直接回收机制丢弃干净的页面或者为 slab 分配的页面
GFP_USER(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)：通常用户空间的进程用来分配内存，这些内存可以被内核或者硬件使用。常用的、一个场景是硬件使用的 DMA 缓冲器要映射到用户空间，如显卡的缓冲器
GFP_DMA/GFP_DMA32(__GFP_DMA/__GFP_DMA32)：使用 ZONE_DMA 或者 ZONE_DMA32 来分配内存
GFP_HIGHUSER(GFP_USER | __GFP_HIGHMEM)：用户空间进程用来分配内存，优先使用 ZONE_HIGHMEM ，这些内存可以映射到用户空间，内核空间不会直接访问这些内存。另外，这些内存不能迁移
GFP_HIGHUSER_MOVABLE(GFP_HIGHUSER | __GFP_MOVABLE)：类似于 GFP_HIGHUSER ，但是页面可以迁移
GFP_TRANSHUGE/GFP_TRANSHUGE_LIGHT((GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)/((GFP_HIGHUSER_MOVABLE | __GFP_COMP | __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM))：通常用于透明页面分配

关键过程分析

__alloc_pages_nodemask

alloc_pages() 最终调用 __alloc _pages _nodemask() 函数，它是伙伴系统的核心函数，它实现在 mm/page_alloc.c 文件中。

struct page *__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask)

在这里插入图片描述
__alloc_pages_nodemask() 函数中主要执行的操作如下。

变量 alloc_flags 用于表示页面分配的行为和属性，这里初始化为ALLOC_WMARK_LOW，即允许分配内存的判断条件为低水位（WMARK_LOW）。

	unsigned int alloc_flags = ALLOC_WMARK_LOW;

伙伴系统能分配的最大内存块大小是 $MAX_ORDER − 1 2^{\text{MAX\_ORDER}-1}$ 个页面，若 MAX_ORDER 设置为 11 ，最大内存块为 4MB 大小。

	if (unlikely(order >= MAX_ORDER)) {
		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
		return NULL;
	}

alloc_context 数据结构是伙伴系统分配函数中用于保存相关参数的数据结构。prepare_alloc_pages() 函数会计算相关的信息并且保存到 alloc_context 数据结构中，如 high_zoneidx 、migratetype 、zonelist 等信息。

	struct alloc_context ac = { };
	...
	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
		return NULL;

finalise_ac() 函数主要用于确定首选的 zone 。

	finalise_ac(gfp_mask, &ac);

alloc_flags_nofragment() 函数是用于内存碎片化方面的一个优化，这里引入了一个新的标志—— ALLOC_NOFRAGMENT 。使用 ZONE_DMA32 作为一个合适的 zone 来避免碎片化其实是微妙的。如果首选的 zone 是高端 zone ，如 ZONE_NORMAL ，过早地使用低端 zone 而造成的内存短缺问题比内存碎片化更严重。当发生外碎片化时，尽可能地从 ZONE_NORMAL 的其他迁移类型的空闲链表中多挪用一些空闲内存，这比过早地使用低端 zone 的内存要好很多。理想的情况下，从其他迁移类型的空闲链表中至少挪用 $pageblock_order 2^{\text{pageblock\_order}}$ 个空闲页面，见 _rmqueue_fallback() 函数。

	/*
	 * Forbid the first pass from falling back to types that fragment
	 * memory until all local zones are considered.
	 */
	alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);

get_page_from_freelist() 函数尝试从伙伴系统的空闲链表中分配内存。若配成功，则返回内存块的第一个页面的 page 数据结构。

	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
	if (likely(page))
		goto out;

当 get_page_from_freelist() 函数分配不成功时，就会进入分配的慢速路径，即 __alloc_pages_slowpath() 函数。

	/*
	 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
	 * resp. GFP_NOIO which has to be inherited for all allocation requests
	 * from a particular context which has been marked by
	 * memalloc_no{fs,io}_{save,restore}.
	 */
	alloc_mask = current_gfp_context(gfp_mask);
	ac.spread_dirty_pages = false;

	/*
	 * Restore the original nodemask if it was potentially replaced with
	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
	 */
	if (unlikely(ac.nodemask != nodemask))
		ac.nodemask = nodemask;

	page = __alloc_pages_slowpath(alloc_mask, order, &ac);

在这里插入图片描述

prepare_alloc_page

prepare_alloc_pages() 函数实现在 page_alloc.c 文件中，主要用于初始化页面分配器中用到的参数。这些参数会临时存放在 alloc_context 数据结构中。

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags);

prepare _alloc_pages() 函数中的主要操作如下。

gfp_zone() 函数根据分配掩码计算出 zone 的 zoneidx ，并存放在 high_zoneidx 成员中。gfp_zone()函数会用到 GFP_ZONEMASK 、GFP_ZONE_TABLE 和 GFP_ZONES_SHIFT 等宏。

#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)

static inline enum zone_type gfp_zone(gfp_t flags)
{
	enum zone_type z;
	int bit = (__force int) (flags & GFP_ZONEMASK);

	z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
					 ((1 << GFP_ZONES_SHIFT) - 1);
	VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
	return z;
}

	ac->high_zoneidx = gfp_zone(gfp_mask);

node_zonelist() 函数返回首选内存节点 preferred_nid 对应的 zonelist 。通常
一个内存节点包含两个 zonelist ：一个是 ZONELIST_FALLBACK ，表示本地的；另外一个是 ZONELIST_NOFALLBACK ，表示远端的。

static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
	if (unlikely(flags & __GFP_THISNODE))
		return ZONELIST_NOFALLBACK;
#endif
	return ZONELIST_FALLBACK;
}

#define NODE_DATA(nid)		(node_data[nid])

static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
	return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);

在这里插入图片描述
内核提供了很多与 zonelist 相关的宏，用于遍历 zonelist 。

zonelist_zone()

static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
	return zoneref->zone;
}

返回 zoneref 对应的 zone

zonelist_zone_idx()

static inline int zonelist_zone_idx(struct zoneref *zoneref)
{
	return zoneref->zone_idx;
}

返回 zoneref 对应的 zone_idx

zonelist_node_idx()

static inline int zone_to_nid(struct zone *zone)
{
	return zone->node;
}

static inline int zonelist_node_idx(struct zoneref *zoneref)
{
	return zone_to_nid(zoneref->zone);
}

返回 zone 所属的内存节点的编号

first_zones_zonelist()

struct zoneref *__next_zones_zonelist(struct zoneref *z,
					enum zone_type highest_zoneidx,
					nodemask_t *nodes)
{
	/*
	 * Find the next suitable zone to use for the allocation.
	 * Only filter based on nodemask if it's set
	 */
	if (unlikely(nodes == NULL))
		while (zonelist_zone_idx(z) > highest_zoneidx)
			z++;
	else
		while (zonelist_zone_idx(z) > highest_zoneidx ||
				(z->zone && !zref_in_nodemask(z, nodes)))
			z++;

	return z;
}

static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes)
{
	if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
		return z;
	return __next_zones_zonelist(z, highest_zoneidx, nodes);
}

static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes)
{
	return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes);
}

返回 zonelist 中不超过给定 highest_zoneidx 的第一个 zone

for_each_zone_zonelist_nodemask()

#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
	for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);	\
		zone;							\
		z = next_zones_zonelist(++z, highidx, nodemask),	\
			zone = zonelist_zone(z))

遍历 zonelist 中不超过给定 highest_zoneidx 的 zone

for_next_zone_zonelist_nodemask()

#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
	for (zone = z->zone;	\
		zone;							\
		z = next_zones_zonelist(++z, highidx, nodemask),	\
			zone = zonelist_zone(z))

从给定的 zone 开始遍历 zonelist 中不超过给定 highest_zoneidx 的 zone

gfpflags_to_migratetype() 函数根据分配掩码来获取内存的迁移类型。

	ac->migratetype = gfpflags_to_migratetype(gfp_mask);

使用了新引入的故障注入（fault injection）技术。

	if (should_fail_alloc_page(gfp_mask, order))
		return false;

get_page_from_freelist

get _page_from_freelist() 函数的主要作用是从伙伴系统的空闲页面链表中尝试分配物理页面。

static struct page *get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac);

关于 get_page_from_freelist() 函数的分析如下。

ALLOC_NOFRAGMENT 是新增的一个标志，表示需要避免内存碎片化。

	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;

preferred_zoneref 表示 zonelist 中首选和推荐的 zone ，这个值是在 finalise_ac() 函数中通过 first_zones_zonelist() 宏计算出来的。

	z = ac->preferred_zoneref;

从推荐的 zone 开始遍历所有的 zone ，这里使用 for_next_zonezonelist_ nodemask()宏。

	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
								ac->nodemask) {

这里判断了 NUMA 系统的一个特殊情况。当要分配内存的 zone 不在本地内存节点(即在远端节点）时，要考虑的不是内存碎片化，而是内存的本地性，因为访问本地内存节点要比访问远端内存节点要快很多。

		if (no_fallback && nr_online_nodes > 1 &&
		    zone != ac->preferred_zoneref->zone) {
			int local_nid;

			/*
			 * If moving to a remote node, retry but allow
			 * fragmenting fallbacks. Locality is more important
			 * than fragmentation avoidance.
			 */
			local_nid = zone_to_nid(ac->preferred_zoneref->zone);
			if (zone_to_nid(zone) != local_nid) {
				alloc_flags &= ~ALLOC_NOFRAGMENT;
				goto retry;
			}
		}

wmark_pages() 宏用来计算 zone 中某个水位的页面大小。zone 的水位分成最低警戒水位（WMARK_MIN）、低水位（WMARK_LOW）和高水位（WMARK_HIGH）。从 Linux 5.0 内核开始，实现了一个临时增加水位（boost watermark）的功能，用来应对外碎片化的情况。

#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)

		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);

zone_watermark_fast() 函数用于判断当前 zone 的空闲页面是否满足 WMARK_LOW 。另外，还会根据 order 来判断是否有足够大的空闲内存块。若该函数返回 true ，表示 zone 的页面高于指定的水位或者满足 order 分配需求。

		if (!zone_watermark_fast(zone, order, mark,
				       ac_classzone_idx(ac), alloc_flags)) {

处理当前的 zone 不满足内存分配需求的情况。这里需要分两种情况考虑，若 node_reclaim_mode 为 0 ，则表示可以从下一个 zone 或者内存节点中分配内存；否则，表示可以在这个 zone 中进行一些内存回收的动作。调用 node _reclaim() 函数尝试回收一部分内存。在 /proc/sys/kernel/vm 目录下有 zone_reclaim_mode 节点，称为 zone reclaim_mode 模式。该节点的值会影响这里的 node_reclaim_mode 变量。通常情况下，zone_reclaim_mode 模式是关闭的，即默认关闭直接从本地 zone 中回收内存。

			int ret;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
			/*
			 * Watermark failed for this zone, but see if we can
			 * grow this zone if it contains deferred pages.
			 */
			if (static_branch_unlikely(&deferred_pages)) {
				if (_deferred_grow_zone(zone, order))
					goto try_this_zone;
			}
#endif
			/* Checked here to keep the fast path fast */
			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
			if (alloc_flags & ALLOC_NO_WATERMARKS)
				goto try_this_zone;

			if (node_reclaim_mode == 0 ||
			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
				continue;

			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
			switch (ret) {
			case NODE_RECLAIM_NOSCAN:
				/* did not scan */
				continue;
			case NODE_RECLAIM_FULL:
				/* scanned but unreclaimable */
				continue;
			default:
				/* did we reclaim enough */
				if (zone_watermark_ok(zone, order, mark,
						ac_classzone_idx(ac), alloc_flags))
					goto try_this_zone;

				continue;
			}

标签 try_this_zone 表示马上要从这个 zone 中分配内存了。rmqueue() 函数会从伙伴系统中分配内存。rmqueue() 函数是伙伴系统的核心分配函数。

try_this_zone:
		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
				gfp_mask, alloc_flags, ac->migratetype);

当从伙伴系统分配页面成功之后需要设置页面的一些属性以及做必要的检查，如设置页面的 _refcount 统计计数为 1 ，设置页面的 private 属性为 0 等。最后返回成功分配页面的 page 数据结构。

		if (page) {
			prep_new_page(page, order, gfp_mask, alloc_flags);

			/*
			 * If this is a high-order atomic allocation then check
			 * if the pageblock should be reserved for the future
			 */
			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
				reserve_highatomic_pageblock(page, zone, order);

			return page;
		}

当遍历完所有的 zone 之后，还没有成功分配出所需要的内存，最有可能的情况是系统中产生了外碎片化。这时可以重新尝试一次。

	if (no_fallback) {
		alloc_flags &= ~ALLOC_NOFRAGMENT;
		goto retry;
	}

get_page_from_freelist() 函数的流程如图所示。
在这里插入图片描述

get_page_from_freelist() 函数是伙伴系统中非常重要的函数，关于该函数，有几点需要注意。

遍历 zonelist 中的 zone 时，扫描 zone 的方向是从高端 zone 到低端 zone 。
大部分情况不一定扫描 zonelist 中所有的 zone ，而是从首选的 zone(preferred_zone)开始扫描。首选的 zone 是通过 gfp_mask 换算的,详见 gfp_zone() 宏和 first_zones_zonelist() 宏。
alloc_context 是一个非常重要的参数，它确定了从哪个 zone 开始扫描和分配内存的迁移类型等信息，这些信息都必须在调用 get_page_from_freelist() 函数时初始化。
在分配内存之前需要判断 zone 的水位情况以及是否满足分配连续大内存块的需求，这是 zone_watermark_ok() 函数的职责。即使 zone_watermark_ok() 函数判断成功，最终也可能会分配失败。究其原因，一方面是内存外碎片化严重，另一方面可能是无法借用其他迁移类型的内存，见 __rmqueue_fallback() 函数。

zone_watermark_fast

zone_watermark_fast() 函数用于测试当前 zone 的水位情况，以及检查是否满足多个页面(order 大于 0)的分配请求。zone_watermark_fast() 函数实现在 mm/page_alloc.c 文件中。

static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags)

这个函数有 5 个参数。

struct zone *z：表示检测是否满足分配请求的 zone 。
unsigned int order：分配 $2^{\text{order}}$ 个物理页面。
unsigned long mark：表示要测试的水位标准。
int classzone_idx：表示首选 zone 的编号。
unsigned int alloc_flags：分配器内部使用的标志位属性。

计算水位需要用到 min_free_kbytes,它是在系统启动时通过系统空闲页面的数量来计算的，具体计算在 init_per_zone_wmark_min() 函数中。另外，系统启动之后也可以通过 sysfs 来设置，节点在 /proc/sys/vm/min_free_kbytes 。计算水位的公式不算复杂，最后结果保存在每个 zone 的 watermark[] 数组中，后续伙伴系统和 kswapd 内核线程会用到。

zone_watermark fast() 函数中的主要操作如下。

zone 里有一个关于物理页面统计数据的数组 vm_stat[] ，这个数组里存放了该 zone 中各种页面的统计数据，包括空闲页面数量 NR_FREE_PAGES 、不活跃的匿名页面数量 NR_ZONE_INACTIVE_ANON 等。zone_page_state() 函数用于获取 zone 中空闲页面的数量。

static inline unsigned long zone_page_state(struct zone *zone,
					enum zone_stat_item item)
{
	long x = atomic_long_read(&zone->vm_stat[item]);
#ifdef CONFIG_SMP
	if (x < 0)
		x = 0;
#endif
	return x;
}

	long free_pages = zone_page_state(z, NR_FREE_PAGES);

针对分配一个页面（order=0）的情况做快速处理。这里特别需要注意的地方就是 lowmem_reserve 。它是每个 zone 预留的内存，为了防止高端 zone 在没内存的情况下过度使用低端 zone 的内存资源。

	long cma_pages = 0;

#ifdef CONFIG_CMA
	/* If allocation can't use CMA areas don't use free CMA pages */
	if (!(alloc_flags & ALLOC_CMA))
		cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
    ...
	if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
		return true;

调用 __zone_watermark_ok() 进一步检查

	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, free_pages);

在 __zone_watermark_ok() 中继续做空闲页面的检查。

	long min = mark;
	int o;
	const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));

	/* free_pages may go negative - that's OK */
	free_pages -= (1 << order) - 1;

	if (alloc_flags & ALLOC_HIGH)
		min -= min / 2;

	/*
	 * If the caller does not have rights to ALLOC_HARDER then subtract
	 * the high-atomic reserves. This will over-estimate the size of the
	 * atomic reserve but it avoids a search.
	 */
	if (likely(!alloc_harder)) {
		free_pages -= z->nr_reserved_highatomic;
	} else {
		/*
		 * OOM victims can try even harder than normal ALLOC_HARDER
		 * users on the grounds that it's definitely going to be in
		 * the exit path shortly and free memory. Any allocation it
		 * makes during the free path will be small and short-lived.
		 */
		if (alloc_flags & ALLOC_OOM)
			min -= min / 2;
		else
			min -= min / 4;
	}


#ifdef CONFIG_CMA
	/* If allocation can't use CMA areas don't use free CMA pages */
	if (!(alloc_flags & ALLOC_CMA))
		free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
#endif

	/*
	 * Check watermarks for an order-0 allocation request. If these
	 * are not met, then a high-order request also cannot go ahead
	 * even if a suitable page happened to be free.
	 */
	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
		return false;

检查是否满足分配 $2^{\text{order}}$ 个页面的需求。这里会判断 MIGRATE_UNMOVABLE 迁移类型到 MIGRATE_RECLAIMABLE 迁移类型是否满足分配需求。只要上述几个迁移类型中的空闲页面链表中有满足 order 需求的内存块，我们就初步认为满足分配需求了，后续可以从迁移类型中挪用内存。

	/* For a high-order request, check at least one suitable page is free */
	for (o = order; o < MAX_ORDER; o++) {
		struct free_area *area = &z->free_area[o];
		int mt;

		if (!area->nr_free)
			continue;

		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
			if (!list_empty(&area->free_list[mt]))
				return true;
		}

#ifdef CONFIG_CMA
		if ((alloc_flags & ALLOC_CMA) &&
		    !list_empty(&area->free_list[MIGRATE_CMA])) {
			return true;
		}
#endif
		if (alloc_harder &&
			!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
			return true;
	}
	return false;

rmqueue

rmqueue() 函数会从伙伴系统中获取内存。若需要的内存块不能满足，那么可以从大内存块中“切”内存。

static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype);

rmqueue() 函数一共有 6 个参数。

struct zone *preferred_zone：首选的 zone 。
struct zone *zone：当前遍历的 zone 。
unsigned int order：分配 $2^{\text{order}}$ 个连续物理页面。
gfp_t gfp_flags：调用者传递过来的分配掩码。
unsigned int alloc_flags：页面分配器内部使用的标志位。
int migratetype：分配内存的迁移类型。

rmqueue() 函数有返回值，当分配成功时，它返回内存块第一个页面的 page 数据结构。

rmqueue() 函数主要实现如下操作。

处理分配单个物理页面的情况（order=0）。调用 rmqueue_pcplist() 函数，从 Per-CPU 变量 per_cpu_pages（PCP机制）中分配物理页面。 per_cpu_pages 是一个 Per-CPU 变量，即每个 CPU 都有一个本地的 per_cpu_pages 变量。这个 per_cpu_pages 数据结构里有一个单页面的链表，里面暂时存放了一小部分单个的物理页面。当系统需要单个物理页面时，就从本地 CPU 的 Per-CPU 变量的链表中直接获取物理页面即可，这不仅效率非常高，而且能减少对 zone 中相关锁的操作。

	if (likely(order == 0)) {
		page = rmqueue_pcplist(preferred_zone, zone, order,
				gfp_flags, migratetype, alloc_flags);
		goto out;
	}

每一个 zone 里面有一个这样的 Per-CPU 变量。

struct zone {
    ...
	struct per_cpu_pageset __percpu *pageset;
    ...
}

per_cpu_pageset 数据结构的定义如下：

struct per_cpu_pages {
	int count;		/* 链表中页面的数量 */
	int high;		/* 高水位 */
	int batch;		/* 每一次回收到伙伴系统中的页面数量 */

	/* 页面链表，分成多个迁移类型 */
	struct list_head lists[MIGRATE_PCPTYPES];
};

struct per_cpu_pageset {
	struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
	s8 expire;
	u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
	s8 stat_threshold;
	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};

count 表示链表中页面的数量；
high 表示当缓存的页面高于该水位时会回收页面到伙伴系统；
batch 表示一次回收到伙伴系统的页面数量。

处理 order 大于 0 的情况。首先需要申请一个 zone->lock 的自旋锁来保护 zone 中的伙伴系统，这也是 PCP 机制可以加速物理页面分配的原因，在 PCP 机制中不需要申请这个锁。

	spin_lock_irqsave(&zone->lock, flags);

do-while 循环调用 __rmqueue() 函数分配内存。

在 __rmqueue() 函数中首先调用 __rmqueue_smallest() 函数尝试切一块内存返回。若 __rmqueue_smallest() 函数分配内存失败，则调用 __rmqueue_fallback() 函数，该函数会到伙伴系统的备份空闲链表中挪用内存。所谓的备份空闲链表，指的是不同迁移类型的空闲链表。如 order 为 5 的空闲链表中，根据迁移类型会划分为 MIGRATE_UNMOVABLE 、MIGRATE_MOVABLE 和 MIGRATE_RECLAIMABLE 。当分配 order 为 5 的 MIGRATE_MOVABLE 的页面失败时，首先从 MAX_ORDER-1 的空闲链表开始查找，并搜索 MIGRATE_RECLAIMABLE 和 MIGRATE_UNMOVABLE 这两个迁移类型的空闲链表中是否有内存可以借用。

static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
						unsigned int alloc_flags)
{
	struct page *page;

retry:
	page = __rmqueue_smallest(zone, order, migratetype);
	if (unlikely(!page)) {
		if (migratetype == MIGRATE_MOVABLE)
			page = __rmqueue_cma_fallback(zone, order);

		if (!page && __rmqueue_fallback(zone, order, migratetype,
								alloc_flags))
			goto retry;
	}

	trace_mm_page_alloc_zone_locked(page, order, migratetype);
	return page;
}

	do {
		page = NULL;
		if (alloc_flags & ALLOC_HARDER) {
			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
			if (page)
				trace_mm_page_alloc_zone_locked(page, order, migratetype);
		}
		if (!page)
			page = __rmqueue(zone, order, migratetype, alloc_flags);
	} while (page && check_new_pages(page, order));
	spin_unlock(&zone->lock);
	if (!page)
		goto failed;

check_new_pages() 函数判断分配出来的页面是否合格。首先，__rmqueue() 函数返回内存块第一个物理页面的 page 数据结构，而 check_new _pages() 函数需要检查整个内存块所有的物理页面。刚分配页面的 page 的_mapcount 应该为 0 。这时 page->mapping 为 NULL 。然后，判断 page 的 _refcount 是否为 0 。注意，alloc_pages() 分配的 page 的 _refcount 应该为 1 ，但是这里为 0 ，因为这个函数之后还调用 set _page_refcounted()->set _page_count() 把 _refcount 设置为 1 。检查 PAGE FLAGS_CHECK_AT PREP 标志位，这个标志位在释放页面时已经被清零了，而这时该标志位被设置，说明分配过程中有问题。若 check_new_pages() 函数返回 true ，说明页面有问题；否则，说明页面检查通过。

static void check_new_page_bad(struct page *page)
{
	const char *bad_reason = NULL;
	unsigned long bad_flags = 0;

	if (unlikely(atomic_read(&page->_mapcount) != -1))
		bad_reason = "nonzero mapcount";
	if (unlikely(page->mapping != NULL))
		bad_reason = "non-NULL mapping";
	if (unlikely(page_ref_count(page) != 0))
		bad_reason = "nonzero _count";
	if (unlikely(page->flags & __PG_HWPOISON)) {
		bad_reason = "HWPoisoned (hardware-corrupted)";
		bad_flags = __PG_HWPOISON;
		/* Don't complain about hwpoisoned pages */
		page_mapcount_reset(page); /* remove PageBuddy */
		return;
	}
	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
		bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
	}
#ifdef CONFIG_MEMCG
	if (unlikely(page->mem_cgroup))
		bad_reason = "page still charged to cgroup";
#endif
	bad_page(page, bad_reason, bad_flags);
}

static inline int check_new_page(struct page *page)
{
	if (likely(page_expected_state(page,
				PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
		return 0;

	check_new_page_bad(page);
	return 1;
}

static bool check_new_pages(struct page *page, unsigned int order)
{
	int i;
	for (i = 0; i < (1 << order); i++) {
		struct page *p = page + i;

		if (unlikely(check_new_page(p)))
			return true;
	}

	return false;
}

页面分配完成后需要更新 zone 的 NR_FREE_PAGES 。

static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
					     int migratetype)
{
	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
	if (is_migrate_cma(migratetype))
		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
}

	__mod_zone_freepage_state(zone, -(1 << order),
				  get_pcppage_migratetype(page));

判断 zone->flags 是否设置了 ZONE_BOOSTED_WATERMARK 标志位。若该标志位置位，则将其清零，并且唤醒 kswapd 内核线程回收内存。当页面分配器触发向备份空闲链表借用内存时，说明系统有外碎片化倾向，因此设置 ZONE_BOOSTED_WATERMARK 标志位，这是 Linux 5.0 内核中新增的外碎片化优化补丁。

	/* Separate test+clear to avoid unnecessary atomics */
	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
	}

VM_BUG_ON_PAGE() 宏需要打开 CONFIG_DEBUG_VM 配置才会生效。之后返回分配好的内存块中第一个页面的 page 数据结构。

	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
	return page;

_rmqueue_smallest() 函数实现在 mm/page_alloc.c 文件中。

static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{
	unsigned int current_order;
	struct free_area *area;
	struct page *page;

	/* Find a page of the appropriate size in the preferred list */
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);
		page = list_first_entry_or_null(&area->free_list[migratetype],
							struct page, lru);
		if (!page)
			continue;
		list_del(&page->lru);
		rmv_page_order(page);
		area->nr_free--;
		expand(zone, page, order, current_order, area, migratetype);
		set_pcppage_migratetype(page, migratetype);
		return page;
	}

	return NULL;
}

在 __rmqueue_smallest() 函数中，首先从 order 开始查找 zone 中的空闲链表。如果 zone 的当前 order 对应的空闲链表中相应迁移类型的链表里没有空闲对象，就会查找上一级 order 对应的空闲链表。
因为在系统启动时，会尽可能把空闲页面分配到 MAX_ORDER-1 的链表中，在系统刚启动之后，可以通过cat /proc/pagetypeinfo 命令看出。当找到某一个 order 的空闲链表中对应的 migratetype 类型的链表中有空闲内存块时，就会从中把一个内存块取出来，然后调用 expand() 函数来分配。因为通常取出的内存块要比需要的内存大，所以分配之后需要把剩下的内存块重新放回伙伴系统中。

static inline void expand(struct zone *zone, struct page *page,
	int low, int high, struct free_area *area,
	int migratetype)
{
	unsigned long size = 1 << high;

	while (high > low) {
		area--;
		high--;
		size >>= 1;
		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

		/*
		 * Mark as guard pages (or page), that will allow to
		 * merge back to allocator when buddy will be freed.
		 * Corresponding page table entries will not be touched,
		 * pages will stay not present in virtual address space
		 */
		if (set_page_guard(zone, &page[size], high, migratetype))
			continue;

		list_add(&page[size].lru, &area->free_list[migratetype]);
		area->nr_free++;
		set_page_order(&page[size], high);
	}
}

expand() 函数用于实现分配的功能。这里参数 high 就是 current_order ，通常 current _order 要比需求的 order 要大。每比较一次，相当于 order 降低了一个级别，最后通过 list _add() 函数把剩下的内存块添加到低一级的空闲链表中。

__alloc_pages_slowpath

当快速路径的分配不成功时，说明系统当前可能已经没有足够的连续的空闲页面，这时我们就要进入到慢速路径的分配，进行内存碎片整理与内存回收，之后再进行分配。

static inline struct page *__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac);

__alloc_pages_slowpath 函数主要实现如下操作。

使用原有的 gfp_flag 重新设置 alloc_flag，并重新计算 preferred zone，若设置了 ALLOC_KSWAPD 则调用 wake_all_kswapds() 唤醒 kswapd 线程进行内存回收。

	/*
	 * The fast path uses conservative alloc_flags to succeed only until
	 * kswapd needs to be woken up, and to avoid the cost of setting up
	 * alloc_flags precisely. So we do that now.
	 */
	alloc_flags = gfp_to_alloc_flags(gfp_mask);

	/*
	 * We need to recalculate the starting point for the zonelist iterator
	 * because we might have used different nodemask in the fast path, or
	 * there was a cpuset modification and we are retrying - otherwise we
	 * could end up iterating over non-eligible zones endlessly.
	 */
	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->high_zoneidx, ac->nodemask);
	if (!ac->preferred_zoneref->zone)
		goto nopage;

	if (alloc_flags & ALLOC_KSWAPD)
		wake_all_kswapds(order, gfp_mask, ac);

之后重新尝试快速路径的分配，若成功则直接返回。

	/*
	 * The adjusted alloc_flags might result in immediate success, so try
	 * that first
	 */
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page)
		goto got_pg;

接下来调用 __alloc_pages_direct_compact() 进行 compaction，该函数内部在整理完后会重新尝试快速路径的分配，若成功则直接返回。

	/*
	 * For costly allocations, try direct compaction first, as it's likely
	 * that we have enough base pages and don't need to reclaim. For non-
	 * movable high-order allocations, do that as well, as compaction will
	 * try prevent permanent fragmentation by migrating from blocks of the
	 * same migratetype.
	 * Don't try this for allocations that are allowed to ignore
	 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
	 */
	if (can_direct_reclaim &&
			(costly_order ||
			   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
			&& !gfp_pfmemalloc_allowed(gfp_mask)) {
		page = __alloc_pages_direct_compact(gfp_mask, order,
						alloc_flags, ac,
						INIT_COMPACT_PRIORITY,
						&compact_result);
		if (page)
			goto got_pg;

接下来调用 wake_all_kswapds() 唤醒 kswapd 线程进行内存回收。

retry:
	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
	if (alloc_flags & ALLOC_KSWAPD)
		wake_all_kswapds(order, gfp_mask, ac);

调整 zonelist 与 alloc_flag ，之后再次尝试快速路径分配，若成功则直接返回。

	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
	if (reserve_flags)
		alloc_flags = reserve_flags;

	/*
	 * Reset the nodemask and zonelist iterators if memory policies can be
	 * ignored. These allocations are high priority and system rather than
	 * user oriented.
	 */
	if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
		ac->nodemask = NULL;
		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->high_zoneidx, ac->nodemask);
	}

	/* Attempt with potentially adjusted zonelist and alloc_flags */
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page)
		goto got_pg;

若 gfp_flag 中没有 __GFP_DIRECT_RECLAIM 或是进程 PCB 的 flag 中有 PF_MEMALLOC，直接跳转到（nopage）。

	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
	...
	/* Caller is not willing to reclaim, we can't balance anything */
	if (!can_direct_reclaim)
		goto nopage;

	/* Avoid recursion of direct reclaim */
	if (current->flags & PF_MEMALLOC)
		goto nopage;

调用 __alloc_pages_direct_reclaim() 进行内存回收（内部调用 __perform_reclaim()）与快速路径分配，若成功则直接返回。

/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
		unsigned int alloc_flags, const struct alloc_context *ac,
		unsigned long *did_some_progress)
{
	struct page *page = NULL;
	bool drained = false;

	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
	if (unlikely(!(*did_some_progress)))
		return NULL;

retry:
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

	/*
	 * If an allocation failed after direct reclaim, it could be because
	 * pages are pinned on the per-cpu lists or in high alloc reserves.
	 * Shrink them them and try again
	 */
	if (!page && !drained) {
		unreserve_highatomic_pageblock(ac, false);
		drain_all_pages(NULL);
		drained = true;
		goto retry;
	}

	return page;
}

	/* Try direct reclaim and then allocating */
	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
							&did_some_progress);
	if (page)
		goto got_pg;

调用 __alloc_pages_direct_compact() 进行 compaction 与快速路径分配，若成功则直接返回。

/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
		unsigned int alloc_flags, const struct alloc_context *ac,
		enum compact_priority prio, enum compact_result *compact_result)
{
	struct page *page;
	unsigned long pflags;
	unsigned int noreclaim_flag;

	if (!order)
		return NULL;

	psi_memstall_enter(&pflags);
	noreclaim_flag = memalloc_noreclaim_save();

	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
									prio);

	memalloc_noreclaim_restore(noreclaim_flag);
	psi_memstall_leave(&pflags);

	if (*compact_result <= COMPACT_INACTIVE)
		return NULL;

	/*
	 * At least in one zone compaction wasn't deferred or skipped, so let's
	 * count a compaction stall
	 */
	count_vm_event(COMPACTSTALL);

	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

	if (page) {
		struct zone *zone = page_zone(page);

		zone->compact_blockskip_flush = false;
		compaction_defer_reset(zone, order, true);
		count_vm_event(COMPACTSUCCESS);
		return page;
	}

	/*
	 * It's bad if compaction run occurs and fails. The most likely reason
	 * is that pages exist, but not enough to satisfy watermarks.
	 */
	count_vm_event(COMPACTFAIL);

	cond_resched();

	return NULL;
}

	/* Try direct compaction and then allocating */
	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
					compact_priority, &compact_result);
	if (page)
		goto got_pg;

如果设置了 __GFP_NORETRY ，或是该次内存分配开销较高（order > PAGE_ALLOC_COSTLY_ORDER）且未设置 __GFP_RETRY_MAYFAIL，直接跳到（nopage）

#define PAGE_ALLOC_COSTLY_ORDER 3

	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
	...
	/* Do not loop if specifically requested */
	if (gfp_mask & __GFP_NORETRY)
		goto nopage;

	/*
	 * Do not retry costly high order allocations unless they are
	 * __GFP_RETRY_MAYFAIL
	 */
	if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
		goto nopage;

调用 should_reclaim_retry() 判断是否需要重新回收，若是则跳回（retry）

	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
				 did_some_progress > 0, &no_progress_loops))
		goto retry;

调用 should_compact_retry() 判断是否需要重新进行 compaction，若是则跳回（retry）

	/*
	 * It doesn't make any sense to retry for the compaction if the order-0
	 * reclaim is not able to make any progress because the current
	 * implementation of the compaction depends on the sufficient amount
	 * of free memory (see __compaction_suitable)
	 */
	if (did_some_progress > 0 &&
			should_compact_retry(ac, order, alloc_flags,
				compact_result, &compact_priority,
				&compaction_retries))
		goto retry;

调用 check_retry_cpuset() 检查 cpuset 是否发生变化，若是则跳转回开头

	/* Deal with possible cpuset update races before we start OOM killing */
	if (check_retry_cpuset(cpuset_mems_cookie, ac))
		goto retry_cpuset;

调用 __alloc_pages_may_oom() 尝试 kill 一些进程来释放内存，该函数内首先还是会先进行一次快速分配，之后才是调用 out_of_memory() 来杀掉最适合的进程以释放内存，最后若设置了 __GFP_NOFAIL 则调用 __alloc_pages_cpuset_fallback() 再次尝试内存分配，在该函数中会两次走快速路径进行分配（第一次会额外附加上 ALLOC_CPUSET 的 flag）

static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
	const struct alloc_context *ac, unsigned long *did_some_progress)
{
	struct oom_control oc = {
		.zonelist = ac->zonelist,
		.nodemask = ac->nodemask,
		.memcg = NULL,
		.gfp_mask = gfp_mask,
		.order = order,
	};
	struct page *page;

	*did_some_progress = 0;

	/*
	 * Acquire the oom lock.  If that fails, somebody else is
	 * making progress for us.
	 */
	if (!mutex_trylock(&oom_lock)) {
		*did_some_progress = 1;
		schedule_timeout_uninterruptible(1);
		return NULL;
	}

	/*
	 * Go through the zonelist yet one more time, keep very high watermark
	 * here, this is only to catch a parallel oom killing, we must fail if
	 * we're still under heavy pressure. But make sure that this reclaim
	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
	 * allocation which will never fail due to oom_lock already held.
	 */
	page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
				      ~__GFP_DIRECT_RECLAIM, order,
				      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
	if (page)
		goto out;

	/* Coredumps can quickly deplete all memory reserves */
	if (current->flags & PF_DUMPCORE)
		goto out;
	/* The OOM killer will not help higher order allocs */
	if (order > PAGE_ALLOC_COSTLY_ORDER)
		goto out;
	/*
	 * We have already exhausted all our reclaim opportunities without any
	 * success so it is time to admit defeat. We will skip the OOM killer
	 * because it is very likely that the caller has a more reasonable
	 * fallback than shooting a random task.
	 */
	if (gfp_mask & __GFP_RETRY_MAYFAIL)
		goto out;
	/* The OOM killer does not needlessly kill tasks for lowmem */
	if (ac->high_zoneidx < ZONE_NORMAL)
		goto out;
	if (pm_suspended_storage())
		goto out;
	/*
	 * XXX: GFP_NOFS allocations should rather fail than rely on
	 * other request to make a forward progress.
	 * We are in an unfortunate situation where out_of_memory cannot
	 * do much for this context but let's try it to at least get
	 * access to memory reserved if the current task is killed (see
	 * out_of_memory). Once filesystems are ready to handle allocation
	 * failures more gracefully we should just bail out here.
	 */

	/* The OOM killer may not free memory on a specific node */
	if (gfp_mask & __GFP_THISNODE)
		goto out;

	/* Exhausted what can be done so it's blame time */
	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
		*did_some_progress = 1;

		/*
		 * Help non-failing allocations by giving them access to memory
		 * reserves
		 */
		if (gfp_mask & __GFP_NOFAIL)
			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
					ALLOC_NO_WATERMARKS, ac);
	}
out:
	mutex_unlock(&oom_lock);
	return page;
}

	/* Reclaim has failed us, start killing things */
	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
	if (page)
		goto got_pg;

如果把当前进程杀掉了，跳到（nopage）；如果杀进程取得了成效，跳回（retry）

	/* Avoid allocations with no watermarks from looping endlessly */
	if (tsk_is_oom_victim(current) &&
	    (alloc_flags == ALLOC_OOM ||
	     (gfp_mask & __GFP_NOMEMALLOC)))
		goto nopage;

	/* Retry as long as the OOM killer is making progress */
	if (did_some_progress) {
		no_progress_loops = 0;
		goto retry;
	}

返回结果

nopage:
	/* Deal with possible cpuset update races before we fail */
	if (check_retry_cpuset(cpuset_mems_cookie, ac))
		goto retry_cpuset;

	/*
	 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
	 * we always retry
	 */
	if (gfp_mask & __GFP_NOFAIL) {
		/*
		 * All existing users of the __GFP_NOFAIL are blockable, so warn
		 * of any new users that actually require GFP_NOWAIT
		 */
		if (WARN_ON_ONCE(!can_direct_reclaim))
			goto fail;

		/*
		 * PF_MEMALLOC request from this context is rather bizarre
		 * because we cannot reclaim anything and only can loop waiting
		 * for somebody to do a work for us
		 */
		WARN_ON_ONCE(current->flags & PF_MEMALLOC);

		/*
		 * non failing costly orders are a hard requirement which we
		 * are not prepared for much so let's warn about these users
		 * so that we can identify them and convert them to something
		 * else.
		 */
		WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);

		/*
		 * Help non-failing allocations by giving them access to memory
		 * reserves but do not use ALLOC_NO_WATERMARKS because this
		 * could deplete whole memory reserves which would just make
		 * the situation worse
		 */
		page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
		if (page)
			goto got_pg;

		cond_resched();
		goto retry;
	}
fail:
	warn_alloc(gfp_mask, ac->nodemask,
			"page allocation failure: order:%u", order);
got_pg:
	return page;
}

__alloc_pages_slowpath() 函数的流程如图所示。
在这里插入图片描述

__free_pages

释放页面的核心函数是 free_pages() ，但最终还会调用 __free_pages() 函数。

void free_pages(unsigned long addr, unsigned int order)
{
	if (addr != 0) {
		VM_BUG_ON(!virt_addr_valid((void *)addr));
		__free_pages(virt_to_page((void *)addr), order);
	}
}

__free_pages() 函数在释放页面时会分两种情况：对于 order 等于 0 的情况，特殊处理；对于 order 大于 0 的情况，正常处理。

static inline void free_the_page(struct page *page, unsigned int order)
{
	if (order == 0)		/* Via pcp? */
		free_unref_page(page);
	else
		__free_pages_ok(page, order);
}

void __free_pages(struct page *page, unsigned int order)
{
	if (put_page_testzero(page))
		free_the_page(page, order);
}

首先来看 order 等于 0 的情况—— free _unref _page() 函数释放单个页面的情况。
page_to_pfn() 宏将 page 数据结构转换成页帧号。

	unsigned long pfn = page_to_pfn(page);

free_unref_page _prepare() 函数对物理页面做一些释放前的检查。

	if (!free_unref_page_prepare(page, pfn))
		return;

释放物理页面的过程中，我们不希望有中断打扰，因为中断过程中可能会触发另外一个页面分配，从而扰乱了本地 CPU 的 PCP 链表结构，所以关闭本地中断。

	migratetype = get_pfnblock_migratetype(page, pfn);

调用 free_unref_page_commit() 函数来释放单个页面到 PCP 链表中。

	set_pcppage_migratetype(page, migratetype);

接下来看多个页面的释放情况。__free_pages_ok() 函数最后会调用 free_one_page() 函数，因此释放内存页面到伙伴系统中，这最终还是通过 __free_one_page 来实现的。该函数不仅可以释放内存页面到伙伴系统，还可以处理空闲页面的合并工作。

static void free_one_page(struct zone *zone,
				struct page *page, unsigned long pfn,
				unsigned int order,
				int migratetype)
{
	spin_lock(&zone->lock);
	if (unlikely(has_isolate_pageblock(zone) ||
		is_migrate_isolate(migratetype))) {
		migratetype = get_pfnblock_migratetype(page, pfn);
	}
	__free_one_page(page, pfn, zone, order, migratetype);
	spin_unlock(&zone->lock);
}

static void __free_pages_ok(struct page *page, unsigned int order)
{
	unsigned long flags;
	int migratetype;
	unsigned long pfn = page_to_pfn(page);

	if (!free_pages_prepare(page, order, true))
		return;

	migratetype = get_pfnblock_migratetype(page, pfn);
	local_irq_save(flags);
	__count_vm_events(PGFREE, 1 << order);
	free_one_page(page_zone(page), page, pfn, order, migratetype);
	local_irq_restore(flags);
}

释放内存页面的核心功能是把空闲页面添加到伙伴系统中适当的空闲链表中。在释放内存块时，会检查相邻的内存块是否空闲，如果空闲，就将其合并成一个大的内存块，放置到高一级的空闲链表中。如果还能继续合并邻近的内存块，就会继续合并，转移到更高级的空闲链表中，这个过程会一直重复下去，直至所有可能合并的内存块都已经合并。

__free_one_page() 函数实现在 mm/page_alloc.c 文件中。

static inline void __free_one_page(struct page *page, unsigned long pfn, struct zone *zone, unsigned int order, int migratetype);

这段代码是合并相邻伙伴块的核心代码。我们以一个实际例子来说明这段代码的逻辑，假设现在要释放一个内存块 A ，大小为两个页面，内存块中页面的开始页帧号是 0x10，order 为 1 ，空闲伙伴块的布局如图所示。
在这里插入图片描述
首先计算 max_order

#define MAX_ORDER 11
#define PAGE_SHIFT		12
#define HPAGE_SHIFT		PMD_SHIFT
#define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
#define pageblock_order		HUGETLB_PAGE_ORDER

max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);

这个页面的页帧号是 0x10 。也就是说，这个内存块位于页块中 0x10 的位置。

在第一次 while 循环中，__find_buddy_pfn() 函数计算 buddy_pfn 。

static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
	return page_pfn ^ (1 << order);
}

		buddy_pfn = __find_buddy_pfn(pfn, order);
		buddy = page + (buddy_pfn - pfn);

page_pfn 为 0x10 ，order 为 1 ，最后的计算结果 buddy_pfn 为 0x12 。buddy 就是内存块 A 的临近内存块 B 了，内存块 B 在页块中的起始地址为 0x12 。

通过 page_is_buddy() 函数来检查内存块 B 是不是空闲的内存块。

static inline int page_is_buddy(struct page *page, struct page *buddy,
							unsigned int order)
{
	if (page_is_guard(buddy) && page_order(buddy) == order) {
		if (page_zone_id(page) != page_zone_id(buddy))
			return 0;

		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

		return 1;
	}

	if (PageBuddy(buddy) && page_order(buddy) == order) {
		/*
		 * zone check is done late to avoid uselessly
		 * calculating zone/node ids for pages that could
		 * never merge.
		 */
		if (page_zone_id(page) != page_zone_id(buddy))
			return 0;

		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

		return 1;
	}
	return 0;
}

		if (!page_is_buddy(page, buddy, order))
			goto done_merging;

为了判断内存块 B 与内存块 A 是否为伙伴关系，需要判断 3 个条件。

内存块 B 要在伙伴系统中。
内存块 B 的 order 和内存块 A 的 order 相同。
内存块 B 要和内存块 A 在同一个 zone 里。

只有满足上述 3 个条件，我们才认为内存块 A 和内存块 B 是伙伴块。若 page_is_buddy() 函数返回 1 ，表示相邻的内存块是伙伴块；若返回 0 ，表示没找到伙伴块。

当没有找到伙伴块时，我们可以跳转到 done _merging 标签处进行合并。当我们找到了一个志同道合的空闲伙伴块时，把它从空闲链表中取出来，以便和内存块 A 合并到高一级的空闲链表中。

		/*
		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
		 * merge with it and move up one order.
		 */
		if (page_is_guard(buddy)) {
			clear_page_guard(zone, buddy, order, migratetype);
		} else {
			list_del(&buddy->lru);
			zone->free_area[order].nr_free--;
			rmv_page_order(buddy);
		}
		combined_pfn = buddy_pfn & pfn;
		page = page + (combined_pfn - pfn);
		pfn = combined_pfn;
		order++;

继续寻找符合要求的内存块。这时 combined_pfn 指向内存块 A 的起始地址。order++ 表示继续寻找有没有可能合并的相邻的内存块，这次要查找的内存的 order 等于 2 ，也就是 4 个页面大小的内存块。不断查找附近有没有 order 为 2 的伙伴块。如果在 0x14 位置的内存块 C 不满足合并条件，如内存块 C 不是空闲页面，或者内存块 C 的 order不等于 2 。内存块 C 的 order 等于 3 ，这显然不符合条件。因为没找到 order 为 2 的内存块，所以只能合并内存块 A 和 B ，然后把合并的内存块添加到空闲链表中。

处理一个特殊情况，刚才找到的能合并的最大伙伴块可能不是最大的内存块，能合并的最大的内存块的 order 应该为 MAX_ORDER-1 。

	/*
	 * If this is not the largest possible page, check if the buddy
	 * of the next-highest order is free. If it is, it's possible
	 * that pages are being freed that will coalesce soon. In case,
	 * that is happening, add the free page to the tail of the list
	 * so it's less likely to be used soon and more likely to be merged
	 * as a higher order page
	 */
	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
		struct page *higher_page, *higher_buddy;
		combined_pfn = buddy_pfn & pfn;
		higher_page = page + (combined_pfn - pfn);
		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
		if (pfn_valid_within(buddy_pfn) &&
		    page_is_buddy(higher_page, higher_buddy, order + 1)) {
			list_add_tail(&page->lru,
				&zone->free_area[order].free_list[migratetype]);
			goto out;
		}
	}

执行合并功能。

	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

slub

关键过程分析

slab_alloc_node

在 slab allocator 中存在着多个不同的内存分配接口，其最后都会调用到 slab_alloc_node() 完成内存分配的工作：

static __always_inline void *slab_alloc_node(struct kmem_cache *s,
		gfp_t gfpflags, int node, unsigned long addr)

该函数首先会调用 slab_pre_alloc_hook() 进行分配前的检查工作，不通过则返回 NULL，这一步主要是检查分配标志位是否合法等：

	s = slab_pre_alloc_hook(s, gfpflags);
	if (!s)
		return NULL;

获取 tid 和 struct kmem_cache_cpu *c 之后从 kmem_cache_cpu 中获取 freelist 和 page 字段。

redo:
	do {
		tid = this_cpu_read(s->cpu_slab->tid);
		c = raw_cpu_ptr(s->cpu_slab);
	} while (IS_ENABLED(CONFIG_PREEMPT) &&
		 unlikely(tid != READ_ONCE(c->tid)));
		 
	barrier();
	
	object = c->freelist;
	page = c->page;

如果 freelist 为空或者 page 不属于指定的 node 则调用 __slab_alloc 进行内存分配。

static inline int page_to_nid(const struct page *page)
{
	struct page *p = (struct page *)page;

	return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
}

static inline int node_match(struct page *page, int node)
{
#ifdef CONFIG_NUMA
	if (node != NUMA_NO_NODE && page_to_nid(page) != node)
		return 0;
#endif
	return 1;
}

	if (unlikely(!object || !node_match(page, node))) {
		object = __slab_alloc(s, gfpflags, node, addr, c);
		stat(s, ALLOC_SLOWPATH);
	}

否则直接从 freelist 上取下 object 。

 else {
		void *next_object = get_freepointer_safe(s, object);
		if (unlikely(!this_cpu_cmpxchg_double(
				s->cpu_slab->freelist, s->cpu_slab->tid,
				object, tid,
				next_object, next_tid(tid)))) {

			note_cmpxchg_failure("slab_alloc", s, tid);
			goto redo;
		}
		prefetch_freepointer(s, next_object);
		stat(s, ALLOC_FASTPATH);
	}

最后如果 __GFP_ZERO 置位则将获取的 object 清零，之后返回 object 。

	if (unlikely(gfpflags & __GFP_ZERO) && object)
		memset(object, 0, s->object_size);

	slab_post_alloc_hook(s, gfpflags, 1, &object);

	return object;

___slab_alloc

__slab_alloc 只是对 ___slab_alloc 的一个简单包装，真正逻辑在 ___slab_alloc 函数中。

static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *p;
	unsigned long flags;

	local_irq_save(flags);
#ifdef CONFIG_PREEMPT
	/*
	 * We may have been preempted and rescheduled on a different
	 * cpu before disabling interrupts. Need to reload cpu area
	 * pointer.
	 */
	c = this_cpu_ptr(s->cpu_slab);
#endif

	p = ___slab_alloc(s, gfpflags, node, addr, c);
	local_irq_restore(flags);
	return p;
}

首先从 kmem_cache_cpu 中获取 page 如果没有则跳转到 new_slab 。

	page = c->page;
	if (!page)
		goto new_slab;

如果 page 和指定的 node 不匹配或者 page 的 flags 检查不通过则调用 deactivate_slab 之后跳转到 new_slab 。

redo:

	if (unlikely(!node_match(page, node))) {
		int searchnode = node;

		if (node != NUMA_NO_NODE && !node_present_pages(node))
			searchnode = node_to_mem_node(node);

		if (unlikely(!node_match(page, searchnode))) {
			stat(s, ALLOC_NODE_MISMATCH);
			deactivate_slab(s, page, c->freelist, c);
			goto new_slab;
		}
	}

	/*
	 * By rights, we should be searching for a slab page that was
	 * PFMEMALLOC but right now, we are losing the pfmemalloc
	 * information when the page leaves the per-cpu allocator
	 */
	if (unlikely(!pfmemalloc_match(page, gfpflags))) {
		deactivate_slab(s, page, c->freelist, c);
		goto new_slab;
	}

判断 freelist 是否为空，如果 freelist 为空则调用 get_freelist 获取 page 对应的 freelist 并将 page 的 freelist 字段置空。如果 get_freelist 也获取不到 freelist 则将 kmem_cache_cpu 的 page 置空然后跳转至 new_slab 。
如果找到了 freelist 则将 freelist 中第一个 object 取出并更新 freelist 和 tid 。

static inline void *get_freelist(struct kmem_cache *s, struct page *page)
{
	struct page new;
	unsigned long counters;
	void *freelist;

	do {
		freelist = page->freelist;
		counters = page->counters;

		new.counters = counters;
		VM_BUG_ON(!new.frozen);

		new.inuse = page->objects;
		new.frozen = freelist != NULL;

	} while (!__cmpxchg_double_slab(s, page,
		freelist, counters,
		NULL, new.counters,
		"get_freelist"));

	return freelist;
}

	/* must check again c->freelist in case of cpu migration or IRQ */
	freelist = c->freelist;
	if (freelist)
		goto load_freelist;

	freelist = get_freelist(s, page);

	if (!freelist) {
		c->page = NULL;
		stat(s, DEACTIVATE_BYPASS);
		goto new_slab;
	}

	stat(s, ALLOC_REFILL);

load_freelist:
	/*
	 * freelist is pointing to the list of objects to be used.
	 * page is pointing to the page from which the objects are obtained.
	 * That page must be frozen for per cpu allocations to work.
	 */
	VM_BUG_ON(!c->page->frozen);
	c->freelist = get_freepointer(s, freelist);
	c->tid = next_tid(c->tid);
	return freelist;

在 new_slab 阶段，首先如果 kmem_cache_cpu 的 partial 不为空则从其中取出一个 page 来更新 kmem_cache_cpu 的 page 字段然后跳转至 redo 阶段。

#define slub_percpu_partial(c)		((c)->partial)
#define slub_set_percpu_partial(c, p) ({ slub_percpu_partial(c) = (p)->next; })

new_slab:

	if (slub_percpu_partial(c)) {
		page = c->page = slub_percpu_partial(c);
		slub_set_percpu_partial(c, page);
		stat(s, CPU_PARTIAL_ALLOC);
		goto redo;
	}

调用 new_slab_objects 函数获取 freelist ，如果无法获取则调用 slab_out_of_memory 函数打印错误信息，之后进行一系列的检查。

	freelist = new_slab_objects(s, gfpflags, node, &c);

	if (unlikely(!freelist)) {
		slab_out_of_memory(s, gfpflags, node);
		return NULL;
	}

	page = c->page;
	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
		goto load_freelist;

	/* Only entered in the debug case */
	if (kmem_cache_debug(s) &&
			!alloc_debug_processing(s, page, freelist, addr))
		goto new_slab;	/* Slab failed checks. Next slab needed */

	deactivate_slab(s, page, get_freepointer(s, freelist), c);
	return freelist;

new_slab_objects

首先尝试通过 get_partial 函数获取 freelist 。 get_partial 函数有 get_partial_node 和 get_any_partial 两种方法。

static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
{
	return s->node[node];
}

static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
		struct kmem_cache_cpu *c)
{
	void *object;
	int searchnode = node;

	if (node == NUMA_NO_NODE)
		searchnode = numa_mem_id();
	else if (!node_present_pages(node))
		searchnode = node_to_mem_node(node);

	object = get_partial_node(s, get_node(s, searchnode), c, flags);
	if (object || node != NUMA_NO_NODE)
		return object;

	return get_any_partial(s, flags, c);
}

    freelist = get_partial(s, flags, node, c);

	if (freelist)
		return freelist;

get_partial_node 是从 kmem_cache_node 的 partial 链表中获取 object，同时也获取一些额外的 object 放到 kmem_cache_cpu 的 partial 中。

static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
	struct page *oldpage;
	int pages;
	int pobjects;

	preempt_disable();
	do {
		pages = 0;
		pobjects = 0;
		oldpage = this_cpu_read(s->cpu_slab->partial);

		if (oldpage) {
			pobjects = oldpage->pobjects;
			pages = oldpage->pages;
			if (drain && pobjects > s->cpu_partial) {
				unsigned long flags;
				/*
				 * partial array is full. Move the existing
				 * set to the per node partial list.
				 */
				local_irq_save(flags);
				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
				local_irq_restore(flags);
				oldpage = NULL;
				pobjects = 0;
				pages = 0;
				stat(s, CPU_PARTIAL_DRAIN);
			}
		}

		pages++;
		pobjects += page->objects - page->inuse;

		page->pages = pages;
		page->pobjects = pobjects;
		page->next = oldpage;

	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
								!= oldpage);
	if (unlikely(!s->cpu_partial)) {
		unsigned long flags;

		local_irq_save(flags);
		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
		local_irq_restore(flags);
	}
	preempt_enable();
#endif
}


static inline void *acquire_slab(struct kmem_cache *s,
		struct kmem_cache_node *n, struct page *page,
		int mode, int *objects)
{
	void *freelist;
	unsigned long counters;
	struct page new;

	lockdep_assert_held(&n->list_lock);

	/*
	 * Zap the freelist and set the frozen bit.
	 * The old freelist is the list of objects for the
	 * per cpu allocation list.
	 */
	freelist = page->freelist;
	counters = page->counters;
	new.counters = counters;
	*objects = new.objects - new.inuse;
	if (mode) {
		new.inuse = page->objects;
		new.freelist = NULL;
	} else {
		new.freelist = freelist;
	}

	VM_BUG_ON(new.frozen);
	new.frozen = 1;

	if (!__cmpxchg_double_slab(s, page,
			freelist, counters,
			new.freelist, new.counters,
			"acquire_slab"))
		return NULL;

	remove_partial(n, page);
	WARN_ON(!freelist);
	return freelist;
}

static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
				struct kmem_cache_cpu *c, gfp_t flags)
{
	struct page *page, *page2;
	void *object = NULL;
	unsigned int available = 0;
	int objects;

	/*
	 * Racy check. If we mistakenly see no partial slabs then we
	 * just allocate an empty slab. If we mistakenly try to get a
	 * partial slab and there is none available then get_partials()
	 * will return NULL.
	 */
	if (!n || !n->nr_partial)
		return NULL;

	spin_lock(&n->list_lock);
	list_for_each_entry_safe(page, page2, &n->partial, lru) {
		void *t;

		if (!pfmemalloc_match(page, flags))
			continue;

		t = acquire_slab(s, n, page, object == NULL, &objects);
		if (!t)
			break;

		available += objects;
		if (!object) {
			c->page = page;
			stat(s, ALLOC_FROM_PARTIAL);
			object = t;
		} else {
			put_cpu_partial(s, page, 0);
			stat(s, CPU_PARTIAL_NODE);
		}
		if (!kmem_cache_has_cpu_partial(s)
			|| available > slub_cpu_partial(s) / 2)
			break;

	}
	spin_unlock(&n->list_lock);
	return object;
}

get_any_partial 遍历 zonelist 中的 zone 然后找到 zone 对应的 node 并调用 get_partial_node 获取 object 。

	do {
		cpuset_mems_cookie = read_mems_allowed_begin();
		zonelist = node_zonelist(mempolicy_slab_node(), flags);
		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
			struct kmem_cache_node *n;

			n = get_node(s, zone_to_nid(zone));

			if (n && cpuset_zone_allowed(zone, flags) &&
					n->nr_partial > s->min_partial) {
				object = get_partial_node(s, n, c, flags);
				if (object) {
					/*
					 * Don't check read_mems_allowed_retry()
					 * here - if mems_allowed was updated in
					 * parallel, that was a harmless race
					 * between allocation and the cpuset
					 * update
					 */
					return object;
				}
			}
		}
	} while (read_mems_allowed_retry(cpuset_mems_cookie));
#endif
	return NULL;

如果 get_partial 无法获取 freelist 则调用 new_slab 获取一个新的 slab 从而获取 freelist 。

	page = new_slab(s, flags, node);
	if (page) {
		c = raw_cpu_ptr(s->cpu_slab);
		if (c->page)
			flush_slab(s, c);

		/*
		 * No other reference to the page yet so we can
		 * muck around with it freely without cmpxchg
		 */
		freelist = page->freelist;
		page->freelist = NULL;

		stat(s, ALLOC_SLAB);
		c->page = page;
		*pc = c;
	} else
		freelist = NULL;

	return freelist;

new_slab 实际调用的是 allocate_slab 。

static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
		gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
		flags &= ~GFP_SLAB_BUG_MASK;
		pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
				invalid_mask, &invalid_mask, flags, &flags);
		dump_stack();
	}

	return allocate_slab(s,
		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}

allocate_slab 函数中在设置完 flag 后有两次机会调用 alloc_slab_page 函数获取 slab 所需的内存。alloc_slab_page 调用 __alloc_pages_node 函数向 buddy system 申请内存页。

static inline struct page *alloc_slab_page(struct kmem_cache *s,
		gfp_t flags, int node, struct kmem_cache_order_objects oo)
{
	struct page *page;
	unsigned int order = oo_order(oo);

	if (node == NUMA_NO_NODE)
		page = alloc_pages(flags, order);
	else
		page = __alloc_pages_node(node, flags, order);

	if (page && memcg_charge_slab(page, flags, order, s)) {
		__free_pages(page, order);
		page = NULL;
	}

	return page;
}

	flags &= gfp_allowed_mask;

	if (gfpflags_allow_blocking(flags))
		local_irq_enable();

	flags |= s->allocflags;

	/*
	 * Let the initial higher-order allocation fail under memory pressure
	 * so we fall-back to the minimum order allocation.
	 */
	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);

	page = alloc_slab_page(s, alloc_gfp, node, oo);
	if (unlikely(!page)) {
		oo = s->min;
		alloc_gfp = flags;
		/*
		 * Allocation may have failed due to fragmentation.
		 * Try a lower order alloc if possible
		 */
		page = alloc_slab_page(s, alloc_gfp, node, oo);
		if (unlikely(!page))
			goto out;
		stat(s, ORDER_FALLBACK);
	}

在获取完内存后会进行一些相关的设置最后返回 slub 。

	page->objects = oo_objects(oo);

	order = compound_order(page);
	page->slab_cache = s;
	__SetPageSlab(page);
	if (page_is_pfmemalloc(page))
		SetPageSlabPfmemalloc(page);

	kasan_poison_slab(page);

	start = page_address(page);

	setup_page_debug(s, start, order);

	shuffle = shuffle_freelist(s, page);

	if (!shuffle) {
		start = fixup_red_left(s, start);
		start = setup_object(s, page, start);
		page->freelist = start;
		for (idx = 0, p = start; idx < page->objects - 1; idx++) {
			next = p + s->size;
			next = setup_object(s, page, next);
			set_freepointer(s, p, next);
			p = next;
		}
		set_freepointer(s, p, NULL);
	}

	page->inuse = page->objects;
	page->frozen = 1;

out:
	if (gfpflags_allow_blocking(flags))
		local_irq_disable();
	if (!page)
		return NULL;

	mod_lruvec_page_state(page,
		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
		1 << oo_order(oo));

	inc_slabs_node(s, page_to_nid(page), page->objects);

	return page;

deactivate_slab

首先将 page 中的 free_list 存储的 object 全部“取出”。

	if (page->freelist) {
		stat(s, DEACTIVATE_REMOTE_FREES);
		tail = DEACTIVATE_TO_TAIL;
	}

	/*
	 * Stage one: Free all available per cpu objects back
	 * to the page freelist while it is still frozen. Leave the
	 * last one.
	 *
	 * There is no need to take the list->lock because the page
	 * is still frozen.
	 */
	while (freelist && (nextfree = get_freepointer(s, freelist))) {
		void *prior;
		unsigned long counters;

		do {
			prior = page->freelist;
			counters = page->counters;
			set_freepointer(s, freelist, prior);
			new.counters = counters;
			new.inuse--;
			VM_BUG_ON(!new.frozen);

		} while (!__cmpxchg_double_slab(s, page,
			prior, counters,
			freelist, new.counters,
			"drain percpu freelist"));

		freelist = nextfree;
	}

	/*
	 * Stage two: Ensure that the page is unfrozen while the
	 * list presence reflects the actual number of objects
	 * during unfreeze.
	 *
	 * We setup the list membership and then perform a cmpxchg
	 * with the count. If there is a mismatch then the page
	 * is not unfrozen but the page is on the wrong list.
	 *
	 * Then we restart the process which may have to remove
	 * the page from the list that we just put it on again
	 * because the number of objects in the slab may have
	 * changed.
	 */
redo:

	old.freelist = page->freelist;
	old.counters = page->counters;
	VM_BUG_ON(!old.frozen);

	/* Determine target state of the slab */
	new.counters = old.counters;
	if (freelist) {
		new.inuse--;
		set_freepointer(s, freelist, old.freelist);
		new.freelist = freelist;
	} else
		new.freelist = old.freelist;

	new.frozen = 0;

判断 slub 当前处在 M_FREE ，M_PARTIAL ，M_FULL 中的哪个状态。

	if (!new.inuse && n->nr_partial >= s->min_partial)
		m = M_FREE;
	else if (new.freelist) {
		m = M_PARTIAL;
		if (!lock) {
			lock = 1;
			/*
			 * Taking the spinlock removes the possiblity
			 * that acquire_slab() will see a slab page that
			 * is frozen
			 */
			spin_lock(&n->list_lock);
		}
	} else {
		m = M_FULL;
		if (kmem_cache_debug(s) && !lock) {
			lock = 1;
			/*
			 * This also ensures that the scanning of full
			 * slabs from diagnostic functions will not see
			 * any frozen slabs.
			 */
			spin_lock(&n->list_lock);
		}
	}

如果上一个状态 l 和当前状态 m 不同则考虑切换状态，即在 kmem_cache_node 中的 partial 和 full 链表之间转移。最后将 kmem_cache_cpu 的 page 和 freelist 置空。

	if (l != m) {
		if (l == M_PARTIAL)
			remove_partial(n, page);
		else if (l == M_FULL)
			remove_full(s, n, page);

		if (m == M_PARTIAL)
			add_partial(n, page, tail);
		else if (m == M_FULL)
			add_full(s, n, page);
	}

	l = m;
	if (!__cmpxchg_double_slab(s, page,
				old.freelist, old.counters,
				new.freelist, new.counters,
				"unfreezing slab"))
		goto redo;

	if (lock)
		spin_unlock(&n->list_lock);

	if (m == M_PARTIAL)
		stat(s, tail);
	else if (m == M_FULL)
		stat(s, DEACTIVATE_FULL);
	else if (m == M_FREE) {
		stat(s, DEACTIVATE_EMPTY);
		discard_slab(s, page);
		stat(s, FREE_SLAB);
	}

	c->page = NULL;
	c->freelist = NULL;

do_slab_free

在 slab allocator 中存在着多个不同的内存释放接口，其最后都会调用到 do_slab_free() 完成内存释放的工作，需要注意的是该函数允许释放已经连接成一条 freelist 且已经加密好的多个对象。

与分配类似，释放同样分为快速路径与慢速路径。

快速路径比较简单，主要就是对比待释放对象所属 slab 是否为 percpu slab，若是则直接挂回去即可，遵循 LIFO 机制。

慢速路径则是调用 __slab_free 释放。

static __always_inline void do_slab_free(struct kmem_cache *s,
				struct page *page, void *head, void *tail,
				int cnt, unsigned long addr)
{
	void *tail_obj = tail ? : head;
	struct kmem_cache_cpu *c;
	unsigned long tid;
redo:
	/*
	 * Determine the currently cpus per cpu slab.
	 * The cpu may change afterward. However that does not matter since
	 * data is retrieved via this pointer. If we are on the same cpu
	 * during the cmpxchg then the free will succeed.
	 */
	do {
		tid = this_cpu_read(s->cpu_slab->tid);
		c = raw_cpu_ptr(s->cpu_slab);
	} while (IS_ENABLED(CONFIG_PREEMPT) &&
		 unlikely(tid != READ_ONCE(c->tid)));

	/* Same with comment on barrier() in slab_alloc_node() */
	barrier();

	if (likely(page == c->page)) {
		set_freepointer(s, tail_obj, c->freelist);

		if (unlikely(!this_cpu_cmpxchg_double(
				s->cpu_slab->freelist, s->cpu_slab->tid,
				c->freelist, tid,
				head, next_tid(tid)))) {

			note_cmpxchg_failure("slab_free", s, tid);
			goto redo;
		}
		stat(s, FREE_FASTPATH);
	} else
		__slab_free(s, page, head, tail_obj, cnt, addr);

}

__slab_free

首先会将待释放 freelist 所属 page 的 freelist 连接到待释放 freelist 的 tail 后边。

这里有两种情况：

如果原 page 上无空闲对象且 page 未被冻结则设置page 将被冻结（new.frozen=1）
如果所有的对象都将为空闲对象且 page 未被冻结则获取 page 所对应的 kmem_cache_node

	do {
		if (unlikely(n)) {
			spin_unlock_irqrestore(&n->list_lock, flags);
			n = NULL;
		}
		prior = page->freelist;
		counters = page->counters;
		set_freepointer(s, tail, prior);
		new.counters = counters;
		was_frozen = new.frozen;
		new.inuse -= cnt;
		if ((!new.inuse || !prior) && !was_frozen) {

			if (kmem_cache_has_cpu_partial(s) && !prior) {

				/*
				 * Slab was on no list before and will be
				 * partially empty
				 * We can defer the list move and instead
				 * freeze it.
				 */
				new.frozen = 1;

			} else { /* Needs to be taken off a list */

				n = get_node(s, page_to_nid(page));
				/*
				 * Speculatively acquire the list_lock.
				 * If the cmpxchg does not succeed then we may
				 * drop the list_lock without any processing.
				 *
				 * Otherwise the list_lock will synchronize with
				 * other processors updating the list of slabs.
				 */
				spin_lock_irqsave(&n->list_lock, flags);

			}
		}

	} while (!cmpxchg_double_slab(s, page,
		prior, counters,
		head, new.counters,
		"__slab_free"));

如果属于上面第一种情况那么可以考虑把 page 放到 kmem_cache_cpu 的 partial 链表上。

	if (likely(!n)) {

		/*
		 * If we just froze the page then put it onto the
		 * per cpu partial list.
		 */
		if (new.frozen && !was_frozen) {
			put_cpu_partial(s, page, 1);
			stat(s, CPU_PARTIAL_FREE);
		}
		/*
		 * The list lock was not taken therefore no list
		 * activity can be necessary.
		 */
		if (was_frozen)
			stat(s, FREE_FROZEN);
		return;
	}

如果属于上面第二种情况且 kmem_cache_node 的 partial 的 page 数量超过上限 kmem_cache.min_partial 则根据 prior 是否为空判断是从 kmem_cache_node 的 partial 还是 full 链表上移出。之后调用 discard_slab 释放这个 page 。

	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
		goto slab_empty;
	...
slab_empty:
	if (prior) {
		/*
		 * Slab on the partial list.
		 */
		remove_partial(n, page);
		stat(s, FREE_REMOVE_PARTIAL);
	} else {
		/* Slab must be on the full list */
		remove_full(s, n, page);
	}

	spin_unlock_irqrestore(&n->list_lock, flags);
	stat(s, FREE_SLAB);
	discard_slab(s, page);

否则，如果 prior 为空说明在第二种情况的基础上原来 page 上无空闲对象，即原来在 full 链表上，因此需要移动到 partial 链表上。

	/*
	 * Objects left in the slab. If it was not on the partial list before
	 * then add it.
	 */
	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
		if (kmem_cache_debug(s))
			remove_full(s, n, page);
		add_partial(n, page, DEACTIVATE_TO_TAIL);
		stat(s, FREE_ADD_PARTIAL);
	}
	spin_unlock_irqrestore(&n->list_lock, flags);
	return;

discard_slab

首先调用 dec_slabs_node 更新 nr_slabs 和 total_objects 之后调用 free_slab 最终调用 __free_slab 。

static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
{
	struct kmem_cache_node *n = get_node(s, node);

	atomic_long_dec(&n->nr_slabs);
	atomic_long_sub(objects, &n->total_objects);
}

static void free_slab(struct kmem_cache *s, struct page *page)
{
	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
		call_rcu(&page->rcu_head, rcu_free_slab);
	} else
		__free_slab(s, page);
}

static void discard_slab(struct kmem_cache *s, struct page *page)
{
	dec_slabs_node(s, page_to_nid(page), page->objects);
	free_slab(s, page);
}

在 __free_slab 中经过一系列检查后调用 __free_pages 将内存页释放给 buddy system 。

static void __free_slab(struct kmem_cache *s, struct page *page)
{
	int order = compound_order(page);
	int pages = 1 << order;

	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
		void *p;

		slab_pad_check(s, page);
		for_each_object(p, s, page_address(page),
						page->objects)
			check_object(s, page, p, SLUB_RED_INACTIVE);
	}

	mod_lruvec_page_state(page,
		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
		-pages);

	__ClearPageSlabPfmemalloc(page);
	__ClearPageSlab(page);

	page->mapping = NULL;
	if (current->reclaim_state)
		current->reclaim_state->reclaimed_slab += pages;
	memcg_uncharge_slab(page, order, s);
	__free_pages(page, order);
}