Linux page migration源码分析

概述

__unmap_and_move函数

step1: Lock the page to be migrated

step2: Insure that writeback is complete.

step3: Lock the new page that we want to move to.

step4: All the page table references to the page are converted to migration entries.

step5-step15: move_to_new_page

step 5- step11

step 12- 15

step 16-18

概述

Linux 内核page migration设计文档_nginux的博客-CSDN博客

前文介绍了page migration迁移的设计思路，且内核文档介绍得知，总计需要18个steps完成迁移过程，本文目标是以源码视角对应到这18个steps上面，加深对于页面迁移的理解。内核源码版本：Linux-5.9。下面再贴一下内核描述的18个steps:

migrate_pages基本调用顺序和示意图：

migrate_pages
    --->unmap_and_move
        --->__unmap_and_move

__unmap_and_move函数

//page：需要迁移的page
//newPage: 迁移的目标page
static int __unmap_and_move(struct page *page, struct page *newpage,
				int force, enum migrate_mode mode)
{
	int rc = -EAGAIN;
	int page_was_mapped = 0;
	struct anon_vma *anon_vma = NULL;
	bool is_lru = !__PageMovable(page);

    //step1:Lock the page to be migrated
    //加锁失败的情况下，有些条件下就不陷入睡眠等待了
	if (!trylock_page(page)) {
        //!force或者Async异步迁移，不再lock_page，因为会sleep。
		if (!force || mode == MIGRATE_ASYNC)
			goto out;

		/*
		 * It's not safe for direct compaction to call lock_page.
		 * For example, during page readahead pages are added locked
		 * to the LRU. Later, when the IO completes the pages are
		 * marked uptodate and unlocked. However, the queueing
		 * could be merging multiple pages for one bio (e.g.
		 * mpage_readahead). If an allocation happens for the
		 * second or third page, the process can end up locking
		 * the same page twice and deadlocking. Rather than
		 * trying to be clever about what pages can be locked,
		 * avoid the use of lock_page for direct compaction
		 * altogether.
		 */
        //对于设置PF_MEMALLOC的direct compaction或者kswapd不要强行加锁，否则可能deadlock.
		if (current->flags & PF_MEMALLOC)
			goto out;

		lock_page(page);
	}
    //step2:Insure that writeback is complete.
	if (PageWriteback(page)) {
		/*
		 * Only in the case of a full synchronous migration is it
		 * necessary to wait for PageWriteback. In the async case,
		 * the retry loop is too short and in the sync-light case,
		 * the overhead of stalling is too much
		 */
        //SYNC模式意味着可以等待writeback完成，等待是通过wait_on_page_writeback(page)实现。
		switch (mode) {
		case MIGRATE_SYNC:
		case MIGRATE_SYNC_NO_COPY:
			break;
		default:
            //否则就意味者此page迁移失败
			rc = -EBUSY;
			goto out_unlock;
		}
		if (!force)
			goto out_unlock;
		wait_on_page_writeback(page);
	}

	/*
	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
	 * we cannot notice that anon_vma is freed while we migrates a page.
	 * This get_anon_vma() delays freeing anon_vma pointer until the end
	 * of migration. File cache pages are no problem because of page_lock()
	 * File Caches may use write_page() or lock_page() in migration, then,
	 * just care Anon page here.
	 *
	 * Only page_get_anon_vma() understands the subtleties of
	 * getting a hold on an anon_vma from outside one of its mms.
	 * But if we cannot get anon_vma, then we won't need it anyway,
	 * because that implies that the anon page is no longer mapped
	 * (and cannot be remapped so long as we hold the page lock).
	 */
	if (PageAnon(page) && !PageKsm(page))
		anon_vma = page_get_anon_vma(page);

	/*
	 * Block others from accessing the new page when we get around to
	 * establishing additional references. We are usually the only one
	 * holding a reference to newpage at this point. We used to have a BUG
	 * here if trylock_page(newpage) fails, but would like to allow for
	 * cases where there might be a race with the previous use of newpage.
	 * This is much like races on refcount of oldpage: just don't BUG().
	 */
    //step3 : Lock the new page that we want to move to
	if (unlikely(!trylock_page(newpage)))
		goto out_unlock;

	if (unlikely(!is_lru)) {
		rc = move_to_new_page(newpage, page, mode);
		goto out_unlock_both;
	}

	/*
	 * Corner case handling:
	 * 1. When a new swap-cache page is read into, it is added to the LRU
	 * and treated as swapcache but it has no rmap yet.
	 * Calling try_to_unmap() against a page->mapping==NULL page will
	 * trigger a BUG.  So handle it here.
	 * 2. An orphaned page (see truncate_complete_page) might have
	 * fs-private metadata. The page can be picked up due to memory
	 * offlining.  Everywhere else except page reclaim, the page is
	 * invisible to the vm, so the page can not be migrated.  So try to
	 * free the metadata, so the page can be freed.
	 */
    //如注释这是corner case处理
	if (!page->mapping) {
		VM_BUG_ON_PAGE(PageAnon(page), page);
		if (page_has_private(page)) {
			try_to_free_buffers(page);
			goto out_unlock_both;
		}
	} else if (page_mapped(page)) {
        //step4 : All the page table references to the page are converted to migration
        //entries.
		/* Establish migration ptes */
		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
				page);
		try_to_unmap(page,
			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
		page_was_mapped = 1;
	}
    //不管是本来就没有mapped(read/write cache),还是try_to_unmap后不再mapped
    //都调用move_to_new_page，很重要的函数，20个步骤的第
	if (!page_mapped(page))
        //step5-step15
		rc = move_to_new_page(newpage, page, mode);

    //step 16
	if (page_was_mapped)
		remove_migration_ptes(page,
			rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);

//step 17
out_unlock_both:
	unlock_page(newpage);
out_unlock:
	/* Drop an anon_vma reference if we took one */
	if (anon_vma)
		put_anon_vma(anon_vma);
	unlock_page(page);
out:
	/*
	 * If migration is successful, decrease refcount of the newpage
	 * which will not free the page because new page owner increased
	 * refcounter. As well, if it is LRU page, add the page to LRU
	 * list in here. Use the old state of the isolated source page to
	 * determine if we migrated a LRU page. newpage was already unlocked
	 * and possibly modified by its owner - don't rely on the page
	 * state.
	 */
	if (rc == MIGRATEPAGE_SUCCESS) {
		if (unlikely(!is_lru))
			put_page(newpage);
		else
            //step 18
			putback_lru_page(newpage);
	}

	return rc;
}

step1: Lock the page to be migrated

如果加锁失败，有些情况下就不能强行lock_page，因为会陷入sleep:

1. MIGRATE_ASYNC异步迁移模式。

因为MIGRATE_ASYNC异步模式根据内核定义是不能阻塞的：

MIGRATE_SYNC_LIGHT: 可以接受一定程度的block，但是不能因为writepage回写block，因为这个block时间可能很大，不符合“LIGHT"的定义初衷。

MIGRATE_SYNC: 可以随意block.

MIGRATE_SYNC_NO_COPY: 可以block，但是不能执行CPU的page赋值操作（DMA是可以的）。

2.PF_MEMALLOC 设置的进程，主要是direct compaction或者kswapd，避免死锁。

step2: Insure that writeback is complete.

是否等待writeback要看迁移模式：

MIGRATE_SYNC和MIGRATE_SYNC_NO_COPY： 两种迁移模式block等待writeback。

其他模式下不会等待此page writeback，意味着该page迁移失败。

step3: Lock the new page that we want to move to.

使用trylock_page尝试加锁新page，如果失败也不BUG。

step4: All the page table references to the page are converted to migration entries.

对于mapped的page调用try_to_unmap解除映射，注意：

并不是所有可迁移页面都是page_mapped。比如read/write产生的page cache
try_to_unmap会将page的所有映射解除。比如page被3个进程共享，那么3个映射全部解除。
try_to_unmap会将page pte设置成migration entry。这样的好处就是，进程继续访问unmap的page之后会触发page fault，并且内核根据pte发现是migration的就等待迁移完成，自然就完成了缺页处理。

step5-step15: move_to_new_page

5-15步骤封装在move_to_new_page函数实现，需要注意对于不同类型的page处理流程上并不相同，以Anon Page为例，由于page->mapping为null，故调用migrate_page函数：


/*
 * Move a page to a newly allocated page
 * The page is locked and all ptes have been successfully removed.
 *
 * The new page will have replaced the old page if this function
 * is successful.
 *
 * Return value:
 *   < 0 - error code
 *  MIGRATEPAGE_SUCCESS - success
 */
static int move_to_new_page(struct page *newpage, struct page *page,
				enum migrate_mode mode)
{
	struct address_space *mapping;
	int rc = -EAGAIN;
	bool is_lru = !__PageMovable(page);
    ...

	mapping = page_mapping(page);

	if (likely(is_lru)) {
		if (!mapping)
			rc = migrate_page(mapping, newpage, page, mode);
		else if (mapping->a_ops->migratepage)
			/*
			 * Most pages have a mapping and most filesystems
			 * provide a migratepage callback. Anonymous pages
			 * are part of swap space which also has its own
			 * migratepage callback. This is the most common path
			 * for page migration.
			 */
			rc = mapping->a_ops->migratepage(mapping, newpage,
							page, mode);
		else
			rc = fallback_migrate_page(mapping, newpage,
							page, mode);
	} 
    ...

	/*
	 * When successful, old pagecache page->mapping must be cleared before
	 * page is freed; but stats require that PageAnon be left as PageAnon.
	 */
	if (rc == MIGRATEPAGE_SUCCESS) {
		if (__PageMovable(page)) {
			VM_BUG_ON_PAGE(!PageIsolated(page), page);

			/*
			 * We clear PG_movable under page_lock so any compactor
			 * cannot try to migrate this page.
			 */
			__ClearPageIsolated(page);
		}

		/*
		 * Anonymous and movable page->mapping will be cleared by
		 * free_pages_prepare so don't reset it here for keeping
		 * the type to work PageAnon, for example.
		 */
		if (!PageMappingFlags(page))
			page->mapping = NULL;

		if (likely(!is_zone_device_page(newpage)))
			flush_dcache_page(newpage);

	}
out:
	return rc;
}

int migrate_page(struct address_space *mapping,
		struct page *newpage, struct page *page,
		enum migrate_mode mode)
{
	int rc;

	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
    //step 5 - 11
	rc = migrate_page_move_mapping(mapping, newpage, page, 0);

	if (rc != MIGRATEPAGE_SUCCESS)
		return rc;
    //step 12 - 15
	if (mode != MIGRATE_SYNC_NO_COPY)
		migrate_page_copy(newpage, page);
	else
		migrate_page_states(newpage, page);
	return MIGRATEPAGE_SUCCESS;
}

step 5- step11

这里我们已anon page调用的migrate_page为例：

step5: The i_pages lock is taken，使用i_pages加锁保护radix tree。加锁之后所有访问radix tree上该page的进程会被block，所以操作完应该尽快解锁。

step6:The refcount of the page is examined and we back out if references remain
otherwise we know that we are the only one referencing this page.检查page->_refcount，如果只有我们引用该page就继续迁移，否则退出。

step7:The radix tree is checked and if it does not contain the pointer to this
page then we back out because someone else modified the radix tree.

检查radix tree是否包含被迁移的page，如果是anon page就简单了，因为anon page本就不在radix tree当中。而在radix-tree中的page就要更新其指向到新的page。

step 8: The new page is prepped with some settings from the old page so that
accesses to the new page will discover a page with the correct settings.

准备新page，主要是设置new page的index, mapping，__SetPageSwapBacked，SetPageDirty等。

step9 : The radix tree is changed to point to the new page.

radix tree更新指向到新page

step 10: he reference count of the old page is dropped,由于老的page从radix tree，故refcount减少1.

step 11: The radix tree lock is dropped，radix tree指向更新到新page已完成，故可以释放锁了。


/*
 * Replace the page in the mapping.
 *
 * The number of remaining references must be:
 * 1 for anonymous pages without a mapping
 * 2 for pages with a mapping
 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 */
int migrate_page_move_mapping(struct address_space *mapping,
		struct page *newpage, struct page *page, int extra_count)
{
	XA_STATE(xas, &mapping->i_pages, page_index(page));
	struct zone *oldzone, *newzone;
	int dirty;
	int expected_count = expected_page_refs(mapping, page) + extra_count;
    //匿名页比较简单，因为不在radix tree中，直接更新newpage就好
	if (!mapping) {
		/* Anonymous page without mapping */
		if (page_count(page) != expected_count)
			return -EAGAIN;

		/* No turning back from here */
		newpage->index = page->index;
		newpage->mapping = page->mapping;
		if (PageSwapBacked(page))
			__SetPageSwapBacked(newpage);

		return MIGRATEPAGE_SUCCESS;
	}

	oldzone = page_zone(page);
	newzone = page_zone(newpage);

    //step5:加锁
	xas_lock_irq(&xas);
    //step6-7:refcount检查，以及检查page是否在radix tree中
	if (page_count(page) != expected_count || xas_load(&xas) != page) {
		xas_unlock_irq(&xas);
		return -EAGAIN;
	}

	if (!page_ref_freeze(page, expected_count)) {
		xas_unlock_irq(&xas);
		return -EAGAIN;
	}

	/*
	 * Now we know that no one else is looking at the page:
	 * no turning back from here.
	 */
    //step8: 准备新page
	newpage->index = page->index;
	newpage->mapping = page->mapping;
	page_ref_add(newpage, thp_nr_pages(page)); /* add cache reference */
	if (PageSwapBacked(page)) {
		__SetPageSwapBacked(newpage);
		if (PageSwapCache(page)) {
			SetPageSwapCache(newpage);
			set_page_private(newpage, page_private(page));
		}
	} else {
		VM_BUG_ON_PAGE(PageSwapCache(page), page);
	}

	/* Move dirty while page refs frozen and newpage not yet exposed */
	dirty = PageDirty(page);
	if (dirty) {
		ClearPageDirty(page);
		SetPageDirty(newpage);
	}
    //step9:radix tree指向新page
	xas_store(&xas, newpage);
	if (PageTransHuge(page)) {
		int i;

		for (i = 1; i < HPAGE_PMD_NR; i++) {
			xas_next(&xas);
			xas_store(&xas, newpage);
		}
	}

	/*
	 * Drop cache reference from old page by unfreezing
	 * to one less reference.
	 * We know this isn't the last reference.
	 */
    //step10：老的page的refcount要相应减少，因为已经从radix tree删除了
	page_ref_unfreeze(page, expected_count - thp_nr_pages(page));

    //step11: radix tree操作完成，解锁了
	xas_unlock(&xas);
    ...
	local_irq_enable();

	return MIGRATEPAGE_SUCCESS;
}