文章目录

前言
一、pick_next_task
二、pick_next_task_fair
参考资料

前言

在内核执行__schedule函数，进程任务切换的时候，__schedule函数函数会调用pick_next_task让调度器从就绪队列中选择最合适的一个进程运行，如下所示：

static void __sched notrace __schedule(bool preempt)
{
	struct task_struct *prev, *next;

	struct rq *rq;
	int cpu;

	cpu = smp_processor_id();
	rq = cpu_rq(cpu);
	prev = rq->curr;

	next = pick_next_task(rq, prev, cookie);
}

接下来我们就来分析pick_next_task函数。

一、pick_next_task

/*
 * Pick up the highest-prio task:
 */
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
	const struct sched_class *class = &fair_sched_class;
	struct task_struct *p;

	/*
	 * Optimization: we know that if all tasks are in
	 * the fair class we can call that function directly:
	 */
	 （1）
	//如果当前正在运行的进程（即要被切换的进程）其调度器类是完全公平调度器类
	//并且处理器运行队列struct rq上处于就绪态的任务数目等于CFS就绪队列处于就绪态的任务
	//那么表示当前处理器上就绪态的任务都在CFS就绪队列上，那么直接调用CFS调度器类的pick_next_task函数
	if (likely(prev->sched_class == class &&
		   rq->nr_running == rq->cfs.h_nr_running)) {
		p = fair_sched_class.pick_next_task(rq, prev, cookie);
		if (unlikely(p == RETRY_TASK))
			goto again;

		（2）
		//如果CFS就绪队列上的任务为NULL，那么则 idle_sched_class 的 pick_next_task 选择下一个任务
		/* assumes fair_sched_class->next == idle_sched_class */
		if (unlikely(!p))
			p = idle_sched_class.pick_next_task(rq, prev, cookie);

		return p;
	}

	（3）
	//如果处理器运行队列的任务有部分在其他调度器类的就绪队列上，那么按照调度类的优先级从高往低依次遍历调用pick_next_task函数指针
again:
	for_each_class(class) {
		p = class->pick_next_task(rq, prev, cookie);
		if (p) {
			if (unlikely(p == RETRY_TASK))
				goto again;
			return p;
		}
	}

	BUG(); /* the idle class will always have a runnable task */
}

（1）这里做了一个优化，又likely修饰，表示处理器就绪队列上任务都是完全公平调度器类的任务。如果当前正在运行的进程（即要被切换的进程）其调度器类是完全公平调度器类，并且处理器运行队列struct rq上处于就绪态的任务数目等于CFS就绪队列处于就绪态的任务，那么表示当前处理器上就绪态的任务都在CFS就绪队列上，那么直接调用CFS调度器类的pick_next_task函数。

const struct sched_class fair_sched_class;

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.pick_next_task		= pick_next_task_fair,
};

（2）如果CFS就绪队列上的任务为NULL，那么则 idle_sched_class 的 pick_next_task 选择下一个任务。直接就是调用处理器运行队列的 idle 任务，每个处理器都有一个 idle 任务。

/*
 * Simple, special scheduling class for the per-CPU idle tasks:
 */
const struct sched_class idle_sched_class = {
	/* .next is NULL */
	.pick_next_task		= pick_next_task_idle,
};

static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
	put_prev_task(rq, prev);
	update_idle_core(rq);
	schedstat_inc(rq->sched_goidle);
	return rq->idle;
}

struct rq {
	struct task_struct *idle;
};

（3）如果处理器运行队列的任务有部分在其他调度器类的就绪队列上，那么按照调度类的优先级从高往低依次遍历调用调度器类的pick_next_task函数指针。

#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)

extern const struct sched_class stop_sched_class;   //优先级最高
extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;	//优先级最低

again:
	for_each_class(class) {
		p = class->pick_next_task(rq, prev, cookie);
		if (p) {
			if (unlikely(p == RETRY_TASK))
				goto again;
			return p;
		}
	}

该表格调度类优先级从高到低排列：

调度类	调度对象	调度策略
stop_sched_class	停机进程（比如迁移进程：migration），每个处理器有一个迁移线程，用来把进程从当前处理器迁移到其他处理器	无
dl_sched_class	限期进程（也属于实时进程）	SCHED_DEADLINE
rt_sched_class	实时进程	SCHED_FIFO 、SCHED_RR
fair_sched_class	普通进程	SCHED_NORMAL 、SCHED_BATCH 、 SCHED_IDLE
idle_sched_class	空闲进程（idle-task）每个处理器都有一个空闲进程	无

进程的 policy 保存了其调度策略：

/*
 * Scheduling policies
 */
#define SCHED_NORMAL		0
#define SCHED_FIFO		1
#define SCHED_RR		2
#define SCHED_BATCH		3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE		5
#define SCHED_DEADLINE		6

struct task_struct {
	unsigned int policy;
}

注意普通进程中的SCHED_IDLE与空闲进程无关，空闲进程属于idle_sched_class调度类，由内核提供单独的机制来处理，每个处理器上有一个空闲线程，即0号线程。空闲调度类的优先级最低，仅当没有其他进程可以调度的时候，才会调度空闲线程。
内核中也有注释：

/*
 * idle-task scheduling class.
 *
 * (NOTE: these are not related to SCHED_IDLE tasks which are
 *  handled in sched/fair.c)
 */

SCHED_BATCH 和 SCHED_IDLE类型的进程也通过完全公平处理器来处理，用于相对次要的进程。SCHED_BATCH用于非交互、CPU密集型的批处理进程（不会抢占SCHED_NORMAL策略的普通进程），这类进程可以在后台默默执行，如果需要不要影响需要交互的进程，可以降低它的优先级。SCHED_IDLE类型的进程属于nice值较低的普通进程，是普通进程中nice值最低的类型的进程，注意这个不属于空闲进程，不负责调度空闲进程。

上面的停机调度类（stop_sched_class）和空闲调度类（idle_sched_class ）在每个处理器上只有一个内核线程，不需要单独的就绪队列。
而dl_sched_class 、rt_sched_class 、fair_sched_class 在每个处理器的就绪队列中都有单独的属于自己调度类的就绪队列。

struct rq {
	struct cfs_rq cfs;
	struct rt_rq rt;
	struct dl_rq dl;

	struct task_struct *curr, *idle, *stop;
};

上面流程图如下所示：
在这里插入图片描述
图片来源于极客时间：趣谈 Linux 操作系统

二、pick_next_task_fair

我们主要关心的就是处理器的就绪队列上的任务都是普通任务，那么其任务都在CFS就绪队列上，那么我们主要分析CFS调度器类的pick_next_task函数：pick_next_task_fair。
源码如下：

static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
	struct cfs_rq *cfs_rq = &rq->cfs;
	struct sched_entity *se;
	struct task_struct *p;
	int new_tasks;

	(1)
	//在调度组中选择优先级最高的进程
again:
#ifdef CONFIG_FAIR_GROUP_SCHED
	if (!cfs_rq->nr_running)
		goto idle;

	if (prev->sched_class != &fair_sched_class)
		goto simple;

	/*
	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
	 * likely that a next task is from the same cgroup as the current.
	 *
	 * Therefore attempt to avoid putting and setting the entire cgroup
	 * hierarchy, only change the part that actually changes.
	 */

	do {
		struct sched_entity *curr = cfs_rq->curr;

		/*
		 * Since we got here without doing put_prev_entity() we also
		 * have to consider cfs_rq->curr. If it is still a runnable
		 * entity, update_curr() will update its vruntime, otherwise
		 * forget we've ever seen it.
		 */
		if (curr) {
			if (curr->on_rq)
				update_curr(cfs_rq);
			else
				curr = NULL;

			/*
			 * This call to check_cfs_rq_runtime() will do the
			 * throttle and dequeue its entity in the parent(s).
			 * Therefore the 'simple' nr_running test will indeed
			 * be correct.
			 */
			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
				goto simple;
		}

		se = pick_next_entity(cfs_rq, curr);
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);

	p = task_of(se);

	/*
	 * Since we haven't yet done put_prev_entity and if the selected task
	 * is a different task than we started out with, try and touch the
	 * least amount of cfs_rqs.
	 */
	if (prev != p) {
		struct sched_entity *pse = &prev->se;

		while (!(cfs_rq = is_same_group(se, pse))) {
			int se_depth = se->depth;
			int pse_depth = pse->depth;

			if (se_depth <= pse_depth) {
				put_prev_entity(cfs_rq_of(pse), pse);
				pse = parent_entity(pse);
			}
			if (se_depth >= pse_depth) {
				set_next_entity(cfs_rq_of(se), se);
				se = parent_entity(se);
			}
		}

		put_prev_entity(cfs_rq, pse);
		set_next_entity(cfs_rq, se);
	}

	if (hrtick_enabled(rq))
		hrtick_start_fair(rq, p);

	return p;
simple:
	cfs_rq = &rq->cfs;
#endif

	(2) 
	//如果CFS就绪队列中没有处于就绪态的任务，跳到 idle 标签。
	if (!cfs_rq->nr_running)
		goto idle;

	(3)
	//将当前进程也就是待调度的进程加入到CFS就绪队列中
	//因为正在执行的进程没有在CFS就绪队列中
	put_prev_task(rq, prev);
	
	(4)
	//从CFS就绪队列中选择 vruntime 最小的调度实体，也就是红黑是中最左侧的叶子节点，作为下一个要执行的调度实体
	//将选择的调度实体从CFS就绪队列中移除，也就是从红黑树中移除
	//并将CFS就绪队列中 curr 成员指向选中的调度实体
	do {
		se = pick_next_entity(cfs_rq, NULL);
		set_next_entity(cfs_rq, se);
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);

	(5)
	//根据调度实体获取相应的进程
	p = task_of(se);

	if (hrtick_enabled(rq))
		hrtick_start_fair(rq, p);

	return p;

	(6)
	//CFS就绪队列中没有处于就绪态的任务
idle:
	/*
	 * This is OK, because current is on_cpu, which avoids it being picked
	 * for load-balance and preemption/IRQs are still disabled avoiding
	 * further scheduler activity on it and we're being very careful to
	 * re-start the picking loop.
	 */
	lockdep_unpin_lock(&rq->lock, cookie);
	//启动负载均衡处理
	new_tasks = idle_balance(rq);
	lockdep_repin_lock(&rq->lock, cookie);
	/*
	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
	 * possible for any higher priority task to appear. In that case we
	 * must re-start the pick_next_entity() loop.
	 */
	//idle_balance()返回值小于0说明在高优先级调度类里面存在可运行的进程，于是返回RETRY_TASK
	//返回RETRY_TASK将指示调用者从高优先级调度类（stop_sched_class/dl_sched_class/rt_sched_class）里面选取目标进程。
	if (new_tasks < 0)
		return RETRY_TASK;

	//如果idle_balance()返回值大于0，说明成功拉取到一些进程，因此跳转到again标号处再次选取目标实体
	if (new_tasks > 0)
		goto again;

	//如果idle_balance()返回值等于0，说明没有拉取到任何进程，返回NULL
	//返回NULL将指示调用者从低优先级调度类（idle_sched_class）里面获取目标进程
	return NULL;
}

（1）当配置了调度组CONFIG_FAIR_GROUP_SCHED，调度实体是task_group时，则需要遍历任务组来选择下一个执行的任务，在调度组中选择优先级最高的进程。这里先不讨论调度组，需要注意是调度的单位既可以是单个任务，也可以是任务组。

（2）如果CFS就绪队列中没有处于就绪态的任务，跳到 idle 标签。如果代码执行到了idle标号处，说明本地CPU的运行队列里面没有可运行的进程，于是调用idle_balance()启动负载均衡处理。

（3）调用 put_prev_task ，将当前正在运行进程（prev = curr）也就是待调度的进程加入到CFS就绪队列中，因为正在执行的进程没有在CFS就绪队列中。现在将正在运行的进程调度出去，那么要将其加入到CFS就绪队列中，也就是将进程调度实体加入到红黑树中。

put_prev_task(rq, prev);

static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
	prev->sched_class->put_prev_task(rq, prev);
}

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.put_prev_task		= put_prev_task_fair,
};

/*
 * Account for a descheduled task:
 */
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
	struct sched_entity *se = &prev->se;
	struct cfs_rq *cfs_rq;

	//遍历调度实体：prev->se
	for_each_sched_entity(se) {
		//根据调度实体获取CFS就绪队列
		cfs_rq = cfs_rq_of(se);
		//将调度实体 prev->se 加入CFS就绪队列中
		put_prev_entity(cfs_rq, se);
	}
}

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
	/*
	 * If still on the runqueue then deactivate_task()
	 * was not called and update_curr() has to be done:
	 */
	if (prev->on_rq)
		update_curr(cfs_rq);

	/* throttle cfs_rqs exceeding runtime */
	check_cfs_rq_runtime(cfs_rq);

	check_spread(cfs_rq, prev);

	//将当前进程调度实体加入到CFS就绪队列也就是红黑树中
	if (prev->on_rq) {
		update_stats_wait_start(cfs_rq, prev);
		/* Put 'current' back into the tree. */
		__enqueue_entity(cfs_rq, prev);
		/* in !on_rq case, update occurred at dequeue */
		update_load_avg(prev, 0);
	}
	//由于当前进程已经加入到CFS就绪队列中，因此CFS就绪队列的curr成员指向 NULL
	cfs_rq->curr = NULL;
}

将调度实体加入到红黑树中：

/*
 * Enqueue an entity into the rb-tree:
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	int leftmost = 1;

	/*
	 * Find the right place in the rbtree:
	 */
	//在红黑树中找到一个最合适的位置放入带调度的进程调度实体
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (entity_before(se, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0;
		}
	}

	/*
	 * Maintain a cache of leftmost tree entries (it is frequently
	 * used):
	 */
	//维护一个缓存，其中存放红黑树的最左叶子节点（这个也是最常用的，我们选择最小的 vruntime 调度实体时就直接选择 cfs_rq->rb_leftmost 即可 ）
	if (leftmost)
		cfs_rq->rb_leftmost = &se->run_node;

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

（4）从CFS就绪队列中选择 vruntime 最小的调度实体，也就是红黑是中最左侧的叶子节点，作为下一个要执行的调度实体，将选择的调度实体从CFS就绪队列中移除，也就是从红黑树中移除。

在没有配置组调度选项(CONFIG_FAIR_GROUP_SCHED)的情况下，group_cfs_rq()返回NULL.因此,上函数中的循环只会循环一次。

#ifdef CONFIG_FAIR_GROUP_SCHED

/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
	return grp->my_q;
}

#else	/* !CONFIG_FAIR_GROUP_SCHED */

/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
	return NULL;
}

#endif	/* CONFIG_FAIR_GROUP_SCHED */

	//从CFS就绪队列中选择 vruntime 最小的调度实体，也就是红黑是中最左侧的叶子节点，作为下一个要执行的调度实体
	//将选择的调度实体从CFS就绪队列中移除，也就是从红黑树中移除
	//并将CFS就绪队列中 curr 成员指向选中的调度实体
	do {
		se = pick_next_entity(cfs_rq, NULL);
		set_next_entity(cfs_rq, se);
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);

/*
 * Pick the next process, keeping these things in mind, in this order:
 * 1) keep things fair between processes/task groups
 * 2) pick the "next" process, since someone really wants that to run
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	//从红黑树中选择最左边的叶子节点
	struct sched_entity *left = __pick_first_entity(cfs_rq);
	struct sched_entity *se;

	/*
	 * If curr is set we have to see if its left of the leftmost entity
	 * still in the tree, provided there was anything in the tree at all.
	 */
	if (!left || (curr && entity_before(curr, left)))
		left = curr;

	se = left; /* ideally we run the leftmost entity */

	......

	return se;
}

从红黑树中选择最左边的叶子节点：

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
	//由于CFS就绪队列中已经缓存了红黑树中的最左叶子节点，直接获取即可
	struct rb_node *left = cfs_rq->rb_leftmost;

	if (!left)
		return NULL;

	return rb_entry(left, struct sched_entity, run_node);
}

由于CFS就绪队列中已经缓存了红黑树中的最左叶子节点，直接获取即可，不需要遍历红黑树获取最左叶子节点：

/* CFS-related fields in a runqueue */
struct cfs_rq {
	struct rb_node *rb_leftmost;
};

将选择的进程调度实体从CFS就绪队列中移除，也就是将调度实体从红黑树中移除。并将CFS就绪队列的 curr 成员指向选择的即将要运行的一下个进程。将调度实体的sum_exec_runtime成员赋值给prev_sum_exec_runtime，为了在update_curr函数中统计调度实体实际运行了多长时间

static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/* 'current' is not kept within the tree. */
	if (se->on_rq) {
		/*
		 * Any task has to be enqueued before it get to execute on
		 * a CPU. So account for the time it spent waiting on the
		 * runqueue.
		 */
		update_stats_wait_end(cfs_rq, se);
		//将选择的进程调度实体从CFS就绪队列中移除
		__dequeue_entity(cfs_rq, se);
		update_load_avg(se, UPDATE_TG);
	}

	update_stats_curr_start(cfs_rq, se);
	//将CFS就绪队列的 curr 成员指向选择的即将要运行的一下个进程
	cfs_rq->curr = se;

	/*
	 * Track our maximum slice length, if the CPU's load is at
	 * least twice that of our own weight (i.e. dont track it
	 * when there are only lesser-weight tasks around):
	 */
	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
		schedstat_set(se->statistics.slice_max,
			max((u64)schedstat_val(se->statistics.slice_max),
			    se->sum_exec_runtime - se->prev_sum_exec_runtime));
	}

	//将调度实体的sum_exec_runtime成员赋值给prev_sum_exec_runtime，为了在update_curr函数中统计调度实体实际运行了多长时间
	se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

将即将要运行的进程调度实体从红黑树中移除：

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	if (cfs_rq->rb_leftmost == &se->run_node) {
		struct rb_node *next_node;

		next_node = rb_next(&se->run_node);
		cfs_rq->rb_leftmost = next_node;
	}

	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

（5）根据选择的调度实体获得其所属的进程

p = task_of(se);

#ifdef CONFIG_FAIR_GROUP_SCHED

/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se)	(!se->my_q)

static inline struct task_struct *task_of(struct sched_entity *se)
{
	SCHED_WARN_ON(!entity_is_task(se));
	return container_of(se, struct task_struct, se);
}

（6）idle标签，如果CFS就绪队列中没有处于就绪态的任务，启动负载均衡处理。如果代码执行到了idle标号处，说明本地CPU的运行队列里面没有可运行的进程，于是调用idle_balance()启动负载均衡处理。负载均衡机制试图从别的CPU那里拉取一些可运行的进程。如果idle_balance()返回值小于0，说明在高优先级调度类里面存在可运行的进程，于是返回RETRY_TASK；如果idle_balance()返回值大于0，说明成功拉取到一些进程，因此跳转到again标号处再次选取目标实体；如果idle_balance()返回值等于0，说明没有拉取到任何进程，返回NULL。

返回RETRY_TASK将指示调用者从高优先级调度类（stop_sched_class/dl_sched_class/rt_sched_class）里面选取目标进程。
返回NULL将指示调用者从低优先级调度类（idle_sched_class）里面获取目标进程。

备注：正在运行的进程 curr 没有在CFS就绪队列中，因此红黑树中没有保存正在运行的进程调度实体，但是 struct cfs_rq 结构体中保存了正在运行的进程 curr，直接通过struct cfs_rq 结构体中即可获取到正在运行的进程。

因此调用 pick_next_task -> pick_next_task_fair 选择下一个要运行的进程时，将当前正在运行的进程调度实体（curr也就是代码中prev，即将被切换的进程）添加到红黑树中，将选择的要运行的进程（代码中的next）调度实体从红黑树中移除，同时将struct cfs_rq 结构体中 curr 指向 next。

/* CFS-related fields in a runqueue */
struct cfs_rq {
	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity *curr;
};