文章目录
- 前言
- 一、pick_next_task
- 二、pick_next_task_fair
- 参考资料
前言
在内核执行__schedule函数,进程任务切换的时候,__schedule函数函数会调用pick_next_task让调度器从就绪队列中选择最合适的一个进程运行,如下所示:
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
struct rq *rq;
int cpu;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
next = pick_next_task(rq, prev, cookie);
}
接下来我们就来分析pick_next_task函数。
一、pick_next_task
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
const struct sched_class *class = &fair_sched_class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
(1)
//如果当前正在运行的进程(即要被切换的进程)其调度器类是完全公平调度器类
//并且处理器运行队列struct rq上处于就绪态的任务数目等于CFS就绪队列处于就绪态的任务
//那么表示当前处理器上就绪态的任务都在CFS就绪队列上,那么直接调用CFS调度器类的pick_next_task函数
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev, cookie);
if (unlikely(p == RETRY_TASK))
goto again;
(2)
//如果CFS就绪队列上的任务为NULL,那么则 idle_sched_class 的 pick_next_task 选择下一个任务
/* assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
p = idle_sched_class.pick_next_task(rq, prev, cookie);
return p;
}
(3)
//如果处理器运行队列的任务有部分在其他调度器类的就绪队列上,那么按照调度类的优先级从高往低依次遍历调用pick_next_task函数指针
again:
for_each_class(class) {
p = class->pick_next_task(rq, prev, cookie);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
return p;
}
}
BUG(); /* the idle class will always have a runnable task */
}
(1)这里做了一个优化,又likely修饰,表示处理器就绪队列上任务都是完全公平调度器类的任务。如果当前正在运行的进程(即要被切换的进程)其调度器类是完全公平调度器类,并且处理器运行队列struct rq上处于就绪态的任务数目等于CFS就绪队列处于就绪态的任务,那么表示当前处理器上就绪态的任务都在CFS就绪队列上,那么直接调用CFS调度器类的pick_next_task函数。
const struct sched_class fair_sched_class;
/*
* All the scheduling class methods:
*/
const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.pick_next_task = pick_next_task_fair,
};
(2)如果CFS就绪队列上的任务为NULL,那么则 idle_sched_class 的 pick_next_task 选择下一个任务。直接就是调用处理器运行队列的 idle 任务,每个处理器都有一个 idle 任务。
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const struct sched_class idle_sched_class = {
/* .next is NULL */
.pick_next_task = pick_next_task_idle,
};
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
struct rq {
struct task_struct *idle;
};
(3)如果处理器运行队列的任务有部分在其他调度器类的就绪队列上,那么按照调度类的优先级从高往低依次遍历调用调度器类的pick_next_task函数指针。
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
extern const struct sched_class stop_sched_class; //优先级最高
extern const struct sched_class dl_sched_class;
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class; //优先级最低
again:
for_each_class(class) {
p = class->pick_next_task(rq, prev, cookie);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
return p;
}
}
该表格调度类优先级从高到低排列:
调度类 | 调度对象 | 调度策略 |
---|---|---|
stop_sched_class | 停机进程(比如迁移进程:migration),每个处理器有一个迁移线程,用来把进程从当前处理器迁移到其他处理器 | 无 |
dl_sched_class | 限期进程(也属于实时进程) | SCHED_DEADLINE |
rt_sched_class | 实时进程 | SCHED_FIFO 、SCHED_RR |
fair_sched_class | 普通进程 | SCHED_NORMAL 、SCHED_BATCH 、 SCHED_IDLE |
idle_sched_class | 空闲进程(idle-task)每个处理器都有一个空闲进程 | 无 |
进程的 policy 保存了其调度策略:
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6
struct task_struct {
unsigned int policy;
}
注意普通进程中的SCHED_IDLE与空闲进程无关,空闲进程属于idle_sched_class调度类,由内核提供单独的机制来处理,每个处理器上有一个空闲线程,即0号线程。空闲调度类的优先级最低,仅当没有其他进程可以调度的时候,才会调度空闲线程。
内核中也有注释:
/*
* idle-task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE tasks which are
* handled in sched/fair.c)
*/
SCHED_BATCH 和 SCHED_IDLE类型的进程也通过完全公平处理器来处理,用于相对次要的进程。SCHED_BATCH用于非交互、CPU密集型的批处理进程(不会抢占SCHED_NORMAL策略的普通进程),这类进程可以在后台默默执行,如果需要不要影响需要交互的进程,可以降低它的优先级。SCHED_IDLE类型的进程属于nice值较低的普通进程,是普通进程中nice值最低的类型的进程,注意这个不属于空闲进程,不负责调度空闲进程。
上面的停机调度类(stop_sched_class)和空闲调度类(idle_sched_class )在每个处理器上只有一个内核线程,不需要单独的就绪队列。
而dl_sched_class 、rt_sched_class 、fair_sched_class 在每个处理器的就绪队列中都有单独的属于自己调度类的就绪队列。
struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
struct task_struct *curr, *idle, *stop;
};
上面流程图如下所示:
图片来源于极客时间:趣谈 Linux 操作系统
二、pick_next_task_fair
我们主要关心的就是处理器的就绪队列上的任务都是普通任务,那么其任务都在CFS就绪队列上,那么我们主要分析CFS调度器类的pick_next_task函数:pick_next_task_fair。
源码如下:
static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
int new_tasks;
(1)
//在调度组中选择优先级最高的进程
again:
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
goto idle;
if (prev->sched_class != &fair_sched_class)
goto simple;
/*
* Because of the set_next_buddy() in dequeue_task_fair() it is rather
* likely that a next task is from the same cgroup as the current.
*
* Therefore attempt to avoid putting and setting the entire cgroup
* hierarchy, only change the part that actually changes.
*/
do {
struct sched_entity *curr = cfs_rq->curr;
/*
* Since we got here without doing put_prev_entity() we also
* have to consider cfs_rq->curr. If it is still a runnable
* entity, update_curr() will update its vruntime, otherwise
* forget we've ever seen it.
*/
if (curr) {
if (curr->on_rq)
update_curr(cfs_rq);
else
curr = NULL;
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
* Therefore the 'simple' nr_running test will indeed
* be correct.
*/
if (unlikely(check_cfs_rq_runtime(cfs_rq)))
goto simple;
}
se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
/*
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
* least amount of cfs_rqs.
*/
if (prev != p) {
struct sched_entity *pse = &prev->se;
while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
int pse_depth = pse->depth;
if (se_depth <= pse_depth) {
put_prev_entity(cfs_rq_of(pse), pse);
pse = parent_entity(pse);
}
if (se_depth >= pse_depth) {
set_next_entity(cfs_rq_of(se), se);
se = parent_entity(se);
}
}
put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);
}
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
return p;
simple:
cfs_rq = &rq->cfs;
#endif
(2)
//如果CFS就绪队列中没有处于就绪态的任务,跳到 idle 标签。
if (!cfs_rq->nr_running)
goto idle;
(3)
//将当前进程也就是待调度的进程加入到CFS就绪队列中
//因为正在执行的进程没有在CFS就绪队列中
put_prev_task(rq, prev);
(4)
//从CFS就绪队列中选择 vruntime 最小的调度实体,也就是红黑是中最左侧的叶子节点,作为下一个要执行的调度实体
//将选择的调度实体从CFS就绪队列中移除,也就是从红黑树中移除
//并将CFS就绪队列中 curr 成员指向选中的调度实体
do {
se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
(5)
//根据调度实体获取相应的进程
p = task_of(se);
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
return p;
(6)
//CFS就绪队列中没有处于就绪态的任务
idle:
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
*/
lockdep_unpin_lock(&rq->lock, cookie);
//启动负载均衡处理
new_tasks = idle_balance(rq);
lockdep_repin_lock(&rq->lock, cookie);
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
//idle_balance()返回值小于0说明在高优先级调度类里面存在可运行的进程,于是返回RETRY_TASK
//返回RETRY_TASK将指示调用者从高优先级调度类(stop_sched_class/dl_sched_class/rt_sched_class)里面选取目标进程。
if (new_tasks < 0)
return RETRY_TASK;
//如果idle_balance()返回值大于0,说明成功拉取到一些进程,因此跳转到again标号处再次选取目标实体
if (new_tasks > 0)
goto again;
//如果idle_balance()返回值等于0,说明没有拉取到任何进程,返回NULL
//返回NULL将指示调用者从低优先级调度类(idle_sched_class)里面获取目标进程
return NULL;
}
(1)当配置了调度组CONFIG_FAIR_GROUP_SCHED,调度实体是task_group时,则需要遍历任务组来选择下一个执行的任务,在调度组中选择优先级最高的进程。这里先不讨论调度组,需要注意是调度的单位既可以是单个任务,也可以是任务组。
(2)如果CFS就绪队列中没有处于就绪态的任务,跳到 idle 标签。如果代码执行到了idle标号处,说明本地CPU的运行队列里面没有可运行的进程,于是调用idle_balance()启动负载均衡处理。
(3)调用 put_prev_task ,将当前正在运行进程(prev = curr)也就是待调度的进程加入到CFS就绪队列中,因为正在执行的进程没有在CFS就绪队列中。现在将正在运行的进程调度出去,那么要将其加入到CFS就绪队列中,也就是将进程调度实体加入到红黑树中。
put_prev_task(rq, prev);
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
prev->sched_class->put_prev_task(rq, prev);
}
/*
* All the scheduling class methods:
*/
const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.put_prev_task = put_prev_task_fair,
};
/*
* Account for a descheduled task:
*/
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
//遍历调度实体:prev->se
for_each_sched_entity(se) {
//根据调度实体获取CFS就绪队列
cfs_rq = cfs_rq_of(se);
//将调度实体 prev->se 加入CFS就绪队列中
put_prev_entity(cfs_rq, se);
}
}
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
*/
if (prev->on_rq)
update_curr(cfs_rq);
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
check_spread(cfs_rq, prev);
//将当前进程调度实体加入到CFS就绪队列也就是红黑树中
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(prev, 0);
}
//由于当前进程已经加入到CFS就绪队列中,因此CFS就绪队列的curr成员指向 NULL
cfs_rq->curr = NULL;
}
将调度实体加入到红黑树中:
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
int leftmost = 1;
/*
* Find the right place in the rbtree:
*/
//在红黑树中找到一个最合适的位置放入带调度的进程调度实体
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (entity_before(se, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
/*
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
//维护一个缓存,其中存放红黑树的最左叶子节点(这个也是最常用的,我们选择最小的 vruntime 调度实体时就直接选择 cfs_rq->rb_leftmost 即可 )
if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}
(4)从CFS就绪队列中选择 vruntime 最小的调度实体,也就是红黑是中最左侧的叶子节点,作为下一个要执行的调度实体,将选择的调度实体从CFS就绪队列中移除,也就是从红黑树中移除。
在没有配置组调度选项(CONFIG_FAIR_GROUP_SCHED)的情况下,group_cfs_rq()返回NULL.因此,上函数中的循环只会循环一次。
#ifdef CONFIG_FAIR_GROUP_SCHED
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return grp->my_q;
}
#else /* !CONFIG_FAIR_GROUP_SCHED */
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return NULL;
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
//从CFS就绪队列中选择 vruntime 最小的调度实体,也就是红黑是中最左侧的叶子节点,作为下一个要执行的调度实体
//将选择的调度实体从CFS就绪队列中移除,也就是从红黑树中移除
//并将CFS就绪队列中 curr 成员指向选中的调度实体
do {
se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
* 2) pick the "next" process, since someone really wants that to run
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
//从红黑树中选择最左边的叶子节点
struct sched_entity *left = __pick_first_entity(cfs_rq);
struct sched_entity *se;
/*
* If curr is set we have to see if its left of the leftmost entity
* still in the tree, provided there was anything in the tree at all.
*/
if (!left || (curr && entity_before(curr, left)))
left = curr;
se = left; /* ideally we run the leftmost entity */
......
return se;
}
从红黑树中选择最左边的叶子节点:
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
//由于CFS就绪队列中已经缓存了红黑树中的最左叶子节点,直接获取即可
struct rb_node *left = cfs_rq->rb_leftmost;
if (!left)
return NULL;
return rb_entry(left, struct sched_entity, run_node);
}
由于CFS就绪队列中已经缓存了红黑树中的最左叶子节点,直接获取即可,不需要遍历红黑树获取最左叶子节点:
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct rb_node *rb_leftmost;
};
将选择的进程调度实体从CFS就绪队列中移除,也就是将调度实体从红黑树中移除。并将CFS就绪队列的 curr 成员指向选择的即将要运行的一下个进程。将调度实体的sum_exec_runtime成员赋值给prev_sum_exec_runtime,为了在update_curr函数中统计调度实体实际运行了多长时间
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
//将选择的进程调度实体从CFS就绪队列中移除
__dequeue_entity(cfs_rq, se);
update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
//将CFS就绪队列的 curr 成员指向选择的即将要运行的一下个进程
cfs_rq->curr = se;
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
schedstat_set(se->statistics.slice_max,
max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
//将调度实体的sum_exec_runtime成员赋值给prev_sum_exec_runtime,为了在update_curr函数中统计调度实体实际运行了多长时间
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
将即将要运行的进程调度实体从红黑树中移除:
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
}
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
(5)根据选择的调度实体获得其所属的进程
p = task_of(se);
#ifdef CONFIG_FAIR_GROUP_SCHED
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
return container_of(se, struct task_struct, se);
}
(6)idle标签,如果CFS就绪队列中没有处于就绪态的任务,启动负载均衡处理。如果代码执行到了idle标号处,说明本地CPU的运行队列里面没有可运行的进程,于是调用idle_balance()启动负载均衡处理。负载均衡机制试图从别的CPU那里拉取一些可运行的进程。如果idle_balance()返回值小于0,说明在高优先级调度类里面存在可运行的进程,于是返回RETRY_TASK;如果idle_balance()返回值大于0,说明成功拉取到一些进程,因此跳转到again标号处再次选取目标实体;如果idle_balance()返回值等于0,说明没有拉取到任何进程,返回NULL。
返回RETRY_TASK将指示调用者从高优先级调度类(stop_sched_class/dl_sched_class/rt_sched_class)里面选取目标进程。
返回NULL将指示调用者从低优先级调度类(idle_sched_class)里面获取目标进程。
备注:正在运行的进程 curr 没有在CFS就绪队列中,因此红黑树中没有保存正在运行的进程调度实体,但是 struct cfs_rq 结构体中保存了 正在运行的进程 curr,直接通过struct cfs_rq 结构体中即可获取到正在运行的进程。
因此调用 pick_next_task -> pick_next_task_fair 选择下一个要运行的进程时,将当前正在运行的进程调度实体(curr也就是代码中prev,即将被切换的进程)添加到红黑树中,将选择的要运行的进程(代码中的next)调度实体从红黑树中移除,同时将struct cfs_rq 结构体中 curr 指向 next。
/* CFS-related fields in a runqueue */
struct cfs_rq {
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
};
参考资料
Linux 4.10.0
极客时间:趣谈Linux操作系统
基于龙芯的Linux内核探索解析
Linux内核深度解析
https://kernel.blog.csdn.net/article/details/52068016