linux内核进程调度

发布时间：2024年01月19日

一、前言

这是一段关于<<linux内核设计与实现>>的一段读书笔记

什么是进程调度？

进程调度是linux 内核的一个子系统，它主要负责辅助内核决策，哪个可运行状态的进程应该被调度，以及要占用多少时间片。也就是说进程调度就是在可运行态进程之间分配优先处理器时间资源的内核子系统。

linux内核的程序主要分为IO密集型和CPU密集型，linux内核默认是SCHEDULE_NORMAL采用完全公平调度，而不是依靠nice值来计算时间片，nice值在CFS中被作为进程获取处理器运行的权重比，CFS的做法是在每个进程运行一段时间、循环轮转、选择运行最少的进程作为下一个运行的进程，而不是才用哪个时间片，记录每个进程应该运行多久。

二、Linux调度的实现

1、时间记账

CFS不再有时间片的概念而是采用时间片，而是采用时间记账的形式。

CFS使用调度器的实体数据结构来追踪运行记账：

struct sched_entity {
	/* For load-balancing: */
	struct load_weight		load;
	struct rb_node			run_node;
	struct list_head		group_node;
	unsigned int			on_rq;

	u64				exec_start;
	u64				sum_exec_runtime;
	u64				vruntime;
	u64				prev_sum_exec_runtime;

	u64				nr_migrations;

#ifdef CONFIG_FAIR_GROUP_SCHED
	int				depth;
	struct sched_entity		*parent;
	/* rq on which this entity is (to be) queued: */
	struct cfs_rq			*cfs_rq;
	/* rq "owned" by this entity/group: */
	struct cfs_rq			*my_q;
	/* cached value of my_q->h_nr_running */
	unsigned long			runnable_weight;
#endif

#ifdef CONFIG_SMP
	/*
	 * Per entity load average tracking.
	 *
	 * Put into separate cache line so it does not
	 * collide with read-mostly values above.
	 */
	struct sched_avg		avg;
#endif
};

这个结构体，位于?struct task_struct 中，

struct sched_entity		se;

虚拟实时

CFS使用vruntime来记录一个进程到底运行了多久以及应该运行多久。

定义在sched_fair.c 中的update_curr 函数功能实现了记账


/*
 * Update the current task's runtime statistics.
 */
static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_clock_task(rq_of(cfs_rq));
	u64 delta_exec;

	if (unlikely(!curr))
		return;

    // 最后一次修改负载后当前任务运行的总时间
	delta_exec = now - curr->exec_start;
	if (unlikely((s64)delta_exec <= 0))
		return;

	curr->exec_start = now;

	if (schedstat_enabled()) {
		struct sched_statistics *stats;

		stats = __schedstats_from_se(curr);
		__schedstat_set(stats->exec_max,
				max(delta_exec, stats->exec_max));
	}

	curr->sum_exec_runtime += delta_exec;
	schedstat_add(cfs_rq->exec_clock, delta_exec);

	curr->vruntime += calc_delta_fair(delta_exec, curr);
	update_min_vruntime(cfs_rq);

	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cgroup_account_cputime(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}

	account_cfs_rq_runtime(cfs_rq, delta_exec);
}

update_curr 计算了当前进程的执行时间，并且把他存放到变量delta_exec中。然后他有把运行时间传递给了__update_curr(),由后者个根据当前可运行进程总数对运行时间进行加权计算。

/*
 * delta /= w
 */
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
	if (unlikely(se->load.weight != NICE_0_LOAD))
		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

	return delta;
}


/*
 * delta_exec * weight / lw.weight
 *   OR
 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 *
 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
 * we're guaranteed shift stays positive because inv_weight is guaranteed to
 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 *
 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 * weight/lw.weight <= 1, and therefore our shift will also be positive.
 */
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
	u64 fact = scale_load_down(weight);
	u32 fact_hi = (u32)(fact >> 32);
	int shift = WMULT_SHIFT;
	int fs;

	__update_inv_weight(lw);

	if (unlikely(fact_hi)) {
		fs = fls(fact_hi);
		shift -= fs;
		fact >>= fs;
	}

	fact = mul_u32_u32(fact, lw->inv_weight);

	fact_hi = (u32)(fact >> 32);
	if (fact_hi) {
		fs = fls(fact_hi);
		shift -= fs;
		fact >>= fs;
	}

	return mul_u64_u32_shr(delta_exec, fact, shift);
}

这段代码是 Linux 内核中与调度相关的代码。这段代码中涉及了对调度实体的负载和权重的计算，以及根据权重计算出的增量值。

首先，calc_delta_fair?函数是用来计算公平调度中的增量值。在这个函数中，首先检查了调度实体的负载权重是否为默认值?NICE_0_LOAD，如果不是，则调用?__calc_delta?函数重新计算增量值。最后返回计算后的增量值。

接着，__calc_delta?函数是用来根据给定的?delta_exec?和权重参数?weight?计算出最终的增量值。在这个函数中，首先对权重进行了一些处理，然后根据权重计算出一个?fact?值，并根据?fact?值和?delta_exec?进行一系列的位运算和乘法运算，最终得到计算后的增量值。

这段代码的主要目的是根据调度实体的负载和权重计算出增量值，用于在调度算法中进行调度决策。

2、进程选择

当CFS需要找下一个运行的进程的时候，它会挑选一个具有最小vruntime的进程。这是CFS调度的核心

CFS存储可运行进程，使用红黑树存储了所有当前需要被调度的程序，节点可运行进程的虚拟运行时间，节点key 是vruntime

如何找最小节点？顺着root节点一直找左节点，最左边的叶子节点，就是vruntime最小的节点。

static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
	struct rb_node *next = rb_next(&se->run_node);

	if (!next)
		return NULL;

	return __node_2_se(next);
}

当使用fork函数的时候触发enqueue_entity

/*
 * Enqueue an entity into the rb-tree:
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
}

static __always_inline struct rb_node *
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
	      bool (*less)(struct rb_node *, const struct rb_node *))
{
	struct rb_node **link = &tree->rb_root.rb_node;
	struct rb_node *parent = NULL;
	bool leftmost = true;

	while (*link) {
		parent = *link;
		if (less(node, parent)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = false;
		}
	}

	rb_link_node(node, parent, link);
	rb_insert_color_cached(node, tree, leftmost);

	return leftmost ? node : NULL;
}

?当进程阻塞或者终止的时候从红黑树上删除进程?

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	/*
	 * When dequeuing a sched_entity, we must:
	 *   - Update loads to have both entity and cfs_rq synced with now.
	 *   - Subtract its load from the cfs_rq->runnable_avg.
	 *   - Subtract its previous weight from cfs_rq->load.weight.
	 *   - For group entity, update its weight to reflect the new share
	 *     of its group cfs_rq.
	 */
	update_load_avg(cfs_rq, se, UPDATE_TG);
	se_update_runnable(se);

	update_stats_dequeue_fair(cfs_rq, se, flags);

	clear_buddies(cfs_rq, se);

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);
	se->on_rq = 0;
	account_entity_dequeue(cfs_rq, se);

	/*
	 * Normalize after update_curr(); which will also have moved
	 * min_vruntime if @se is the one holding it back. But before doing
	 * update_min_vruntime() again, which will discount @se's position and
	 * can move min_vruntime forward still more.
	 */
	if (!(flags & DEQUEUE_SLEEP))
		se->vruntime -= cfs_rq->min_vruntime;

	/* return excess runtime on last dequeue */
	return_cfs_rq_runtime(cfs_rq);

	update_cfs_group(se);

	/*
	 * Now advance min_vruntime if @se was the entity holding it back,
	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
	 * put back on, and if we advance min_vruntime, we'll be placed back
	 * further than we started -- ie. we'll be penalized.
	 */
	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
		update_min_vruntime(cfs_rq);

	if (cfs_rq->nr_running == 0)
		update_idle_cfs_rq_clock_pelt(cfs_rq);
}

3、调度器入口

进程调度的主要入口函数是schedule(),它定义在kernel/sched.c中。他在内核中最重要的作用是选择哪个进程可以运行

/*
 * Pick up the highest-prio task:
 */
static inline struct task_struct *
__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	const struct sched_class *class;
	struct task_struct *p;

	/*
	 * Optimization: we know that if all tasks are in the fair class we can
	 * call that function directly, but only if the @prev task wasn't of a
	 * higher scheduling class, because otherwise those lose the
	 * opportunity to pull in more work from other CPUs.
	 */
	if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
		   rq->nr_running == rq->cfs.h_nr_running)) {

		p = pick_next_task_fair(rq, prev, rf);
		if (unlikely(p == RETRY_TASK))
			goto restart;

		/* Assume the next prioritized class is idle_sched_class */
		if (!p) {
			put_prev_task(rq, prev);
			p = pick_next_task_idle(rq);
		}

		return p;
	}

4、睡眠和唤醒

休眠被阻塞的进程处于一个特殊不可执行的状态。

进程把自己标记为休眠转改，从红黑树中放入等待队列

使用等待队列的例子:

DEFINE_WAIT(wait)

add_wait_queue(q, &wait)

while(!condition) {
    prepare_to_wait(&q, &wait, TASK_INTERRUPTIABLE);
    if (signal_pending(current))
        schedule();
}

finish_wait(&q, &wait);

5、抢占和上下文切换

上下文切换，也就是从一个可执行进程切换到另一个可执行进程，定义在kernel/sched.c中的context_switch,它完成两项基本工作：

1)switch_mm把进程从上一个进程切换到新进程的处理器中。

2）switch_to 这个函数负责从上一个进程的处理器状态切换到新进程的处理器状态。这包括保存、回复栈信息和寄存器信息。?????????

内核提供了一个need_resched标志来表示是否需要重新进行一次调度。这个标志表示有其他进程应当被运行了，应该尽快被调度。

当某个进程应该被抢占时，周期性调度器scheduler_tick()会设置这个标志位。
当一个优先级高的进程进入可执行状态的时候，try_to_wake_up()会设置这个标志位。

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline int test_tsk_need_resched(struct task_struct *tsk)
{
	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
static __always_inline bool need_resched(void)
{
	return unlikely(tif_need_resched());
}

#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)

#define test_thread_flag(flag) \
	test_ti_thread_flag(current_thread_info(), flag)

6、内核的实时调度策略

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#define __USE_GNU
#include <sched.h>
 
#define MY_PRIORITY (49) /* 定义实时进程的优先级 */
#define MAX_ITER (10000000) /* 定义实时进程的执行次数 */
 
int main(int argc, char *argv[]) {
    struct sched_param my_params;
    int i, j;
    int priority = 2;  // 请根据实际需求设置优先级

    my_params.sched_priority = priority;

 
    /* 设置实时调度策略 */
    if (sched_setscheduler(getpid(), SCHED_FIFO, &my_params) == -1) {
        perror("sched_setscheduler failed");
        exit(EXIT_FAILURE);
    }
 
    /* 设置进程的优先级 */
    my_params.sched_priority = MY_PRIORITY;
    if (sched_setparam(0, &my_params) == -1) {
        perror("sched_setparam failed");
        exit(EXIT_FAILURE);
    }
 
    /* 分配 CPU 核心 */
    cpu_set_t my_set;
    CPU_ZERO(&my_set);
    CPU_SET(1, &my_set);
    if (sched_setaffinity(0, sizeof(cpu_set_t), &my_set) == -1) {
        perror("sched_setaffinity failed");
        exit(EXIT_FAILURE);
    }
 
    /* 执行实时进程的任务 */
    for(;;){}
 
    return 0;
}

文章来源:https://blog.csdn.net/qq_32783703/article/details/135661093
本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若内容造成侵权/违法违规/事实不符，请联系我的编程经验分享网邮箱：chenni525@qq.com进行投诉反馈，一经查实，立即删除！