当前位置：首页 > article >正文

Linux内核源码分析-进程调度(二)-常用数据结构

article 2025/2/27 21:05:43

cfs调度器常用数据结构

struct task_struct
struct sched_entity
struct rq
struct cfs_rq

struct task_struct

Linux通过struct task_struct结构体描述每一个进程。task_struct包含很多进程相关的信息（例如，优先级、进程状态以及调度实体等）。简略版的task_struct如下：

// 进程描述符
struct task_struct {
	/* -1 unrunnable, 0 runnable, >0 stopped: */
	volatile long			state;  // 进程状态标志
#endif
	/*
	 * /* task_struct::on_rq states: */
     * #define TASK_ON_RQ_QUEUED	1  // 表示进程正在就绪队列中运行。
     * #define TASK_ON_RQ_MIGRATING	2  // 表示处于迁移过程中的进程，可能不在就绪队列中。
	 */
	int				on_rq;  // 调度实体是否在就绪队列中，0代表不存在，1代表存在。

    // 进程调度策略和优先级
    /*
	  限期进程的优先级比实时进程要高,实时进程的优先级比普通进程要高.
	        限期进程的优先级是-1;
			实时进程的优先级0-99, 优先级数值越大,表示优先级越高;
			普通进程的优先级为: 100-139, 优先级数值越小,表示优先级越高, 可通过修改nice值改变普通进程的优先级,  优先级等于120加上nice值.                                                                                                                                                                                                                                          ,想
	*/
    /*
	  static_prio：普通进程的静态优先级，值越小优先级越高。静态优先级是进程启动时分配的优先级。
	               可以用nice()或者sched_setscheduler()系统调用更改，否则在运行期间一直保持不变。
	  rt_priority：实时进程的优先级，值介于[0,99]之间。注意：rt_priority是值越大优先级越高
	  normal_prio：基于前两个参数ststic_prio和rt_priority计算出来的。
	               可以这样理解：static_prio和rt_priority分别代表普通进程和实时进程的“静态（原生）”的优先级，代表进程的固有属性。
				   由于普通进程是值越小优先级越高，而实时进程是值越大优先级越高，需要统一成值越小优先级越高，因此普通进程的normal_prio等于static_prio，而实时进程的normal_prio等于99-rt_priority。
	  prio：进程的有效优先级，也称动态优先级。
	        顾名思义，在系统中需要判断进程优先级时用的便是该参数，调度考虑的优先级也是它。对于实时进程来说，有效优先级prio通常就等于normal_prio。
			进程可以临时提高优先级，通过改变prio的值实现，有效优先级的提升不影响进程的静态优先级。子进程的有效优先级继承自父进程的静态优先级，而不是父进程的有效优先级（父进程的优先级临时提高了，该特性不会遗传给子进程）。
	*/

	int				prio;   // 调度优先级(数值越小, 优先级越高)
	int				static_prio;   // 静态优先级
	int				normal_prio;  // 正常优先级(数值越小, 优先级越高)
	unsigned int			rt_priority;  // 实时优先级,用来记录实时进程的用户空间的静态优先级。

	const struct sched_class	*sched_class;  // 表示该进程所属的调度器类
	struct sched_entity		se;  // 普通进程调度类实体
	struct sched_rt_entity		rt;  // 实时进程调度类实体
#ifdef CONFIG_CGROUP_SCHED
	struct task_group		*sched_task_group;  // 组调度
#endif
	struct sched_dl_entity		dl;  // deadline调度类实体
}

struct sched_entity

但是，每一个调度类并不是直接管理task_struct，管理的是调度实体，从而引入调度实体的概念。CFS调度器使用sched_entity达到PELT(per entity load tracking)，即se级别的负载跟踪。调度实体记录权重信息、运行时间信息、负载信息等。

// 被调度实体（包括两种：①与进程一一对应的调度实体，即一个进程一个调度实体；②组调度实体，一个组对应一个调度实体，一个组里可能包含多个进程(即当前group se包含多个task se)）。
struct sched_entity {
	/* For load-balancing: */
	struct load_weight		load;  // 权重信息，用于计算虚拟时间。

	/*
	 * runnable_weight指可运行权重，该概念主要针对group se提出。
	 * 针对task se来说，runnable_weight的值就是和进程权重weight相等。
	 * 针对group se，runnable_weight总是小于等于weight （其实这话说的像废话，因为一个group se中还有可能存在running状态的task se）。
	 */
	unsigned long			runnable_weight;  // 在所有可运行进程中所占的权重。
	struct rb_node			run_node;  // 调度实体作为一个节点插入到CFS的红黑树中。CFS调度器的每个就绪队列维护了一颗红黑树，上面挂满了就绪等待执行的task，run_node就是挂载点。
	struct list_head		group_node;  // 当前调度实体属于哪一个调度组，是rq中cfs_tasks链表的挂载点。
	unsigned int			on_rq;  // 是否在就绪队列上。进程加入到就绪队列，该位置被置为1；退出就绪队列，被置为0。

	u64				exec_start;  // 上次启动时间
	u64				sum_exec_runtime;  // 当前进程总的CPU消耗时间，这个是真实的CPU消耗时间。
	u64				vruntime;  // 虚拟运行时间
	u64				prev_sum_exec_runtime;  // 进程撤销时会将sum_exec_runtime保存到prev_sum_exec_runtime中。

	u64				nr_migrations;

	struct sched_statistics		statistics;

#ifdef CONFIG_FAIR_GROUP_SCHED  // 组调度启用宏
	int				depth;  // se的嵌套深度，比如cpu rq的顶层调度实体为task se，则depth等于0；若为greou se中的task se，则此时的task se中的depth为1。
	struct sched_entity		*parent;  // 当前调度实体是task se时，parent指向包含此task se的group se。
	/* rq on which this entity is (to be) queued: */
	struct cfs_rq			*cfs_rq;  // 所属的就绪队，即指向group se的就绪队列cfs_rq。
	/* rq "owned" by this entity/group: */
	struct cfs_rq			*my_q;  // group se的就绪队列。
#endif

#ifdef CONFIG_SMP
	/*
	 * Per entity load average tracking.
	 *
	 * Put into separate cache line so it does not
	 * collide with read-mostly values above.
	 */
	struct sched_avg		avg;  // 负载信息
#endif
};

struct rq

// 每个CPU都有一个全局的运行队列，它是per-cpu类型，即每个cpu上都会有一个struct rq结构体。rq中包含cfs就绪队列、rt就绪队列、dl就绪队列。

struct rq { 
	unsigned int		nr_running;  // rq中可运行的进程数量
	struct cfs_rq		cfs;  // cfs就绪队列
	struct rt_rq		rt;  // rt就绪队列
	struct dl_rq		dl;  // dl就绪队列
};

struct cfs_rq

CFS调度器的就绪队列，简称cfs就绪队列，管理就绪态的struct sched_entity调度实体，后续可以通过pick_next_task接口从cfs就绪队列中选择最适合运行的调度实体（虚拟时间最小的调度实体）。
cfs就绪队列包含队列权重、负载、运行统计等信息。

/* CFS-related fields in a runqueue */
// 
struct cfs_rq {
	struct load_weight	load;  // 就绪队列权重，即就绪队列管理的所有调度实体权重之和。
	unsigned long		runnable_weight;  // 针对group se提出,是实体权重的一部分，表示rq中可运行实体的权重总和。对于task se来说，runnable_weight就是se的weight，二者的值完全一样。
	unsigned int		nr_running;  // 就绪队列上调度实体的个数（包括task se和group se，比如task se个数为9，group se个数为1，则nr_running为10）。

	// 就绪队列上真实的调度实体的个数（比如当前调度队列上task se个数为9，group se个数为1（group se中包含9个task se，则），则h_nr_running为18）
	unsigned int		h_nr_running;  /* SCHED_{NORMAL,BATCH,IDLE} */  
	unsigned int		idle_h_nr_running; /* SCHED_IDLE */

	u64			exec_clock;

	/*
	 * cfs_rq（csf run queue）中的每一个进程都有一个虚拟时钟，vruntime。
	 * 如果一个进程得以执行，随着时间的增加（一个一个tick的到来），其vruntime将不断增大。
	 * 没有得到执行的进程vruntime不变。调度器总是选择vruntime最小的那个进程来运行(即红黑树最左边的进程)。这就是所谓的“完全公平”。
	 * 为了区别不同优先级的进程，优先级高的进程vruntime增长得慢，以至于它们可以得到更多的运行时间。
	*/
	u64			min_vruntime;  // 跟踪就绪队列上所有调度实体的最小虚拟运行时间。
#ifndef CONFIG_64BIT
	u64			min_vruntime_copy;
#endif

	struct rb_root_cached	tasks_timeline;  // 按照虚拟时间大小排序的红黑树（虚拟运行时间最小的在红黑树的最左边）。

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	//  sched_entity：可被内核调度的实体
	struct sched_entity	*curr;  // 就绪队列的当前执行进程的可调度实体
	struct sched_entity	*next;
	struct sched_entity	*last;
	struct sched_entity	*skip;

#ifdef	CONFIG_SCHED_DEBUG
	unsigned int		nr_spread_over;
#endif

#ifdef CONFIG_SMP
	/*
	 * CFS load tracking
	 */
	struct sched_avg	avg;  // 负载信息
#ifndef CONFIG_64BIT
	u64			load_last_update_time_copy;
#endif


    /*
	 * 当一个任务退出或者唤醒后迁移到到其他cpu上的时候，那么原本所在CPU的cfs rq上需要移除该任务带来的负载。
	 * 由于持rq锁问题，所以先把移除的负载记录在这个removed成员中，适当的时机再更新之。删除记录函数 remove_entity_load_avg()，实际删除函数 update_cfs_rq_load_avg()。
	 */
	struct {
		raw_spinlock_t	lock ____cacheline_aligned;
		int		nr;
		unsigned long	load_avg;
		unsigned long	util_avg;
		unsigned long	runnable_sum;
	} removed;

#ifdef CONFIG_FAIR_GROUP_SCHED

    /*
	 * 指该group cfs_rq已经向 task group 的load_avg同步的负载值。
	 * 因为tg是一个全局共享变量，多个cpu会同时访问，为了避免严重的资源竞争，group cfs_rq的负载贡献值（运行就更新）不会立即就更新到tg->load_avg（task group的负载总和）。
	 * 而是等到差值大到一定程度，才会再次同步到tg->load_avg；
	 */
	unsigned long		tg_load_avg_contrib;
	long			propagate;
	long			prop_runnable_sum;

	/*
	 *   h_load = weight * f(tg)
	 *
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	 */
	unsigned long		h_load;
	u64			last_h_load_update;
	struct sched_entity	*h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */

#ifdef CONFIG_FAIR_GROUP_SCHED
	struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached ,此cfs_rq附加到的CPU运行队列,即cfs队列所在的全局就绪队列*/

	/*
	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
	 * This list is used during load balance.
	 */
	int			on_list;
	struct list_head	leaf_cfs_rq_list;

	// tg指向就绪队列包含的调度实体属于的task group。
	struct task_group	*tg;	/* group that "owns" this runqueue */  

#ifdef CONFIG_CFS_BANDWIDTH
	int			runtime_enabled;
	s64			runtime_remaining;

	u64			throttled_clock;
	u64			throttled_clock_task;
	u64			throttled_clock_task_time;
	int			throttled;
	int			throttle_count;
	struct list_head	throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};