安卓内核内存回收
背景
一般上层由于业务需要,需要内核提供一些定制的内存回收接口。或者内核层本来就想做内存机制的优化。便需要在原有内存回收机制上做一些hook操作。所以了解Linux内核内存回收流程很重要
本章节主要讲kswapd线程,当内存低watermark时,kswapd会被唤醒并开始工作。
内存回收步骤
- kswapd初始化
1、设置每次swap的page数
2、创建kswapd线程,多个numa节点对应多个线程。 - 执行kswapd()
1、进入for死循环
2、整理内存碎片后,进入睡眠。
3、被唤醒后,执行回收
内存回收详细步骤
- kswapd初始化
static int __init kswapd_init(void) {
swap_setup();//设置page_cluster,作用是确定每次swap in/out多少page(2^page_cluster)
for_each_node_state(nid, N_MEMORY)//遍历所有numa节点
kswapd_run(nid);//为每个numa节点创建kswapd线程
}
void kswapd_run(int nid) { //为每个节点id 创建kswapd线程
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
}
- 执行kswapd()
// kswapd整个生命都在这儿
static int kswapd(void *p) {
pg_data_t *pgdat = (pg_data_t *)p; //每个numa节点,通过pg_data_t描述物理内存布局
for ( ; ; ) {
//判断kswapd是否进入睡眠,并让出cpu。
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, highest_zoneidx);
// 回收page
reclaim_order = balance_pgdat(pgdat, alloc_order, highest_zoneidx);
}
}
// 尝试睡眠
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int highest_zoneidx) {
//将kswapd线程加入此内存节点的wait队列
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
//小睡一会,目的是唤醒 内存压缩线程,整理内存碎片
if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);//唤醒内存压缩线程
remaining = schedule_timeout(HZ/10);//小睡一会
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
}
if (!remaining && prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
if (!kthread_should_stop())
schedule();// 正式睡眠,让出cpu
}
finish_wait(&pgdat->kswapd_wait, &wait);
}
// 动手回收page
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) {
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.may_unmap = 1,
}; //page回收控制
do {
if (kswapd_shrink_node(pgdat, &sc)) //回收核心函数
raise_priority = false;
} while (sc.priority >= 1);
}
static bool kswapd_shrink_node(pg_data_t *pgdat, struct scan_control *sc) {
sc->nr_to_reclaim = 0;
for (z = 0; z <= sc->reclaim_idx; z++) {
zone = pgdat->node_zones + z;
if (!managed_zone(zone))
continue;
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);//计算期望回收的page数
}
shrink_node(pgdat, sc);//对内存节点进行回收
return sc->nr_scanned >= sc->nr_to_reclaim;
}
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) {
again:
shrink_node_memcgs(pgdat, sc);//往下调用实质性回收函数
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;//将slab部分,算进已回收
reclaim_state->reclaimed_slab = 0;
}
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc))//回收还不满足要求,继续
goto again;
}
static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) {
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);//numa节点的lru列表
shrink_lruvec(lruvec, sc);//lru 相关回收
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);// slab相关回收
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) {
unsigned long nr[NR_LRU_LISTS];//记录numa节点中"各类page"待扫描数 nr[0]=anon inactive pages nr[1]=anon active pages nr[2]=file inactive pages nr[3]=file active pages
unsigned long nr_reclaimed = 0;//本次已回收page数
unsigned long nr_to_reclaim = sc->nr_to_reclaim;//期望回收page数
get_scan_count(lruvec, sc, nr);//获取nr变量 扫描数量
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {//3个lru链表中待扫描page数不为0
unsigned long nr_anon, nr_file, percentage;
unsigned long nr_scanned;
for_each_evictable_lru(lru) {// 遍历所有lru链表
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan; //计算剩余待扫描page数
nr_reclaimed += shrink_list(lru, nr_to_scan, lruvec, sc);//回收指定链表
}
}
if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)//回收未达标,继续回收
continue;
// 回收达标后,匿名页或文件页 只要一个待扫描为0 就退出循环
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
if (!nr_file || !nr_anon)
break;
}
sc->nr_reclaimed += nr_reclaimed;//将本次已回收累加到sc里
}
源码
参考kernel 6.1