内存规整是Mel Gormal开发防止内存碎片anti-fragmen pach补丁的第二个部分Avoiding fragmentation with page clustering v27 [LWN.net],主要用于解决当系统长时间运行之后,造成比较碎化内存时,通过内存规整将处于可以移动MOVE的类型内存,重新进行页迁移 整合出较大连续物理内存:
内存规整机制原因比较简单:
- 利用类似快慢指针技巧,当需要对一个zone进行内存规整时,使用migrate_pfn和free_pfn两个遍历;
- migrate_pfn 为从zone 头部开始进行扫描,依次扫描出已经被分配出去但是可以进行页迁移的页面
- free_pfn:为从zone 尾部开始扫描,依次扫描出空闲page。
- 当每次扫描结束后,将migrate_pfn扫描出的可迁移页面 依次迁移到free_pfn 空闲page中。
- 当free_pfn和migrate_pfn 两个遇见相等时说明内存规整完毕。
- 这样规整之后,zone前半部分可以需要迁移的页面被迁移到zone后半部分空闲page中, 这样前半部分会空闲出大块连续物理内存,供下次申请内存使用。
内存规整技术是页迁移技术的一个比较重要的使用场景,帮助系统整理出连续物理内存。
触发时机内存规整触发时机主要有以下三种:
- 通过/proc/sys/vm/compact_memory 由用户根据手动触发,如果是NUMA系统则还可以通过/sys/devices/system/node/node
/compact 触发 - kcompatd线程类似与kswapd线程,内存水位不够时,会触发kcompactd线程进行异步内存规整。
- 慢速申请内存通道,说明内存压力过大,__alloc_pages_slowpath会通过__alloc_pages_direct_compact 进行同步内存规整。
由于内存规整是以zone为单位进行扫描,因此不像kswapd由于pgdata中相关数据,内存规整主要涉及到struct zone中数据用于记录内存规整的进度:
struct zone {
... ...
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
unsigned long compact_cached_free_pfn; //用于记录从尾部开始扫描的空闲page的位置
unsigned long compact_cached_migrate_pfn[2];//该数组用于控制异步和同步两种memory compact场景所从头部开始扫描的页迁移位置
unsigned long compact_init_migrate_pfn; //内存规整页迁移起始地址
unsigned long compact_init_free_pfn; //内存规整的空闲free起始地址
#endif
} ____cacheline_internodealigned_in_smp;
struct compact_control
struct compact_control结构类似与kswapd中的struct scan_control, 该结构主要用于内存规整时内部使用的数据结构,同时还可以控制内存规整起始位置,以及策略等。
struct compact_control {
struct list_head freepages;
struct list_head migratepages;
unsigned int nr_freepages;
unsigned int nr_migratepages;
unsigned long free_pfn;
unsigned long migrate_pfn;
unsigned long fast_start_pfn;
struct zone *zone;
unsigned long total_migrate_scanned;
unsigned long total_free_scanned;
unsigned short fast_search_fail;
short search_order;
const gfp_t gfp_mask;
int order;
int migratetype;
const unsigned int alloc_flags;
const int highest_zoneidx;
enum migrate_mode mode;
bool ignore_skip_hint;
bool no_set_skip_hint;
bool ignore_block_suitable;
bool direct_compaction;
bool whole_zone;
bool contended;
bool rescan;
bool alloc_contig;
};
主要成员说明:
- struct list_head freepages: 空闲页链表,表明页面要迁移到目的页即空闲页链表,处于该链表中的空闲页,被isolate孤立出来,防止同时被buddy给其他进程使用。
- struct list_head migratepages:所要迁移的页面链表,用于记录本次所需要迁移的页面,处于该链表中的空闲页,被isolate孤立出来,防止页面被swap out到磁盘或者page cache被释放等场景。
- unsigned int nr_freepages: 记录freepages中有多少个空闲页 被isolate孤立出来。
- unsigned int nr_migratepages: 记录migratepages中有多少个页面要进行迁移,并被isolate孤立出来
- unsigned long free_pfn: 从尾部开始扫描的空闲起始页帧号,即本次扫描zone,从尾部开始扫描寻找空闲页的起始位置。
- unsigned long migrate_pfn:本次扫描,从头部往尾部开始扫描的 起始位置,从该位置开始寻找符合要求的页进行迁移。
- unsigned long fast_start_pfn:用于快速线性扫描的起始位置
- struct zone *zone:所要扫描的zone
- unsigned long total_migrate_scanned:已经扫描并做页迁移的页数目
- unsigned long total_free_scanned:已经扫描用用作空闲页 作为页前面目的的数目
- const gfp_t gfp_mask: gfp mask
- short search_order: 扫描是开始的order 即一次性做页迁移的数目
- int order: 扫描时所需要的至少order 页数目。
- int migratetype:页迁移类型
- const int highest_zoneidx: 最高zone,扫描的zone范围‘
- enum migrate_mode mode: 是同步还是异步模式,即是通过kcompact线程进行内存规整,还是通过直接方式进行内存规整
- bool direct_compaction: 如果为false,则是通过kscompact线程 或者 /proc手动触发触发
- bool whole_zone:是否一次性扫描整个zone.
compact_zone()函数是实施内存规整的核心函数,不管是哪种触发方式最终都会通过compact_zone 规整指定的zone进行内存规整,
调用关系按照compact_zone触发关系调用关系图如下:
上述四种触发方式,最终都是靠compact_zone实现页迁移功能
compact_zone源码
static enum compact_result
compact_zone(struct compact_control *cc, struct capture_control *capc)
{
enum compact_result ret;
unsigned long start_pfn = cc->zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(cc->zone);
unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
cc->total_migrate_scanned = 0;
cc->total_free_scanned = 0;
cc->nr_migratepages = 0;
cc->nr_freepages = 0;
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
cc->migratetype = gfp_migratetype(cc->gfp_mask);
ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
cc->highest_zoneidx);
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;
VM_BUG_ON(ret != COMPACT_CONTINUE);
if (compaction_restarting(cc->zone, cc->order))
__reset_isolation_suitable(cc->zone);
cc->fast_start_pfn = 0;
if (cc->whole_zone) {
cc->migrate_pfn = start_pfn;
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
} else {
cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = cc->zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
cc->zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
cc->whole_zone = true;
}
last_migrated_pfn = 0;
update_cached = !sync &&
cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
migrate_prep_local();
while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
unsigned long start_pfn = cc->migrate_pfn;
cc->rescan = false;
if (pageblock_start_pfn(last_migrated_pfn) ==
pageblock_start_pfn(start_pfn)) {
cc->rescan = true;
}
switch (isolate_migratepages(cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
if (update_cached) {
cc->zone->compact_cached_migrate_pfn[1] =
cc->zone->compact_cached_migrate_pfn[0];
}
goto check_drain;
case ISOLATE_SUCCESS:
update_cached = false;
last_migrated_pfn = start_pfn;
;
}
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION);
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
&cc->migratepages);
cc->nr_migratepages = 0;
if (err) {
putback_movable_pages(&cc->migratepages);
if (err == -ENOMEM && !compact_scanners_met(cc)) {
ret = COMPACT_CONTENDED;
goto out;
}
if (cc->direct_compaction &&
(cc->mode == MIGRATE_ASYNC)) {
cc->migrate_pfn = block_end_pfn(
cc->migrate_pfn - 1, cc->order);
last_migrated_pfn = 0;
}
}
check_drain:
if (cc->order > 0 && last_migrated_pfn) {
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
if (last_migrated_pfn < current_block_start) {
lru_add_drain_cpu_zone(cc->zone);
last_migrated_pfn = 0;
}
}
if (capc && capc->page) {
ret = COMPACT_SUCCESS;
break;
}
}
out:
if (cc->nr_freepages > 0) {
unsigned long free_pfn = release_freepages(&cc->freepages);
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
free_pfn = pageblock_start_pfn(free_pfn);
if (free_pfn > cc->zone->compact_cached_free_pfn)
cc->zone->compact_cached_free_pfn = free_pfn;
}
count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
return ret;
}
compact_zone流程
该函数整理处理思路相对比较清晰:
- 按照指定的zone进行内存规整
- 对compact_control结构中的一些值进行初始化
- compaction_suitable:根据实际内存水位情况判断是否有必要做内存规整,因为内存规整操作比较耗时,如果空闲内存处于较高情况,则没有必要触发内存规整,最终是通过__compaction_suitable实现对内存水位判断
- compact_finished: 用于处理当前zone是否扫描完毕,zone扫描时使用了处理上的技巧。内存规整时分别从zone头部扫描migrate_pfn和从zone尾部扫描free_pfn, 当migrate_pfn与free_pfn相遇时,则认为扫描完毕。最终是调用__compact_finished函数
- isolate_migratepages: 将扫描出的要迁移的页进行孤立isolate,防止在迁移过程中,该页面被释放 或者swap out等,也是做页迁移之前必须准备工作,可以详见《linux那些事之页迁移(page migratiom)》
- migrate_pages:将孤立出来的页面进行页迁移。
- 当zone扫描内存规整完毕之后,需要free_pfn 等信息保存到zone中 用于记录本次扫描位置,空闲页等信息,以便用于下一次内存规整时使用。
- compact_result 为内存规整结果
__compaction_suitable用于判断当前zone是否可以做内存规整:
static enum compact_result __compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
int highest_zoneidx,
unsigned long wmark_target)
{
unsigned long watermark;
if (is_via_compact_memory(order))
return COMPACT_CONTINUE;
watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
alloc_flags))
return COMPACT_SUCCESS;
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
ALLOC_CMA, wmark_target))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
}
主要有以下几种情况:
- is_via_compact_memory: 是否通过/proc/sys/vm/compact_memory 手动强制进行内存规整,如果强制内存规整,则直接返回COMPACT_CONTINUE,继续后续步骤进行内存规整
- zone_watermark_ok: 当作内存water mark满足order,则没有必要做内存规整,能够申请oder 内存成功,返回COMPACT_SUCCESS
- 当内存不满足时,则查看是否可以通过内存压缩满足内存申请,如果能够通过内存规整满足,则返回COMPACT_SUCCESS进行内存规整,如果判断出当前内存不能够通过内存压缩满足分配要求,则说明内存碎片化不严重,没有必要进行内存压缩发跳过此次内存规整操作,返回 COMPACT_SKIPPED。
compact_result为此次内存规整之后的结果:
enum compact_result {
COMPACT_NOT_SUITABLE_ZONE,
COMPACT_SKIPPED,
COMPACT_DEFERRED,
COMPACT_INACTIVE = COMPACT_DEFERRED,
COMPACT_NO_SUITABLE_PAGE,
COMPACT_CONTINUE,
COMPACT_COMPLETE,
COMPACT_PARTIAL_SKIPPED,
COMPACT_CONTENDED,
COMPACT_SUCCESS,
};
- COMPACT_NOT_SUITABLE_ZONE:该zone不适合做内存规整
- COMPACT_SKIPPED:内存规整要求不满足,跳过该zone
- COMPACT_DEFERRED:由于之前的一些错误导致内存规整退出
- COMPACT_INACTIVE:上次内存规整未激活
- COMPACT_NO_SUITABLE_PAGE:没有合适的物理页做内存规整
- COMPACT_CONTINUE:下一个页面块pageblock继续做内存规整
- COMPACT_COMPLETE:zone都以及做内存规整扫描完毕,但是没有合适页面做内存规整
- COMPACT_PARTIAL_SKIPPED:直接内存规整已经扫描了部分zone页面,但是仍然没有合适页面做内存规整
- COMPACT_CONTENDED:由于某些锁竞争导致内存规整退出
- COMPACT_SUCCESS:当前zone内存规整满足页面分配要求,可以退出
__compact_finished用于判断当前内存压缩扫描是否完成:
static enum compact_result __compact_finished(struct compact_control *cc)
{
unsigned int order;
const int migratetype = cc->migratetype;
int ret;
if (compact_scanners_met(cc)) {
reset_cached_positions(cc->zone);
if (cc->direct_compaction)
cc->zone->compact_blockskip_flush = true;
if (cc->whole_zone)
return COMPACT_COMPLETE;
else
return COMPACT_PARTIAL_SKIPPED;
}
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
return COMPACT_CONTINUE;
ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &cc->zone->free_area[order];
bool can_steal;
if (!free_area_empty(area, migratetype))
return COMPACT_SUCCESS;
#ifdef CONFIG_CMA
if (migratetype == MIGRATE_MOVABLE &&
!free_area_empty(area, MIGRATE_CMA))
return COMPACT_SUCCESS;
#endif
if (find_suitable_fallback(area, order, migratetype,
true, &can_steal) != -1) {
if (migratetype == MIGRATE_MOVABLE)
return COMPACT_SUCCESS;
if (cc->mode == MIGRATE_ASYNC ||
IS_ALIGNED(cc->migrate_pfn,
pageblock_nr_pages)) {
return COMPACT_SUCCESS;
}
ret = COMPACT_CONTINUE;
break;
}
}
if (cc->contended || fatal_signal_pending(current))
ret = COMPACT_CONTENDED;
return ret;
}
- compact_scanners_met: cc->free_pfn和cc->migrate_pfn是否相遇,如果说明已经扫描完毕,返回true
- compact_scanners_met 返回true,如果cc->whole_zone为true说明需要扫描整个zone,并且已经扫描完毕了,返回COMPACT_COMPLETE。如果ccf->whole_zone为false,说明不需要扫描整个zone,但是此时已经扫描完整个zone,说明扫描完整个zone还没有满足后续内存分配要求,返回COMPACT_PARTIAL_SKIPPED。
- compact_scanners_met 返回false, is_via_compact_memory()为ture说明是/proc/sys/vm/compact_memory手动触发,需要继续扫描zone下一个pageblock,返回COMPACT_CONTINUE。
- 如果cc->migrate_pfn 不是pageblock对齐,则需要继续扫描
- 接下来一个比较长的循环出来,主要是需要判断后续扫描是否还有空闲page 用来做迁移目的页,如果有则可以继续做迁移,如果没有则返回COMPACT_NO_SUITABLE_PAGE
isolate_migratepages函数处理比较长,整体处理思路就是继续扫描,将扫描出合适的物理页做迁移,并将需要做迁移的页孤立出来,加入到cc->migratepages中:
static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
{
unsigned long block_start_pfn;
unsigned long block_end_pfn;
unsigned long low_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
bool fast_find_block;
//通过快速通道找到所要迁移扫描的起始pfn
low_pfn = fast_find_migrateblock(cc);
//计算出扫描结束page pfn,要求按照pageblock对齐
block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < cc->zone->zone_start_pfn)
block_start_pfn = cc->zone->zone_start_pfn;
//快速找到的pfn是否成功
fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
block_end_pfn = pageblock_end_pfn(low_pfn);
//对low_pfn起始的pageblock内物理页面做扫描,将合适的做迁移的页面孤立出来
for (; block_end_pfn <= cc->free_pfn;
fast_find_block = false,
low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
//如果长时间循环,需要进行放权,给其他进程得到调度机会
if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
//检查是否在同一个pageblock内
page = pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, cc->zone);
if (!page)
continue;
//如果最近孤立页面失败,则不进行再次尝试,仅仅做检查
if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
!fast_find_block && !isolation_suitable(cc, page))
continue;
//如果是异步规整,则仅规整move页面且非huge page
if (!suitable_migration_source(cc, page)) {
update_cached_migrate(cc, block_end_pfn);
continue;
}
//将[low_pfn,block_end_pfn)中的符合内存规整要求的页面孤立出来,将孤立出来的页面加入到cc->migratepages中
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
if (!low_pfn)
return ISOLATE_ABORT;
//孤立页面成功或者失败都不在继续
break;
}
//记录下次重新扫描的做页迁移的起始pfn
cc->migrate_pfn = low_pfn;
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
migrate_pages
将上述合适做页迁移的页面,且已经孤立到cc->migratepages中页面做页迁移:
err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode,MR_COMPACTION)
关于页迁移详细信息可以参考《linux那些事之页迁移(page migratiom)》



