Linux内核源代码情景分析-交换分区

浏览数：33 / 时间：2015年06月20日

在Linux内核源代码情景分析-共享内存中，共享内存，当内存紧张时是换出到交换分区。

在Linux内核源代码情景分析-mmap后，文件与虚拟区间建立映射中，文件映射的页面，当内存紧张时是换出到硬盘上的文件中。

这里的交换分区，就是是swap分区，记得给电脑安装ubuntu时，就有一项是swap分区。

交换分区和文件的区别是：

文件是在一个具体的文件系统之下的，交换分区没有这个必要，它可能是一个裸分区，不需要文件系统。

需要说明一点，并不是所有从物理内存中交换出来的数据都会被放到Swap中（如果这样的话，Swap就会不堪重负），有相当一部分数据被直接交换到文件系统。例如，有的程序会打开一些文件，对文件进行读写（其实每个程序都至少要打开一个文件，那就是运行程序本身），当需要将这些程序的内存空间交换出去时，就没有必要将文件部分的数据放到Swap空间中了，而可以直接将其放到文件里去。但是那些用malloc和new函数生成的对象的数据则不同，它们需要Swap空间，因为它们在文件系统中没有相应的“储备”文件，因此被称作“匿名”(Anonymous)内存数据。这类数据还包括堆栈中的一些状态和变量数据等。所以说，Swap空间是“匿名”数据的交换空间。

总结：

当内存紧张时：

共享内存区，malloc和new函数生成的对象的数据，堆栈中的一些状态和变量数据都被换出到swap分区。

文件映射的页面，换出到硬盘上的文件中。

现在来分析最后一种情况，页面中存放的是堆栈中的一些状态和变量数据。

一、堆栈页面的换入

由于pte_none(entry)为true，所以调用do_no_page()。

static inline int handle_pte_fault(struct mm_struct *mm,
	struct vm_area_struct * vma, unsigned long address,
	int write_access, pte_t * pte)
{
	pte_t entry;

	/*
	 * We need the page table lock to synchronize with kswapd
	 * and the SMP-safe atomic PTE updates.
	 */
	spin_lock(&mm->page_table_lock);
	entry = *pte;
	if (!pte_present(entry)) {
		/*
		 * If it truly wasn‘t present, we know that kswapd
		 * and the PTE updates will not touch it later. So
		 * drop the lock.
		 */
		spin_unlock(&mm->page_table_lock);
		if (pte_none(entry))
			return do_no_page(mm, vma, address, write_access, pte);
		return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
	}

	if (write_access) {
		if (!pte_write(entry))
			return do_wp_page(mm, vma, address, pte, entry);

		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	establish_pte(vma, address, pte, entry);
	spin_unlock(&mm->page_table_lock);
	return 1;
}

static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
	unsigned long address, int write_access, pte_t *page_table)
{
	struct page * new_page;
	pte_t entry;

	if (!vma->vm_ops || !vma->vm_ops->nopage)//由于nopage为NULL
		return do_anonymous_page(mm, vma, page_table, write_access, address);

	/*
	 * The third argument is "no_share", which tells the low-level code
	 * to copy, not share the page even if sharing is possible.  It‘s
	 * essentially an early COW detection.
	 */
	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
	if (new_page == NULL)	/* no page was available -- SIGBUS */
		return 0;
	if (new_page == NOPAGE_OOM)
		return -1;
	++mm->rss;
	/*
	 * This silly early PAGE_DIRTY setting removes a race
	 * due to the bad i386 page protection. But it‘s valid
	 * for other architectures too.
	 *
	 * Note that if write_access is true, we either now have
	 * an exclusive copy of the page, or this is a shared mapping,
	 * so we can make it writable and dirty to avoid having to
	 * handle that later.
	 */
	flush_page_to_ram(new_page);
	flush_icache_page(vma, new_page);
	entry = mk_pte(new_page, vma->vm_page_prot);
	if (write_access) {
		entry = pte_mkwrite(pte_mkdirty(entry));
	} else if (page_count(new_page) > 1 &&
		   !(vma->vm_flags & VM_SHARED))
		entry = pte_wrprotect(entry);
	set_pte(page_table, entry);
	/* no need to invalidate: a not-present page shouldn‘t be cached */
	update_mmu_cache(vma, address, entry);
	return 2;	/* Major fault */
}

由于vma->vm_ops->nopage为NULL，所以执行do_anonymous_page，代码如下：

static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
{
	struct page *page = NULL;
	pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
	if (write_access) {
		page = alloc_page(GFP_HIGHUSER);//分配页面
		if (!page)
			return -1;
		clear_user_highpage(page, addr);
		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
		mm->rss++;
		flush_page_to_ram(page);
	}
	set_pte(page_table, entry);//并建立映射
	/* No need to invalidate - it was non-present before */
	update_mmu_cache(vma, addr, entry);
	return 1;	/* Minor fault */
}

二、堆栈页面的换出

换入后page并未加入到任何队列。

refill_inactive_scan和swap_out，把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问，且age小于0。这里的活跃页面是先加入到对应的队列中，再脱离。详见swap_out调用try_to_swap_out的代码分析。

page_launder，把不活跃脏的页面变成不活跃干净的页面。

try_to_swap_out，代码如下：

static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
{
	pte_t pte;
	swp_entry_t entry;
	struct page * page;
	int onlist;

	pte = *page_table;
	if (!pte_present(pte))//物理页面是否在内存中
		goto out_failed;
	page = pte_page(pte);
	if ((!VALID_PAGE(page)) || PageReserved(page))
		goto out_failed;

	if (!mm->swap_cnt)
		return 1;

	mm->swap_cnt--;//被考察的页面数减1

	onlist = PageActive(page);
	/* Don‘t look at this pte if it‘s been accessed recently. */
	if (ptep_test_and_clear_young(page_table)) {//如果页面被访问过，那么直接out_failed
		age_page_up(page);
		goto out_failed;
	}
	if (!onlist)
		/* The page is still mapped, so it can‘t be freeable... */
		age_page_down_ageonly(page);

	......
	if (page->age > 0)//如果页面的age不小于0，页out_failed
		goto out_failed;

	if (TryLockPage(page))
		goto out_failed;

	......
	pte = ptep_get_and_clear(page_table);//走到这里，说明页面最近没有访问过，且age小于0,清空页目录项
	flush_tlb_page(vma, address);

	......
	if (PageSwapCache(page)) {//page结构不在swapper_space队列中
		entry.val = page->index;
		if (pte_dirty(pte))
			set_page_dirty(page);
set_swap_pte:
		swap_duplicate(entry);
		set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
		UnlockPage(page);
		mm->rss--;
		deactivate_page(page);
		page_cache_release(page);
out_failed:
		return 0;
	}
        flush_cache_page(vma, address);
	if (!pte_dirty(pte))//不执行这里
		goto drop_pte;

	/*
	 * Ok, it‘s really dirty. That means that
	 * we should either create a new swap cache
	 * entry for it, or we should write it back
	 * to its own backing store.
	 */
	if (page->mapping) {//也不执行这里
		set_page_dirty(page);
		goto drop_pte;
	}

	/*
	 * This is a dirty, swappable page.  First of all,
	 * get a suitable swap entry for it, and make sure
	 * we have the swap cache set up to associate the
	 * page with that swap entry.
	 */
	entry = get_swap_page();//执行这里，分配一个交换分区
	if (!entry.val)
		goto out_unlock_restore; /* No swap space left */

	/* Add it to the swap cache and mark it dirty */
	add_to_swap_cache(page, entry);//把page加入到对应的队列
	set_page_dirty(page);
	goto set_swap_pte;//跳转到set_swap_pte

out_unlock_restore:
	set_pte(page_table, pte);
	UnlockPage(page);
}

add_to_swap_cache，代码如下：

void add_to_swap_cache(struct page *page, swp_entry_t entry)
{
	unsigned long flags;

#ifdef SWAP_CACHE_INFO
	swap_cache_add_total++;
#endif
	if (!PageLocked(page))
		BUG();
	if (PageTestandSetSwapCache(page))
		BUG();
	if (page->mapping)
		BUG();
	flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1));
	page->flags = flags | (1 << PG_uptodate);
	add_to_page_cache_locked(page, &swapper_space, entry.val);
}

void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
{
	if (!PageLocked(page))
		BUG();

	page_cache_get(page);
	spin_lock(&pagecache_lock);
	page->index = index;
	add_page_to_inode_queue(mapping, page);
	add_page_to_hash_queue(page, page_hash(mapping, index));
	lru_cache_add(page);
	spin_unlock(&pagecache_lock);
}

此时：

page->list链入mapping->clean_pages；

page->next_hash和page->pprev_hash链入全局的Hash表；

page->lru链入了全局的active_list；

然后返回到try_to_swap_out，继续执行go set_swap_pte，继续执行如下：

set_swap_pte:
		swap_duplicate(entry);
		set_pte(page_table, swp_entry_to_pte(entry));//页目录项指向盘上数据块的地址
drop_pte:
		UnlockPage(page);
		mm->rss--;
		deactivate_page(page);//从active_list到inactive_list
		page_cache_release(page);

此时：

page->list链入mapping->dirty_pages/clean_pages；

page->next_hash和page->pprev_hash链入全局的Hash表；

page->lru链入了全局的inactive_dirty_list；

三、堆栈页面恢复映射

1、对于不活跃脏的页面和不活跃干净的页面，如果发生缺页中断，会调用do_swap_page()。

static inline int handle_pte_fault(struct mm_struct *mm,
	struct vm_area_struct * vma, unsigned long address,
	int write_access, pte_t * pte)
{
	pte_t entry;

	/*
	 * We need the page table lock to synchronize with kswapd
	 * and the SMP-safe atomic PTE updates.
	 */
	spin_lock(&mm->page_table_lock);
	entry = *pte;
	if (!pte_present(entry)) {
		/*
		 * If it truly wasn‘t present, we know that kswapd
		 * and the PTE updates will not touch it later. So
		 * drop the lock.
		 */
		spin_unlock(&mm->page_table_lock);
		if (pte_none(entry))//有值
			return do_no_page(mm, vma, address, write_access, pte);
		return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);//有值，所以会调用这个
	}

	if (write_access) {
		if (!pte_write(entry))
			return do_wp_page(mm, vma, address, pte, entry);

		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	establish_pte(vma, address, pte, entry);
	spin_unlock(&mm->page_table_lock);
	return 1;
}

因为页目录表有值，指向了交换分区，所以开始执行do_swap_page()。

static int do_swap_page(struct mm_struct * mm,
	struct vm_area_struct * vma, unsigned long address,
	pte_t * page_table, swp_entry_t entry, int write_access)
{
	struct page *page = lookup_swap_cache(entry);
	pte_t pte;

	if (!page) {
		lock_kernel();
		swapin_readahead(entry);
		page = read_swap_cache(entry);
		unlock_kernel();
		if (!page)
			return -1;

		flush_page_to_ram(page);
		flush_icache_page(vma, page);
	}

	mm->rss++;

	pte = mk_pte(page, vma->vm_page_prot);

	/*
	 * Freeze the "shared"ness of the page, ie page_count + swap_count.
	 * Must lock page before transferring our swap count to already
	 * obtained page count.
	 */
	lock_page(page);
	swap_free(entry);
	if (write_access && !is_page_shared(page))
		pte = pte_mkwrite(pte_mkdirty(pte));
	UnlockPage(page);

	set_pte(page_table, pte);
	/* No need to invalidate - it was non-present before */
	update_mmu_cache(vma, address, pte);
	return 1;	/* Minor fault */
}

在do_swap_page中，lookup_swap_cache，会在全局的Hash表找到对应的页面，并且引用计数加1，变成2，但还没有移到活跃队列中。什么时候转移到活跃队列中呢？

答案在，page_launder和reclaim_page中。

page_launder：

if (PageTestandClearReferenced(page) || page->age > 0 ||   //此时引用计数大于1 
                (!page->buffers && page_count(page) > 1) ||  
                page_ramdisk(page)) {  
            del_page_from_inactive_dirty_list(page);  
            add_page_to_active_list(page);  
            continue;  
}

reclaim_page：

if (PageTestandClearReferenced(page) || page->age > 0 ||  
                (!page->buffers && page_count(page) > 1)) {//此时引用计数大于1
            del_page_from_inactive_clean_list(page);  
            add_page_to_active_list(page);  
            continue;  
}

2、如果reclaim_page，把不活跃干净的页面，所有的链表关系都清除，但使用计数仍然为1。

由于lookup_swap_cache返回NULL，所以接下来的流程就是我们原来分析过的一篇文章，Linux内核源代码情景分析-内存管理之用户页面的换入。

郑重声明：本站内容如果来自互联网及其他传播媒体，其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享，并不代表本站赞同其观点和对其真实性负责，也不构成任何其他建议。

Linux内核源代码情景分析-交换分区