标题: 2.6版linux kernel缺页处理分析 [打印本页]
作者: linuxarm 时间: 2006-8-13 23:24 标题: 2.6版linux kernel缺页处理分析
2.6版linux kernel缺页处理分析
作者:董磊鋆
先看下几个比较重要的概念:
存储器映射,在分配物理页面时要用到。以下摘自<深入理解linux内核>:
A memory region can be associated with some portion of either a regular file in a disk-based filesystem or a block device file. This means that an access to a byte within a page of the memory region is translated by the kernel into an operation on the corresponding byte of the file. This technique is called memory mapping.
进程地址空间的组成:每个进程都有自己的内存描述表,用结构体mm_struct表示,它指向线性区,所谓线性区就是用vmalloc分配出来的连续虚拟地址空间,用结构体vm_area_struct表示。每个mm_struct都指向若干个vm_area_struct,由vm_area_struct再指向具体的物理内存。下图是进程内存使用示意图:
正是因为进程看到的都是虚拟地址,处于效率的考虑,不是所有虚拟地址都对应了物理内存的,所以在向没有对应物理内存的虚拟地址进行读写时,就产生了缺页的问题。
大致原理,大家可以参考<深入理解linux内核>,具体代码和解释如下:
static int
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{这些参数的意义是:addr缺页中的地址,进程对该地址的读写由于没有对应物理页面而出错。Fsr 是权限的掩码,regs是进入处理时的寄存器集。
struct task_struct tsk;/*指针,用来指向请求缺页处理的进程*/
struct mm_struct *mm;/*进程内存区域结构*/
int fault;
tsk = current;/*指向当前发生页错误的进程*/
mm = tsk->mm;/*获取发生页错误的进程的内存结构
/*
如果是内核进程或者是中断处理,则它们没有线性区,
因而也没有上下文处理,mm为空
*/
if (in_interrupt() || !mm)
goto no_context;
/*获取mm 信号量*/
down_read(&mm->mmap_sem);
fault = __do_page_fault(mm, addr, fsr, tsk);
/*
*加_的函数代表子系统内部使用的,外部不要调用,这里开始就和2.4不一样了
*在这里,请直接看后面的__do_page_fault说明,然后再回过头来看下面的代码。
*/
up_read(&mm->mmap_sem);/*释放mm semaphore*/
/*
* 先处理成功分配的情况,直接返回0
*/
if (fault > 0)
return 0;
/*
*有内存,但是无法分配给请求进程,只能杀死进程
*/
if (fault == 0)/*other error in handle_mm...*/
goto do_sigbus;
/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault == VM_FAULT_OOM) {
/*没有内存了,杀进程,这是处理缺页的原则之一*/
printk("VM: killing process %s", tsk->comm);
do_exit(SIGKILL);
} else
__do_user_fault(tsk, addr, fsr, fault == VM_FAULT_
BADACCESS ?SEGV_ACCERR : SEGV_MAPERR, regs);
return 0;
/*
* We ran out of memory, or some other thing happened to us that made
* us unable to handle the page fault gracefully.
*/
do_sigbus:
/*
* Send a sigbus, regardless of whether we were in kernel
* or user mode.
*/
tsk->thread.address = addr;
tsk->thread.error_code = fsr;
tsk->thread.trap_no = 14;
force_sig(SIGBUS, tsk);
#ifdef CONFIG_DEBUG_USER
if (user_debug & UDBG_BUS) {
printk(KERN_DEBUG "%s: sigbus at 0x%08lx, pc=0x%08lx\n",
current->comm, addr, instruction_pointer(regs));
}
#endif
/* Kernel mode? Handle exceptions or die */
if (user_mode(regs))
return 0;
no_context:
__do_kernel_fault(mm, addr, fsr, regs);
return 0;
}
作者: linuxarm 时间: 2006-8-13 23:25
static int
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,struct task_struct *tsk)
{
struct vm_area_struct vma;/*线性区结构*/
int fault, mask;
vma = find_vma(mm, addr);
/*查找缺页地址是不是在线性区的范围内,下面是对find_vma 的解释*/.
/* The find_vma( ) function acts on two parameters: the address mm of a process memory descriptor and a linear address addr. It locates the first memory region whose vm_end field is greater than addr and returns the address of its descriptor; if no such region exists, it returns a NULL pointer.
*/
fault = VM_FAULT_BADMAP;
if (!vma)/*在缺页地址后没有线性区,因此地址不时有效的缺页请求地址*/
goto out;
if (vma->vm_start > addr)
/*如果整个线性区在请求地址的后方,请求地址不在线性区之内,则另外处 理。
*/
goto check_stack;
/*
*到这里,请求缺爷的进程,已经排除内核,中断进程,而且请求的地址 *也在进程的线性区之内。可以按照正常处理。
*/
good_area:
/*判别对缺页的请求权限 */
if (fsr & (1 << 11)) /*写权限?*/
mask = VM_WRITE;
/*prototype in vm_area_struct;将权限掩码设置成写权限*/
else
mask = VM_READ|VM_EXEC;
/*将权限掩码设置成读和执行权限*/
fault = VM_FAULT_BADACCESS;
/*将错误号码预设成VM_FAULT_BADACCESS */
if (!(vma->vm_flags & mask))
/*
vma->vm_flags是线性区的读写权限,如果线性区(整个线性区并不一定 都分配有物理页面)的权限不符合要求的权限,只能退出。
*/
goto out;
/*
* If for any reason at all we couldn't handle
* the fault, make sure we exit gracefully rather
* than endlessly redo the fault.
*/
survive:
/*到这里,请求缺爷的进程,已经排除内核,中断进程,而且请求的地址
*也在进程的线性区之内。在读写权限也正确。则尝试分配物理页面。
*handle_mm_fault( ) function is invoked to allocate a new page frame:
*The handle_mm_fault( ) function returns 1 or 2 if it succeeded in allocating
*a new page frame for the process. The value 1 indicates that the Page Fault
*has been handled without blocking the current process; this kind of Page
*Fault is called minor fault. The value 2 indicates that the Page Fault forced
*the current process to sleep (most likely because time was spent while
*filling the page frame assigned to the process with data read from disk); a
*Page Fault that blocks the current process is called a major fault. The
*function can also returns -1 (for not enough memory) or 0 (for any other *error).
*/
fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11));
/*
* 先处理正常情况 – 分配成功和除内存用尽之外的错误
* #define VM_FAULT_OOM (-1)
* #define VM_FAULT_SIGBUS 0
* #define VM_FAULT_MINOR 1 no block
* #define VM_FAULT_MAJOR 2 blocked
*/
switch (fault) {
case VM_FAULT_MAJOR:/*页面分配了,但是进程被挂起*/
tsk->maj_flt++;
return fault;/*直接返回执行结果*/
case VM_FAULT_MINOR: :/*页面分配了,进程没有被挂起*/
tsk->min_flt++;
/*不在这里直接返回,在后面返回,这个我觉得在这里返回也一样*/
case VM_FAULT_SIGBUS:
return fault;
/*如果是非内存用尽之外的错误,直接返回执行结果*/
}
if (tsk->pid != 1)/*是否是init进程*/
goto out;
/*
作者: linuxarm 时间: 2006-8-13 23:25
*执行到这里,还有handle_mm_fault 返回-1的情况没有处理。
*VM_FAULT_MAJOR和VM_FAULT_SIGBUS的情况就直接返回了, *VM_FAULT_MINOR时,如果不是init进程请求缺页处理的话,也直 *接返回。
*但是,如果是init进程请求缺页,而且handle_mm_fault返回 *VM_FAULT_MINOR呢?我觉得是通过下面的循环再到 *handle_mm_fault分配一把物理页,但是由于这时候addr已经有物理 *页面,没必要也无法再分配物理页,因此handle_mm_fault会返回 *VM_FAULT_SIGBUS,然后直接return。我觉得这样分析是有问题的, *因为在返回VM_FAULT_SIGBUS到do_page_fault后,进程回被杀死, *Init进程能被杀死吗?
*/
/*
* If we are out of memory for pid1,
* sleep for a while and retry
*/
yield();
goto survive;
check_stack:
if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))/*expand_stack 0 is ok expand both stack and mm of process*/
goto good_area;
out:
return fault;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
unsigned long address, int write_access)
{
pgd_t pgd;/*全局页目录表*/
pmd_t pmd;/*中间页表*/
__set_current_state(TASK_RUNNING);
pgd = pgd_offset(mm, address);/*获得在pgd中的偏移*/
inc_page_state(pgfault);
if (is_vm_hugetlb_page(vma))
return VM_FAULT_SIGBUS; /* mapping truncation does this. */
/*
* We need the page table lock to synchronize with kswapd
* and the SMP-safe atomic PTE updates.
*/
spin_lock(&mm->page_table_lock);
pmd = pmd_alloc(mm, pgd, address);/*分配一个新的中间页*/
if (pmd) {
pte_t * pte = pte_alloc_map(mm, pmd, address);/*分配页表*/
if (pte)
return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
}
spin_unlock(&mm->page_table_lock);
return VM_FAULT_OOM;
}
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t *pte, pmd_t *pmd)
{ pte_t entry;
entry = pte; /*取页帧目录项*/
if (!pte_present(entry))
{/*如果页帧不在物理内存中*/
if (pte_none(entry)) /*如果页帧目录项为空*/
return do_no_page(mm, vma, address, write_access, pte, pmd); 进行空页处理
if (pte_file(entry))
/*int pte_file(pte_t)是2.6的处理中新增加的一个inline函数,就是用来判别该页是否是映* 射到文件的。
* Return true if the pte is a "file pte". This is where you'll need to
* use the magical reserved bit to distinguish this from a swapped out pte.
* static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; }
* #define _PAGE_FILE 0x040 /* set:pagecache unset:swap */
*/
return do_file_page(mm, vma, address, write_access, pte, pmd);文件缓冲页处理
return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); 进行换页处理
}
if (write_access)
{如果是访问权限的问题
if (!pte_write(entry))读/写权限都设置,才返回1
return do_wp_page(mm, vma, address, pte, pmd, entry);
进行写保护页处理
entry = pte_mkdirty(entry); 设置脏页标志
}
页在内存中,非空,非交换页,非文件缓冲页
entry = pte_mkyoung(entry); 设置访问标志
ptep_set_access_flags(vma, address, pte, entry, write_access);
/*
#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty)
do {
set_pte(__ptep, __entry); 将指定值写入表项
flush_tlb_page(__vma, __address);刷新TLB
} while (0)
*/
update_mmu_cache(vma, address, entry);
更新外部mm信息,x86,arm上是什么都不做.
pte_unmap(pte); //#define pte_unmap(pte) do { } while (0)
spin_unlock(&mm->page_table_lock);
return VM_FAULT_MINOR;
}
暂时就分析这么点,框图和原理在书上已经有了。关于init分配页返回1的情况,我的确无法解释为什么要这么安排,2.6.11;2.6.12都是这么处理,实在想不通。
欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/) |
Powered by Discuz! 7.0.0 |