do_fork->copy_process->copy_mm

static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
{
	struct mm_struct * mm, *oldmm;
	int retval;

	tsk->min_flt = tsk->maj_flt = 0;
	tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif

	tsk->mm = NULL;
	tsk->active_mm = NULL;

	/*
	 * Are we cloning a kernel thread?
	 *
	 * We need to steal a active VM for that..
	 */
	oldmm = current->mm; 
	if (!oldmm)
		return 0;

	if (clone_flags & CLONE_VM) {
		atomic_inc(&oldmm->mm_users);
		mm = oldmm;
		goto good_mm;
	}

	retval = -ENOMEM;
	mm = dup_mm(tsk);
	if (!mm)
		goto fail_nomem;

good_mm:
	/* Initializing for Swap token stuff */
	mm->token_priority = 0;
	mm->last_interval = 0;
	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
		atomic_inc(&mm->oom_disable_count);

	tsk->mm = mm;
	tsk->active_mm = mm;
	return 0;

fail_nomem:
	return retval;
}
这段代码需要注意两个问题,一个是task_struct中的mm和activemm指针的指向问题,在前面的文章中已经讨论过了;我们重点看第二个问题,也是这个函数的核心,即
mm=dup_mm(tsk)

dup_mm的代码如下:

/*
 * Allocate a new mm structure and copy contents from the
 * mm structure of the passed in task structure.
 */
struct mm_struct *dup_mm(struct task_struct *tsk)
{
	struct mm_struct *mm, *oldmm = current->mm;
	int err;

	if (!oldmm)
		return NULL;

	mm = allocate_mm();  //在专用高速缓存中分配一个mm结构
	if (!mm)
		goto fail_nomem;

	memcpy(mm, oldmm, sizeof(*mm)); //将父进程的mm拷贝给子进程
	mm_init_cpumask(mm);

	/* Initializing for Swap token stuff */
	mm->token_priority = 0;
	mm->last_interval = 0;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	mm->pmd_huge_pte = NULL;
#endif

	if (!mm_init(mm, tsk)) 
		goto fail_nomem;

	if (init_new_context(tsk, mm))
		goto fail_nocontext;

	dup_mm_exe_file(oldmm, mm);

	err = dup_mmap(mm, oldmm);
	if (err)
		goto free_pt;

	mm->hiwater_rss = get_mm_rss(mm);
	mm->hiwater_vm = mm->total_vm;

	if (mm->binfmt && !try_module_get(mm->binfmt->module))
		goto free_pt;

	return mm;

free_pt:
	/* don't put binfmt in mmput, we haven't got module yet */
	mm->binfmt = NULL;
	mmput(mm);

fail_nomem:
	return NULL;

fail_nocontext:
	/*
	 * If init_new_context() failed, we cannot use mmput() to free the mm
	 * because it calls destroy_context()
	 */
	mm_free_pgd(mm);
	free_mm(mm);
	return NULL;
}
先来看函数中的关键操作mm_init(mm,tsk);

static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
{
	atomic_set(&mm->mm_users, 1);
	atomic_set(&mm->mm_count, 1);
	init_rwsem(&mm->mmap_sem);
	INIT_LIST_HEAD(&mm->mmlist);
	mm->flags = (current->mm) ?
		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
	mm->core_state = NULL;
	mm->nr_ptes = 0;
	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
	spin_lock_init(&mm->page_table_lock);
	mm->free_area_cache = TASK_UNMAPPED_BASE;
	mm->cached_hole_size = ~0UL;
	mm_init_aio(mm);
	mm_init_owner(mm, p);
	atomic_set(&mm->oom_disable_count, 0);  

        /*以上代码初始化mm的数据成员*/
        /*关键是下面的mm_alloc_pgd(mm)*/

	if (likely(!mm_alloc_pgd(mm))) {  //allocate pgd, and copy the kernel space  items to the gdb
		mm->def_flags = 0;
		mmu_notifier_mm_init(mm);
		return mm;
	}

	free_mm(mm);
	return NULL;
}
关键操作mm_alloc_pgd(mm),这个操作可以跟踪到pgd_alloc函数:

pgd_t *pgd_alloc(struct mm_struct *mm)
{
  /*由于未开启PAE机制,因此不用关心所有有关pmd的操作*/
	pgd_t *pgd;
	pmd_t *pmds[PREALLOCATED_PMDS];

	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); //为pgd分配一个页面

	if (pgd == NULL)
		goto out;

	mm->pgd = pgd;  //令mm的pgd成员变量指向新申请的pgd页面

	if (preallocate_pmds(pmds) != 0)
		goto out_free_pgd;

	if (paravirt_pgd_alloc(mm) != 0)
		goto out_free_pmds;

	/*
	 * Make sure that pre-populating the pmds is atomic with
	 * respect to anything walking the pgd_list, so that they
	 * never see a partially populated pgd.
	 */
	spin_lock(&pgd_lock);

	pgd_ctor(mm, pgd);   //关键代码,将指向内核空间的页目录项拷贝到新分配的pgd对应的项目中。
	pgd_prepopulate_pmd(mm, pgd, pmds);

	spin_unlock(&pgd_lock);

	return pgd;

out_free_pmds:
	free_pmds(pmds);
out_free_pgd:
	free_page((unsigned long)pgd);
out:
	return NULL;
}
这个函数的关键就是
	pgd_ctor(mm, pgd);   //关键代码,将指向内核空间的页目录项拷贝到新分配的pgd对应的项目中。
我们来看一下:
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
	/* If the pgd points to a shared pagetable level (either the
	   ptes in non-PAE, or shared PMD in PAE), then just copy the
	   references from swapper_pg_dir. */
	if (PAGETABLE_LEVELS == 2 ||
	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
	    PAGETABLE_LEVELS == 4) {
		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,    //item points to the kernel space
				swapper_pg_dir + KERNEL_PGD_BOUNDARY,  
				KERNEL_PGD_PTRS);
	}

	/* list required to sync kernel mapping updates */
	if (!SHARED_KERNEL_PMD) {
		pgd_set_mm(pgd, mm);
		pgd_list_add(pgd);
	}
}
关键代码:

clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,    //item points to the kernel space
				swapper_pg_dir + KERNEL_PGD_BOUNDARY,  
				KERNEL_PGD_PTRS);

clone_pgd_range的源代码

/*
 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
 *
 *  dst - pointer to pgd range anwhere on a pgd page
 *  src - ""
 *  count - the number of pgds to copy.
 *
 * dst and src can be on the same page, but the range must not overlap,
 * and must not cross a page boundary.
 */
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
       memcpy(dst, src, count * sizeof(pgd_t));
}

然后看一下传给clone_pgd_range的参数:

第一个参数:

pgd + KERNEL_PGD_BOUNDARY
pgd是分配得到的页目录的基地址,经过跟踪计算KERNEL_PGD_BOUNDARY=768,也就是指向pgd的第768项,这个项目正好是内核空间的开始。

第二个参数:


swapper_pg_dir + KERNEL_PGD_BOUNDARY,

首先来看一下swapper_pg_dir是什么?(详细的解释请参看:http://blog.csdn.net/sunnybeike/article/details/6897819)

swapper_pg_dir这个东西其实就是一个页目录的指针。swapper_pg_dir只是在内核初始化的时候被载入到cr3指示内存映射信息,之后在init进程启动后就成了idle内核线程的页目录指针了,/sbin/init由一个叫做init的内核线程exec而成,而init内核线程是原始的内核也就是后来的idle线程do_fork而成的,而在do_fork中会为新生的进程重启分配一个页目录指针,由此可见swapper_pg_dir只是在idle和内核线程中被使用,可是它的作用却不只是为idle进程指示内存映射信息,更多的,它作为一个内核空间的内存映射模板而存在,在linux中,任何进程在内核空间就不分彼此了,所有的进程都会公用一份内核空间的内存映射,因此,内核空间是所有进程共享的,每当一个新的进程建立的时候,都会将swapper_pg_dir的768项以后的信息全部复制到新进程页目录的768项以后,代表内核空间。另外在操作3G+896M以上的虚拟内存时,只会更改swapper_pg_dir的映射信息,当别的进程访问到这些页面的时候会发生缺页,在缺页处理中会与swapper_pg_dir同步。

因此第二个参数指向的是页目录表模板的第768项,也就是指向指向内核空间的页目录项。

第三个参数:

		KERNEL_PGD_PTRS);

顾名思义,是表示内核目录项的个数。

因此,整个clone_pgd_range就是将初始化页表的内核空间。

这样回到mm_init中,下面一个重要的函数就是:

/*
 * we do not have to muck with descriptors here, that is
 * done in switch_mm() as needed.
 */
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
	struct mm_struct *old_mm;
	int retval = 0;

	mutex_init(&mm->context.lock); //初始化锁
	mm->context.size = 0;
	old_mm = current->mm;
	if (old_mm && old_mm->context.size > 0) {
		mutex_lock(&old_mm->context.lock);
		retval = copy_ldt(&mm->context, &old_mm->context);
		mutex_unlock(&old_mm->context.lock);
	}
	return retval;
}
显然,这里面比较重要的操作就是:

copy_ldt(&mm->context, &old_mm->context);
但是只有VM86模式才会有LDT,因此我们并不关心这个操作。

来看下面一个函数:

static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
{
	/* It's safe to write the exe_file pointer without exe_file_lock because
	 * this is called during fork when the task is not yet in /proc */
	newmm->exe_file = get_mm_exe_file(oldmm);
}

首先task_struct中的exe_file字段,在task_struct结构中的定义如下:

    /* store ref to file /proc/<pid>/exe symlink points to */    
     struct file *exe_file;

可以看到,它是指向可执行文件的。所以,dup_mm_exe_file就是将mm中的exe_file字段指向父进程的可执行文件。

最后一个重要的函数是dup_mmap函数:

static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
	struct rb_node **rb_link, *rb_parent;
	int retval;
	unsigned long charge;
	struct mempolicy *pol;

	down_write(&oldmm->mmap_sem);
	flush_cache_dup_mm(oldmm);
	/*
	 * Not linked in yet - no deadlock potential:
	 */
	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);

	mm->locked_vm = 0;
	mm->mmap = NULL;
	mm->mmap_cache = NULL;
	mm->free_area_cache = oldmm->mmap_base;
	mm->cached_hole_size = ~0UL;
	mm->map_count = 0;
	cpumask_clear(mm_cpumask(mm));
	mm->mm_rb = RB_ROOT;
	rb_link = &mm->mm_rb.rb_node;
	rb_parent = NULL;
	pprev = &mm->mmap;
	retval = ksm_fork(mm, oldmm);
	if (retval)
		goto out;
	retval = khugepaged_fork(mm, oldmm);
	if (retval)
		goto out;

	prev = NULL;
	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {  //对的vm_area_struct和页面映射表进行循环复制
		struct file *file;

		if (mpnt->vm_flags & VM_DONTCOPY) {  //do not copy this vm on fork.
			long pages = vma_pages(mpnt); //这个vm中包含的页面数
			mm->total_vm -= pages;   //将mm所拥有的页面数减去这些没有拷贝的页面的数量
			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
								-pages); //主要对映射到这个vm中的file相关的标志做设置
			continue;
		}
		charge = 0;
		if (mpnt->vm_flags & VM_ACCOUNT) {
			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
			if (security_vm_enough_memory(len))
				goto fail_nomem;
			charge = len;
		}
		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
		if (!tmp)
			goto fail_nomem;
		*tmp = *mpnt;  //***************************完全拷贝当前vm
		INIT_LIST_HEAD(&tmp->anon_vma_chain);
		pol = mpol_dup(vma_policy(mpnt));
		retval = PTR_ERR(pol);
		if (IS_ERR(pol))
			goto fail_nomem_policy;
		vma_set_policy(tmp, pol);
		tmp->vm_mm = mm;
		if (anon_vma_fork(tmp, mpnt))
			goto fail_nomem_anon_vma_fork;
		tmp->vm_flags &= ~VM_LOCKED;
		tmp->vm_next = tmp->vm_prev = NULL;
		file = tmp->vm_file;
		if (file) {
			struct inode *inode = file->f_path.dentry->d_inode;
			struct address_space *mapping = file->f_mapping;

			get_file(file);
			if (tmp->vm_flags & VM_DENYWRITE)
				atomic_dec(&inode->i_writecount);
			mutex_lock(&mapping->i_mmap_mutex);
			if (tmp->vm_flags & VM_SHARED)
				mapping->i_mmap_writable++;
			flush_dcache_mmap_lock(mapping);
			/* insert tmp into the share list, just after mpnt */
			vma_prio_tree_add(tmp, mpnt);
			flush_dcache_mmap_unlock(mapping);
			mutex_unlock(&mapping->i_mmap_mutex);
		}

		/*
		 * Clear hugetlb-related page reserves for children. This only
		 * affects MAP_PRIVATE mappings. Faults generated by the child
		 * are not guaranteed to succeed, even if read-only
		 */
		if (is_vm_hugetlb_page(tmp))
			reset_vma_resv_huge_pages(tmp);

		/*
		 * Link in the new vma and copy the page table entries.
		 */
		*pprev = tmp;
		pprev = &tmp->vm_next;
		tmp->vm_prev = prev;
		prev = tmp;

		__vma_link_rb(mm, tmp, rb_link, rb_parent);
		rb_link = &tmp->vm_rb.rb_right;
		rb_parent = &tmp->vm_rb;

		mm->map_count++;
		retval = copy_page_range(mm, oldmm, mpnt);  //*****************这个操作是整个循环中最值得我们关注的函数。

		if (tmp->vm_ops && tmp->vm_ops->open)
			tmp->vm_ops->open(tmp);

		if (retval)
			goto out;
	}
	/* a new mm has just been created */
	arch_dup_mmap(oldmm, mm);
	retval = 0;
out:
	up_write(&mm->mmap_sem);
	flush_tlb_mm(oldmm);
	up_write(&oldmm->mmap_sem);
	return retval;
fail_nomem_anon_vma_fork:
	mpol_put(pol);
fail_nomem_policy:
	kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
	retval = -ENOMEM;
	vm_unacct_memory(charge);
	goto out;
}
下面来看copy_range:

这个函数逐层处理页目录项和页表项

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		struct vm_area_struct *vma)
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
	unsigned long addr = vma->vm_start;
	unsigned long end = vma->vm_end;
	int ret;

	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
		if (!vma->anon_vma)
			return 0;
	}

	if (is_vm_hugetlb_page(vma))  //如果是基于某些体系结构支持的巨型页,就调用另外的处理方法,显然我们这里是不需要的。
		return copy_hugetlb_page_range(dst_mm, src_mm, vma);

	if (unlikely(is_pfn_mapping(vma))) {  //如果是纯PFN机制的,则也不是我们需要关注的
		/*
		 * We do not free on error cases below as remove_vma
		 * gets called on error from higher level routine
		 */
		ret = track_pfn_vma_copy(vma);
		if (ret)
			return ret;
	}

	/*
	 * We need to invalidate the secondary MMU mappings only when
	 * there could be a permission downgrade on the ptes of the
	 * parent mm. And a permission downgrade will only happen if
	 * is_cow_mapping() returns true.
	 */
	if (is_cow_mapping(vma->vm_flags))  
		mmu_notifier_invalidate_range_start(src_mm, addr, end);

       /*以下代码是需要认真关注的*/
	ret = 0;
	dst_pgd = pgd_offset(dst_mm, addr);   //获得dst_mm中addr地址在页目录表中的页目录项
	src_pgd = pgd_offset(src_mm, addr);   //获得src_mm中addr地址在页目录表中的页目录项
	do {
		next = pgd_addr_end(addr, end);  //获取下一个目录项所指的地址
		if (pgd_none_or_clear_bad(src_pgd)) //如果页面的映射尚未建立,则无需做任何事情;如果是坏的页目录项,则将其清空
			continue;
		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,  
					    vma, addr, next))) {  //这个操作是关键,它会复制当前页目录项指向的页表
			ret = -ENOMEM;
			break;
		}
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);  //一直循环,直至将当前vm的目录项遍历完毕

	if (is_cow_mapping(vma->vm_flags))
		mmu_notifier_invalidate_range_end(src_mm,
						  vma->vm_start, end);
	return ret;
}
copy_pud_range是对页表的复制:

/*对页表的复制类似于对页目录的复制*/
static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pud_t *src_pud, *dst_pud;
	unsigned long next;

	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
	if (!dst_pud)
		return -ENOMEM;
	src_pud = pud_offset(src_pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(src_pud))
			continue;
		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pud++, src_pud++, addr = next, addr != end);
	return 0;
}

copy_mm就是对虚存存页目录和页表的的拷贝,因此涉及到内存管理,很多东西没有讲清楚,待看完内存管理之后再来详看!

到这里,我们知道了copy_mm的所做的事情如下:

1.从swapper_pg_dir拷贝内核空间的页目录

2.从父进程中拷贝用户空间的页目录和页表,但实际是要进行的拷贝内存也的过程,但是由于cow的应用,实际上一页都没有拷贝,只是将可写的用户空间对应的页表设置

成为可读,而对于可读的用户空间则进行共享。














   

你可能感兴趣的:(struct,cache,File,exe,DST,structure)