linux内核mount系统调用源码分析

@Author:  Gordon Wu

@Time : 2014/12

0.摘要

mount是Linux很常见的命令,本文将从用户空间的命令行开始,一步一步切入到内核的源代码,解释一个文件系统是如果挂载的。本文基于linux 2.6.32

1.SYSCALL_DEFINE5, 系统调用

Linux kernel通过系统调用的方式为用户提供陷入到内核,mount的系统调用是SYSCALL_DEFINE5,位于fs/namespace.c:
主要完成把一些安装信息从用户空间拷贝到内核空间, 见代码的中文注释。
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
		char __user *, type, unsigned long, flags, void __user *, data)
{
	int ret;
	char *kernel_type;
	char *kernel_dir;
	char *kernel_dev;
	unsigned long data_page;

	ret = copy_mount_string(type, &kernel_type);       // 把用户空间的挂载类型复制到内核
	if (ret < 0)
		goto out_type;

	kernel_dir = getname(dir_name);                    //通过kmem_cache_alloc从names_cachep一个PATH_MAX大小的内核空间,把用户空间的dir_name复制过去
	if (IS_ERR(kernel_dir)) {
		ret = PTR_ERR(kernel_dir);
		goto out_dir;
	}

	ret = copy_mount_string(dev_name, &kernel_dev);     //复制挂载的设备过去
	if (ret < 0)
		goto out_dev;

	ret = copy_mount_options(data, &data_page);
	if (ret < 0)
		goto out_data;

	ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,   
		(void *) data_page);                           //数据准备好,开始do_mount

	free_page(data_page);
out_data:
	kfree(kernel_dev);
out_dev:
	putname(kernel_dir);
out_dir:
	kfree(kernel_type);
out_type:
	return ret;
}

2. do_mount

验证挂载的选项Flags,以及获取并填充相应挂载目录dir_name的路径Path结构。 再根据挂载选项Flags来判断,挂载的动作。

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
long do_mount(char *dev_name, char *dir_name, char *type_page,
		  unsigned long flags, void *data_page)
{
	struct path path;
	int retval = 0;
	int mnt_flags = 0;

	/* Discard magic */
	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
		flags &= ~MS_MGC_MSK;

	/* Basic sanity checks */

	if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))           //挂载目录的有效性, memchr是判断dir_name是否在0~PAGE_SIZE(用户空间)中
		return -EINVAL;                                                  

	if (data_page)
		((char *)data_page)[PAGE_SIZE - 1] = 0;

	/* Default to relatime unless overriden */
	if (!(flags & MS_NOATIME))
		mnt_flags |= MNT_RELATIME;

	/* Separate the per-mountpoint flags */
	if (flags & MS_NOSUID)
		mnt_flags |= MNT_NOSUID;                                           //忽略suid和sgid位的影响
	if (flags & MS_NODEV)
		mnt_flags |= MNT_NODEV;                                         //不允许访问设备专用文件
	if (flags & MS_NOEXEC)
		mnt_flags |= MNT_NOEXEC;                                    //不允许执行程序
	if (flags & MS_NOATIME)
		mnt_flags |= MNT_NOATIME;                                  //下面三个标志是关于是否更新文件或目录的atime
	if (flags & MS_NODIRATIME)
		mnt_flags |= MNT_NODIRATIME;
	if (flags & MS_STRICTATIME)
		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
	if (flags & MS_RDONLY)                                            //只读标志
		mnt_flags |= MNT_READONLY;
                                                                            //相应的一些标志已经备份到mnt_flags了,flags去除相应位
	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
		   MS_STRICTATIME);
 
	/* ... and get the mountpoint */
	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);                  //!!根据dir_name,获取挂载目录的路径信息path
	if (retval)
		return retval;

	retval = security_sb_mount(dev_name, &path,
				   type_page, flags, data_page);
	if (retval)
		goto dput_out;
        <span style="color:#FF0000;">/*根据不同选项,进行下面五种不同的挂载*/</span>
	if (flags & MS_REMOUNT)
		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
				    data_page);
	else if (flags & MS_BIND)
		retval = do_loopback(&path, dev_name, flags & MS_REC);
	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
		retval = do_change_type(&path, flags);
	else if (flags & MS_MOVE)
		retval = do_move_mount(&path, dev_name);
	else
		retval = do_new_mount(&path, type_page, flags, mnt_flags,                //do_new_mount是最常用的挂载,path: 挂载目录,dev_name:挂载设备
				      dev_name, data_page);
dput_out:
	path_put(&path);
	return retval;
}

3. do_new_mount

利用do_kern_mount为用户空间生成一个新的挂载,并do_add_mount把新安装加入到命名空间树上

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */  // *path: 挂载目录, *name:挂载设备,*type:挂载文件系统类型
static int do_new_mount(struct path *path, char *type, int flags,           
			int mnt_flags, char *name, void *data)
{
	struct vfsmount *mnt;                  //包含已挂载文件系统的信息。

	if (!type)
		return -EINVAL;

	/* we need capabilities... */           // root权限,才能挂载
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

        //内核锁
	lock_kernel();
	mnt = do_kern_mount(type, flags, name, data);        //完成mnt信息的填充。
	unlock_kernel();
	if (IS_ERR(mnt))
		return PTR_ERR(mnt);

	return do_add_mount(mnt, path, mnt_flags, NULL);     //添加到命名空间树上
}

4. do_kern_mount

4.1 do_kern_mount函数首先会调用get_fs_type来查看内核是否注册了参数type所指的文件系统,对于内核源码下fs目录下的所有文件系统都会通过调用register_filesystem来注册这个文件系统,其实就是添加到内核文件系统链表中,get_fs_type会将参数type字符串跟内核链表中所有已经注册的文件系统结构体file_system_type的name成员向比较,如果找到,则说明内核已经注册了相应文件系统,并且返回相应文件系统注册的file_system_type结构体。后面的挂载过程需要使用到这个结构体中的成员。[1]

struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
	struct file_system_type *type = get_fs_type(fstype);    //获取挂载文件系统的类型结构,见4.2,4.3简介
	struct vfsmount *mnt;
	if (!type)
		return ERR_PTR(-ENODEV);
	mnt = vfs_kern_mount(type, flags, name, data);           
	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
	    !mnt->mnt_sb->s_subtype)
		mnt = fs_set_subtype(mnt, fstype);
	put_filesystem(type);
	return mnt;
}
4.2 struct file_system_type结构体,下面的get_sb函数会在下面的挂载过程中用到,用来完成具体文件系统挂载的操作。

struct file_system_type {
	const char *name;
	int fs_flags;
	int (*get_sb) (struct file_system_type *, int,
		       const char *, void *, struct vfsmount *);
	void (*kill_sb) (struct super_block *);
	struct module *owner;
	struct file_system_type * next;
	struct list_head fs_supers;

	struct lock_class_key s_lock_key;
	struct lock_class_key s_umount_key;

	struct lock_class_key i_lock_key;
	struct lock_class_key i_mutex_key;
	struct lock_class_key i_mutex_dir_key;
	struct lock_class_key i_alloc_sem_key;
};
4.3 get_fs_type用到的最主要的函数: 就是从已挂载文件系统队列file_systems中查找,返回相应name类型的file_system_type.

static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
	struct file_system_type **p;
	for (p=&file_systems; *p; p=&(*p)->next)
		if (strlen((*p)->name) == len &&
		    strncmp((*p)->name, name, len) == 0)
			break;
	return p;
}


5. vfs_kern_mount

具体文件系统file_system_type的get_sb成员函数是关键,它填充vfsmnt结构体的super_block结构体,是写文件系统的第一步。

struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
	struct vfsmount *mnt;
	char *secdata = NULL;
	int error;

	if (!type)
		return ERR_PTR(-ENODEV);

	error = -ENOMEM;
	mnt = alloc_vfsmnt(name);      //alloc_vfsmnt以设备名为参数,为mnt函数分配一个空间,初始化mnt的基本信息,包括mnt->mnt_devname,以及一些list
	if (!mnt)
		goto out;

	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
		secdata = alloc_secdata();
		if (!secdata)
			goto out_mnt;

		error = security_sb_copy_data(data, secdata);
		if (error)
			goto out_free_secdata;
	}

	error = type->get_sb(type, flags, name, data, mnt);      //调用具体文件系统file_system_type的get_sb函数,填充vfsmnt结构体的super_block结构体
	if (error < 0)
		goto out_free_secdata;
	BUG_ON(!mnt->mnt_sb);

 	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
 	if (error)
 		goto out_sb;

	/*
	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
	 * but s_maxbytes was an unsigned long long for many releases. Throw
	 * this warning for a little while to try and catch filesystems that
	 * violate this rule. This warning should be either removed or
	 * converted to a BUG() in 2.6.34.
	 */
	WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
		"negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);

	mnt->mnt_mountpoint = mnt->mnt_root;                   //
	mnt->mnt_parent = mnt;
	up_write(&mnt->mnt_sb->s_umount);
	free_secdata(secdata);
	return mnt;
out_sb:
	dput(mnt->mnt_root);
	deactivate_locked_super(mnt->mnt_sb);
out_free_secdata:
	free_secdata(secdata);
out_mnt:
	free_vfsmnt(mnt);
out:
	return ERR_PTR(error);
}

5.2 ext2_get_sb, 以ext2文件系统为例,通过ext2_fill_super来填充super_block的信息。

static int ext2_get_sb(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
}
5.3  get_sb_bdev

获取相应块设备的super_block,通过ext2_fill_super来完成super_block信息的填充。

int get_sb_bdev(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data,
	int (*fill_super)(struct super_block *, void *, int),
	struct vfsmount *mnt)
{
	struct block_device *bdev;          //块设备信息
	struct super_block *s;
	fmode_t mode = FMODE_READ;
	int error = 0;

	if (!(flags & MS_RDONLY))
		mode |= FMODE_WRITE;

	bdev = open_bdev_exclusive(dev_name, mode, fs_type);    //根据dev_name,去读取块设备信息。
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);

	/*
	 * once the super is inserted into the list by sget, s_umount
	 * will protect the lockfs code from trying to start a snapshot
	 * while we are mounting
	 */
	mutex_lock(&bdev->bd_fsfreeze_mutex);
	if (bdev->bd_fsfreeze_count > 0) {
		mutex_unlock(&bdev->bd_fsfreeze_mutex);
		error = -EBUSY;
		goto error_bdev;
	}
	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);     //根据bdev,去读取相应的super_block
	mutex_unlock(&bdev->bd_fsfreeze_mutex);
	if (IS_ERR(s))
		goto error_s;

	if (s->s_root) {
		if ((flags ^ s->s_flags) & MS_RDONLY) {
			deactivate_locked_super(s);
			error = -EBUSY;
			goto error_bdev;
		}

		close_bdev_exclusive(bdev, mode);
	} else {
		char b[BDEVNAME_SIZE];

		s->s_flags = flags;
		s->s_mode = mode;
		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
		sb_set_blocksize(s, block_size(bdev));
		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);        //具体文件系统的fill_super,这里是ext2_fill_super,完成对super_block各个域的初始化
		if (error) {
			deactivate_locked_super(s);
			goto error;
		}

		s->s_flags |= MS_ACTIVE;
		bdev->bd_super = s;
	}

	simple_set_mnt(mnt, s);
	return 0;

error_s:
	error = PTR_ERR(s);
error_bdev:
	close_bdev_exclusive(bdev, mode);
error:
	return error;
}

6.do_new_mount->do_add_mount

回到do_new_mount的do_add_mount函数,把mnt挂载到命名空间上。do_add_mount将新挂载的文件系统(由vfsmnt表示)添加到系统的命名空间结构体的已挂载文件系统链表中,命名空间是指系统中以挂载文件系统树,每个进程的PCB中都有namespace成员来表示该进程的命名空间,大多数的进程共享同一个命名空间,所以如果在一个进程中将磁盘挂载到系统中,在另一个进程也是可以看到的,这就是由命名空间来实现的[1]

*
 * add a mount into a namespace's mount tree
 * - provide the option of adding the new mount to an expiration list
 */
int do_add_mount(struct vfsmount *newmnt, struct path *path,
		 int mnt_flags, struct list_head *fslist)
{
	int err;

	down_write(&namespace_sem);
	/* Something was mounted here while we slept */             /*挂载目录可能是已挂载的,follow_down可以把path->mnt和path->dentry指向上一层挂载*/
	while (d_mountpoint(path->dentry) &&
	       follow_down(path))
		;
	err = -EINVAL;
	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
		goto unlock;

	/* Refuse the same filesystem on the same mount point */
	err = -EBUSY;
	if (path->mnt->mnt_sb == newmnt->mnt_sb &&
	    path->mnt->mnt_root == path->dentry)
		goto unlock;

	err = -EINVAL;
	if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
		goto unlock;

	newmnt->mnt_flags = mnt_flags;
	if ((err = graft_tree(newmnt, path)))
		goto unlock;

	if (fslist) /* add to the specified expiration list */                  /*添加*/
		list_add_tail(&newmnt->mnt_expire, fslist);

	up_write(&namespace_sem);
	return 0;

unlock:
	up_write(&namespace_sem);
	mntput(newmnt);
	return err;
}

参考博文: [1]:  http://blog.csdn.net/skyflying2012/article/details/9748133 

你可能感兴趣的:(kernel,mount,FileSystem)