@Author: Gordon Wu
@Time : 2014/12
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; char *kernel_dir; char *kernel_dev; unsigned long data_page; ret = copy_mount_string(type, &kernel_type); // 把用户空间的挂载类型复制到内核 if (ret < 0) goto out_type; kernel_dir = getname(dir_name); //通过kmem_cache_alloc从names_cachep一个PATH_MAX大小的内核空间,把用户空间的dir_name复制过去 if (IS_ERR(kernel_dir)) { ret = PTR_ERR(kernel_dir); goto out_dir; } ret = copy_mount_string(dev_name, &kernel_dev); //复制挂载的设备过去 if (ret < 0) goto out_dev; ret = copy_mount_options(data, &data_page); if (ret < 0) goto out_data; ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, (void *) data_page); //数据准备好,开始do_mount free_page(data_page); out_data: kfree(kernel_dev); out_dev: putname(kernel_dir); out_dir: kfree(kernel_type); out_type: return ret; }
验证挂载的选项Flags,以及获取并填充相应挂载目录dir_name的路径Path结构。 再根据挂载选项Flags来判断,挂载的动作。
/* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */ long do_mount(char *dev_name, char *dir_name, char *type_page, unsigned long flags, void *data_page) { struct path path; int retval = 0; int mnt_flags = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; /* Basic sanity checks */ if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) //挂载目录的有效性, memchr是判断dir_name是否在0~PAGE_SIZE(用户空间)中 return -EINVAL; if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; //忽略suid和sgid位的影响 if (flags & MS_NODEV) mnt_flags |= MNT_NODEV; //不允许访问设备专用文件 if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; //不允许执行程序 if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME; //下面三个标志是关于是否更新文件或目录的atime if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) //只读标志 mnt_flags |= MNT_READONLY; //相应的一些标志已经备份到mnt_flags了,flags去除相应位 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); /* ... and get the mountpoint */ retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); //!!根据dir_name,获取挂载目录的路径信息path if (retval) return retval; retval = security_sb_mount(dev_name, &path, type_page, flags, data_page); if (retval) goto dput_out; <span style="color:#FF0000;">/*根据不同选项,进行下面五种不同的挂载*/</span> if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) retval = do_loopback(&path, dev_name, flags & MS_REC); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&path, flags); else if (flags & MS_MOVE) retval = do_move_mount(&path, dev_name); else retval = do_new_mount(&path, type_page, flags, mnt_flags, //do_new_mount是最常用的挂载,path: 挂载目录,dev_name:挂载设备 dev_name, data_page); dput_out: path_put(&path); return retval; }
利用do_kern_mount为用户空间生成一个新的挂载,并do_add_mount把新安装加入到命名空间树上
/* * create a new mount for userspace and request it to be added into the * namespace's tree */ // *path: 挂载目录, *name:挂载设备,*type:挂载文件系统类型 static int do_new_mount(struct path *path, char *type, int flags, int mnt_flags, char *name, void *data) { struct vfsmount *mnt; //包含已挂载文件系统的信息。 if (!type) return -EINVAL; /* we need capabilities... */ // root权限,才能挂载 if (!capable(CAP_SYS_ADMIN)) return -EPERM; //内核锁 lock_kernel(); mnt = do_kern_mount(type, flags, name, data); //完成mnt信息的填充。 unlock_kernel(); if (IS_ERR(mnt)) return PTR_ERR(mnt); return do_add_mount(mnt, path, mnt_flags, NULL); //添加到命名空间树上 }
4.1 do_kern_mount函数首先会调用get_fs_type来查看内核是否注册了参数type所指的文件系统,对于内核源码下fs目录下的所有文件系统都会通过调用register_filesystem来注册这个文件系统,其实就是添加到内核文件系统链表中,get_fs_type会将参数type字符串跟内核链表中所有已经注册的文件系统结构体file_system_type的name成员向比较,如果找到,则说明内核已经注册了相应文件系统,并且返回相应文件系统注册的file_system_type结构体。后面的挂载过程需要使用到这个结构体中的成员。[1]
struct vfsmount * do_kern_mount(const char *fstype, int flags, const char *name, void *data) { struct file_system_type *type = get_fs_type(fstype); //获取挂载文件系统的类型结构,见4.2,4.3简介 struct vfsmount *mnt; if (!type) return ERR_PTR(-ENODEV); mnt = vfs_kern_mount(type, flags, name, data); if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && !mnt->mnt_sb->s_subtype) mnt = fs_set_subtype(mnt, fstype); put_filesystem(type); return mnt; }4.2 struct file_system_type结构体,下面的get_sb函数会在下面的挂载过程中用到,用来完成具体文件系统挂载的操作。
struct file_system_type { const char *name; int fs_flags; int (*get_sb) (struct file_system_type *, int, const char *, void *, struct vfsmount *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; struct list_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; struct lock_class_key i_alloc_sem_key; };4.3 get_fs_type用到的最主要的函数: 就是从已挂载文件系统队列file_systems中查找,返回相应name类型的file_system_type.
static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; for (p=&file_systems; *p; p=&(*p)->next) if (strlen((*p)->name) == len && strncmp((*p)->name, name, len) == 0) break; return p; }
具体文件系统file_system_type的get_sb成员函数是关键,它填充vfsmnt结构体的super_block结构体,是写文件系统的第一步。
struct vfsmount * vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { struct vfsmount *mnt; char *secdata = NULL; int error; if (!type) return ERR_PTR(-ENODEV); error = -ENOMEM; mnt = alloc_vfsmnt(name); //alloc_vfsmnt以设备名为参数,为mnt函数分配一个空间,初始化mnt的基本信息,包括mnt->mnt_devname,以及一些list if (!mnt) goto out; if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { secdata = alloc_secdata(); if (!secdata) goto out_mnt; error = security_sb_copy_data(data, secdata); if (error) goto out_free_secdata; } error = type->get_sb(type, flags, name, data, mnt); //调用具体文件系统file_system_type的get_sb函数,填充vfsmnt结构体的super_block结构体 if (error < 0) goto out_free_secdata; BUG_ON(!mnt->mnt_sb); error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); if (error) goto out_sb; /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. This warning should be either removed or * converted to a BUG() in 2.6.34. */ WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes); mnt->mnt_mountpoint = mnt->mnt_root; // mnt->mnt_parent = mnt; up_write(&mnt->mnt_sb->s_umount); free_secdata(secdata); return mnt; out_sb: dput(mnt->mnt_root); deactivate_locked_super(mnt->mnt_sb); out_free_secdata: free_secdata(secdata); out_mnt: free_vfsmnt(mnt); out: return ERR_PTR(error); }
5.2 ext2_get_sb, 以ext2文件系统为例,通过ext2_fill_super来填充super_block的信息。
static int ext2_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt); }5.3 get_sb_bdev
获取相应块设备的super_block,通过ext2_fill_super来完成super_block信息的填充。
int get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int), struct vfsmount *mnt) { struct block_device *bdev; //块设备信息 struct super_block *s; fmode_t mode = FMODE_READ; int error = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; bdev = open_bdev_exclusive(dev_name, mode, fs_type); //根据dev_name,去读取块设备信息。 if (IS_ERR(bdev)) return PTR_ERR(bdev); /* * once the super is inserted into the list by sget, s_umount * will protect the lockfs code from trying to start a snapshot * while we are mounting */ mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); error = -EBUSY; goto error_bdev; } s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); //根据bdev,去读取相应的super_block mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) goto error_s; if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { deactivate_locked_super(s); error = -EBUSY; goto error_bdev; } close_bdev_exclusive(bdev, mode); } else { char b[BDEVNAME_SIZE]; s->s_flags = flags; s->s_mode = mode; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); //具体文件系统的fill_super,这里是ext2_fill_super,完成对super_block各个域的初始化 if (error) { deactivate_locked_super(s); goto error; } s->s_flags |= MS_ACTIVE; bdev->bd_super = s; } simple_set_mnt(mnt, s); return 0; error_s: error = PTR_ERR(s); error_bdev: close_bdev_exclusive(bdev, mode); error: return error; }
回到do_new_mount的do_add_mount函数,把mnt挂载到命名空间上。do_add_mount将新挂载的文件系统(由vfsmnt表示)添加到系统的命名空间结构体的已挂载文件系统链表中,命名空间是指系统中以挂载文件系统树,每个进程的PCB中都有namespace成员来表示该进程的命名空间,大多数的进程共享同一个命名空间,所以如果在一个进程中将磁盘挂载到系统中,在另一个进程也是可以看到的,这就是由命名空间来实现的[1]。
* * add a mount into a namespace's mount tree * - provide the option of adding the new mount to an expiration list */ int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags, struct list_head *fslist) { int err; down_write(&namespace_sem); /* Something was mounted here while we slept */ /*挂载目录可能是已挂载的,follow_down可以把path->mnt和path->dentry指向上一层挂载*/ while (d_mountpoint(path->dentry) && follow_down(path)) ; err = -EINVAL; if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) goto unlock; /* Refuse the same filesystem on the same mount point */ err = -EBUSY; if (path->mnt->mnt_sb == newmnt->mnt_sb && path->mnt->mnt_root == path->dentry) goto unlock; err = -EINVAL; if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) goto unlock; newmnt->mnt_flags = mnt_flags; if ((err = graft_tree(newmnt, path))) goto unlock; if (fslist) /* add to the specified expiration list */ /*添加*/ list_add_tail(&newmnt->mnt_expire, fslist); up_write(&namespace_sem); return 0; unlock: up_write(&namespace_sem); mntput(newmnt); return err; }