linux虚拟文件系统是设备驱动程序的之上的一个抽象层,致力于提供给应用程序一个统一的操作文件的接口。虚拟文件系统的各个数据结构之间的关系比较复杂,画了一张各个数据结构之间的关系图在 http://download.csdn.net/detail/lonewolfxw/4588935,这个清晰的给出了各结构的关系。
linux的虚拟文件系统最核心的结构就是dentry缓存,每次查找一个路径时,先在dentry缓存中查找是否有对应的项,例如cd /home/lonewolf则先解析目录结构,查找对应于home的dentry是否在缓存中,若不在则从底层设备中读取,然后查找lonewolf对应的dentry,依次进行,找到了dentry就找到文件对应的inode。因此,dentry缓存和文件目录树相同,按照树的方式组织,并且将已经得到的dentry项加入到dentry_hashtable缓存起来,避免每次都从低速的底层设备中读取,哈希表用来快速的查找。这当然只是一个查找的简略的过程,实际的过程相当复杂,需要处理很多情况,包括:
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
* the final dentry. We expect 'base' to be positive and a directory.
*
* Returns 0 and nd will have valid dentry and mnt on success.
* Returns error and drops reference to input namei data on failure.
*/
static int link_path_walk(const char *name, struct nameidata *nd) /*struct nameidata只是一个查找结果的传送定义的一个结构体,查找成功则nd->path就是找到的结果,在执行这个函数之前,先对@nd进行初始化,表示路径查找的起点,若@name是以'/'开始,怎@nd->path就初始化fs_struct->root表示从跟目录开始查找,若@name不是以'/'开始,则初始化fs_struct->pwd表示从进程的当前目录开始查找*/
{
struct path next;
int err;
while (*name=='/') //若name是以/开始,则跳过,然后解析真正的路径的各个部分
name++;
if (!*name)
return 0;
/* At this point we know we have a real path component. */
for(;;) {
unsigned long hash;
struct qstr this;
unsigned int c;
int type;
err = may_lookup(nd); //检查权限,检查@nd对应inode的目录是否有执行权限,访问目录需要目录的执行权限
if (err)
break;
/*下面就是解析目录的各个部分, this表示解析的当前部分的结果*/
this.name = name;
c = *(const unsigned char *)name;
hash = init_name_hash();
do {
name++;
hash = partial_name_hash(c, hash);//一个一个字符累加的哈希值,计算遍历到的部分的哈希值
c = *(const unsigned char *)name;
} while (c && (c != '/')); //字符串结束或者碰到下个部分的开始'/',则停止
this.len = name - (const char *) this.name;
this.hash = end_name_hash(hash);
/*type表示当前部分的类型,用来处理特殊目录'.'和'..'*/
type = LAST_NORM;
if (this.name[0] == '.') switch (this.len) {
case 2:
if (this.name[1] == '.') {
type = LAST_DOTDOT;
nd->flags |= LOOKUP_JUMPED;
}
break;
case 1:
type = LAST_DOT;
}
if (likely(type == LAST_NORM)) {
struct dentry *parent = nd->path.dentry;
nd->flags &= ~LOOKUP_JUMPED;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
err = parent->d_op->d_hash(parent, nd->inode,
&this);
if (err < 0)
break;
}
}
/*如果字符串已经结束,或者当前解析之后全是'/',则当前部分是路径的最后一部分,没有进过walk_component处理,因为有可能最后一个部分不是目录,从而跳到last_component处理,由调用link_path_walk出来最后一个部分*/
if (!c)
goto last_component;
while (*++name == '/');
if (!*name)
goto last_component;
err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);/*这个函数执行真正的dentry缓存的查找,在dentry_hashtable中查找@this的名字,找不到则使用inode的inode_operations->lookup中的操作从底层设备载入进来,这个哈希表用父目录的dentry和当前目录名字的hash值也就是this->hash作为键值。而且这个函数还处理目录的权限以及目录是装载点的情况,由于一个目录下可以装载多个文件系统,最新装载的文件系统隐藏以前的装载,若是装载点,则顺着装载点一直查找,直到最上层的装载点也就是当前可以看到的文件系统,当这个函数返回1,则表示这个目录是符号链接,下面进行特殊处理。函数调用成功则 @nd->path 表示this.name这个名字所表示的目录,也是就当前解析成功的目录,然后下一次循环解析下一个部分时候,这个目录就当做父目录在dentry缓存中查找,直至所有的部分全部完成*/
if (err < 0)
return err;
if (err) {
err = nested_symlink(&next, nd);//如果err是1,则处理符号链接
if (err)
return err;
}
if (can_lookup(nd->inode)) /*检查这个部分是否可以查找,也就是说检查这个部分是否是目录,由于除了最后一部分之外,中间的部分必须是目录,不是目录则出错。是最后一项会跳过此处的检查,直接跳到last_component*/
continue;
err = -ENOTDIR;
break;
/* here ends the main loop */
last_component:
nd->last = this;
nd->last_type = type;
return 0;
}
terminate_walk(nd);
return err;
}
static inline int walk_component(struct nameidata *nd, struct path *path, struct qstr *name, int type, int follow) { struct inode *inode; int err; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(type != LAST_NORM)) return handle_dots(nd, type); /*处理目录是'.'和'..’的情况,'.'很好处理,直接跳过就可以了,'..'稍微麻烦,因为当前目录有可能是一个装载点,跳到上一级目录就要切换文件系统*/ err = do_lookup(nd, name, path, &inode);/*这个从dentry缓存中查找,找不到就从底层设备中找,并且会处理装载点的情况*/ if (unlikely(err)) { terminate_walk(nd); return err; } if (!inode) {//没有找到dentry,则表示文件不存在 path_to_nameidata(path, nd); terminate_walk(nd); return -ENOENT; } if (should_follow_link(inode, follow)) {//检查是否要跟踪符号链接,若是返回1,有nested_symlink处理 if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { terminate_walk(nd); return -ECHILD; } } BUG_ON(inode != path->dentry->d_inode); return 1; } path_to_nameidata(path, nd);//将找到的path放到nd中返回 nd->inode = inode; return 0; }
static inline int handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { if (nd->flags & LOOKUP_RCU) { if (follow_dotdot_rcu(nd)) return -ECHILD; } else follow_dotdot(nd); } return 0; } static int follow_dotdot_rcu(struct nameidata *nd) { set_root_rcu(nd); //获得当前进程的根文件系统的path while (1) { if (nd->path.dentry == nd->root.dentry && nd->path.mnt == nd->root.mnt) { /*如果在根路径上执行"..",没有意义直接跳过就可以了*/ break; } if (nd->path.dentry != nd->path.mnt->mnt_root) { /*不是装载点的根目录,就直接获得dentry的parent就可以了*/ struct dentry *old = nd->path.dentry; struct dentry *parent = old->d_parent; unsigned seq; seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) goto failed; nd->path.dentry = parent; nd->seq = seq; break; } if (!follow_up_rcu(&nd->path)) break; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } follow_mount_rcu(nd); nd->inode = nd->path.dentry->d_inode; return 0; failed: nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); br_read_unlock(vfsmount_lock); return -ECHILD; }
/* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */ long do_mount(char *dev_name, char *dir_name, char *type_page, unsigned long flags, void *data_page) { struct path path; int retval = 0; int mnt_flags = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; /* Basic sanity checks */ if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) //验证目录的名字 return -EINVAL; if (data_page) //特定文件系统的私有项,大小为一页 ((char *)data_page)[PAGE_SIZE - 1] = 0; /* ... and get the mountpoint */ retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); //上文讲的查找装载点的路径 if (retval) return retval; retval = security_sb_mount(dev_name, &path, type_page, flags, data_page); //直接调用security_ops->sb_mount,若成功直接返回 if (retval) goto dput_out; /*通过flag配置装载选项,下面是一个多路选择器,根据不同的装载选项调用不同的函数*/ /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; if (flags & MS_NODEV) mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) retval = do_loopback(&path, dev_name, flags & MS_REC); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&path, flags); else if (flags & MS_MOVE) retval = do_move_mount(&path, dev_name); else retval = do_new_mount(&path, type_page, flags, mnt_flags, dev_name, data_page); dput_out: path_put(&path); return retval; }装载新的文件系统通过do_new_mount来处理:
/* * create a new mount for userspace and request it to be added into the * namespace's tree */ static int do_new_mount(struct path *path, char *type, int flags, int mnt_flags, char *name, void *data) { struct vfsmount *mnt; int err; if (!type) return -EINVAL; /* we need capabilities... */ if (!capable(CAP_SYS_ADMIN)) //权限检查 return -EPERM; mnt = do_kern_mount(type, flags, name, data); /*新建一个vfsmount实例,并通过特定文件系统的操作装载到系统系统中,返回装载点的根目录*/ if (IS_ERR(mnt)) return PTR_ERR(mnt); err = do_add_mount(mnt, path, mnt_flags); /*将vfsmount加入到vfsmount树中, 设置相关的数据结构的选项*/ if (err) mntput(mnt); return err; }do_add_mount通过各种检查,例如一个挂载点不能重复挂载其自身在相同的挂载点、挂载点必须是目录等,最后执行namespace.c中的attach_recursive_mnt处理各种数据结构:
static int attach_recursive_mnt(struct vfsmount *source_mnt, struct path *path, struct path *parent_path) { LIST_HEAD(tree_list); struct vfsmount *dest_mnt = path->mnt; struct dentry *dest_dentry = path->dentry; struct vfsmount *child, *p; int err; if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; } err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); /*处理从属装载和共享装载,将相关的vfsmount通过tree_list返回*/ if (err) goto out_cleanup_ids; br_write_lock(vfsmount_lock); if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } if (parent_path) { /*如果source_mnt之前装载在@parent_path,要迁移到@path上,则先从parent_path中移除,然后增加到@path路径上,移除包括从parent_path的vfsmount的子装载点中移除和从mount_hashtable中移除,因为mount_hashtable是通过父装载点的vfsmount和子装载的dentry来计算哈希值的*/ detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); touch_mnt_namespace(parent_path->mnt->mnt_ns); } else { /*parent_path为空表示新挂载项,则设置source_mnt的mnt_parent、mnt_root、mnt_mountpoint,然后将其增加到父装载点的子装载点链表中,并将其加入哈希表*/ mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); } /*处理所有的从属装载和共享装载*/ list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { list_del_init(&child->mnt_hash); commit_tree(child); } br_write_unlock(vfsmount_lock); return 0; out_cleanup_ids: if (IS_MNT_SHARED(dest_mnt)) cleanup_group_ids(source_mnt, NULL); out: return err; }至此,装载文件系统基本完成了。