这部分主要对linux虚拟文件系统内核初始化部分做些补充。
关于shrinker,inode和dentry cache初始化阶段都需要注册自己的shrinker,用于缩减cache。两个操作原理类似。
shrinker数据结构介绍
/* * A callback you can register to apply pressure to ageable caches. * * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should * look through the least-recently-used 'nr_to_scan' entries and * attempt to free them up. It should return the number of objects * which remain in the cache. If it returns -1, it means it cannot do * any scanning at this time (eg. there is a risk of deadlock). * * The 'gfpmask' refers to the allocation we are currently trying to * fulfil. * * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is * querying the cache size, so a fastpath for that case is appropriate. */ struct shrinker { int (*shrink)(int nr_to_scan, gfp_t gfp_mask); int seeks; /* seeks to recreate an obj */ /* These are for internal use */ struct list_head list; long nr; /* objs pending delete */ };
1,注册inode cache shrinker
Start_kernel()->vfs_caches_init()->dcache_init()->register_shrinker(&dcache_shrinker);
/* * Add a shrinker callback to be called from the vm */ void register_shrinker(struct shrinker *shrinker) { shrinker->nr = 0; down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); }
其中相关的函数在这里定义。
static struct shrinker dcache_shrinker = { .shrink = shrink_dcache_memory, .seeks = DEFAULT_SEEKS, };
/* * Scan `nr' dentries and return the number which remain. * * We need to avoid reentering the filesystem if the caller is performing a * GFP_NOFS allocation attempt. One example deadlock is: * * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache-> * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode-> * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK. * * In this case we return -1 to tell the caller that we baled. */ static int shrink_dcache_memory(int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; prune_dcache(nr);/*缩减指定大小的cache*/ } return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; }
/** * prune_dcache - shrink the dcache * @count: number of entries to try to free * * Shrink the dcache. This is done when we need more memory, or simply when we * need to unmount something (at which point we need to unuse all dentries). * * This function may fail to free any resources if all the dentries are in use. */ /*缩减dcache,count为释放的数量*/ static void prune_dcache(int count) { struct super_block *sb; int w_count; int unused = dentry_stat.nr_unused; int prune_ratio; int pruned; if (unused == 0 || count == 0) return; spin_lock(&dcache_lock); restart: if (count >= unused) prune_ratio = 1;/*释放率*/ else prune_ratio = unused / count; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_nr_dentry_unused == 0) continue; sb->s_count++; /* Now, we reclaim unused dentrins with fairness. * We reclaim them same percentage from each superblock. * We calculate number of dentries to scan on this sb * as follows, but the implementation is arranged to avoid * overflows: * number of dentries to scan on this sb = * count * (number of dentries on this sb / * number of dentries in the machine) */ spin_unlock(&sb_lock); /*重新利用释放率计算释放量*/ if (prune_ratio != 1) w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1; else w_count = sb->s_nr_dentry_unused; pruned = w_count; /* * We need to be sure this filesystem isn't being unmounted, * otherwise we could race with generic_shutdown_super(), and * end up holding a reference to an inode while the filesystem * is unmounted. So we try to get s_umount, and make sure * s_root isn't NULL. */ if (down_read_trylock(&sb->s_umount)) { if ((sb->s_root != NULL) && (!list_empty(&sb->s_dentry_lru))) { spin_unlock(&dcache_lock); /*实际释放工作*/ __shrink_dcache_sb(sb, &w_count, DCACHE_REFERENCED); pruned -= w_count; spin_lock(&dcache_lock); } up_read(&sb->s_umount); } spin_lock(&sb_lock); count -= pruned; /* * restart only when sb is no longer on the list and * we have more work to do. */ if (__put_super_and_need_restart(sb) && count > 0) { spin_unlock(&sb_lock); goto restart; } } spin_unlock(&sb_lock); spin_unlock(&dcache_lock); }
/* * Shrink the dentry LRU on a given superblock. * @sb : superblock to shrink dentry LRU. * @count: If count is NULL, we prune all dentries on superblock. * @flags: If flags is non-zero, we need to do special processing based on * which flags are set. This means we don't need to maintain multiple * similar copies of this loop. */ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) { LIST_HEAD(referenced); LIST_HEAD(tmp); struct dentry *dentry; int cnt = 0; BUG_ON(!sb); BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); spin_lock(&dcache_lock); if (count != NULL) /* called from prune_dcache() and shrink_dcache_parent() */ cnt = *count;/*在下面用到*/ restart: if (count == NULL) list_splice_init(&sb->s_dentry_lru, &tmp); else { while (!list_empty(&sb->s_dentry_lru)) { dentry = list_entry(sb->s_dentry_lru.prev, struct dentry, d_lru); BUG_ON(dentry->d_sb != sb); spin_lock(&dentry->d_lock); /* * If we are honouring the DCACHE_REFERENCED flag and * the dentry has this flag set, don't free it. Clear * the flag and put it back on the LRU. */ /*清flag对应位,将链表元素放LRU尾部*/ if ((flags & DCACHE_REFERENCED) && (dentry->d_flags & DCACHE_REFERENCED)) { dentry->d_flags &= ~DCACHE_REFERENCED; list_move(&dentry->d_lru, &referenced); spin_unlock(&dentry->d_lock); } else { /*从d_lru链表中删除,加到tmp链表中*/ list_move_tail(&dentry->d_lru, &tmp); spin_unlock(&dentry->d_lock); cnt--;/*数量减一*/ if (!cnt)/*减到0跳出循环*/ break; } cond_resched_lock(&dcache_lock); } } /*对tmp中的每个元素,其中tmp中的元素为上面移过来的*/ while (!list_empty(&tmp)) { dentry = list_entry(tmp.prev, struct dentry, d_lru); /*从tmp中删除相关链表并做重新初始化和数据统计*/ dentry_lru_del_init(dentry); spin_lock(&dentry->d_lock); /* * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free * it - just keep it off the LRU list. */ if (atomic_read(&dentry->d_count)) { spin_unlock(&dentry->d_lock); continue; }/*释放dentry和其父dentry*/ prune_one_dentry(dentry); /* dentry->d_lock was dropped in prune_one_dentry() */ cond_resched_lock(&dcache_lock); } if (count == NULL && !list_empty(&sb->s_dentry_lru)) goto restart; if (count != NULL) *count = cnt; if (!list_empty(&referenced)) list_splice(&referenced, &sb->s_dentry_lru); spin_unlock(&dcache_lock); }
static void dentry_lru_del_init(struct dentry *dentry) { if (likely(!list_empty(&dentry->d_lru))) { list_del_init(&dentry->d_lru);/*从链表中删除并初始化dentry->d_lru*/ dentry->d_sb->s_nr_dentry_unused--;/*未用数减一*/ dentry_stat.nr_unused--;/*更新统计数据*/ } }
/* * Throw away a dentry - free the inode, dput the parent. This requires that * the LRU list has already been removed. * * Try to prune ancestors as well. This is necessary to prevent * quadratic behavior of shrink_dcache_parent(), but is also expected * to be beneficial in reducing dentry cache fragmentation. */ static void prune_one_dentry(struct dentry * dentry) __releases(dentry->d_lock) __releases(dcache_lock) __acquires(dcache_lock) { __d_drop(dentry); dentry = d_kill(dentry);/*释放dentry*/ /* * Prune ancestors. Locking is simpler than in dput(), * because dcache_lock needs to be taken anyway. */ spin_lock(&dcache_lock); while (dentry) { if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock)) return; if (dentry->d_op && dentry->d_op->d_delete) dentry->d_op->d_delete(dentry); dentry_lru_del_init(dentry); __d_drop(dentry); dentry = d_kill(dentry); spin_lock(&dcache_lock); } }
/** * d_kill - kill dentry and return parent * @dentry: dentry to kill * * The dentry must already be unhashed and removed from the LRU. * * If this is the root of the dentry tree, return NULL. */ static struct dentry *d_kill(struct dentry *dentry) __releases(dentry->d_lock) __releases(dcache_lock) { struct dentry *parent; list_del(&dentry->d_u.d_child);/*删除子目录*/ dentry_stat.nr_dentry--;/*更新统计数据*/ /* For d_free, below */ /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry);/*"释放"inode*/ if (IS_ROOT(dentry)) parent = NULL; else parent = dentry->d_parent; d_free(dentry);/*释放dentry*/ return parent; }
/* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ /*释放inode*/ static void dentry_iput(struct dentry * dentry) __releases(dentry->d_lock) __releases(dcache_lock) { struct inode *inode = dentry->d_inode; if (inode) { dentry->d_inode = NULL; list_del_init(&dentry->d_alias);/*从同一索引节点目录链表中删除*/ spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); if (!inode->i_nlink)/*如果inode没有硬链接*/ fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode);/*释放inode*/ } else { spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); } }
2.注册inode cache shrinker
Start_kernel()->vfs_caches_init()->inode_init()->register_shrinker(&icache_shrinker);
其中参数为下面定义
static struct shrinker icache_shrinker = { .shrink = shrink_icache_memory, .seeks = DEFAULT_SEEKS, };
static int shrink_icache_memory(int nr, gfp_t gfp_mask) { if (nr) { /* * Nasty deadlock avoidance. We may hold various FS locks, * and we don't want to recurse into the FS that called us * in clear_inode() and friends.. */ if (!(gfp_mask & __GFP_FS)) return -1; prune_icache(nr); } return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; }
/* * Scan `goal' inodes on the unused list for freeable ones. They are moved to * a temporary list and then are freed outside inode_lock by dispose_list(). * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. We expect the final iput() on that inode to add it to * the front of the inode_unused list. So look for it there and if the * inode is still freeable, proceed. The right inode is found 99.9% of the * time in testing on a 4-way. * * If the inode has metadata buffers attached to mapping->private_list then * try to remove them. */ static void prune_icache(int nr_to_scan) { LIST_HEAD(freeable);/*初始化freeable,在下面需要用到,作为临时存放可被释放的inode*/ int nr_pruned = 0; int nr_scanned; unsigned long reap = 0; down_read(&iprune_sem); spin_lock(&inode_lock); for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; if (list_empty(&inode_unused)) break; inode = list_entry(inode_unused.prev, struct inode, i_list); if (inode->i_state || atomic_read(&inode->i_count)) { /*将ionde从inode_unused链表中删除,加入inode_unused链表头*/ list_move(&inode->i_list, &inode_unused); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode);/*移动到使用链表*/ spin_unlock(&inode_lock); if (remove_inode_buffers(inode))/*从buffer链表中删除所有buffer*/ reap += invalidate_mapping_pages(&inode->i_data, 0, -1); iput(inode); spin_lock(&inode_lock); if (inode != list_entry(inode_unused.next, struct inode, i_list)) continue; /* wrong inode or list_empty */ if (!can_unuse(inode)) continue; } /*移动到freeable链表*/ list_move(&inode->i_list, &freeable); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; nr_pruned++;/*统计移动到freeable链表的元素个数*/ } inodes_stat.nr_unused -= nr_pruned;/*更新统计数据*/ if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); spin_unlock(&inode_lock); dispose_list(&freeable);/*将freeable链表中的数据处理掉*/ up_read(&iprune_sem); }
3,注册文件描述符表释放函数
/*文件描述符表*/ struct fdtable { unsigned int max_fds;/*进程能够处理的最大file结构*/ struct file ** fd;/*所有打开文件信息*//* current fd array */ fd_set *close_on_exec;/*exec系统调用被关闭的所有文件集合*/ fd_set *open_fds;/*当前打开的所有文件集合*/ struct rcu_head rcu; struct fdtable *next; };
Start_kernel()->vfs_caches_init()->files_init()->files_defer_init()->fdtable_defer_list_init()->INIT_WORK(&fddef->wq, free_fdtable_work);
static void free_fdtable_work(struct work_struct *work) { struct fdtable_defer *f = container_of(work, struct fdtable_defer, wq); struct fdtable *fdt; spin_lock_bh(&f->lock); fdt = f->next; f->next = NULL; spin_unlock_bh(&f->lock); while(fdt) {/*释放工作*/ struct fdtable *next = fdt->next; vfree(fdt->fd); free_fdset(fdt); kfree(fdt); fdt = next; } }
4.sysfs文件系统初始化
Start_kernel()->vfs_caches_init()->mnt_init()->sysfs_init()
int __init sysfs_init(void) { int err = -ENOMEM; sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", sizeof(struct sysfs_dirent), 0, 0, NULL); if (!sysfs_dir_cachep) goto out; /*初始化sysfs的backing_dev_info结构*/ err = sysfs_inode_init(); if (err) goto out_err; /*注册文件系统*/ err = register_filesystem(&sysfs_fs_type); if (!err) { /*创建sysfs mount*/ sysfs_mount = kern_mount(&sysfs_fs_type); if (IS_ERR(sysfs_mount)) { printk(KERN_ERR "sysfs: could not mount!\n"); err = PTR_ERR(sysfs_mount); sysfs_mount = NULL; unregister_filesystem(&sysfs_fs_type); goto out_err; } } else goto out_err; out: return err; out_err: kmem_cache_destroy(sysfs_dir_cachep); sysfs_dir_cachep = NULL; goto out; }