文件数据的写入
【write】
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);
ret = vfs_write(file, buf, count, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
}
return ret;
}
函数fget_light根据文件描述符fd在当前进程的文件打开列表中找到file对象。函数file_pos_read获取对文件操作的当前位置。函数vfs_write完成文件数据写的实际工作。函数file_pos_write重新设置文件操作的当前位置。
【write--->vfs_write】
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= 0) {
count = ret;
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
}
return ret;
}
首先检测用户空间提供的数据区是否可读,然后检测文件操作区是否可写。通过上述检测之后调用文件所在文件系统的操作函数进行数据写入。
【write--->vfs_write--->do_sync_write】
ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
for (;;) {
ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (ret != -EIOCBRETRY)
break;
wait_on_retry_sync_kiocb(&kiocb);
}
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
初始化一个用户空间缓存管理结构iovec和数据读取参数管理结构kiocb。然后调用函数filp->f_op->aio_write进行数据的实际写入。在ext3文件系统中,文件操作函数集初始化如下:
const struct file_operations ext3_file_operations = {
......
.aio_write = generic_file_aio_write,
......
};
【write--->vfs_write--->do_sync_write--->generic_file_aio_write】
ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct blk_plug plug;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
if (err < 0 && ret > 0)
ret = err;
}
blk_finish_plug(&plug);
return ret;
}
函数__generic_file_aio_write实现文件异步写入的主要工作。如果file中置O_DSYNC标志或inode中置标志MS_SYNCHRONOUS都会在函数generic_write_sync中将缓存中的数据写回设备。
【write--->vfs_write--->do_sync_write--->generic_file_aio_write--->__generic_file_aio_write】
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
loff_t pos;
ssize_t written;
ssize_t err;
ocount = 0;
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
if (err)
return err;
count = ocount;
pos = *ppos;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
current->backing_dev_info = mapping->backing_dev_info;
written = 0;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
if (count == 0)
goto out;
err = file_remove_suid(file);
if (err)
goto out;
file_update_time(file);
检测用户空间数据段是否可读。对文件读写权限写入位置和数据大小的检查。如果文件设置的setuid则将该标志移出。
if (unlikely(file->f_flags & O_DIRECT)) {
loff_t endbyte;
ssize_t written_buffered;
written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
ppos, count, ocount);
if (written < 0 || written == count)
goto out;
pos += written;
count -= written;
written_buffered = generic_file_buffered_write(iocb, iov,
nr_segs, pos, ppos, count,
written);
if (written_buffered < 0) {
err = written_buffered;
goto out;
}
endbyte = pos + written_buffered - written - 1;
err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
if (err == 0) {
written = written_buffered;
invalidate_mapping_pages(mapping,
pos >> PAGE_CACHE_SHIFT,
endbyte >> PAGE_CACHE_SHIFT);
} else {
}
如果设置了标志O_DIRECT,调用函数generic_file_direct_write将数据直接写入设备。如果直接写入数据出错或者没有将数据完全写入则调用函数generic_file_buffered_write将数据(剩余数据)写入缓存,然后调用函数filemap_write_and_wait_range将缓存中的数据写入设备。
} else {
written = generic_file_buffered_write(iocb, iov, nr_segs,
pos, ppos, count, written);
如果没有设置标志O_DIRECT,数据将被写入缓存并提交写数据请求。
}
out:
current->backing_dev_info = NULL;
return written ? written : err;
}