ramfs


writeシステムコールは、file->f_op->write()で書き込み引数を設定してfile->f_op->aio_write()コールし、file->f_mapping->page_tree->rnode->slots[]のインデックスの設定されるページキャッシュを領域とするバッファにデータが設定され、ブロックデバイスの書き込は、ファイルシステムに係るfile->f_op->fsync()によりfile->f_mapping->page_tree->rnode->slots[]がブロックデバイスに書き込まれます。ramfsはブロックデバイスを有しないファイルシステムで、file->f_op->fsyncはnoop_fsync()の未実装です。

file->f_mapping->page_tree->rnode->slots[]へのインデックス取得は、struct address_space_operations.write_beginでかかるインデックスのページキャッシュが設定されていなければ、ページキャッシュを取得しfile->f_mapping->page_tree->rnode->slots[]に設定されます。

実装イメージ

struct files_struct *file;
int                 fd;

fd    = open(fname, O_RDWR);
write(fd, data, cnt=strlen(data));
ーーーーーーーーーーーーーーーーーーーーーーーーーーー
#ifdef __KERNEL__
#define RADIX_TREE_MAP_SHIFT    (CONFIG_BASE_SMALL ? 4 : 6)
#else
#define RADIX_TREE_MAP_SHIFT    3       /* For more stressful testing */
#endif

#define RADIX_TREE_MAP_SIZE     (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK     (RADIX_TREE_MAP_SIZE-1)

#define RADIX_TREE_TAG_LONGS    \
       ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)

#define PAGE_SHIFT      12
#define PAGE_SIZE       (1 << PAGE_SHIFT)
#define PAGE_CACHE_SIZE         PAGE_SIZE

struct files_struct *file;
int                 fd;

file  = current->files->fdt->fd[fd];
pos   = file->f_pos;
index = (pos >> shift) & RADIX_TREE_MAP_MASK;
page  = file->f_mapping->page_tree->rnode->slots[index];

if (!page) {
   page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
   file->f_mapping->page_tree->rnode->slots[index] = page;
}

offset = (pos & (PAGE_CACHE_SIZE - 1));
wr_cnt = PAGE_CACHE_SIZE - offset;
strcpy(page + offset, data, PAGE_CACHE_SIZE - cnt);

カーネル実装

const struct address_space_operations ramfs_aops = {
       .readpage       = simple_readpage,
       .write_begin    = simple_write_begin,
       .write_end      = simple_write_end,
       .set_page_dirty = __set_page_dirty_no_writeback,
};

const struct file_operations ramfs_file_operations = {
       .read           = do_sync_read,
       .aio_read       = generic_file_aio_read,
       .write          = do_sync_write,
       .aio_write      = generic_file_aio_write,
       .mmap           = generic_file_mmap,
       .fsync          = noop_fsync,
       .splice_read    = generic_file_splice_read,
       .splice_write   = generic_file_splice_write,
       .llseek         = generic_file_llseek,
};

int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
       return 0;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
               size_t, count)
{
       struct fd f = fdget(fd);
       ssize_t ret = -EBADF;

       if (f.file) {
               loff_t pos = file_pos_read(f.file);
               ret = vfs_write(f.file, buf, count, &pos);
               file_pos_write(f.file, pos);
               fdput(f);
       }

       return ret;
}

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
       ssize_t ret;

       if (!(file->f_mode & FMODE_WRITE))
               return -EBADF;
       if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
               return -EINVAL;
       if (unlikely(!access_ok(VERIFY_READ, buf, count)))
               return -EFAULT;

       ret = rw_verify_area(WRITE, file, pos, count);
       if (ret >= 0) {
               count = ret;
               if (file->f_op->write)
                       ret = file->f_op->write(file, buf, count, pos);
               else
                       ret = do_sync_write(file, buf, count, pos);
               if (ret > 0) {
                       fsnotify_modify(file);
                       add_wchar(current, ret);
               }
               inc_syscw(current);
       }

       return ret;
}

ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
       struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
       struct kiocb kiocb;
       ssize_t ret;

       init_sync_kiocb(&kiocb, filp);
       kiocb.ki_pos = *ppos;
       kiocb.ki_left = len;
       kiocb.ki_nbytes = len;

       for (;;) {
               ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
               if (ret != -EIOCBRETRY)
                       break;
               wait_on_retry_sync_kiocb(&kiocb);
       }

       if (-EIOCBQUEUED == ret)
               ret = wait_on_sync_kiocb(&kiocb);
       *ppos = kiocb.ki_pos;
       return ret;
}

ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
               unsigned long nr_segs, loff_t pos)
{
       struct file *file = iocb->ki_filp;
       struct inode *inode = file->f_mapping->host;
       ssize_t ret;

       BUG_ON(iocb->ki_pos != pos);

       sb_start_write(inode->i_sb);
       mutex_lock(&inode->i_mutex);
       ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
       mutex_unlock(&inode->i_mutex);

       if (ret > 0 || ret == -EIOCBQUEUED) {
               ssize_t err;

               err = generic_write_sync(file, pos, ret);
               if (err < 0 && ret > 0)
                       ret = err;
       }
       sb_end_write(inode->i_sb);
       return ret;
}

ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t *ppos)
{
       struct file *file = iocb->ki_filp;
       struct address_space * mapping = file->f_mapping;
       size_t ocount;          /* original count */
       size_t count;           /* after file limit checks */
       struct inode    *inode = mapping->host;
       loff_t          pos;
       ssize_t         written;
       ssize_t         err;

       ocount = 0;
       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
       if (err)
               return err;

       count = ocount;
       pos = *ppos;

       current->backing_dev_info = mapping->backing_dev_info;
       written = 0;

       err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
       if (err)
               goto out;

       if (count == 0)
               goto out;

       err = file_remove_suid(file);
       if (err)
               goto out;

       err = file_update_time(file);
       if (err)
               goto out;

       if (unlikely(file->f_flags & O_DIRECT)) {
               loff_t endbyte;
               ssize_t written_buffered;

               written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
                                                       ppos, count, ocount);
               if (written < 0 || written == count)
                       goto out;

               pos += written;
               count -= written;
               written_buffered = generic_file_buffered_write(iocb, iov,
                                               nr_segs, pos, ppos, count,
                                               written);

               if (written_buffered < 0) {
                       err = written_buffered;
                       goto out;
               }

               endbyte = pos + written_buffered - written - 1;
               err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
               if (err == 0) {
                       written = written_buffered;
                       invalidate_mapping_pages(mapping,
                                                pos >> PAGE_CACHE_SHIFT,
                                                endbyte >> PAGE_CACHE_SHIFT);
               }
       } else {
               written = generic_file_buffered_write(iocb, iov, nr_segs,
                               pos, ppos, count, written);
       }
out:
       current->backing_dev_info = NULL;
       return written ? written : err;
}

ssize_t generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
               unsigned long nr_segs, loff_t pos, loff_t *ppos, size_t count, ssize_t written)
{
       struct file *file = iocb->ki_filp;
       ssize_t status;
       struct iov_iter i;

       iov_iter_init(&i, iov, nr_segs, count, written);
       status = generic_perform_write(file, &i, pos);

       if (likely(status >= 0)) {
               written += status;
               *ppos = pos + status;
       }
       
       return written ? written : status;
}

int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
       return 0;
}

const struct address_space_operations ramfs_aops = {
       .readpage       = simple_readpage,
       .write_begin    = simple_write_begin,
       .write_end      = simple_write_end,
       .set_page_dirty = __set_page_dirty_no_writeback,
};

static ssize_t generic_perform_write(struct file *file,
                               struct iov_iter *i, loff_t pos)
{
       struct address_space *mapping = file->f_mapping;
       const struct address_space_operations *a_ops = mapping->a_ops;
       long status = 0;
       ssize_t written = 0;
       unsigned int flags = 0;

       if (segment_eq(get_fs(), KERNEL_DS))
               flags |= AOP_FLAG_UNINTERRUPTIBLE;

       do {
               struct page *page;
               unsigned long offset;   /* Offset into pagecache page */
               unsigned long bytes;    /* Bytes to write to page */
               size_t copied;          /* Bytes copied from user */
               void *fsdata;

               offset = (pos & (PAGE_CACHE_SIZE - 1));
               bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                               iov_iter_count(i));

again:
               if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
                       status = -EFAULT;
                       break;
               }

               status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                               &page, &fsdata);
               if (unlikely(status))
                       break;

               if (mapping_writably_mapped(mapping))
                       flush_dcache_page(page);

               pagefault_disable();
               copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
               pagefault_enable();
               flush_dcache_page(page);

               mark_page_accessed(page);
               status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                               page, fsdata);
               if (unlikely(status < 0))
                       break;
               copied = status;

               cond_resched();

               iov_iter_advance(i, copied);
               if (unlikely(copied == 0)) {
                       bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                               iov_iter_single_seg_count(i));
                       goto again;
               }
               pos += copied;
               written += copied;

               balance_dirty_pages_ratelimited(mapping);
               if (fatal_signal_pending(current)) {
                       status = -EINTR;
                       break;
               }
       } while (iov_iter_count(i));

       return written ? written : status;
}

int simple_write_begin(struct file *file, struct address_space *mapping,
                       loff_t pos, unsigned len, unsigned flags,
                       struct page **pagep, void **fsdata)
{
       struct page *page;
       pgoff_t index;

       index = pos >> PAGE_CACHE_SHIFT;

       page = grab_cache_page_write_begin(mapping, index, flags);
       if (!page)
               return -ENOMEM;

       *pagep = page;

       if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
               unsigned from = pos & (PAGE_CACHE_SIZE - 1);

               zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
       }
       return 0;
}

struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                       pgoff_t index, unsigned flags)
{
       int status;
       gfp_t gfp_mask;
       struct page *page;
       gfp_t gfp_notmask = 0;

       gfp_mask = mapping_gfp_mask(mapping);
       if (mapping_cap_account_dirty(mapping))
               gfp_mask |= __GFP_WRITE;
       if (flags & AOP_FLAG_NOFS)
               gfp_notmask = __GFP_FS;
repeat:
       page = find_lock_page(mapping, index);
       if (page)
               goto found;

       page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
       if (!page)
               return NULL;
       status = add_to_page_cache_lru(page, mapping, index,
                                               GFP_KERNEL & ~gfp_notmask);
       if (unlikely(status)) {
               page_cache_release(page);
               if (status == -EEXIST)
                       goto repeat;
               return NULL;
       }
found:
       wait_on_page_writeback(page);
       return page;
}


const struct file_operations ramfs_file_operations = {
       .read           = do_sync_read,
       .aio_read       = generic_file_aio_read,
       .write          = do_sync_write,
       .aio_write      = generic_file_aio_write,
       .mmap           = generic_file_mmap,
       .fsync          = noop_fsync,
       .splice_read    = generic_file_splice_read,
       .splice_write   = generic_file_splice_write,
       .llseek         = generic_file_llseek,
};

ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
               unsigned long nr_segs, loff_t pos)
{
       struct file *file = iocb->ki_filp;
       struct inode *inode = file->f_mapping->host;
       ssize_t ret;

       BUG_ON(iocb->ki_pos != pos);

       sb_start_write(inode->i_sb);
       mutex_lock(&inode->i_mutex);
       ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
       mutex_unlock(&inode->i_mutex);

       if (ret > 0 || ret == -EIOCBQUEUED) {
               ssize_t err;

               err = generic_write_sync(file, pos, ret);
               if (err < 0 && ret > 0)
                       ret = err;
       }
       sb_end_write(inode->i_sb);
       return ret;
}

ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t *ppos)
{
       struct file *file = iocb->ki_filp;
       struct address_space * mapping = file->f_mapping;
       size_t ocount;          /* original count */
       size_t count;           /* after file limit checks */
       struct inode    *inode = mapping->host;
       loff_t          pos;
       ssize_t         written;
       ssize_t         err;

       ocount = 0;
       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
       if (err)
               return err;

       count = ocount;
       pos = *ppos;

       current->backing_dev_info = mapping->backing_dev_info;
       written = 0;

       err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
       if (err)
               goto out;

       if (count == 0)
               goto out;

       err = file_remove_suid(file);
       if (err)
               goto out;

       err = file_update_time(file);
       if (err)
               goto out;

       if (unlikely(file->f_flags & O_DIRECT)) {
               loff_t endbyte;
               ssize_t written_buffered;

               written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
                                                       ppos, count, ocount);
               if (written < 0 || written == count)
                       goto out;

               pos += written;
               count -= written;
               written_buffered = generic_file_buffered_write(iocb, iov,
                                               nr_segs, pos, ppos, count,
                                               written);
               if (written_buffered < 0) {
                       err = written_buffered;
                       goto out;
               }

               endbyte = pos + written_buffered - written - 1;
               err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
               if (err == 0) {
                       written = written_buffered;
                       invalidate_mapping_pages(mapping,
                                                pos >> PAGE_CACHE_SHIFT,
                                                endbyte >> PAGE_CACHE_SHIFT);
               } 
       } else {
               written = generic_file_buffered_write(iocb, iov, nr_segs,
                               pos, ppos, count, written);
       }
out:
       current->backing_dev_info = NULL;
       return written ? written : err;
}

ssize_t generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
               unsigned long nr_segs, loff_t pos, loff_t *ppos,
               size_t count, ssize_t written)
{
       struct file *file = iocb->ki_filp;
       ssize_t status;
       struct iov_iter i;

       iov_iter_init(&i, iov, nr_segs, count, written);
       status = generic_perform_write(file, &i, pos);

       if (likely(status >= 0)) {
               written += status;
               *ppos = pos + status;
       }
       
       return written ? written : status;
}

int simple_write_begin(struct file *file, struct address_space *mapping,
                       loff_t pos, unsigned len, unsigned flags,
                       struct page **pagep, void **fsdata)
{
       struct page *page;
       pgoff_t index;

       index = pos >> PAGE_CACHE_SHIFT;

       page = grab_cache_page_write_begin(mapping, index, flags);
       if (!page)
               return -ENOMEM;

       *pagep = page;

       if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
               unsigned from = pos & (PAGE_CACHE_SIZE - 1);

               zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
       }
       return 0;
}

struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                       pgoff_t index, unsigned flags)
{
       int status;
       gfp_t gfp_mask;
       struct page *page;
       gfp_t gfp_notmask = 0;

       gfp_mask = mapping_gfp_mask(mapping);
       if (mapping_cap_account_dirty(mapping))
               gfp_mask |= __GFP_WRITE;
       if (flags & AOP_FLAG_NOFS)
               gfp_notmask = __GFP_FS;
repeat:
       page = find_lock_page(mapping, index);
       if (page)
               goto found;

       page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
       if (!page)
               return NULL;
       status = add_to_page_cache_lru(page, mapping, index,
                                               GFP_KERNEL & ~gfp_notmask);
       if (unlikely(status)) {
               page_cache_release(page);
               if (status == -EEXIST)
                       goto repeat;
               return NULL;
       }
found:
       wait_on_page_writeback(page);
       return page;
}

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                               pgoff_t offset, gfp_t gfp_mask)
{
       int ret;

       ret = add_to_page_cache(page, mapping, offset, gfp_mask);
       if (ret == 0)
               lru_cache_add_file(page);
       return ret;
}

static inline int add_to_page_cache(struct page *page,
               struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
       int error;

       __set_page_locked(page);
       error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
       if (unlikely(error))
               __clear_page_locked(page);
       return error;
}

int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
               pgoff_t offset, gfp_t gfp_mask)
{
       int error;

       VM_BUG_ON(!PageLocked(page));
       VM_BUG_ON(PageSwapBacked(page));

       error = mem_cgroup_cache_charge(page, current->mm,
                                       gfp_mask & GFP_RECLAIM_MASK);
       if (error)
               goto out;

       error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
       if (error == 0) {
               page_cache_get(page);
               page->mapping = mapping;
               page->index = offset;

               spin_lock_irq(&mapping->tree_lock);
               error = radix_tree_insert(&mapping->page_tree, offset, page);
               if (likely(!error)) {
                       mapping->nrpages++;
                       __inc_zone_page_state(page, NR_FILE_PAGES);
                       spin_unlock_irq(&mapping->tree_lock);
               } else {
                       page->mapping = NULL;
                       /* Leave page->index set: truncation relies upon it */
                       spin_unlock_irq(&mapping->tree_lock);
                       mem_cgroup_uncharge_cache_page(page);
                       page_cache_release(page);
               }
               radix_tree_preload_end();
       } else
               mem_cgroup_uncharge_cache_page(page);
out:
       return error;
}

int radix_tree_insert(struct radix_tree_root *root,
                       unsigned long index, void *item)
{
       struct radix_tree_node *node = NULL, *slot;
       unsigned int height, shift;
       int offset;
       int error;

       BUG_ON(radix_tree_is_indirect_ptr(item));

       if (index > radix_tree_maxindex(root->height)) {
               error = radix_tree_extend(root, index);
               if (error)
                       return error;
       }

       slot = indirect_to_ptr(root->rnode);

       height = root->height;
       shift = (height-1) * RADIX_TREE_MAP_SHIFT;

       offset = 0; 
       while (height > 0) {
               if (slot == NULL) {
                       if (!(slot = radix_tree_node_alloc(root)))
                               return -ENOMEM;
                       slot->height = height;
                       slot->parent = node;
                       if (node) {
                               rcu_assign_pointer(node->slots[offset], slot);
                               node->count++;
                       } else
                               rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
               }

               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
               node = slot;
               slot = node->slots[offset];
               shift -= RADIX_TREE_MAP_SHIFT;
               height--;
       }

       if (slot != NULL)
               return -EEXIST;

       if (node) {
               node->count++;
               rcu_assign_pointer(node->slots[offset], item);
               BUG_ON(tag_get(node, 0, offset));
               BUG_ON(tag_get(node, 1, offset));
       } else {
               rcu_assign_pointer(root->rnode, item);
               BUG_ON(root_tag_get(root, 0));
               BUG_ON(root_tag_get(root, 1));
       }

       return 0;
}

int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
       return 0;
}
係る他のファイルシステムの実装は、ブロックデバイスに書き込む.fsyncが実装されているかどうかで、read/writeに係る実装はramfsと同様のfile->f_mapping->page_tree->rnode->slots[]となります。
const struct file_operations ext3_file_operations = {
       .llseek         = generic_file_llseek,
       .read           = do_sync_read,
       .write          = do_sync_write,
       .aio_read       = generic_file_aio_read,
       .aio_write      = generic_file_aio_write,
       .unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
       .compat_ioctl   = ext3_compat_ioctl,
#endif
       .mmap           = generic_file_mmap,
       .open           = dquot_file_open,
       .release        = ext3_release_file,
       .fsync          = ext3_sync_file,
       .splice_read    = generic_file_splice_read,
       .splice_write   = generic_file_splice_write,
};

int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
       struct inode *inode = file->f_mapping->host;
       struct ext3_inode_info *ei = EXT3_I(inode);
       journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
       int ret, needs_barrier = 0;
       tid_t commit_tid;

       trace_ext3_sync_file_enter(file, datasync);

       if (inode->i_sb->s_flags & MS_RDONLY)
               return 0;

       ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
       if (ret)
               goto out;

       J_ASSERT(ext3_journal_current_handle() == NULL);

       if (ext3_should_journal_data(inode)) {
               ret = ext3_force_commit(inode->i_sb);
               goto out;
       }

       if (datasync)
               commit_tid = atomic_read(&ei->i_datasync_tid);
       else
               commit_tid = atomic_read(&ei->i_sync_tid);

       if (test_opt(inode->i_sb, BARRIER) &&
           !journal_trans_will_send_data_barrier(journal, commit_tid))
               needs_barrier = 1;
       log_start_commit(journal, commit_tid);
       ret = log_wait_commit(journal, commit_tid);

       if (needs_barrier) {
               int err;

               err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
               if (!ret)
                       ret = err;
       }
out:
       trace_ext3_sync_file_exit(inode, ret);
       return ret;
}

ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t *ppos)
{
       struct file *file = iocb->ki_filp;
       struct address_space * mapping = file->f_mapping;
       size_t ocount;          /* original count */
       size_t count;           /* after file limit checks */
       struct inode    *inode = mapping->host;
       loff_t          pos;
       ssize_t         written;
       ssize_t         err;

       ocount = 0;
       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
       if (err)
               return err;

       count = ocount;
       pos = *ppos;

       current->backing_dev_info = mapping->backing_dev_info;
       written = 0;

       err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
       if (err)
               goto out;

       if (count == 0)
               goto out;

       err = file_remove_suid(file);
       if (err)
               goto out;

       err = file_update_time(file);
       if (err)
               goto out;

       if (unlikely(file->f_flags & O_DIRECT)) {
               loff_t endbyte;
               ssize_t written_buffered;

               written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
                                                       ppos, count, ocount);
               if (written < 0 || written == count)
                       goto out;

               pos += written;
               count -= written;
               written_buffered = generic_file_buffered_write(iocb, iov,
                                               nr_segs, pos, ppos, count,
                                               written);
               if (written_buffered < 0) {
                       err = written_buffered;
                       goto out;
               }

               endbyte = pos + written_buffered - written - 1;
               err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
               if (err == 0) {
                       written = written_buffered;
                       invalidate_mapping_pages(mapping,
                                                pos >> PAGE_CACHE_SHIFT,
                                                endbyte >> PAGE_CACHE_SHIFT);
               } else {
               }
       } else {
               written = generic_file_buffered_write(iocb, iov, nr_segs,
                               pos, ppos, count, written);
       }
out:
       current->backing_dev_info = NULL;
       return written ? written : err;
}

ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
               unsigned long nr_segs, loff_t pos)
{
       struct file *file = iocb->ki_filp;
       struct inode *inode = file->f_mapping->host;
       ssize_t ret;

       BUG_ON(iocb->ki_pos != pos);

       sb_start_write(inode->i_sb);
       mutex_lock(&inode->i_mutex);
       ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
       mutex_unlock(&inode->i_mutex);

       if (ret > 0 || ret == -EIOCBQUEUED) {
               ssize_t err;

               err = generic_write_sync(file, pos, ret);
               if (err < 0 && ret > 0)
                       ret = err;
       }
       sb_end_write(inode->i_sb);
       return ret;
}

generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
               unsigned long nr_segs, loff_t pos, loff_t *ppos,
               size_t count, ssize_t written)
{
       struct file *file = iocb->ki_filp;
       ssize_t status;
       struct iov_iter i;

       iov_iter_init(&i, iov, nr_segs, count, written);
       status = generic_perform_write(file, &i, pos);

       if (likely(status >= 0)) {
               written += status;
               *ppos = pos + status;
       }
       
       return written ? written : status;
}

static ssize_t generic_perform_write(struct file *file,
                               struct iov_iter *i, loff_t pos)
{
       struct address_space *mapping = file->f_mapping;
       const struct address_space_operations *a_ops = mapping->a_ops;
       long status = 0;
       ssize_t written = 0;
       unsigned int flags = 0;

       if (segment_eq(get_fs(), KERNEL_DS))
               flags |= AOP_FLAG_UNINTERRUPTIBLE;

       do {
               struct page *page;
               unsigned long offset;   /* Offset into pagecache page */
               unsigned long bytes;    /* Bytes to write to page */
               size_t copied;          /* Bytes copied from user */
               void *fsdata;

               offset = (pos & (PAGE_CACHE_SIZE - 1));
               bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                               iov_iter_count(i));

again:
               if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
                       status = -EFAULT;
                       break;
               }

               status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                               &page, &fsdata);
               if (unlikely(status))
                       break;

               if (mapping_writably_mapped(mapping))
                       flush_dcache_page(page);

               pagefault_disable();
               copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
               pagefault_enable();
               flush_dcache_page(page);

               mark_page_accessed(page);
               status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                               page, fsdata);
               if (unlikely(status < 0))
                       break;
               copied = status;

               cond_resched();

               iov_iter_advance(i, copied);
               if (unlikely(copied == 0)) {
                       bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                               iov_iter_single_seg_count(i));
                       goto again;
               }
               pos += copied;
               written += copied;

               balance_dirty_pages_ratelimited(mapping);
               if (fatal_signal_pending(current)) {
                       status = -EINTR;
                       break;
               }
       } while (iov_iter_count(i));

       return written ? written : status;
}

ssize_t generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
               unsigned long nr_segs, loff_t pos, loff_t *ppos,
               size_t count, ssize_t written)
{
       struct file *file = iocb->ki_filp;
       ssize_t status;
       struct iov_iter i;

       iov_iter_init(&i, iov, nr_segs, count, written);
       status = generic_perform_write(file, &i, pos);

       if (likely(status >= 0)) {
               written += status;
               *ppos = pos + status;
       }
       
       return written ? written : status;
}

備考

ブロックデバイスを参照しないramfsも、mountシステムコールは、ブロックファイルがNULLでなければ、file->f_op->fsyncの更新先ブロックファイルをmnt->mnt_devnameに設定する故、mountコマンドでは運用のないramfsのmountでも、引数として適当な名称のブロックデバイスを指定する必要があります。
[root@north a]# mount -t ramfs babakaka /mnt7
struct vfsmount mnt->mnt_devname = kstrdup("babakaka", GFP_KERNEL);
[root@north a]# mount | grep ramfs
babakaka on /mnt7 type ramfs (rw,relatime)

static int do_new_mount(struct path *path, const char *type, int flags,
                       int mnt_flags, const char *name, void *data)
{
       struct vfsmount *mnt;
       int err;

       if (!type)
               return -EINVAL;

       mnt = do_kern_mount(type, flags, name, data);
       if (IS_ERR(mnt))
               return PTR_ERR(mnt);

       err = do_add_mount(real_mount(mnt), path, mnt_flags);
       if (err)
               mntput(mnt);
       return err;
}

do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
       struct file_system_type *type = get_fs_type(fstype);
       struct vfsmount *mnt;
       if (!type)
               return ERR_PTR(-ENODEV);
       mnt = vfs_kern_mount(type, flags, name, data);
       if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
           !mnt->mnt_sb->s_subtype)
               mnt = fs_set_subtype(mnt, fstype);
       put_filesystem(type);
       return mnt;
}

static struct mount *alloc_vfsmnt(const char *name)
{
       struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
       if (mnt) {
               int err;

               err = mnt_alloc_id(mnt);
               if (err)
                       goto out_free_cache;

               if (name) {
                       mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
                       if (!mnt->mnt_devname)
                               goto out_free_id;
               }

#ifdef CONFIG_SMP
               mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
               if (!mnt->mnt_pcp)
                       goto out_free_devname;

               this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
               mnt->mnt_count = 1;
               mnt->mnt_writers = 0;
#endif

               INIT_LIST_HEAD(&mnt->mnt_hash);
               INIT_LIST_HEAD(&mnt->mnt_child);
               INIT_LIST_HEAD(&mnt->mnt_mounts);
               INIT_LIST_HEAD(&mnt->mnt_list);
               INIT_LIST_HEAD(&mnt->mnt_expire);
               INIT_LIST_HEAD(&mnt->mnt_share);
               INIT_LIST_HEAD(&mnt->mnt_slave_list);
               INIT_LIST_HEAD(&mnt->mnt_slave);
#ifdef CONFIG_FSNOTIFY
               INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
       }
       return mnt;

#ifdef CONFIG_SMP
out_free_devname:
       kfree(mnt->mnt_devname);
#endif
out_free_id:
       mnt_free_id(mnt);
out_free_cache:
       kmem_cache_free(mnt_cache, mnt);
       return NULL;
}


最終更新 2017/11/05 16:29:46 - north
(2010/06/02 18:21:57 作成)


検索

アクセス数
3712750
最近のコメント
コアダンプファイル - sakaia
list_head構造体 - yocto_no_yomikata
勧告ロックと強制ロック - wataash
LKMからのファイル出力 - 重松 宏昌
kprobe - ななし
ksetの実装 - スーパーコピー
カーネルスレッドとは - ノース
カーネルスレッドとは - nbyst
asmlinkageってなに? - ノース
asmlinkageってなに? - よろしく
Adsense
広告情報が設定されていません。