X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Focfs2%2Ffile.c;fp=fs%2Focfs2%2Ffile.c;h=5a3d1fd4d28a04cefa4bc8d6b0158e129341460d;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=8a4048b55fdc41d5dda4dc5e8b9908e4a765cc64;hpb=76828883507a47dae78837ab5dec5a5b4513c667;p=linux-2.6.git diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8a4048b55..5a3d1fd4d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -30,6 +30,9 @@ #include #include #include +#include +#include +#include #define MLOG_MASK_PREFIX ML_INODE #include @@ -44,6 +47,7 @@ #include "file.h" #include "sysfile.h" #include "inode.h" +#include "ioctl.h" #include "journal.h" #include "mmap.h" #include "suballoc.h" @@ -64,7 +68,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, - file->f_dentry->d_name.len, file->f_dentry->d_name.name); + file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); spin_lock(&oi->ip_lock); @@ -94,8 +98,8 @@ static int ocfs2_file_release(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, - file->f_dentry->d_name.len, - file->f_dentry->d_name.name); + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name); spin_lock(&oi->ip_lock); if (!--oi->ip_open_count) @@ -132,7 +136,77 @@ bail: return (err < 0) ? -EIO : 0; } -int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, +int ocfs2_should_update_atime(struct inode *inode, + struct vfsmount *vfsmnt) +{ + struct timespec now; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return 0; + + if ((inode->i_flags & S_NOATIME) || + ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) + return 0; + + /* + * We can be called with no vfsmnt structure - NFSD will + * sometimes do this. + * + * Note that our action here is different than touch_atime() - + * if we can't tell whether this is a noatime mount, then we + * don't know whether to trust the value of s_atime_quantum. + */ + if (vfsmnt == NULL) + return 0; + + if ((vfsmnt->mnt_flags & MNT_NOATIME) || + ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) + return 0; + + if (vfsmnt->mnt_flags & MNT_RELATIME) { + if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || + (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + return 1; + + return 0; + } + + now = CURRENT_TIME; + if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) + return 0; + else + return 1; +} + +int ocfs2_update_inode_atime(struct inode *inode, + struct buffer_head *bh) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle; + + mlog_entry_void(); + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + inode->i_atime = CURRENT_TIME; + ret = ocfs2_mark_inode_dirty(handle, inode, bh); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + mlog_exit(ret); + return ret; +} + +int ocfs2_set_inode_size(handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 new_i_size) @@ -161,10 +235,9 @@ static int ocfs2_simple_size_update(struct inode *inode, { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_journal_handle *handle = NULL; + handle_t *handle = NULL; - handle = ocfs2_start_trans(osb, NULL, - OCFS2_INODE_UPDATE_CREDITS); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (handle == NULL) { ret = -ENOMEM; mlog_errno(ret); @@ -176,7 +249,7 @@ static int ocfs2_simple_size_update(struct inode *inode, if (ret < 0) mlog_errno(ret); - ocfs2_commit_trans(handle); + ocfs2_commit_trans(osb, handle); out: return ret; } @@ -187,14 +260,14 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, u64 new_i_size) { int status; - struct ocfs2_journal_handle *handle; + handle_t *handle; mlog_entry_void(); /* TODO: This needs to actually orphan the inode in this * transaction. */ - handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -205,7 +278,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, if (status < 0) mlog_errno(status); - ocfs2_commit_trans(handle); + ocfs2_commit_trans(osb, handle); out: mlog_exit(status); return status; @@ -220,8 +293,9 @@ static int ocfs2_truncate_file(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_truncate_context *tc = NULL; - mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n", - OCFS2_I(inode)->ip_blkno, new_i_size); + mlog_entry("(inode = %llu, new_i_size = %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)new_i_size); truncate_inode_pages(inode->i_mapping, new_i_size); @@ -233,29 +307,43 @@ static int ocfs2_truncate_file(struct inode *inode, } mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), - "Inode %"MLFu64", inode i_size = %lld != di " - "i_size = %"MLFu64", i_flags = 0x%x\n", - OCFS2_I(inode)->ip_blkno, + "Inode %llu, inode i_size = %lld != di " + "i_size = %llu, i_flags = 0x%x\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), - le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags)); + (unsigned long long)le64_to_cpu(fe->i_size), + le32_to_cpu(fe->i_flags)); if (new_i_size > le64_to_cpu(fe->i_size)) { - mlog(0, "asked to truncate file with size (%"MLFu64") " - "to size (%"MLFu64")!\n", - le64_to_cpu(fe->i_size), new_i_size); + mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", + (unsigned long long)le64_to_cpu(fe->i_size), + (unsigned long long)new_i_size); status = -EINVAL; mlog_errno(status); goto bail; } - mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n", - le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size); + mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", + (unsigned long long)le64_to_cpu(fe->i_blkno), + (unsigned long long)le64_to_cpu(fe->i_size), + (unsigned long long)new_i_size); /* lets handle the simple truncate cases before doing any more * cluster locking. */ if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; + /* This forces other nodes to sync and drop their pages. Do + * this even if we have a truncate without allocation change - + * ocfs2 cluster sizes can be much greater than page size, so + * we have to truncate them anyway. */ + status = ocfs2_data_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + ocfs2_data_unlock(inode, 1); + if (le32_to_cpu(fe->i_clusters) == ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", @@ -268,14 +356,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* This forces other nodes to sync and drop their pages */ - status = ocfs2_data_lock(inode, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - ocfs2_data_unlock(inode, 1); - /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking @@ -319,7 +399,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct inode *inode, u32 clusters_to_add, struct buffer_head *fe_bh, - struct ocfs2_journal_handle *handle, + handle_t *handle, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) @@ -378,8 +458,8 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, } block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n", - num_bits, bit_off, OCFS2_I(inode)->ip_blkno); + mlog(0, "Allocating %u clusters at block %u for inode %llu\n", + num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, num_bits, meta_ac); if (status < 0) { @@ -424,7 +504,7 @@ static int ocfs2_extend_allocation(struct inode *inode, u32 prev_clusters; struct buffer_head *bh = NULL; struct ocfs2_dinode *fe = NULL; - struct ocfs2_journal_handle *handle = NULL; + handle_t *handle = NULL; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; enum ocfs2_alloc_restarted why; @@ -449,18 +529,11 @@ static int ocfs2_extend_allocation(struct inode *inode, restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, " + mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " "clusters_to_add = %u\n", - OCFS2_I(inode)->ip_blkno, i_size_read(inode), + (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), fe->i_clusters, clusters_to_add); - handle = ocfs2_alloc_handle(osb); - if (handle == NULL) { - status = -ENOMEM; - mlog_errno(status); - goto leave; - } - num_free_extents = ocfs2_num_free_extents(osb, inode, fe); @@ -471,10 +544,7 @@ restart_all: } if (!num_free_extents) { - status = ocfs2_reserve_new_metadata(osb, - handle, - fe, - &meta_ac); + status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -482,10 +552,7 @@ restart_all: } } - status = ocfs2_reserve_clusters(osb, - handle, - clusters_to_add, - &data_ac); + status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -500,7 +567,7 @@ restart_all: drop_alloc_sem = 1; credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); - handle = ocfs2_start_trans(osb, handle, credits); + handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -569,8 +636,8 @@ restarted_transaction: } } - mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n", - fe->i_clusters, fe->i_size); + mlog(0, "fe: i_clusters = %u, i_size=%llu\n", + fe->i_clusters, (unsigned long long)fe->i_size); mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", OCFS2_I(inode)->ip_clusters, i_size_read(inode)); @@ -580,7 +647,7 @@ leave: drop_alloc_sem = 0; } if (handle) { - ocfs2_commit_trans(handle); + ocfs2_commit_trans(osb, handle); handle = NULL; } if (data_ac) { @@ -606,7 +673,8 @@ leave: /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to - * worry about recursive locking in ->commit_write(). */ + * worry about recursive locking in ->prepare_write() and + * ->commit_write(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 size) { @@ -614,7 +682,7 @@ static int ocfs2_write_zero_page(struct inode *inode, struct page *page; unsigned long index; unsigned int offset; - struct ocfs2_journal_handle *handle = NULL; + handle_t *handle = NULL; int ret; offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ @@ -634,7 +702,7 @@ static int ocfs2_write_zero_page(struct inode *inode, goto out; } - ret = ocfs2_prepare_write(NULL, page, offset, offset); + ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -658,7 +726,7 @@ static int ocfs2_write_zero_page(struct inode *inode, ret = 0; if (handle) - ocfs2_commit_trans(handle); + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out_unlock: unlock_page(page); page_cache_release(page); @@ -682,19 +750,38 @@ static int ocfs2_zero_extend(struct inode *inode, } start_off += sb->s_blocksize; + + /* + * Very large extends have the potential to lock up + * the cpu for extended periods of time. + */ + cond_resched(); } out: return ret; } +/* + * A tail_to_skip value > 0 indicates that we're being called from + * ocfs2_file_aio_write(). This has the following implications: + * + * - we don't want to update i_size + * - di_bh will be NULL, which is fine because it's only used in the + * case where we want to update i_size. + * - ocfs2_zero_extend() will then only be filling the hole created + * between i_size and the start of the write. + */ static int ocfs2_extend_file(struct inode *inode, struct buffer_head *di_bh, - u64 new_i_size) + u64 new_i_size, + size_t tail_to_skip) { int ret = 0; u32 clusters_to_add; + BUG_ON(!tail_to_skip && !di_bh); + /* setattr sometimes calls us like this. */ if (new_i_size == 0) goto out; @@ -706,28 +793,49 @@ static int ocfs2_extend_file(struct inode *inode, clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - OCFS2_I(inode)->ip_clusters; + /* + * protect the pages that ocfs2_zero_extend is going to be + * pulling into the page cache.. we do this before the + * metadata extend so that we don't get into the situation + * where we've extended the metadata but can't get the data + * lock to zero. + */ + ret = ocfs2_data_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + if (clusters_to_add) { ret = ocfs2_extend_allocation(inode, clusters_to_add); if (ret < 0) { mlog_errno(ret); - goto out; - } - - ret = ocfs2_zero_extend(inode, new_i_size); - if (ret < 0) { - mlog_errno(ret); - goto out; + goto out_unlock; } - } + } - /* No allocation required, we just use this helper to - * do a trivial update of i_size. */ - ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + /* + * Call this even if we don't add any clusters to the tree. We + * still need to zero the area between the old i_size and the + * new i_size. + */ + ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); if (ret < 0) { mlog_errno(ret); - goto out; + goto out_unlock; } + if (!tail_to_skip) { + /* We're being called from ocfs2_setattr() which wants + * us to update i_size */ + ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (ret < 0) + mlog_errno(ret); + } + +out_unlock: + ocfs2_data_unlock(inode, 1); + out: return ret; } @@ -739,7 +847,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) struct super_block *sb = inode->i_sb; struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; - struct ocfs2_journal_handle *handle = NULL; + handle_t *handle = NULL; mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); @@ -750,13 +858,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog(0, "uid change: %d\n", attr->ia_uid); if (attr->ia_valid & ATTR_GID) mlog(0, "gid change: %d\n", attr->ia_gid); + if (attr->ia_valid & ATTR_TAG) + mlog(0, "tag change: %d\n", attr->ia_tag); if (attr->ia_valid & ATTR_SIZE) mlog(0, "size change...\n"); if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) mlog(0, "time change...\n"); #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ - | ATTR_GID | ATTR_UID | ATTR_MODE) + | ATTR_GID | ATTR_UID | ATTR_TAG | ATTR_MODE) if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); return 0; @@ -775,7 +885,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - status = ocfs2_meta_lock(inode, NULL, &bh, 1); + status = ocfs2_meta_lock(inode, &bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -786,7 +896,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (i_size_read(inode) > attr->ia_size) status = ocfs2_truncate_file(inode, bh, attr->ia_size); else - status = ocfs2_extend_file(inode, bh, attr->ia_size); + status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -795,7 +905,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -813,7 +923,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_errno(status); bail_commit: - ocfs2_commit_trans(handle); + ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_meta_unlock(inode, 1); bail_unlock_rw: @@ -856,19 +966,39 @@ bail: return err; } +int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + int ret; + + mlog_entry_void(); + + ret = ocfs2_meta_lock(inode, NULL, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = generic_permission(inode, mask, NULL); + + ocfs2_meta_unlock(inode, 0); +out: + mlog_exit(ret); + return ret; +} + static int ocfs2_write_remove_suid(struct inode *inode) { int ret; struct buffer_head *bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_journal_handle *handle; + handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di; - mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno, - inode->i_mode); + mlog_entry("(Inode %llu, mode 0%o)\n", + (unsigned long long)oi->ip_blkno, inode->i_mode); - handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (handle == NULL) { ret = -ENOMEM; mlog_errno(ret); @@ -901,77 +1031,29 @@ static int ocfs2_write_remove_suid(struct inode *inode) out_bh: brelse(bh); out_trans: - ocfs2_commit_trans(handle); + ocfs2_commit_trans(osb, handle); out: mlog_exit(ret); return ret; } -static inline int ocfs2_write_should_remove_suid(struct inode *inode) -{ - mode_t mode = inode->i_mode; - - if (!capable(CAP_FSETID)) { - if (unlikely(mode & S_ISUID)) - return 1; - - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - return 1; - } - return 0; -} - -static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, - const char __user *buf, - size_t count, - loff_t pos) +static int ocfs2_prepare_inode_for_write(struct dentry *dentry, + loff_t *ppos, + size_t count, + int appending) { - struct iovec local_iov = { .iov_base = (void __user *)buf, - .iov_len = count }; - int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; + int ret = 0, meta_level = appending; + struct inode *inode = dentry->d_inode; u32 clusters; - struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_dentry->d_inode; loff_t newsize, saved_pos; - mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, - (unsigned int)count, - filp->f_dentry->d_name.len, - filp->f_dentry->d_name.name); - - /* happy write of zero bytes */ - if (count == 0) - return 0; - - if (!inode) { - mlog(0, "bad inode\n"); - return -EIO; - } - - mutex_lock(&inode->i_mutex); - /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ - if (filp->f_flags & O_DIRECT) { - have_alloc_sem = 1; - down_read(&inode->i_alloc_sem); - } - - /* concurrent O_DIRECT writes are allowed */ - rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; - ret = ocfs2_rw_lock(inode, rw_level); - if (ret < 0) { - rw_level = -1; - mlog_errno(ret); - goto out; - } - /* * We sample i_size under a read level meta lock to see if our write * is extending the file, if it is we back off and get a write level * meta lock. */ - meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; for(;;) { - ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); + ret = ocfs2_meta_lock(inode, NULL, meta_level); if (ret < 0) { meta_level = -1; mlog_errno(ret); @@ -987,7 +1069,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (ocfs2_write_should_remove_suid(inode)) { + if (should_remove_suid(dentry)) { if (meta_level == 0) { ocfs2_meta_unlock(inode, meta_level); meta_level = 1; @@ -997,17 +1079,17 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, ret = ocfs2_write_remove_suid(inode); if (ret < 0) { mlog_errno(ret); - goto out; + goto out_unlock; } } /* work on a copy of ppos until we're sure that we won't have * to recalculate it due to relocking. */ - if (filp->f_flags & O_APPEND) { + if (appending) { saved_pos = i_size_read(inode); mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); } else { - saved_pos = iocb->ki_pos; + saved_pos = *ppos; } newsize = count + saved_pos; @@ -1042,33 +1124,71 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (!clusters) break; - ret = ocfs2_extend_allocation(inode, clusters); + ret = ocfs2_extend_file(inode, NULL, newsize, count); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); - goto out; - } - - /* Fill any holes which would've been created by this - * write. If we're O_APPEND, this will wind up - * (correctly) being a noop. */ - ret = ocfs2_zero_extend(inode, (u64) newsize - count); - if (ret < 0) { - mlog_errno(ret); - goto out; + goto out_unlock; } break; } - /* ok, we're done with i_size and alloc work */ - iocb->ki_pos = saved_pos; + if (appending) + *ppos = saved_pos; + +out_unlock: ocfs2_meta_unlock(inode, meta_level); - meta_level = -1; + +out: + return ret; +} + +static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos) +{ + int ret, rw_level, have_alloc_sem = 0; + struct file *filp = iocb->ki_filp; + struct inode *inode = filp->f_path.dentry->d_inode; + int appending = filp->f_flags & O_APPEND ? 1 : 0; + + mlog_entry("(0x%p, %u, '%.*s')\n", filp, + (unsigned int)nr_segs, + filp->f_path.dentry->d_name.len, + filp->f_path.dentry->d_name.name); + + /* happy write of zero bytes */ + if (iocb->ki_left == 0) + return 0; + + mutex_lock(&inode->i_mutex); + /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ + if (filp->f_flags & O_DIRECT) { + have_alloc_sem = 1; + down_read(&inode->i_alloc_sem); + } + + /* concurrent O_DIRECT writes are allowed */ + rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; + ret = ocfs2_rw_lock(inode, rw_level); + if (ret < 0) { + rw_level = -1; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, + iocb->ki_left, appending); + if (ret < 0) { + mlog_errno(ret); + goto out; + } /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb); - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); + ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); @@ -1089,8 +1209,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, } out: - if (meta_level != -1) - ocfs2_meta_unlock(inode, meta_level); if (have_alloc_sem) up_read(&inode->i_alloc_sem); if (rw_level != -1) @@ -1101,19 +1219,90 @@ out: return ret; } +static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, + loff_t *ppos, + size_t len, + unsigned int flags) +{ + int ret; + struct inode *inode = out->f_path.dentry->d_inode; + + mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, + (unsigned int)len, + out->f_path.dentry->d_name.len, + out->f_path.dentry->d_name.name); + + inode_double_lock(inode, pipe->inode); + + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + /* ok, we're done with i_size and alloc work */ + ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); + +out_unlock: + ocfs2_rw_unlock(inode, 1); +out: + inode_double_unlock(inode, pipe->inode); + + mlog_exit(ret); + return ret; +} + +static ssize_t ocfs2_file_splice_read(struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + int ret = 0; + struct inode *inode = in->f_path.dentry->d_inode; + + mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, + (unsigned int)len, + in->f_path.dentry->d_name.len, + in->f_path.dentry->d_name.name); + + /* + * See the comment in ocfs2_file_aio_read() + */ + ret = ocfs2_meta_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + ocfs2_meta_unlock(inode, 0); + + ret = generic_file_splice_read(in, ppos, pipe, len, flags); + +bail: + mlog_exit(ret); + return ret; +} + static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, - char __user *buf, - size_t count, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - int ret = 0, rw_level = -1, have_alloc_sem = 0; + int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; - mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, - (unsigned int)count, - filp->f_dentry->d_name.len, - filp->f_dentry->d_name.name); + mlog_entry("(0x%p, %u, '%.*s')\n", filp, + (unsigned int)nr_segs, + filp->f_path.dentry->d_name.len, + filp->f_path.dentry->d_name.name); if (!inode) { ret = -EINVAL; @@ -1139,7 +1328,23 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, ocfs2_iocb_set_rw_locked(iocb); } - ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); + /* + * We're fine letting folks race truncates and extending + * writes with read across the cluster, just like they can + * locally. Hence no rw_lock during read. + * + * Take and drop the meta data lock to update inode fields + * like i_size. This allows the checks down below + * generic_file_aio_read() a chance of actually working. + */ + ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + ocfs2_meta_unlock(inode, lock_level); + + ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); if (ret == -EINVAL) mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); @@ -1165,14 +1370,17 @@ bail: struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, + .sync_flags = ocfs2_sync_flags, + .permission = ocfs2_permission, }; struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, + .permission = ocfs2_permission, }; -struct file_operations ocfs2_fops = { +const struct file_operations ocfs2_fops = { .read = do_sync_read, .write = do_sync_write, .sendfile = generic_file_sendfile, @@ -1182,10 +1390,14 @@ struct file_operations ocfs2_fops = { .open = ocfs2_file_open, .aio_read = ocfs2_file_aio_read, .aio_write = ocfs2_file_aio_write, + .ioctl = ocfs2_ioctl, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, }; -struct file_operations ocfs2_dops = { +const struct file_operations ocfs2_dops = { .read = generic_read_dir, .readdir = ocfs2_readdir, .fsync = ocfs2_sync_file, + .ioctl = ocfs2_ioctl, };