X-Git-Url: http://git.onelab.eu/?p=linux-2.6.git;a=blobdiff_plain;f=fs%2Focfs2%2Faops.c;fp=fs%2Focfs2%2Faops.c;h=93628b02ef5d8899658834e093195e9a702a8693;hp=8f4467a930a548648c33b5cf075a4b022989fd31;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hpb=76828883507a47dae78837ab5dec5a5b4513c667 diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8f4467a93..93628b02e 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -74,8 +74,8 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, fe = (struct ocfs2_dinode *) bh->b_data; if (!OCFS2_IS_VALID_DINODE(fe)) { - mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n", - fe->i_blkno, 7, fe->i_signature); + mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)fe->i_blkno, 7, fe->i_signature); goto bail; } @@ -162,8 +162,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, NULL); if (err) { mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " - "%"MLFu64", NULL)\n", err, inode, - (unsigned long long)iblock, p_blkno); + "%llu, NULL)\n", err, inode, (unsigned long long)iblock, + (unsigned long long)p_blkno); goto bail; } @@ -171,13 +171,15 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, if (bh_result->b_blocknr == 0) { err = -EIO; - mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" " - "blkno=(%"MLFu64")\n", (unsigned long long)iblock, - p_blkno, OCFS2_I(inode)->ip_blkno); + mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", + (unsigned long long)iblock, + (unsigned long long)p_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); } past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof); + mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, + (unsigned long long)past_eof); if (create && (iblock >= past_eof)) set_buffer_new(bh_result); @@ -198,7 +200,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); - ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); + ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); if (ret != 0) { if (ret == AOP_TRUNCATED_PAGE) unlock = 0; @@ -274,30 +276,42 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) return ret; } +/* This can also be called from ocfs2_write_zero_page() which has done + * it's own cluster locking. */ +int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + int ret; + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + ret = block_prepare_write(page, from, to, ocfs2_get_block); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + return ret; +} + /* * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called * from loopback. It must be able to perform its own locking around * ocfs2_get_block(). */ -int ocfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ocfs2_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) { struct inode *inode = page->mapping->host; int ret; mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); - ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); + ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); if (ret != 0) { mlog_errno(ret); goto out; } - down_read(&OCFS2_I(inode)->ip_alloc_sem); - - ret = block_prepare_write(page, from, to, ocfs2_get_block); - - up_read(&OCFS2_I(inode)->ip_alloc_sem); + ret = ocfs2_prepare_write_nolock(inode, page, from, to); ocfs2_meta_unlock(inode, 0); out: @@ -341,16 +355,16 @@ static int walk_page_buffers( handle_t *handle, return ret; } -struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, +handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, unsigned to) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_journal_handle *handle = NULL; + handle_t *handle = NULL; int ret = 0; - handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (!handle) { ret = -ENOMEM; mlog_errno(ret); @@ -358,7 +372,7 @@ struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, } if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle->k_handle, + ret = walk_page_buffers(handle, page_buffers(page), from, to, NULL, ocfs2_journal_dirty_data); @@ -368,7 +382,7 @@ struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, out: if (ret) { if (handle) - ocfs2_commit_trans(handle); + ocfs2_commit_trans(osb, handle); handle = ERR_PTR(ret); } return handle; @@ -377,31 +391,28 @@ out: static int ocfs2_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - int ret, extending = 0, locklevel = 0; - loff_t new_i_size; + int ret; struct buffer_head *di_bh = NULL; struct inode *inode = page->mapping->host; - struct ocfs2_journal_handle *handle = NULL; + handle_t *handle = NULL; + struct ocfs2_dinode *di; mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); /* NOTE: ocfs2_file_aio_write has ensured that it's safe for - * us to sample inode->i_size here without the metadata lock: + * us to continue here without rechecking the I/O against + * changed inode values. * * 1) We're currently holding the inode alloc lock, so no * nodes can change it underneath us. * * 2) We've had to take the metadata lock at least once - * already to check for extending writes, hence insuring - * that our current copy is also up to date. + * already to check for extending writes, suid removal, etc. + * The meta data update code then ensures that we don't get a + * stale inode allocation image (i_size, i_clusters, etc). */ - new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - if (new_i_size > i_size_read(inode)) { - extending = 1; - locklevel = 1; - } - ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page); + ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page); if (ret != 0) { mlog_errno(ret); goto out; @@ -413,23 +424,20 @@ static int ocfs2_commit_write(struct file *file, struct page *page, goto out_unlock_meta; } - if (extending) { - handle = ocfs2_start_walk_page_trans(inode, page, from, to); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - goto out_unlock_data; - } + handle = ocfs2_start_walk_page_trans(inode, page, from, to); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_unlock_data; + } - /* Mark our buffer early. We'd rather catch this error up here - * as opposed to after a successful commit_write which would - * require us to set back inode->i_size. */ - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } + /* Mark our buffer early. We'd rather catch this error up here + * as opposed to after a successful commit_write which would + * require us to set back inode->i_size. */ + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; } /* might update i_size */ @@ -439,37 +447,28 @@ static int ocfs2_commit_write(struct file *file, struct page *page, goto out_commit; } - if (extending) { - loff_t size = (u64) i_size_read(inode); - struct ocfs2_dinode *di = - (struct ocfs2_dinode *)di_bh->b_data; + di = (struct ocfs2_dinode *)di_bh->b_data; - /* ocfs2_mark_inode_dirty is too heavy to use here. */ - inode->i_blocks = ocfs2_align_bytes_to_sectors(size); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + /* ocfs2_mark_inode_dirty() is too heavy to use here. */ + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - di->i_size = cpu_to_le64(size); - di->i_ctime = di->i_mtime = - cpu_to_le64(inode->i_mtime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = - cpu_to_le32(inode->i_mtime.tv_nsec); + inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; } - BUG_ON(extending && (i_size_read(inode) != new_i_size)); - out_commit: - if (handle) - ocfs2_commit_trans(handle); + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out_unlock_data: ocfs2_data_unlock(inode, 1); out_unlock_meta: - ocfs2_meta_unlock(inode, locklevel); + ocfs2_meta_unlock(inode, 1); out: if (di_bh) brelse(di_bh); @@ -491,7 +490,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) * accessed concurrently from multiple nodes. */ if (!INODE_JOURNAL(inode)) { - err = ocfs2_meta_lock(inode, NULL, NULL, 0); + err = ocfs2_meta_lock(inode, NULL, 0); if (err) { if (err != -ENOENT) mlog_errno(err); @@ -538,32 +537,35 @@ bail: * fs_count, map_bh, dio->rw == WRITE); */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, int create) { int ret; - u64 vbo_max; /* file offset, max_blocks from iblock */ - u64 p_blkno; + u64 p_blkno, inode_blocks; int contig_blocks; - unsigned char blocksize_bits; - - if (!inode || !bh_result) { - mlog(ML_ERROR, "inode or bh_result is null\n"); - return -EIO; - } - - blocksize_bits = inode->i_sb->s_blocksize_bits; + unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; /* This function won't even be called if the request isn't all * nicely aligned and of the right size, so there's no need * for us to check any of that. */ - vbo_max = ((u64)iblock + max_blocks) << blocksize_bits; - spin_lock(&OCFS2_I(inode)->ip_lock); - if ((iblock + max_blocks) > - ocfs2_clusters_to_blocks(inode->i_sb, - OCFS2_I(inode)->ip_clusters)) { + inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb, + OCFS2_I(inode)->ip_clusters); + + /* + * For a read which begins past the end of file, we return a hole. + */ + if (!create && (iblock >= inode_blocks)) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + ret = 0; + goto bail; + } + + /* + * Any write past EOF is not allowed because we'd be extending. + */ + if (create && (iblock + max_blocks) > inode_blocks) { spin_unlock(&OCFS2_I(inode)->ip_lock); ret = -EIO; goto bail; @@ -603,7 +605,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, ssize_t bytes, void *private) { - struct inode *inode = iocb->ki_filp->f_dentry->d_inode; + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -619,20 +621,40 @@ static ssize_t ocfs2_direct_IO(int rw, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; int ret; mlog_entry_void(); + + /* + * We get PR data locks even for O_DIRECT. This allows + * concurrent O_DIRECT I/O but doesn't let O_DIRECT with + * extending and buffered zeroing writes race. If they did + * race then the buffered zeroing could be written back after + * the O_DIRECT I/O. It's one thing to tell people not to mix + * buffered and O_DIRECT writes, but expecting them to + * understand that file extension is also an implicit buffered + * write is too much. By getting the PR we force writeback of + * the buffered zeroing before proceeding. + */ + ret = ocfs2_data_lock(inode, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + ocfs2_data_unlock(inode, 0); + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io); +out: mlog_exit(ret); return ret; } -struct address_space_operations ocfs2_aops = { +const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, .writepage = ocfs2_writepage, .prepare_write = ocfs2_prepare_write,