fedora core 6 1.2949 + vserver 2.2.0

[linux-2.6.git] / fs / ocfs2 / aops.c
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c

index 8f4467a..93628b0 100644 (file)
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -74,8 +74,8 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
         fe = (struct ocfs2_dinode *) bh->b_data;
  
         if (!OCFS2_IS_VALID_DINODE(fe)) {
-               mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
-                    fe->i_blkno, 7, fe->i_signature);
+               mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
+                    (unsigned long long)fe->i_blkno, 7, fe->i_signature);
                 goto bail;
         }
  
@@ -162,8 +162,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                                           NULL);
         if (err) {
                 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
-                    "%"MLFu64", NULL)\n", err, inode,
-                    (unsigned long long)iblock, p_blkno);
+                    "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
+                    (unsigned long long)p_blkno);
                 goto bail;
         }
  
@@ -171,13 +171,15 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
  
         if (bh_result->b_blocknr == 0) {
                 err = -EIO;
-               mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
-                    "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
-                    p_blkno, OCFS2_I(inode)->ip_blkno);
+               mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+                    (unsigned long long)iblock,
+                    (unsigned long long)p_blkno,
+                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
         }
  
         past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-       mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
+       mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+            (unsigned long long)past_eof);
  
         if (create && (iblock >= past_eof))
                 set_buffer_new(bh_result);
@@ -198,7 +200,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
  
         mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
  
-       ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+       ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
         if (ret != 0) {
                 if (ret == AOP_TRUNCATED_PAGE)
                         unlock = 0;
@@ -274,30 +276,42 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
         return ret;
  }
  
+/* This can also be called from ocfs2_write_zero_page() which has done
+ * it's own cluster locking. */
+int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
+                              unsigned from, unsigned to)
+{
+       int ret;
+
+       down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       return ret;
+}
+
  /*
   * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
   * from loopback.  It must be able to perform its own locking around
   * ocfs2_get_block().
   */
-int ocfs2_prepare_write(struct file *file, struct page *page,
-                       unsigned from, unsigned to)
+static int ocfs2_prepare_write(struct file *file, struct page *page,
+                              unsigned from, unsigned to)
  {
         struct inode *inode = page->mapping->host;
         int ret;
  
         mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
  
-       ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+       ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
         if (ret != 0) {
                 mlog_errno(ret);
                 goto out;
         }
  
-       down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-       ret = block_prepare_write(page, from, to, ocfs2_get_block);
-
-       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+       ret = ocfs2_prepare_write_nolock(inode, page, from, to);
  
         ocfs2_meta_unlock(inode, 0);
  out:
@@ -341,16 +355,16 @@ static int walk_page_buffers(     handle_t *handle,
         return ret;
  }
  
-struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                          struct page *page,
                                                          unsigned from,
                                                          unsigned to)
  {
         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-       struct ocfs2_journal_handle *handle = NULL;
+       handle_t *handle = NULL;
         int ret = 0;
  
-       handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+       handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
         if (!handle) {
                 ret = -ENOMEM;
                 mlog_errno(ret);
@@ -358,7 +372,7 @@ struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
         }
  
         if (ocfs2_should_order_data(inode)) {
-               ret = walk_page_buffers(handle->k_handle,
+               ret = walk_page_buffers(handle,
                                         page_buffers(page),
                                         from, to, NULL,
                                         ocfs2_journal_dirty_data);
@@ -368,7 +382,7 @@ struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
  out:
         if (ret) {
                 if (handle)
-                       ocfs2_commit_trans(handle);
+                       ocfs2_commit_trans(osb, handle);
                 handle = ERR_PTR(ret);
         }
         return handle;
@@ -377,31 +391,28 @@ out:
  static int ocfs2_commit_write(struct file *file, struct page *page,
                               unsigned from, unsigned to)
  {
-       int ret, extending = 0, locklevel = 0;
-       loff_t new_i_size;
+       int ret;
         struct buffer_head *di_bh = NULL;
         struct inode *inode = page->mapping->host;
-       struct ocfs2_journal_handle *handle = NULL;
+       handle_t *handle = NULL;
+       struct ocfs2_dinode *di;
  
         mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
  
         /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
-        * us to sample inode->i_size here without the metadata lock:
+        * us to continue here without rechecking the I/O against
+        * changed inode values.
          *
          * 1) We're currently holding the inode alloc lock, so no
          *    nodes can change it underneath us.
          *
          * 2) We've had to take the metadata lock at least once
-        *    already to check for extending writes, hence insuring
-        *    that our current copy is also up to date.
+        *    already to check for extending writes, suid removal, etc.
+        *    The meta data update code then ensures that we don't get a
+        *    stale inode allocation image (i_size, i_clusters, etc).
          */
-       new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-       if (new_i_size > i_size_read(inode)) {
-               extending = 1;
-               locklevel = 1;
-       }
  
-       ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+       ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
         if (ret != 0) {
                 mlog_errno(ret);
                 goto out;
@@ -413,23 +424,20 @@ static int ocfs2_commit_write(struct file *file, struct page *page,
                 goto out_unlock_meta;
         }
  
-       if (extending) {
-               handle = ocfs2_start_walk_page_trans(inode, page, from, to);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       handle = NULL;
-                       goto out_unlock_data;
-               }
+       handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out_unlock_data;
+       }
  
-               /* Mark our buffer early. We'd rather catch this error up here
-                * as opposed to after a successful commit_write which would
-                * require us to set back inode->i_size. */
-               ret = ocfs2_journal_access(handle, inode, di_bh,
-                                          OCFS2_JOURNAL_ACCESS_WRITE);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out_commit;
-               }
+       /* Mark our buffer early. We'd rather catch this error up here
+        * as opposed to after a successful commit_write which would
+        * require us to set back inode->i_size. */
+       ret = ocfs2_journal_access(handle, inode, di_bh,
+                                  OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_commit;
         }
  
         /* might update i_size */
@@ -439,37 +447,28 @@ static int ocfs2_commit_write(struct file *file, struct page *page,
                 goto out_commit;
         }
  
-       if (extending) {
-               loff_t size = (u64) i_size_read(inode);
-               struct ocfs2_dinode *di =
-                       (struct ocfs2_dinode *)di_bh->b_data;
+       di = (struct ocfs2_dinode *)di_bh->b_data;
  
-               /* ocfs2_mark_inode_dirty is too heavy to use here. */
-               inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
-               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+       /* ocfs2_mark_inode_dirty() is too heavy to use here. */
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+       di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
  
-               di->i_size = cpu_to_le64(size);
-               di->i_ctime = di->i_mtime = 
-                               cpu_to_le64(inode->i_mtime.tv_sec);
-               di->i_ctime_nsec = di->i_mtime_nsec = 
-                               cpu_to_le32(inode->i_mtime.tv_nsec);
+       inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
+       di->i_size = cpu_to_le64((u64)i_size_read(inode));
  
-               ret = ocfs2_journal_dirty(handle, di_bh);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out_commit;
-               }
+       ret = ocfs2_journal_dirty(handle, di_bh);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_commit;
         }
  
-       BUG_ON(extending && (i_size_read(inode) != new_i_size));
-
  out_commit:
-       if (handle)
-               ocfs2_commit_trans(handle);
+       ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
  out_unlock_data:
         ocfs2_data_unlock(inode, 1);
  out_unlock_meta:
-       ocfs2_meta_unlock(inode, locklevel);
+       ocfs2_meta_unlock(inode, 1);
  out:
         if (di_bh)
                 brelse(di_bh);
@@ -491,7 +490,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
          * accessed concurrently from multiple nodes.
          */
         if (!INODE_JOURNAL(inode)) {
-               err = ocfs2_meta_lock(inode, NULL, NULL, 0);
+               err = ocfs2_meta_lock(inode, NULL, 0);
                 if (err) {
                         if (err != -ENOENT)
                                 mlog_errno(err);
@@ -538,32 +537,35 @@ bail:
   *                                     fs_count, map_bh, dio->rw == WRITE);
   */
  static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
-                                    unsigned long max_blocks,
                                      struct buffer_head *bh_result, int create)
  {
         int ret;
-       u64 vbo_max; /* file offset, max_blocks from iblock */
-       u64 p_blkno;
+       u64 p_blkno, inode_blocks;
         int contig_blocks;
-       unsigned char blocksize_bits;
-
-       if (!inode || !bh_result) {
-               mlog(ML_ERROR, "inode or bh_result is null\n");
-               return -EIO;
-       }
-
-       blocksize_bits = inode->i_sb->s_blocksize_bits;
+       unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
+       unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
  
         /* This function won't even be called if the request isn't all
          * nicely aligned and of the right size, so there's no need
          * for us to check any of that. */
  
-       vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
-
         spin_lock(&OCFS2_I(inode)->ip_lock);
-       if ((iblock + max_blocks) >
-           ocfs2_clusters_to_blocks(inode->i_sb,
-                                    OCFS2_I(inode)->ip_clusters)) {
+       inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb,
+                                               OCFS2_I(inode)->ip_clusters);
+
+       /*
+        * For a read which begins past the end of file, we return a hole.
+        */
+       if (!create && (iblock >= inode_blocks)) {
+               spin_unlock(&OCFS2_I(inode)->ip_lock);
+               ret = 0;
+               goto bail;
+       }
+
+       /*
+        * Any write past EOF is not allowed because we'd be extending.
+        */
+       if (create && (iblock + max_blocks) > inode_blocks) {
                 spin_unlock(&OCFS2_I(inode)->ip_lock);
                 ret = -EIO;
                 goto bail;
@@ -603,7 +605,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
                              ssize_t bytes,
                              void *private)
  {
-       struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+       struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
  
         /* this io's submitter should not have unlocked this before we could */
         BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -619,20 +621,40 @@ static ssize_t ocfs2_direct_IO(int rw,
                                unsigned long nr_segs)
  {
         struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+       struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
         int ret;
  
         mlog_entry_void();
+
+       /*
+        * We get PR data locks even for O_DIRECT.  This allows
+        * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+        * extending and buffered zeroing writes race.  If they did
+        * race then the buffered zeroing could be written back after
+        * the O_DIRECT I/O.  It's one thing to tell people not to mix
+        * buffered and O_DIRECT writes, but expecting them to
+        * understand that file extension is also an implicit buffered
+        * write is too much.  By getting the PR we force writeback of
+        * the buffered zeroing before proceeding.
+        */
+       ret = ocfs2_data_lock(inode, 0);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+       ocfs2_data_unlock(inode, 0);
+
         ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                             inode->i_sb->s_bdev, iov, offset,
                                             nr_segs, 
                                             ocfs2_direct_IO_get_blocks,
                                             ocfs2_dio_end_io);
+out:
         mlog_exit(ret);
         return ret;
  }
  
-struct address_space_operations ocfs2_aops = {
+const struct address_space_operations ocfs2_aops = {
         .readpage       = ocfs2_readpage,
         .writepage      = ocfs2_writepage,
         .prepare_write  = ocfs2_prepare_write,