fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / fs / fs-writeback.c
index 5f3ccf9..a4b142a 100644 (file)
@@ -22,8 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
-
-extern struct super_block *blockdev_superblock;
+#include "internal.h"
 
 /**
  *     __mark_inode_dirty -    internal function
@@ -133,10 +132,11 @@ out:
 
 EXPORT_SYMBOL(__mark_inode_dirty);
 
-static void write_inode(struct inode *inode, int sync)
+static int write_inode(struct inode *inode, int sync)
 {
        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
-               inode->i_sb->s_op->write_inode(inode, sync);
+               return inode->i_sb->s_op->write_inode(inode, sync);
+       return 0;
 }
 
 /*
@@ -170,8 +170,11 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
        ret = do_writepages(mapping, wbc);
 
        /* Don't write the inode if only I_DIRTY_PAGES was set */
-       if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
-               write_inode(inode, wait);
+       if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+               int err = write_inode(inode, wait);
+               if (ret == 0)
+                       ret = err;
+       }
 
        if (wait) {
                int err = filemap_fdatawait(mapping);
@@ -213,8 +216,9 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
                } else if (inode->i_state & I_DIRTY) {
                        /*
                         * Someone redirtied the inode while were writing back
-                        * the pages: nothing to do.
+                        * the pages.
                         */
+                       list_move(&inode->i_list, &sb->s_dirty);
                } else if (atomic_read(&inode->i_count)) {
                        /*
                         * The inode is clean, inuse
@@ -232,26 +236,49 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 }
 
 /*
- * Write out an inode's dirty pages.  Called under inode_lock.
+ * Write out an inode's dirty pages.  Called under inode_lock.  Either the
+ * caller has ref on the inode (either via __iget or via syscall against an fd)
+ * or the inode has I_WILL_FREE set (via generic_forget_inode)
  */
 static int
-__writeback_single_inode(struct inode *inode,
-                       struct writeback_control *wbc)
+__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
+       wait_queue_head_t *wqh;
+
+       if (!atomic_read(&inode->i_count))
+               WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
+       else
+               WARN_ON(inode->i_state & I_WILL_FREE);
+
        if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
+               struct address_space *mapping = inode->i_mapping;
+               int ret;
+
                list_move(&inode->i_list, &inode->i_sb->s_dirty);
-               return 0;
+
+               /*
+                * Even if we don't actually write the inode itself here,
+                * we can at least start some of the data writeout..
+                */
+               spin_unlock(&inode_lock);
+               ret = do_writepages(mapping, wbc);
+               spin_lock(&inode_lock);
+               return ret;
        }
 
        /*
         * It's a data-integrity sync.  We must wait.
         */
-       while (inode->i_state & I_LOCK) {
-               __iget(inode);
-               spin_unlock(&inode_lock);
-               __wait_on_inode(inode);
-               iput(inode);
-               spin_lock(&inode_lock);
+       if (inode->i_state & I_LOCK) {
+               DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LOCK);
+
+               wqh = bit_waitqueue(&inode->i_state, __I_LOCK);
+               do {
+                       spin_unlock(&inode_lock);
+                       __wait_on_bit(wqh, &wq, inode_wait,
+                                                       TASK_UNINTERRUPTIBLE);
+                       spin_lock(&inode_lock);
+               } while (inode->i_state & I_LOCK);
        }
        return __sync_single_inode(inode, wbc);
 }
@@ -301,27 +328,33 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                long pages_skipped;
 
-               if (bdi->memory_backed) {
-                       if (sb == blockdev_superblock) {
+               if (!bdi_cap_writeback_dirty(bdi)) {
+                       list_move(&inode->i_list, &sb->s_dirty);
+                       if (sb_is_blkdev_sb(sb)) {
                                /*
                                 * Dirty memory-backed blockdev: the ramdisk
-                                * driver does this.
+                                * driver does this.  Skip just this inode
                                 */
-                               list_move(&inode->i_list, &sb->s_dirty);
                                continue;
                        }
+                       /*
+                        * Dirty memory-backed inode against a filesystem other
+                        * than the kernel-internal bdev filesystem.  Skip the
+                        * entire superblock.
+                        */
+                       break;
                }
 
                if (wbc->nonblocking && bdi_write_congested(bdi)) {
                        wbc->encountered_congestion = 1;
-                       if (sb != blockdev_superblock)
+                       if (!sb_is_blkdev_sb(sb))
                                break;          /* Skip a congested fs */
                        list_move(&inode->i_list, &sb->s_dirty);
                        continue;               /* Skip a congested blockdev */
                }
 
                if (wbc->bdi && bdi != wbc->bdi) {
-                       if (sb != blockdev_superblock)
+                       if (!sb_is_blkdev_sb(sb))
                                break;          /* fs has the wrong queue */
                        list_move(&inode->i_list, &sb->s_dirty);
                        continue;               /* blockdev has wrong queue */
@@ -359,6 +392,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
                }
                spin_unlock(&inode_lock);
                iput(inode);
+               cond_resched();
                spin_lock(&inode_lock);
                if (wbc->nr_to_write <= 0)
                        break;
@@ -390,7 +424,7 @@ writeback_inodes(struct writeback_control *wbc)
 {
        struct super_block *sb;
 
-       spin_lock(&inode_lock);
+       might_sleep();
        spin_lock(&sb_lock);
 restart:
        sb = sb_entry(super_blocks.prev);
@@ -405,19 +439,21 @@ restart:
                         * be unmounted by the time it is released.
                         */
                        if (down_read_trylock(&sb->s_umount)) {
-                               if (sb->s_root)
+                               if (sb->s_root) {
+                                       spin_lock(&inode_lock);
                                        sync_sb_inodes(sb, wbc);
+                                       spin_unlock(&inode_lock);
+                               }
                                up_read(&sb->s_umount);
                        }
                        spin_lock(&sb_lock);
-                       if (__put_super(sb))
+                       if (__put_super_and_need_restart(sb))
                                goto restart;
                }
                if (wbc->nr_to_write <= 0)
                        break;
        }
        spin_unlock(&sb_lock);
-       spin_unlock(&inode_lock);
 }
 
 /*
@@ -435,9 +471,11 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 {
        struct writeback_control wbc = {
                .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+               .range_start    = 0,
+               .range_end      = LLONG_MAX,
        };
-       unsigned long nr_dirty = read_page_state(nr_dirty);
-       unsigned long nr_unstable = read_page_state(nr_unstable);
+       unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+       unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 
        wbc.nr_to_write = nr_dirty + nr_unstable +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
@@ -462,34 +500,9 @@ static void set_sb_syncing(int val)
        spin_unlock(&sb_lock);
 }
 
-/*
- * Find a superblock with inodes that need to be synced
- */
-static struct super_block *get_super_to_sync(void)
-{
-       struct super_block *sb;
-restart:
-       spin_lock(&sb_lock);
-       sb = sb_entry(super_blocks.prev);
-       for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
-               if (sb->s_syncing)
-                       continue;
-               sb->s_syncing = 1;
-               sb->s_count++;
-               spin_unlock(&sb_lock);
-               down_read(&sb->s_umount);
-               if (!sb->s_root) {
-                       drop_super(sb);
-                       goto restart;
-               }
-               return sb;
-       }
-       spin_unlock(&sb_lock);
-       return NULL;
-}
-
 /**
- * sync_inodes
+ * sync_inodes - writes all inodes to disk
+ * @wait: wait for completion
  *
  * sync_inodes() goes through each super block's dirty inode list, writes the
  * inodes out, waits on the writeout and puts the inodes back on the normal
@@ -506,50 +519,72 @@ restart:
  * outstanding dirty inodes, the writeback goes block-at-a-time within the
  * filesystem's write_inode().  This is extremely slow.
  */
-void sync_inodes(int wait)
+static void __sync_inodes(int wait)
 {
        struct super_block *sb;
 
-       set_sb_syncing(0);
-       while ((sb = get_super_to_sync()) != NULL) {
-               sync_inodes_sb(sb, 0);
-               sync_blockdev(sb->s_bdev);
-               drop_super(sb);
+       spin_lock(&sb_lock);
+restart:
+       list_for_each_entry(sb, &super_blocks, s_list) {
+               if (sb->s_syncing)
+                       continue;
+               sb->s_syncing = 1;
+               sb->s_count++;
+               spin_unlock(&sb_lock);
+               down_read(&sb->s_umount);
+               if (sb->s_root) {
+                       sync_inodes_sb(sb, wait);
+                       sync_blockdev(sb->s_bdev);
+               }
+               up_read(&sb->s_umount);
+               spin_lock(&sb_lock);
+               if (__put_super_and_need_restart(sb))
+                       goto restart;
        }
+       spin_unlock(&sb_lock);
+}
+
+void sync_inodes(int wait)
+{
+       set_sb_syncing(0);
+       __sync_inodes(0);
+
        if (wait) {
                set_sb_syncing(0);
-               while ((sb = get_super_to_sync()) != NULL) {
-                       sync_inodes_sb(sb, 1);
-                       sync_blockdev(sb->s_bdev);
-                       drop_super(sb);
-               }
+               __sync_inodes(1);
        }
 }
 
 /**
- *     write_inode_now -       write an inode to disk
- *     @inode: inode to write to disk
- *     @sync: whether the write should be synchronous or not
+ * write_inode_now     -       write an inode to disk
+ * @inode: inode to write to disk
+ * @sync: whether the write should be synchronous or not
+ *
+ * This function commits an inode to disk immediately if it is dirty. This is
+ * primarily needed by knfsd.
  *
- *     This function commits an inode to disk immediately if it is
- *     dirty. This is primarily needed by knfsd.
+ * The caller must either have a ref on the inode or must have set I_WILL_FREE.
  */
-void write_inode_now(struct inode *inode, int sync)
+int write_inode_now(struct inode *inode, int sync)
 {
+       int ret;
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = WB_SYNC_ALL,
+               .range_start = 0,
+               .range_end = LLONG_MAX,
        };
 
-       if (inode->i_mapping->backing_dev_info->memory_backed)
-               return;
+       if (!mapping_cap_writeback_dirty(inode->i_mapping))
+               wbc.nr_to_write = 0;
 
+       might_sleep();
        spin_lock(&inode_lock);
-       __writeback_single_inode(inode, &wbc);
+       ret = __writeback_single_inode(inode, &wbc);
        spin_unlock(&inode_lock);
        if (sync)
                wait_on_inode(inode);
+       return ret;
 }
 EXPORT_SYMBOL(write_inode_now);
 
@@ -578,13 +613,14 @@ EXPORT_SYMBOL(sync_inode);
 /**
  * generic_osync_inode - flush all dirty data for a given inode to disk
  * @inode: inode to write
+ * @mapping: the address_space that should be flushed
  * @what:  what to write and wait upon
  *
  * This can be called by file_write functions for files which have the
  * O_SYNC flag set, to flush dirty writes to disk.
  *
  * @what is a bitmask, specifying which part of the inode's data should be
- * written and waited upon:
+ * written and waited upon.
  *
  *    OSYNC_DATA:     i_mapping's dirty data
  *    OSYNC_METADATA: the buffers at i_mapping->private_list
@@ -597,7 +633,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
        int need_write_inode_now = 0;
        int err2;
 
-       current->flags |= PF_SYNCWRITE;
        if (what & OSYNC_DATA)
                err = filemap_fdatawrite(mapping);
        if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -610,7 +645,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
                if (!err)
                        err = err2;
        }
-       current->flags &= ~PF_SYNCWRITE;
 
        spin_lock(&inode_lock);
        if ((inode->i_state & I_DIRTY) &&
@@ -618,8 +652,11 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
                need_write_inode_now = 1;
        spin_unlock(&inode_lock);
 
-       if (need_write_inode_now)
-               write_inode_now(inode, 1);
+       if (need_write_inode_now) {
+               err2 = write_inode_now(inode, 1);
+               if (!err)
+                       err = err2;
+       }
        else
                wait_on_inode(inode);
 
@@ -647,8 +684,9 @@ int writeback_acquire(struct backing_dev_info *bdi)
 
 /**
  * writeback_in_progress: determine whether there is writeback in progress
- *                        against a backing device.
  * @bdi: the device's backing_dev_info structure.
+ *
+ * Determine whether there is writeback in progress against a backing device.
  */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {