fedora core 6 1.2949 + vserver 2.2.0

[linux-2.6.git] / fs / jbd / commit.c
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c

index a540c7f..be4648b 100644 (file)
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -1,5 +1,5 @@
  /*
- * linux/fs/commit.c
+ * linux/fs/jbd/commit.c
   *
   * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   *
@@ -93,6 +93,188 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh)
         return 1;
  }
  
+/* Done it all: now write the commit record.  We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+static int journal_write_commit_record(journal_t *journal,
+                                       transaction_t *commit_transaction)
+{
+       struct journal_head *descriptor;
+       struct buffer_head *bh;
+       int i, ret;
+       int barrier_done = 0;
+
+       if (is_journal_aborted(journal))
+               return 0;
+
+       descriptor = journal_get_descriptor_buffer(journal);
+       if (!descriptor)
+               return 1;
+
+       bh = jh2bh(descriptor);
+
+       /* AKPM: buglet - add `i' to tmp! */
+       for (i = 0; i < bh->b_size; i += 512) {
+               journal_header_t *tmp = (journal_header_t*)bh->b_data;
+               tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+               tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+               tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+       }
+
+       JBUFFER_TRACE(descriptor, "write commit block");
+       set_buffer_dirty(bh);
+       if (journal->j_flags & JFS_BARRIER) {
+               set_buffer_ordered(bh);
+               barrier_done = 1;
+       }
+       ret = sync_dirty_buffer(bh);
+       /* is it possible for another commit to fail at roughly
+        * the same time as this one?  If so, we don't want to
+        * trust the barrier flag in the super, but instead want
+        * to remember if we sent a barrier request
+        */
+       if (ret == -EOPNOTSUPP && barrier_done) {
+               char b[BDEVNAME_SIZE];
+
+               printk(KERN_WARNING
+                       "JBD: barrier-based sync failed on %s - "
+                       "disabling barriers\n",
+                       bdevname(journal->j_dev, b));
+               spin_lock(&journal->j_state_lock);
+               journal->j_flags &= ~JFS_BARRIER;
+               spin_unlock(&journal->j_state_lock);
+
+               /* And try again, without the barrier */
+               clear_buffer_ordered(bh);
+               set_buffer_uptodate(bh);
+               set_buffer_dirty(bh);
+               ret = sync_dirty_buffer(bh);
+       }
+       put_bh(bh);             /* One for getblk() */
+       journal_put_journal_head(descriptor);
+
+       return (ret == -EIO);
+}
+
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+{
+       int i;
+
+       for (i = 0; i < bufs; i++) {
+               wbuf[i]->b_end_io = end_buffer_write_sync;
+               /* We use-up our safety reference in submit_bh() */
+               submit_bh(WRITE, wbuf[i]);
+       }
+}
+
+/*
+ *  Submit all the data buffers to disk
+ */
+static void journal_submit_data_buffers(journal_t *journal,
+                               transaction_t *commit_transaction)
+{
+       struct journal_head *jh;
+       struct buffer_head *bh;
+       int locked;
+       int bufs = 0;
+       struct buffer_head **wbuf = journal->j_wbuf;
+
+       /*
+        * Whenever we unlock the journal and sleep, things can get added
+        * onto ->t_sync_datalist, so we have to keep looping back to
+        * write_out_data until we *know* that the list is empty.
+        *
+        * Cleanup any flushed data buffers from the data list.  Even in
+        * abort mode, we want to flush this out as soon as possible.
+        */
+write_out_data:
+       cond_resched();
+       spin_lock(&journal->j_list_lock);
+
+       while (commit_transaction->t_sync_datalist) {
+               jh = commit_transaction->t_sync_datalist;
+               bh = jh2bh(jh);
+               locked = 0;
+
+               /* Get reference just to make sure buffer does not disappear
+                * when we are forced to drop various locks */
+               get_bh(bh);
+               /* If the buffer is dirty, we need to submit IO and hence
+                * we need the buffer lock. We try to lock the buffer without
+                * blocking. If we fail, we need to drop j_list_lock and do
+                * blocking lock_buffer().
+                */
+               if (buffer_dirty(bh)) {
+                       if (test_set_buffer_locked(bh)) {
+                               BUFFER_TRACE(bh, "needs blocking lock");
+                               spin_unlock(&journal->j_list_lock);
+                               /* Write out all data to prevent deadlocks */
+                               journal_do_submit_data(wbuf, bufs);
+                               bufs = 0;
+                               lock_buffer(bh);
+                               spin_lock(&journal->j_list_lock);
+                       }
+                       locked = 1;
+               }
+               /* We have to get bh_state lock. Again out of order, sigh. */
+               if (!inverted_lock(journal, bh)) {
+                       jbd_lock_bh_state(bh);
+                       spin_lock(&journal->j_list_lock);
+               }
+               /* Someone already cleaned up the buffer? */
+               if (!buffer_jbd(bh)
+                       || jh->b_transaction != commit_transaction
+                       || jh->b_jlist != BJ_SyncData) {
+                       jbd_unlock_bh_state(bh);
+                       if (locked)
+                               unlock_buffer(bh);
+                       BUFFER_TRACE(bh, "already cleaned up");
+                       put_bh(bh);
+                       continue;
+               }
+               if (locked && test_clear_buffer_dirty(bh)) {
+                       BUFFER_TRACE(bh, "needs writeout, adding to array");
+                       wbuf[bufs++] = bh;
+                       __journal_file_buffer(jh, commit_transaction,
+                                               BJ_Locked);
+                       jbd_unlock_bh_state(bh);
+                       if (bufs == journal->j_wbufsize) {
+                               spin_unlock(&journal->j_list_lock);
+                               journal_do_submit_data(wbuf, bufs);
+                               bufs = 0;
+                               goto write_out_data;
+                       }
+               } else if (!locked && buffer_locked(bh)) {
+                       __journal_file_buffer(jh, commit_transaction,
+                                               BJ_Locked);
+                       jbd_unlock_bh_state(bh);
+                       put_bh(bh);
+               } else {
+                       BUFFER_TRACE(bh, "writeout complete: unfile");
+                       __journal_unfile_buffer(jh);
+                       jbd_unlock_bh_state(bh);
+                       if (locked)
+                               unlock_buffer(bh);
+                       journal_remove_journal_head(bh);
+                       /* Once for our safety reference, once for
+                        * journal_remove_journal_head() */
+                       put_bh(bh);
+                       put_bh(bh);
+               }
+
+               if (lock_need_resched(&journal->j_list_lock)) {
+                       spin_unlock(&journal->j_list_lock);
+                       goto write_out_data;
+               }
+       }
+       spin_unlock(&journal->j_list_lock);
+       journal_do_submit_data(wbuf, bufs);
+}
+
  /*
   * journal_commit_transaction
   *
@@ -103,7 +285,7 @@ void journal_commit_transaction(journal_t *journal)
  {
         transaction_t *commit_transaction;
         struct journal_head *jh, *new_jh, *descriptor;
-       struct buffer_head *wbuf[64];
+       struct buffer_head **wbuf = journal->j_wbuf;
         int bufs;
         int flags;
         int err;
@@ -194,10 +376,8 @@ void journal_commit_transaction(journal_t *journal)
                         struct buffer_head *bh = jh2bh(jh);
  
                         jbd_lock_bh_state(bh);
-                       if (jh->b_committed_data) {
-                               kfree(jh->b_committed_data);
-                               jh->b_committed_data = NULL;
-                       }
+                       jbd_slab_free(jh->b_committed_data, bh->b_size);
+                       jh->b_committed_data = NULL;
                         jbd_unlock_bh_state(bh);
                 }
                 journal_refile_buffer(journal, jh);
@@ -229,83 +409,32 @@ void journal_commit_transaction(journal_t *journal)
         jbd_debug (3, "JBD: commit phase 2\n");
  
         /*
-        * Now start flushing things to disk, in the order they appear
-        * on the transaction lists.  Data blocks go first.
-        */
-
-       err = 0;
-       /*
-        * Whenever we unlock the journal and sleep, things can get added
-        * onto ->t_sync_datalist, so we have to keep looping back to
-        * write_out_data until we *know* that the list is empty.
+        * First, drop modified flag: all accesses to the buffers
+        * will be tracked for a new trasaction only -bzzz
          */
-       bufs = 0;
-       /*
-        * Cleanup any flushed data buffers from the data list.  Even in
-        * abort mode, we want to flush this out as soon as possible.
-        */
-write_out_data:
-       cond_resched();
         spin_lock(&journal->j_list_lock);
-
-       while (commit_transaction->t_sync_datalist) {
-               struct buffer_head *bh;
-
-               jh = commit_transaction->t_sync_datalist;
-               commit_transaction->t_sync_datalist = jh->b_tnext;
-               bh = jh2bh(jh);
-               if (buffer_locked(bh)) {
-                       BUFFER_TRACE(bh, "locked");
-                       if (!inverted_lock(journal, bh))
-                               goto write_out_data;
-                       __journal_unfile_buffer(jh);
-                       __journal_file_buffer(jh, commit_transaction,
-                                               BJ_Locked);
-                       jbd_unlock_bh_state(bh);
-                       if (need_resched()) {
-                               spin_unlock(&journal->j_list_lock);
-                               goto write_out_data;
-                       }
-               } else {
-                       if (buffer_dirty(bh)) {
-                               BUFFER_TRACE(bh, "start journal writeout");
-                               get_bh(bh);
-                               wbuf[bufs++] = bh;
-                               if (bufs == ARRAY_SIZE(wbuf)) {
-                                       jbd_debug(2, "submit %d writes\n",
-                                                       bufs);
-                                       spin_unlock(&journal->j_list_lock);
-                                       ll_rw_block(WRITE, bufs, wbuf);
-                                       journal_brelse_array(wbuf, bufs);
-                                       bufs = 0;
-                                       goto write_out_data;
-                               }
-                       } else {
-                               BUFFER_TRACE(bh, "writeout complete: unfile");
-                               if (!inverted_lock(journal, bh))
-                                       goto write_out_data;
-                               __journal_unfile_buffer(jh);
-                               jbd_unlock_bh_state(bh);
-                               journal_remove_journal_head(bh);
-                               put_bh(bh);
-                               if (need_resched()) {
-                                       spin_unlock(&journal->j_list_lock);
-                                       goto write_out_data;
-                               }
-                       }
-               }
+       if (commit_transaction->t_buffers) {
+               new_jh = jh = commit_transaction->t_buffers->b_tnext;
+               do {
+                       J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
+                                       new_jh->b_modified == 0);
+                       new_jh->b_modified = 0;
+                       new_jh = new_jh->b_tnext;
+               } while (new_jh != jh);
         }
+       spin_unlock(&journal->j_list_lock);
  
-       if (bufs) {
-               spin_unlock(&journal->j_list_lock);
-               ll_rw_block(WRITE, bufs, wbuf);
-               journal_brelse_array(wbuf, bufs);
-               spin_lock(&journal->j_list_lock);
-       }
+       /*
+        * Now start flushing things to disk, in the order they appear
+        * on the transaction lists.  Data blocks go first.
+        */
+       err = 0;
+       journal_submit_data_buffers(journal, commit_transaction);
  
         /*
          * Wait for all previously submitted IO to complete.
          */
+       spin_lock(&journal->j_list_lock);
         while (commit_transaction->t_locked_list) {
                 struct buffer_head *bh;
  
@@ -333,14 +462,13 @@ write_out_data:
                         jbd_unlock_bh_state(bh);
                 }
                 put_bh(bh);
-               if (need_resched()) {
-                       spin_unlock(&journal->j_list_lock);
-                       cond_resched();
-                       spin_lock(&journal->j_list_lock);
-               }
+               cond_resched_lock(&journal->j_list_lock);
         }
         spin_unlock(&journal->j_list_lock);
  
+       if (err)
+               __journal_abort_hard(journal);
+
         journal_write_revoke_records(journal, commit_transaction);
  
         jbd_debug(3, "JBD: commit phase 2\n");
@@ -405,9 +533,9 @@ write_out_data:
                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
                                 (unsigned long long)bh->b_blocknr, bh->b_data);
                         header = (journal_header_t *)&bh->b_data[0];
-                       header->h_magic     = htonl(JFS_MAGIC_NUMBER);
-                       header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK);
-                       header->h_sequence  = htonl(commit_transaction->t_tid);
+                       header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
+                       header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
+                       header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
  
                         tagp = &bh->b_data[sizeof(journal_header_t)];
                         space_left = bh->b_size - sizeof(journal_header_t);
@@ -473,8 +601,8 @@ write_out_data:
                         tag_flag |= JFS_FLAG_SAME_UUID;
  
                 tag = (journal_block_tag_t *) tagp;
-               tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr);
-               tag->t_flags = htonl(tag_flag);
+               tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
+               tag->t_flags = cpu_to_be32(tag_flag);
                 tagp += sizeof(journal_block_tag_t);
                 space_left -= sizeof(journal_block_tag_t);
  
@@ -488,7 +616,7 @@ write_out_data:
                 /* If there's no more to do, or if the descriptor is full,
                    let the IO rip! */
  
-               if (bufs == ARRAY_SIZE(wbuf) ||
+               if (bufs == journal->j_wbufsize ||
                     commit_transaction->t_buffers == NULL ||
                     space_left < sizeof(journal_block_tag_t) + 16) {
  
@@ -498,7 +626,7 @@ write_out_data:
                             submitting the IOs.  "tag" still points to
                             the last tag we set up. */
  
-                       tag->t_flags |= htonl(JFS_FLAG_LAST_TAG);
+                       tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
  
  start_journal_io:
                         for (i = 0; i < bufs; i++) {
@@ -545,6 +673,8 @@ wait_for_iobuf:
                         wait_on_buffer(bh);
                         goto wait_for_iobuf;
                 }
+               if (cond_resched())
+                       goto wait_for_iobuf;
  
                 if (unlikely(!buffer_uptodate(bh)))
                         err = -EIO;
@@ -579,7 +709,7 @@ wait_for_iobuf:
                 journal_file_buffer(jh, commit_transaction, BJ_Forget);
                 /* Wake up any transactions which were waiting for this
                    IO to complete */
-               wake_up_buffer(bh);
+               wake_up_bit(&bh->b_state, BH_Unshadow);
                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
                 __brelse(bh);
         }
@@ -599,6 +729,8 @@ wait_for_iobuf:
                         wait_on_buffer(bh);
                         goto wait_for_ctlbuf;
                 }
+               if (cond_resched())
+                       goto wait_for_ctlbuf;
  
                 if (unlikely(!buffer_uptodate(bh)))
                         err = -EIO;
@@ -613,75 +745,17 @@ wait_for_iobuf:
  
         jbd_debug(3, "JBD: commit phase 6\n");
  
-       if (is_journal_aborted(journal))
-               goto skip_commit;
-
-       /* Done it all: now write the commit record.  We should have
-        * cleaned up our previous buffers by now, so if we are in abort
-        * mode we can now just skip the rest of the journal write
-        * entirely. */
+       if (journal_write_commit_record(journal, commit_transaction))
+               err = -EIO;
  
-       descriptor = journal_get_descriptor_buffer(journal);
-       if (!descriptor) {
+       if (err)
                 __journal_abort_hard(journal);
-               goto skip_commit;
-       }
-
-       /* AKPM: buglet - add `i' to tmp! */
-       for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
-               journal_header_t *tmp =
-                       (journal_header_t*)jh2bh(descriptor)->b_data;
-               tmp->h_magic = htonl(JFS_MAGIC_NUMBER);
-               tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK);
-               tmp->h_sequence = htonl(commit_transaction->t_tid);
-       }
-
-       JBUFFER_TRACE(descriptor, "write commit block");
-       {
-               struct buffer_head *bh = jh2bh(descriptor);
-
-               set_buffer_dirty(bh);
-               sync_dirty_buffer(bh);
-               if (unlikely(!buffer_uptodate(bh)))
-                       err = -EIO;
-               put_bh(bh);             /* One for getblk() */
-               journal_put_journal_head(descriptor);
-       }
  
         /* End of a transaction!  Finally, we can do checkpoint
             processing: any buffers committed as a result of this
             transaction can be removed from any checkpoint list it was on
             before. */
  
-skip_commit: /* The journal should be unlocked by now. */
-
-       if (err)
-               __journal_abort_hard(journal);
-
-       /*
-        * Call any callbacks that had been registered for handles in this
-        * transaction.  It is up to the callback to free any allocated
-        * memory.
-        *
-        * The spinlocking (t_jcb_lock) here is surely unnecessary...
-        */
-       spin_lock(&commit_transaction->t_jcb_lock);
-       if (!list_empty(&commit_transaction->t_jcb)) {
-               struct list_head *p, *n;
-               int error = is_journal_aborted(journal);
-
-               list_for_each_safe(p, n, &commit_transaction->t_jcb) {
-                       struct journal_callback *jcb;
-
-                       jcb = list_entry(p, struct journal_callback, jcb_list);
-                       list_del(p);
-                       spin_unlock(&commit_transaction->t_jcb_lock);
-                       jcb->jcb_func(jcb, error);
-                       spin_lock(&commit_transaction->t_jcb_lock);
-               }
-       }
-       spin_unlock(&commit_transaction->t_jcb_lock);
-
         jbd_debug(3, "JBD: commit phase 7\n");
  
         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
@@ -691,11 +765,18 @@ skip_commit: /* The journal should be unlocked by now. */
         J_ASSERT(commit_transaction->t_shadow_list == NULL);
         J_ASSERT(commit_transaction->t_log_list == NULL);
  
+restart_loop:
+       /*
+        * As there are other places (journal_unmap_buffer()) adding buffers
+        * to this list we have to be careful and hold the j_list_lock.
+        */
+       spin_lock(&journal->j_list_lock);
         while (commit_transaction->t_forget) {
                 transaction_t *cp_transaction;
                 struct buffer_head *bh;
  
                 jh = commit_transaction->t_forget;
+               spin_unlock(&journal->j_list_lock);
                 bh = jh2bh(jh);
                 jbd_lock_bh_state(bh);
                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
@@ -712,14 +793,14 @@ skip_commit: /* The journal should be unlocked by now. */
                  * Otherwise, we can just throw away the frozen data now.
                  */
                 if (jh->b_committed_data) {
-                       kfree(jh->b_committed_data);
+                       jbd_slab_free(jh->b_committed_data, bh->b_size);
                         jh->b_committed_data = NULL;
                         if (jh->b_frozen_data) {
                                 jh->b_committed_data = jh->b_frozen_data;
                                 jh->b_frozen_data = NULL;
                         }
                 } else if (jh->b_frozen_data) {
-                       kfree(jh->b_frozen_data);
+                       jbd_slab_free(jh->b_frozen_data, bh->b_size);
                         jh->b_frozen_data = NULL;
                 }
  
@@ -757,13 +838,42 @@ skip_commit: /* The journal should be unlocked by now. */
                         jbd_unlock_bh_state(bh);
                 } else {
                         J_ASSERT_BH(bh, !buffer_dirty(bh));
-                       J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-                       __journal_unfile_buffer(jh);
-                       jbd_unlock_bh_state(bh);
-                       journal_remove_journal_head(bh);  /* needs a brelse */
-                       release_buffer_page(bh);
+                       /* The buffer on BJ_Forget list and not jbddirty means
+                        * it has been freed by this transaction and hence it
+                        * could not have been reallocated until this
+                        * transaction has committed. *BUT* it could be
+                        * reallocated once we have written all the data to
+                        * disk and before we process the buffer on BJ_Forget
+                        * list. */
+                       JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+                       __journal_refile_buffer(jh);
+                       if (!jh->b_transaction) {
+                               jbd_unlock_bh_state(bh);
+                                /* needs a brelse */
+                               journal_remove_journal_head(bh);
+                               release_buffer_page(bh);
+                       } else
+                               jbd_unlock_bh_state(bh);
                 }
+               cond_resched_lock(&journal->j_list_lock);
+       }
+       spin_unlock(&journal->j_list_lock);
+       /*
+        * This is a bit sleazy.  We borrow j_list_lock to protect
+        * journal->j_committing_transaction in __journal_remove_checkpoint.
+        * Really, __journal_remove_checkpoint should be using j_state_lock but
+        * it's a bit hassle to hold that across __journal_remove_checkpoint
+        */
+       spin_lock(&journal->j_state_lock);
+       spin_lock(&journal->j_list_lock);
+       /*
+        * Now recheck if some buffers did not get attached to the transaction
+        * while the lock was dropped...
+        */
+       if (commit_transaction->t_forget) {
                 spin_unlock(&journal->j_list_lock);
+               spin_unlock(&journal->j_state_lock);
+               goto restart_loop;
         }
  
         /* Done with this transaction! */
@@ -772,14 +882,6 @@ skip_commit: /* The journal should be unlocked by now. */
  
         J_ASSERT(commit_transaction->t_state == T_COMMIT);
  
-       /*
-        * This is a bit sleazy.  We borrow j_list_lock to protect
-        * journal->j_committing_transaction in __journal_remove_checkpoint.
-        * Really, __jornal_remove_checkpoint should be using j_state_lock but
-        * it's a bit hassle to hold that across __journal_remove_checkpoint
-        */
-       spin_lock(&journal->j_state_lock);
-       spin_lock(&journal->j_list_lock);
         commit_transaction->t_state = T_FINISHED;
         J_ASSERT(commit_transaction == journal->j_committing_transaction);
         journal->j_commit_sequence = commit_transaction->t_tid;