X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Fjbd%2Fcommit.c;h=be4648bc7a2f8febe584a999040599bac87edfb6;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=1c029b20923e6a93925be29f6f40ad4d85701589;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 1c029b209..be4648bc7 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -1,5 +1,5 @@ /* - * linux/fs/commit.c + * linux/fs/jbd/commit.c * * Written by Stephen C. Tweedie , 1998 * @@ -93,6 +93,188 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh) return 1; } +/* Done it all: now write the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. + * + * Returns 1 if the journal needs to be aborted or 0 on success + */ +static int journal_write_commit_record(journal_t *journal, + transaction_t *commit_transaction) +{ + struct journal_head *descriptor; + struct buffer_head *bh; + int i, ret; + int barrier_done = 0; + + if (is_journal_aborted(journal)) + return 0; + + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) + return 1; + + bh = jh2bh(descriptor); + + /* AKPM: buglet - add `i' to tmp! */ + for (i = 0; i < bh->b_size; i += 512) { + journal_header_t *tmp = (journal_header_t*)bh->b_data; + tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); + tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + } + + JBUFFER_TRACE(descriptor, "write commit block"); + set_buffer_dirty(bh); + if (journal->j_flags & JFS_BARRIER) { + set_buffer_ordered(bh); + barrier_done = 1; + } + ret = sync_dirty_buffer(bh); + /* is it possible for another commit to fail at roughly + * the same time as this one? If so, we don't want to + * trust the barrier flag in the super, but instead want + * to remember if we sent a barrier request + */ + if (ret == -EOPNOTSUPP && barrier_done) { + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: barrier-based sync failed on %s - " + "disabling barriers\n", + bdevname(journal->j_dev, b)); + spin_lock(&journal->j_state_lock); + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); + + /* And try again, without the barrier */ + clear_buffer_ordered(bh); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + ret = sync_dirty_buffer(bh); + } + put_bh(bh); /* One for getblk() */ + journal_put_journal_head(descriptor); + + return (ret == -EIO); +} + +static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) +{ + int i; + + for (i = 0; i < bufs; i++) { + wbuf[i]->b_end_io = end_buffer_write_sync; + /* We use-up our safety reference in submit_bh() */ + submit_bh(WRITE, wbuf[i]); + } +} + +/* + * Submit all the data buffers to disk + */ +static void journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct journal_head *jh; + struct buffer_head *bh; + int locked; + int bufs = 0; + struct buffer_head **wbuf = journal->j_wbuf; + + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_sync_datalist, so we have to keep looping back to + * write_out_data until we *know* that the list is empty. + * + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + */ +write_out_data: + cond_resched(); + spin_lock(&journal->j_list_lock); + + while (commit_transaction->t_sync_datalist) { + jh = commit_transaction->t_sync_datalist; + bh = jh2bh(jh); + locked = 0; + + /* Get reference just to make sure buffer does not disappear + * when we are forced to drop various locks */ + get_bh(bh); + /* If the buffer is dirty, we need to submit IO and hence + * we need the buffer lock. We try to lock the buffer without + * blocking. If we fail, we need to drop j_list_lock and do + * blocking lock_buffer(). + */ + if (buffer_dirty(bh)) { + if (test_set_buffer_locked(bh)) { + BUFFER_TRACE(bh, "needs blocking lock"); + spin_unlock(&journal->j_list_lock); + /* Write out all data to prevent deadlocks */ + journal_do_submit_data(wbuf, bufs); + bufs = 0; + lock_buffer(bh); + spin_lock(&journal->j_list_lock); + } + locked = 1; + } + /* We have to get bh_state lock. Again out of order, sigh. */ + if (!inverted_lock(journal, bh)) { + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + } + /* Someone already cleaned up the buffer? */ + if (!buffer_jbd(bh) + || jh->b_transaction != commit_transaction + || jh->b_jlist != BJ_SyncData) { + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + BUFFER_TRACE(bh, "already cleaned up"); + put_bh(bh); + continue; + } + if (locked && test_clear_buffer_dirty(bh)) { + BUFFER_TRACE(bh, "needs writeout, adding to array"); + wbuf[bufs++] = bh; + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + if (bufs == journal->j_wbufsize) { + spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs); + bufs = 0; + goto write_out_data; + } + } else if (!locked && buffer_locked(bh)) { + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + put_bh(bh); + } else { + BUFFER_TRACE(bh, "writeout complete: unfile"); + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + journal_remove_journal_head(bh); + /* Once for our safety reference, once for + * journal_remove_journal_head() */ + put_bh(bh); + put_bh(bh); + } + + if (lock_need_resched(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto write_out_data; + } + } + spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs); +} + /* * journal_commit_transaction * @@ -103,7 +285,7 @@ void journal_commit_transaction(journal_t *journal) { transaction_t *commit_transaction; struct journal_head *jh, *new_jh, *descriptor; - struct buffer_head *wbuf[64]; + struct buffer_head **wbuf = journal->j_wbuf; int bufs; int flags; int err; @@ -194,10 +376,8 @@ void journal_commit_transaction(journal_t *journal) struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); - if (jh->b_committed_data) { - kfree(jh->b_committed_data); - jh->b_committed_data = NULL; - } + jbd_slab_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; jbd_unlock_bh_state(bh); } journal_refile_buffer(journal, jh); @@ -229,83 +409,32 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 2\n"); /* - * Now start flushing things to disk, in the order they appear - * on the transaction lists. Data blocks go first. - */ - - err = 0; - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - */ - bufs = 0; - /* - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. + * First, drop modified flag: all accesses to the buffers + * will be tracked for a new trasaction only -bzzz */ -write_out_data: - cond_resched(); spin_lock(&journal->j_list_lock); - - while (commit_transaction->t_sync_datalist) { - struct buffer_head *bh; - - jh = commit_transaction->t_sync_datalist; - commit_transaction->t_sync_datalist = jh->b_tnext; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - BUFFER_TRACE(bh, "locked"); - if (!inverted_lock(journal, bh)) - goto write_out_data; - __journal_unfile_buffer(jh); - __journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (need_resched()) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } else { - if (buffer_dirty(bh)) { - BUFFER_TRACE(bh, "start journal writeout"); - get_bh(bh); - wbuf[bufs++] = bh; - if (bufs == ARRAY_SIZE(wbuf)) { - jbd_debug(2, "submit %d writes\n", - bufs); - spin_unlock(&journal->j_list_lock); - ll_rw_block(WRITE, bufs, wbuf); - journal_brelse_array(wbuf, bufs); - bufs = 0; - goto write_out_data; - } - } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - if (!inverted_lock(journal, bh)) - goto write_out_data; - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - if (need_resched()) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } - } + if (commit_transaction->t_buffers) { + new_jh = jh = commit_transaction->t_buffers->b_tnext; + do { + J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || + new_jh->b_modified == 0); + new_jh->b_modified = 0; + new_jh = new_jh->b_tnext; + } while (new_jh != jh); } + spin_unlock(&journal->j_list_lock); - if (bufs) { - spin_unlock(&journal->j_list_lock); - ll_rw_block(WRITE, bufs, wbuf); - journal_brelse_array(wbuf, bufs); - spin_lock(&journal->j_list_lock); - } + /* + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ + err = 0; + journal_submit_data_buffers(journal, commit_transaction); /* * Wait for all previously submitted IO to complete. */ + spin_lock(&journal->j_list_lock); while (commit_transaction->t_locked_list) { struct buffer_head *bh; @@ -333,14 +462,13 @@ write_out_data: jbd_unlock_bh_state(bh); } put_bh(bh); - if (need_resched()) { - spin_unlock(&journal->j_list_lock); - cond_resched(); - spin_lock(&journal->j_list_lock); - } + cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); + if (err) + __journal_abort_hard(journal); + journal_write_revoke_records(journal, commit_transaction); jbd_debug(3, "JBD: commit phase 2\n"); @@ -362,7 +490,7 @@ write_out_data: */ commit_transaction->t_state = T_COMMIT; - descriptor = 0; + descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -405,14 +533,15 @@ write_out_data: jbd_debug(4, "JBD: got buffer %llu (%p)\n", (unsigned long long)bh->b_blocknr, bh->b_data); header = (journal_header_t *)&bh->b_data[0]; - header->h_magic = htonl(JFS_MAGIC_NUMBER); - header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK); - header->h_sequence = htonl(commit_transaction->t_tid); + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); + header->h_sequence = cpu_to_be32(commit_transaction->t_tid); tagp = &bh->b_data[sizeof(journal_header_t)]; space_left = bh->b_size - sizeof(journal_header_t); first_tag = 1; - set_bit(BH_JWrite, &bh->b_state); + set_buffer_jwrite(bh); + set_buffer_dirty(bh); wbuf[bufs++] = bh; /* Record it so that we can wait for IO @@ -472,8 +601,8 @@ write_out_data: tag_flag |= JFS_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; - tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr); - tag->t_flags = htonl(tag_flag); + tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); + tag->t_flags = cpu_to_be32(tag_flag); tagp += sizeof(journal_block_tag_t); space_left -= sizeof(journal_block_tag_t); @@ -487,7 +616,7 @@ write_out_data: /* If there's no more to do, or if the descriptor is full, let the IO rip! */ - if (bufs == ARRAY_SIZE(wbuf) || + if (bufs == journal->j_wbufsize || commit_transaction->t_buffers == NULL || space_left < sizeof(journal_block_tag_t) + 16) { @@ -497,12 +626,12 @@ write_out_data: submitting the IOs. "tag" still points to the last tag we set up. */ - tag->t_flags |= htonl(JFS_FLAG_LAST_TAG); + tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; - set_buffer_locked(bh); + lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; @@ -544,6 +673,8 @@ wait_for_iobuf: wait_on_buffer(bh); goto wait_for_iobuf; } + if (cond_resched()) + goto wait_for_iobuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; @@ -578,7 +709,7 @@ wait_for_iobuf: journal_file_buffer(jh, commit_transaction, BJ_Forget); /* Wake up any transactions which were waiting for this IO to complete */ - wake_up_buffer(bh); + wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); } @@ -598,6 +729,8 @@ wait_for_iobuf: wait_on_buffer(bh); goto wait_for_ctlbuf; } + if (cond_resched()) + goto wait_for_ctlbuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; @@ -612,74 +745,17 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 6\n"); - if (is_journal_aborted(journal)) - goto skip_commit; - - /* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort - * mode we can now just skip the rest of the journal write - * entirely. */ + if (journal_write_commit_record(journal, commit_transaction)) + err = -EIO; - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { + if (err) __journal_abort_hard(journal); - goto skip_commit; - } - - /* AKPM: buglet - add `i' to tmp! */ - for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { - journal_header_t *tmp = - (journal_header_t*)jh2bh(descriptor)->b_data; - tmp->h_magic = htonl(JFS_MAGIC_NUMBER); - tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK); - tmp->h_sequence = htonl(commit_transaction->t_tid); - } - - JBUFFER_TRACE(descriptor, "write commit block"); - { - struct buffer_head *bh = jh2bh(descriptor); - set_buffer_uptodate(bh); - sync_dirty_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - put_bh(bh); /* One for getblk() */ - journal_put_journal_head(descriptor); - } /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this transaction can be removed from any checkpoint list it was on before. */ -skip_commit: /* The journal should be unlocked by now. */ - - if (err) - __journal_abort_hard(journal); - - /* - * Call any callbacks that had been registered for handles in this - * transaction. It is up to the callback to free any allocated - * memory. - * - * The spinlocking (t_jcb_lock) here is surely unnecessary... - */ - spin_lock(&commit_transaction->t_jcb_lock); - if (!list_empty(&commit_transaction->t_jcb)) { - struct list_head *p, *n; - int error = is_journal_aborted(journal); - - list_for_each_safe(p, n, &commit_transaction->t_jcb) { - struct journal_callback *jcb; - - jcb = list_entry(p, struct journal_callback, jcb_list); - list_del(p); - spin_unlock(&commit_transaction->t_jcb_lock); - jcb->jcb_func(jcb, error); - spin_lock(&commit_transaction->t_jcb_lock); - } - } - spin_unlock(&commit_transaction->t_jcb_lock); - jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_sync_datalist == NULL); @@ -689,11 +765,18 @@ skip_commit: /* The journal should be unlocked by now. */ J_ASSERT(commit_transaction->t_shadow_list == NULL); J_ASSERT(commit_transaction->t_log_list == NULL); +restart_loop: + /* + * As there are other places (journal_unmap_buffer()) adding buffers + * to this list we have to be careful and hold the j_list_lock. + */ + spin_lock(&journal->j_list_lock); while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; jh = commit_transaction->t_forget; + spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || @@ -710,14 +793,14 @@ skip_commit: /* The journal should be unlocked by now. */ * Otherwise, we can just throw away the frozen data now. */ if (jh->b_committed_data) { - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; } } else if (jh->b_frozen_data) { - kfree(jh->b_frozen_data); + jbd_slab_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; } @@ -755,13 +838,42 @@ skip_commit: /* The journal should be unlocked by now. */ jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); /* needs a brelse */ - release_buffer_page(bh); + /* The buffer on BJ_Forget list and not jbddirty means + * it has been freed by this transaction and hence it + * could not have been reallocated until this + * transaction has committed. *BUT* it could be + * reallocated once we have written all the data to + * disk and before we process the buffer on BJ_Forget + * list. */ + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + if (!jh->b_transaction) { + jbd_unlock_bh_state(bh); + /* needs a brelse */ + journal_remove_journal_head(bh); + release_buffer_page(bh); + } else + jbd_unlock_bh_state(bh); } + cond_resched_lock(&journal->j_list_lock); + } + spin_unlock(&journal->j_list_lock); + /* + * This is a bit sleazy. We borrow j_list_lock to protect + * journal->j_committing_transaction in __journal_remove_checkpoint. + * Really, __journal_remove_checkpoint should be using j_state_lock but + * it's a bit hassle to hold that across __journal_remove_checkpoint + */ + spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + /* + * Now recheck if some buffers did not get attached to the transaction + * while the lock was dropped... + */ + if (commit_transaction->t_forget) { spin_unlock(&journal->j_list_lock); + spin_unlock(&journal->j_state_lock); + goto restart_loop; } /* Done with this transaction! */ @@ -770,14 +882,6 @@ skip_commit: /* The journal should be unlocked by now. */ J_ASSERT(commit_transaction->t_state == T_COMMIT); - /* - * This is a bit sleazy. We borrow j_list_lock to protect - * journal->j_committing_transaction in __journal_remove_checkpoint. - * Really, __jornal_remove_checkpoint should be using j_state_lock but - * it's a bit hassle to hold that across __journal_remove_checkpoint - */ - spin_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid;