fs/jbd/commit.c

   1 /*
   2  * linux/fs/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/smp_lock.h>
  24
  25 /*
  26  * Default IO end handler for temporary BJ_IO buffer_heads.
  27  */
  28 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  29 {
  30         BUFFER_TRACE(bh, "");
  31         if (uptodate)
  32                 set_buffer_uptodate(bh);
  33         else
  34                 clear_buffer_uptodate(bh);
  35         unlock_buffer(bh);
  36 }
  37
  38 /*
  39  * When an ext3-ordered file is truncated, it is possible that many pages are
  40  * not sucessfully freed, because they are attached to a committing transaction.
  41  * After the transaction commits, these pages are left on the LRU, with no
  42  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  43  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  44  * the numbers in /proc/meminfo look odd.
  45  *
  46  * So here, we have a buffer which has just come off the forget list.  Look to
  47  * see if we can strip all buffers from the backing page.
  48  *
  49  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  50  * caller provided us with a ref against the buffer, and we drop that here.
  51  */
  52 static void release_buffer_page(struct buffer_head *bh)
  53 {
  54         struct page *page;
  55
  56         if (buffer_dirty(bh))
  57                 goto nope;
  58         if (atomic_read(&bh->b_count) != 1)
  59                 goto nope;
  60         page = bh->b_page;
  61         if (!page)
  62                 goto nope;
  63         if (page->mapping)
  64                 goto nope;
  65
  66         /* OK, it's a truncated page */
  67         if (TestSetPageLocked(page))
  68                 goto nope;
  69
  70         page_cache_get(page);
  71         __brelse(bh);
  72         try_to_free_buffers(page);
  73         unlock_page(page);
  74         page_cache_release(page);
  75         return;
  76
  77 nope:
  78         __brelse(bh);
  79 }
  80
  81 /*
  82  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  83  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  84  * return 0.  j_list_lock is dropped in this case.
  85  */
  86 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  87 {
  88         if (!jbd_trylock_bh_state(bh)) {
  89                 spin_unlock(&journal->j_list_lock);
  90                 schedule();
  91                 return 0;
  92         }
  93         return 1;
  94 }
  95
  96 /*
  97  * journal_commit_transaction
  98  *
  99  * The primary function for committing a transaction to the log.  This
 100  * function is called by the journal thread to begin a complete commit.
 101  */
 102 void journal_commit_transaction(journal_t *journal)
 103 {
 104         transaction_t *commit_transaction;
 105         struct journal_head *jh, *new_jh, *descriptor;
 106         struct buffer_head *wbuf[64];
 107         int bufs;
 108         int flags;
 109         int err;
 110         unsigned long blocknr;
 111         char *tagp = NULL;
 112         journal_header_t *header;
 113         journal_block_tag_t *tag = NULL;
 114         int space_left = 0;
 115         int first_tag = 0;
 116         int tag_flag;
 117         int i;
 118
 119         /*
 120          * First job: lock down the current transaction and wait for
 121          * all outstanding updates to complete.
 122          */
 123
 124 #ifdef COMMIT_STATS
 125         spin_lock(&journal->j_list_lock);
 126         summarise_journal_usage(journal);
 127         spin_unlock(&journal->j_list_lock);
 128 #endif
 129
 130         /* Do we need to erase the effects of a prior journal_flush? */
 131         if (journal->j_flags & JFS_FLUSHED) {
 132                 jbd_debug(3, "super block updated\n");
 133                 journal_update_superblock(journal, 1);
 134         } else {
 135                 jbd_debug(3, "superblock not updated\n");
 136         }
 137
 138         J_ASSERT(journal->j_running_transaction != NULL);
 139         J_ASSERT(journal->j_committing_transaction == NULL);
 140
 141         commit_transaction = journal->j_running_transaction;
 142         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 143
 144         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 145                         commit_transaction->t_tid);
 146
 147         spin_lock(&journal->j_state_lock);
 148         commit_transaction->t_state = T_LOCKED;
 149
 150         spin_lock(&commit_transaction->t_handle_lock);
 151         while (commit_transaction->t_updates) {
 152                 DEFINE_WAIT(wait);
 153
 154                 prepare_to_wait(&journal->j_wait_updates, &wait,
 155                                         TASK_UNINTERRUPTIBLE);
 156                 if (commit_transaction->t_updates) {
 157                         spin_unlock(&commit_transaction->t_handle_lock);
 158                         spin_unlock(&journal->j_state_lock);
 159                         schedule();
 160                         spin_lock(&journal->j_state_lock);
 161                         spin_lock(&commit_transaction->t_handle_lock);
 162                 }
 163                 finish_wait(&journal->j_wait_updates, &wait);
 164         }
 165         spin_unlock(&commit_transaction->t_handle_lock);
 166
 167         J_ASSERT (commit_transaction->t_outstanding_credits <=
 168                         journal->j_max_transaction_buffers);
 169
 170         /*
 171          * First thing we are allowed to do is to discard any remaining
 172          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 173          * that there are no such buffers: if a large filesystem
 174          * operation like a truncate needs to split itself over multiple
 175          * transactions, then it may try to do a journal_restart() while
 176          * there are still BJ_Reserved buffers outstanding.  These must
 177          * be released cleanly from the current transaction.
 178          *
 179          * In this case, the filesystem must still reserve write access
 180          * again before modifying the buffer in the new transaction, but
 181          * we do not require it to remember exactly which old buffers it
 182          * has reserved.  This is consistent with the existing behaviour
 183          * that multiple journal_get_write_access() calls to the same
 184          * buffer are perfectly permissable.
 185          */
 186         while (commit_transaction->t_reserved_list) {
 187                 jh = commit_transaction->t_reserved_list;
 188                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 189                 /*
 190                  * A journal_get_undo_access()+journal_release_buffer() may
 191                  * leave undo-committed data.
 192                  */
 193                 if (jh->b_committed_data) {
 194                         struct buffer_head *bh = jh2bh(jh);
 195
 196                         jbd_lock_bh_state(bh);
 197                         if (jh->b_committed_data) {
 198                                 kfree(jh->b_committed_data);
 199                                 jh->b_committed_data = NULL;
 200                         }
 201                         jbd_unlock_bh_state(bh);
 202                 }
 203                 journal_refile_buffer(journal, jh);
 204         }
 205
 206         /*
 207          * Now try to drop any written-back buffers from the journal's
 208          * checkpoint lists.  We do this *before* commit because it potentially
 209          * frees some memory
 210          */
 211         spin_lock(&journal->j_list_lock);
 212         __journal_clean_checkpoint_list(journal);
 213         spin_unlock(&journal->j_list_lock);
 214
 215         jbd_debug (3, "JBD: commit phase 1\n");
 216
 217         /*
 218          * Switch to a new revoke table.
 219          */
 220         journal_switch_revoke_table(journal);
 221
 222         commit_transaction->t_state = T_FLUSH;
 223         journal->j_committing_transaction = commit_transaction;
 224         journal->j_running_transaction = NULL;
 225         commit_transaction->t_log_start = journal->j_head;
 226         wake_up(&journal->j_wait_transaction_locked);
 227         spin_unlock(&journal->j_state_lock);
 228
 229         jbd_debug (3, "JBD: commit phase 2\n");
 230
 231         /*
 232          * Now start flushing things to disk, in the order they appear
 233          * on the transaction lists.  Data blocks go first.
 234          */
 235
 236         err = 0;
 237         /*
 238          * Whenever we unlock the journal and sleep, things can get added
 239          * onto ->t_sync_datalist, so we have to keep looping back to
 240          * write_out_data until we *know* that the list is empty.
 241          */
 242         bufs = 0;
 243         /*
 244          * Cleanup any flushed data buffers from the data list.  Even in
 245          * abort mode, we want to flush this out as soon as possible.
 246          */
 247 write_out_data:
 248         cond_resched();
 249         spin_lock(&journal->j_list_lock);
 250
 251         while (commit_transaction->t_sync_datalist) {
 252                 struct buffer_head *bh;
 253
 254                 jh = commit_transaction->t_sync_datalist;
 255                 commit_transaction->t_sync_datalist = jh->b_tnext;
 256                 bh = jh2bh(jh);
 257                 if (buffer_locked(bh)) {
 258                         BUFFER_TRACE(bh, "locked");
 259                         if (!inverted_lock(journal, bh))
 260                                 goto write_out_data;
 261                         __journal_unfile_buffer(jh);
 262                         __journal_file_buffer(jh, commit_transaction,
 263                                                 BJ_Locked);
 264                         jbd_unlock_bh_state(bh);
 265                         if (lock_need_resched(&journal->j_list_lock)) {
 266                                 spin_unlock(&journal->j_list_lock);
 267                                 goto write_out_data;
 268                         }
 269                 } else {
 270                         if (buffer_dirty(bh)) {
 271                                 BUFFER_TRACE(bh, "start journal writeout");
 272                                 get_bh(bh);
 273                                 wbuf[bufs++] = bh;
 274                                 if (bufs == ARRAY_SIZE(wbuf)) {
 275                                         jbd_debug(2, "submit %d writes\n",
 276                                                         bufs);
 277                                         spin_unlock(&journal->j_list_lock);
 278                                         ll_rw_block(WRITE, bufs, wbuf);
 279                                         journal_brelse_array(wbuf, bufs);
 280                                         bufs = 0;
 281                                         goto write_out_data;
 282                                 }
 283                         } else {
 284                                 BUFFER_TRACE(bh, "writeout complete: unfile");
 285                                 if (!inverted_lock(journal, bh))
 286                                         goto write_out_data;
 287                                 __journal_unfile_buffer(jh);
 288                                 jbd_unlock_bh_state(bh);
 289                                 journal_remove_journal_head(bh);
 290                                 put_bh(bh);
 291                                 if (lock_need_resched(&journal->j_list_lock)) {
 292                                         spin_unlock(&journal->j_list_lock);
 293                                         goto write_out_data;
 294                                 }
 295                         }
 296                 }
 297         }
 298
 299         if (bufs) {
 300                 spin_unlock(&journal->j_list_lock);
 301                 ll_rw_block(WRITE, bufs, wbuf);
 302                 journal_brelse_array(wbuf, bufs);
 303                 spin_lock(&journal->j_list_lock);
 304         }
 305
 306         /*
 307          * Wait for all previously submitted IO to complete.
 308          */
 309         while (commit_transaction->t_locked_list) {
 310                 struct buffer_head *bh;
 311
 312                 jh = commit_transaction->t_locked_list->b_tprev;
 313                 bh = jh2bh(jh);
 314                 get_bh(bh);
 315                 if (buffer_locked(bh)) {
 316                         spin_unlock(&journal->j_list_lock);
 317                         wait_on_buffer(bh);
 318                         if (unlikely(!buffer_uptodate(bh)))
 319                                 err = -EIO;
 320                         spin_lock(&journal->j_list_lock);
 321                 }
 322                 if (!inverted_lock(journal, bh)) {
 323                         put_bh(bh);
 324                         spin_lock(&journal->j_list_lock);
 325                         continue;
 326                 }
 327                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 328                         __journal_unfile_buffer(jh);
 329                         jbd_unlock_bh_state(bh);
 330                         journal_remove_journal_head(bh);
 331                         put_bh(bh);
 332                 } else {
 333                         jbd_unlock_bh_state(bh);
 334                 }
 335                 put_bh(bh);
 336                 cond_resched_lock(&journal->j_list_lock);
 337         }
 338         spin_unlock(&journal->j_list_lock);
 339
 340         if (err)
 341                 __journal_abort_hard(journal);
 342
 343         journal_write_revoke_records(journal, commit_transaction);
 344
 345         jbd_debug(3, "JBD: commit phase 2\n");
 346
 347         /*
 348          * If we found any dirty or locked buffers, then we should have
 349          * looped back up to the write_out_data label.  If there weren't
 350          * any then journal_clean_data_list should have wiped the list
 351          * clean by now, so check that it is in fact empty.
 352          */
 353         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 354
 355         jbd_debug (3, "JBD: commit phase 3\n");
 356
 357         /*
 358          * Way to go: we have now written out all of the data for a
 359          * transaction!  Now comes the tricky part: we need to write out
 360          * metadata.  Loop over the transaction's entire buffer list:
 361          */
 362         commit_transaction->t_state = T_COMMIT;
 363
 364         descriptor = NULL;
 365         bufs = 0;
 366         while (commit_transaction->t_buffers) {
 367
 368                 /* Find the next buffer to be journaled... */
 369
 370                 jh = commit_transaction->t_buffers;
 371
 372                 /* If we're in abort mode, we just un-journal the buffer and
 373                    release it for background writing. */
 374
 375                 if (is_journal_aborted(journal)) {
 376                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 377                         journal_refile_buffer(journal, jh);
 378                         /* If that was the last one, we need to clean up
 379                          * any descriptor buffers which may have been
 380                          * already allocated, even if we are now
 381                          * aborting. */
 382                         if (!commit_transaction->t_buffers)
 383                                 goto start_journal_io;
 384                         continue;
 385                 }
 386
 387                 /* Make sure we have a descriptor block in which to
 388                    record the metadata buffer. */
 389
 390                 if (!descriptor) {
 391                         struct buffer_head *bh;
 392
 393                         J_ASSERT (bufs == 0);
 394
 395                         jbd_debug(4, "JBD: get descriptor\n");
 396
 397                         descriptor = journal_get_descriptor_buffer(journal);
 398                         if (!descriptor) {
 399                                 __journal_abort_hard(journal);
 400                                 continue;
 401                         }
 402
 403                         bh = jh2bh(descriptor);
 404                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 405                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 406                         header = (journal_header_t *)&bh->b_data[0];
 407                         header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
 408                         header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
 409                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 410
 411                         tagp = &bh->b_data[sizeof(journal_header_t)];
 412                         space_left = bh->b_size - sizeof(journal_header_t);
 413                         first_tag = 1;
 414                         set_buffer_jwrite(bh);
 415                         set_buffer_dirty(bh);
 416                         wbuf[bufs++] = bh;
 417
 418                         /* Record it so that we can wait for IO
 419                            completion later */
 420                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 421                         journal_file_buffer(descriptor, commit_transaction,
 422                                         BJ_LogCtl);
 423                 }
 424
 425                 /* Where is the buffer to be written? */
 426
 427                 err = journal_next_log_block(journal, &blocknr);
 428                 /* If the block mapping failed, just abandon the buffer
 429                    and repeat this loop: we'll fall into the
 430                    refile-on-abort condition above. */
 431                 if (err) {
 432                         __journal_abort_hard(journal);
 433                         continue;
 434                 }
 435
 436                 /*
 437                  * start_this_handle() uses t_outstanding_credits to determine
 438                  * the free space in the log, but this counter is changed
 439                  * by journal_next_log_block() also.
 440                  */
 441                 commit_transaction->t_outstanding_credits--;
 442
 443                 /* Bump b_count to prevent truncate from stumbling over
 444                    the shadowed buffer!  @@@ This can go if we ever get
 445                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 446                 atomic_inc(&jh2bh(jh)->b_count);
 447
 448                 /* Make a temporary IO buffer with which to write it out
 449                    (this will requeue both the metadata buffer and the
 450                    temporary IO buffer). new_bh goes on BJ_IO*/
 451
 452                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 453                 /*
 454                  * akpm: journal_write_metadata_buffer() sets
 455                  * new_bh->b_transaction to commit_transaction.
 456                  * We need to clean this up before we release new_bh
 457                  * (which is of type BJ_IO)
 458                  */
 459                 JBUFFER_TRACE(jh, "ph3: write metadata");
 460                 flags = journal_write_metadata_buffer(commit_transaction,
 461                                                       jh, &new_jh, blocknr);
 462                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 463                 wbuf[bufs++] = jh2bh(new_jh);
 464
 465                 /* Record the new block's tag in the current descriptor
 466                    buffer */
 467
 468                 tag_flag = 0;
 469                 if (flags & 1)
 470                         tag_flag |= JFS_FLAG_ESCAPE;
 471                 if (!first_tag)
 472                         tag_flag |= JFS_FLAG_SAME_UUID;
 473
 474                 tag = (journal_block_tag_t *) tagp;
 475                 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
 476                 tag->t_flags = cpu_to_be32(tag_flag);
 477                 tagp += sizeof(journal_block_tag_t);
 478                 space_left -= sizeof(journal_block_tag_t);
 479
 480                 if (first_tag) {
 481                         memcpy (tagp, journal->j_uuid, 16);
 482                         tagp += 16;
 483                         space_left -= 16;
 484                         first_tag = 0;
 485                 }
 486
 487                 /* If there's no more to do, or if the descriptor is full,
 488                    let the IO rip! */
 489
 490                 if (bufs == ARRAY_SIZE(wbuf) ||
 491                     commit_transaction->t_buffers == NULL ||
 492                     space_left < sizeof(journal_block_tag_t) + 16) {
 493
 494                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 495
 496                         /* Write an end-of-descriptor marker before
 497                            submitting the IOs.  "tag" still points to
 498                            the last tag we set up. */
 499
 500                         tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
 501
 502 start_journal_io:
 503                         for (i = 0; i < bufs; i++) {
 504                                 struct buffer_head *bh = wbuf[i];
 505                                 lock_buffer(bh);
 506                                 clear_buffer_dirty(bh);
 507                                 set_buffer_uptodate(bh);
 508                                 bh->b_end_io = journal_end_buffer_io_sync;
 509                                 submit_bh(WRITE, bh);
 510                         }
 511                         cond_resched();
 512
 513                         /* Force a new descriptor to be generated next
 514                            time round the loop. */
 515                         descriptor = NULL;
 516                         bufs = 0;
 517                 }
 518         }
 519
 520         /* Lo and behold: we have just managed to send a transaction to
 521            the log.  Before we can commit it, wait for the IO so far to
 522            complete.  Control buffers being written are on the
 523            transaction's t_log_list queue, and metadata buffers are on
 524            the t_iobuf_list queue.
 525
 526            Wait for the buffers in reverse order.  That way we are
 527            less likely to be woken up until all IOs have completed, and
 528            so we incur less scheduling load.
 529         */
 530
 531         jbd_debug(3, "JBD: commit phase 4\n");
 532
 533         /*
 534          * akpm: these are BJ_IO, and j_list_lock is not needed.
 535          * See __journal_try_to_free_buffer.
 536          */
 537 wait_for_iobuf:
 538         while (commit_transaction->t_iobuf_list != NULL) {
 539                 struct buffer_head *bh;
 540
 541                 jh = commit_transaction->t_iobuf_list->b_tprev;
 542                 bh = jh2bh(jh);
 543                 if (buffer_locked(bh)) {
 544                         wait_on_buffer(bh);
 545                         goto wait_for_iobuf;
 546                 }
 547                 if (cond_resched())
 548                         goto wait_for_iobuf;
 549
 550                 if (unlikely(!buffer_uptodate(bh)))
 551                         err = -EIO;
 552
 553                 clear_buffer_jwrite(bh);
 554
 555                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 556                 journal_unfile_buffer(journal, jh);
 557
 558                 /*
 559                  * ->t_iobuf_list should contain only dummy buffer_heads
 560                  * which were created by journal_write_metadata_buffer().
 561                  */
 562                 BUFFER_TRACE(bh, "dumping temporary bh");
 563                 journal_put_journal_head(jh);
 564                 __brelse(bh);
 565                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 566                 free_buffer_head(bh);
 567
 568                 /* We also have to unlock and free the corresponding
 569                    shadowed buffer */
 570                 jh = commit_transaction->t_shadow_list->b_tprev;
 571                 bh = jh2bh(jh);
 572                 clear_bit(BH_JWrite, &bh->b_state);
 573                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 574
 575                 /* The metadata is now released for reuse, but we need
 576                    to remember it against this transaction so that when
 577                    we finally commit, we can do any checkpointing
 578                    required. */
 579                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 580                 journal_file_buffer(jh, commit_transaction, BJ_Forget);
 581                 /* Wake up any transactions which were waiting for this
 582                    IO to complete */
 583                 wake_up_bit(&bh->b_state, BH_Unshadow);
 584                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 585                 __brelse(bh);
 586         }
 587
 588         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 589
 590         jbd_debug(3, "JBD: commit phase 5\n");
 591
 592         /* Here we wait for the revoke record and descriptor record buffers */
 593  wait_for_ctlbuf:
 594         while (commit_transaction->t_log_list != NULL) {
 595                 struct buffer_head *bh;
 596
 597                 jh = commit_transaction->t_log_list->b_tprev;
 598                 bh = jh2bh(jh);
 599                 if (buffer_locked(bh)) {
 600                         wait_on_buffer(bh);
 601                         goto wait_for_ctlbuf;
 602                 }
 603                 if (cond_resched())
 604                         goto wait_for_ctlbuf;
 605
 606                 if (unlikely(!buffer_uptodate(bh)))
 607                         err = -EIO;
 608
 609                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 610                 clear_buffer_jwrite(bh);
 611                 journal_unfile_buffer(journal, jh);
 612                 journal_put_journal_head(jh);
 613                 __brelse(bh);           /* One for getblk */
 614                 /* AKPM: bforget here */
 615         }
 616
 617         jbd_debug(3, "JBD: commit phase 6\n");
 618
 619         if (is_journal_aborted(journal))
 620                 goto skip_commit;
 621
 622         /* Done it all: now write the commit record.  We should have
 623          * cleaned up our previous buffers by now, so if we are in abort
 624          * mode we can now just skip the rest of the journal write
 625          * entirely. */
 626
 627         descriptor = journal_get_descriptor_buffer(journal);
 628         if (!descriptor) {
 629                 __journal_abort_hard(journal);
 630                 goto skip_commit;
 631         }
 632
 633         /* AKPM: buglet - add `i' to tmp! */
 634         for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
 635                 journal_header_t *tmp =
 636                         (journal_header_t*)jh2bh(descriptor)->b_data;
 637                 tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
 638                 tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
 639                 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 640         }
 641
 642         JBUFFER_TRACE(descriptor, "write commit block");
 643         {
 644                 struct buffer_head *bh = jh2bh(descriptor);
 645                 int ret;
 646                 int barrier_done = 0;
 647
 648                 set_buffer_dirty(bh);
 649                 if (journal->j_flags & JFS_BARRIER) {
 650                         set_buffer_ordered(bh);
 651                         barrier_done = 1;
 652                 }
 653                 ret = sync_dirty_buffer(bh);
 654                 /* is it possible for another commit to fail at roughly
 655                  * the same time as this one?  If so, we don't want to
 656                  * trust the barrier flag in the super, but instead want
 657                  * to remember if we sent a barrier request
 658                  */
 659                 if (ret == -EOPNOTSUPP && barrier_done) {
 660                         char b[BDEVNAME_SIZE];
 661
 662                         printk(KERN_WARNING
 663                                 "JBD: barrier-based sync failed on %s - "
 664                                 "disabling barriers\n",
 665                                 bdevname(journal->j_dev, b));
 666                         spin_lock(&journal->j_state_lock);
 667                         journal->j_flags &= ~JFS_BARRIER;
 668                         spin_unlock(&journal->j_state_lock);
 669
 670                         /* And try again, without the barrier */
 671                         clear_buffer_ordered(bh);
 672                         set_buffer_uptodate(bh);
 673                         set_buffer_dirty(bh);
 674                         ret = sync_dirty_buffer(bh);
 675                 }
 676                 if (unlikely(ret == -EIO))
 677                         err = -EIO;
 678                 put_bh(bh);             /* One for getblk() */
 679                 journal_put_journal_head(descriptor);
 680         }
 681
 682         /* End of a transaction!  Finally, we can do checkpoint
 683            processing: any buffers committed as a result of this
 684            transaction can be removed from any checkpoint list it was on
 685            before. */
 686
 687 skip_commit: /* The journal should be unlocked by now. */
 688
 689         if (err)
 690                 __journal_abort_hard(journal);
 691
 692         jbd_debug(3, "JBD: commit phase 7\n");
 693
 694         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 695         J_ASSERT(commit_transaction->t_buffers == NULL);
 696         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 697         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 698         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 699         J_ASSERT(commit_transaction->t_log_list == NULL);
 700
 701 restart_loop:
 702         while (commit_transaction->t_forget) {
 703                 transaction_t *cp_transaction;
 704                 struct buffer_head *bh;
 705
 706                 jh = commit_transaction->t_forget;
 707                 bh = jh2bh(jh);
 708                 jbd_lock_bh_state(bh);
 709                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 710                         jh->b_transaction == journal->j_running_transaction);
 711
 712                 /*
 713                  * If there is undo-protected committed data against
 714                  * this buffer, then we can remove it now.  If it is a
 715                  * buffer needing such protection, the old frozen_data
 716                  * field now points to a committed version of the
 717                  * buffer, so rotate that field to the new committed
 718                  * data.
 719                  *
 720                  * Otherwise, we can just throw away the frozen data now.
 721                  */
 722                 if (jh->b_committed_data) {
 723                         kfree(jh->b_committed_data);
 724                         jh->b_committed_data = NULL;
 725                         if (jh->b_frozen_data) {
 726                                 jh->b_committed_data = jh->b_frozen_data;
 727                                 jh->b_frozen_data = NULL;
 728                         }
 729                 } else if (jh->b_frozen_data) {
 730                         kfree(jh->b_frozen_data);
 731                         jh->b_frozen_data = NULL;
 732                 }
 733
 734                 spin_lock(&journal->j_list_lock);
 735                 cp_transaction = jh->b_cp_transaction;
 736                 if (cp_transaction) {
 737                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 738                         __journal_remove_checkpoint(jh);
 739                 }
 740
 741                 /* Only re-checkpoint the buffer_head if it is marked
 742                  * dirty.  If the buffer was added to the BJ_Forget list
 743                  * by journal_forget, it may no longer be dirty and
 744                  * there's no point in keeping a checkpoint record for
 745                  * it. */
 746
 747                 /* A buffer which has been freed while still being
 748                  * journaled by a previous transaction may end up still
 749                  * being dirty here, but we want to avoid writing back
 750                  * that buffer in the future now that the last use has
 751                  * been committed.  That's not only a performance gain,
 752                  * it also stops aliasing problems if the buffer is left
 753                  * behind for writeback and gets reallocated for another
 754                  * use in a different page. */
 755                 if (buffer_freed(bh)) {
 756                         clear_buffer_freed(bh);
 757                         clear_buffer_jbddirty(bh);
 758                 }
 759
 760                 if (buffer_jbddirty(bh)) {
 761                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 762                         __journal_insert_checkpoint(jh, commit_transaction);
 763                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 764                         __journal_refile_buffer(jh);
 765                         jbd_unlock_bh_state(bh);
 766                 } else {
 767                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 768                         J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
 769                         __journal_unfile_buffer(jh);
 770                         jbd_unlock_bh_state(bh);
 771                         journal_remove_journal_head(bh);  /* needs a brelse */
 772                         release_buffer_page(bh);
 773                 }
 774                 spin_unlock(&journal->j_list_lock);
 775                 if (cond_resched())
 776                         goto restart_loop;
 777         }
 778
 779         /* Done with this transaction! */
 780
 781         jbd_debug(3, "JBD: commit phase 8\n");
 782
 783         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 784
 785         /*
 786          * This is a bit sleazy.  We borrow j_list_lock to protect
 787          * journal->j_committing_transaction in __journal_remove_checkpoint.
 788          * Really, __jornal_remove_checkpoint should be using j_state_lock but
 789          * it's a bit hassle to hold that across __journal_remove_checkpoint
 790          */
 791         spin_lock(&journal->j_state_lock);
 792         spin_lock(&journal->j_list_lock);
 793         commit_transaction->t_state = T_FINISHED;
 794         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 795         journal->j_commit_sequence = commit_transaction->t_tid;
 796         journal->j_committing_transaction = NULL;
 797         spin_unlock(&journal->j_state_lock);
 798
 799         if (commit_transaction->t_checkpoint_list == NULL) {
 800                 __journal_drop_transaction(journal, commit_transaction);
 801         } else {
 802                 if (journal->j_checkpoint_transactions == NULL) {
 803                         journal->j_checkpoint_transactions = commit_transaction;
 804                         commit_transaction->t_cpnext = commit_transaction;
 805                         commit_transaction->t_cpprev = commit_transaction;
 806                 } else {
 807                         commit_transaction->t_cpnext =
 808                                 journal->j_checkpoint_transactions;
 809                         commit_transaction->t_cpprev =
 810                                 commit_transaction->t_cpnext->t_cpprev;
 811                         commit_transaction->t_cpnext->t_cpprev =
 812                                 commit_transaction;
 813                         commit_transaction->t_cpprev->t_cpnext =
 814                                 commit_transaction;
 815                 }
 816         }
 817         spin_unlock(&journal->j_list_lock);
 818
 819         jbd_debug(1, "JBD: commit %d complete, head %d\n",
 820                   journal->j_commit_sequence, journal->j_tail_sequence);
 821
 822         wake_up(&journal->j_wait_done_commit);
 823 }