vserver 1.9.3
[linux-2.6.git] / fs / jbd / commit.c
1 /*
2  * linux/fs/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/smp_lock.h>
24
25 /*
26  * Default IO end handler for temporary BJ_IO buffer_heads.
27  */
28 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29 {
30         BUFFER_TRACE(bh, "");
31         if (uptodate)
32                 set_buffer_uptodate(bh);
33         else
34                 clear_buffer_uptodate(bh);
35         unlock_buffer(bh);
36 }
37
38 /*
39  * When an ext3-ordered file is truncated, it is possible that many pages are
40  * not sucessfully freed, because they are attached to a committing transaction.
41  * After the transaction commits, these pages are left on the LRU, with no
42  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
43  * by the VM, but their apparent absence upsets the VM accounting, and it makes
44  * the numbers in /proc/meminfo look odd.
45  *
46  * So here, we have a buffer which has just come off the forget list.  Look to
47  * see if we can strip all buffers from the backing page.
48  *
49  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
50  * caller provided us with a ref against the buffer, and we drop that here.
51  */
52 static void release_buffer_page(struct buffer_head *bh)
53 {
54         struct page *page;
55
56         if (buffer_dirty(bh))
57                 goto nope;
58         if (atomic_read(&bh->b_count) != 1)
59                 goto nope;
60         page = bh->b_page;
61         if (!page)
62                 goto nope;
63         if (page->mapping)
64                 goto nope;
65
66         /* OK, it's a truncated page */
67         if (TestSetPageLocked(page))
68                 goto nope;
69
70         page_cache_get(page);
71         __brelse(bh);
72         try_to_free_buffers(page);
73         unlock_page(page);
74         page_cache_release(page);
75         return;
76
77 nope:
78         __brelse(bh);
79 }
80
81 /*
82  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
84  * return 0.  j_list_lock is dropped in this case.
85  */
86 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87 {
88         if (!jbd_trylock_bh_state(bh)) {
89                 spin_unlock(&journal->j_list_lock);
90                 schedule();
91                 return 0;
92         }
93         return 1;
94 }
95
96 /*
97  * journal_commit_transaction
98  *
99  * The primary function for committing a transaction to the log.  This
100  * function is called by the journal thread to begin a complete commit.
101  */
102 void journal_commit_transaction(journal_t *journal)
103 {
104         transaction_t *commit_transaction;
105         struct journal_head *jh, *new_jh, *descriptor;
106         struct buffer_head *wbuf[64];
107         int bufs;
108         int flags;
109         int err;
110         unsigned long blocknr;
111         char *tagp = NULL;
112         journal_header_t *header;
113         journal_block_tag_t *tag = NULL;
114         int space_left = 0;
115         int first_tag = 0;
116         int tag_flag;
117         int i;
118
119         /*
120          * First job: lock down the current transaction and wait for
121          * all outstanding updates to complete.
122          */
123
124 #ifdef COMMIT_STATS
125         spin_lock(&journal->j_list_lock);
126         summarise_journal_usage(journal);
127         spin_unlock(&journal->j_list_lock);
128 #endif
129
130         /* Do we need to erase the effects of a prior journal_flush? */
131         if (journal->j_flags & JFS_FLUSHED) {
132                 jbd_debug(3, "super block updated\n");
133                 journal_update_superblock(journal, 1);
134         } else {
135                 jbd_debug(3, "superblock not updated\n");
136         }
137
138         J_ASSERT(journal->j_running_transaction != NULL);
139         J_ASSERT(journal->j_committing_transaction == NULL);
140
141         commit_transaction = journal->j_running_transaction;
142         J_ASSERT(commit_transaction->t_state == T_RUNNING);
143
144         jbd_debug(1, "JBD: starting commit of transaction %d\n",
145                         commit_transaction->t_tid);
146
147         spin_lock(&journal->j_state_lock);
148         commit_transaction->t_state = T_LOCKED;
149
150         spin_lock(&commit_transaction->t_handle_lock);
151         while (commit_transaction->t_updates) {
152                 DEFINE_WAIT(wait);
153
154                 prepare_to_wait(&journal->j_wait_updates, &wait,
155                                         TASK_UNINTERRUPTIBLE);
156                 if (commit_transaction->t_updates) {
157                         spin_unlock(&commit_transaction->t_handle_lock);
158                         spin_unlock(&journal->j_state_lock);
159                         schedule();
160                         spin_lock(&journal->j_state_lock);
161                         spin_lock(&commit_transaction->t_handle_lock);
162                 }
163                 finish_wait(&journal->j_wait_updates, &wait);
164         }
165         spin_unlock(&commit_transaction->t_handle_lock);
166
167         J_ASSERT (commit_transaction->t_outstanding_credits <=
168                         journal->j_max_transaction_buffers);
169
170         /*
171          * First thing we are allowed to do is to discard any remaining
172          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
173          * that there are no such buffers: if a large filesystem
174          * operation like a truncate needs to split itself over multiple
175          * transactions, then it may try to do a journal_restart() while
176          * there are still BJ_Reserved buffers outstanding.  These must
177          * be released cleanly from the current transaction.
178          *
179          * In this case, the filesystem must still reserve write access
180          * again before modifying the buffer in the new transaction, but
181          * we do not require it to remember exactly which old buffers it
182          * has reserved.  This is consistent with the existing behaviour
183          * that multiple journal_get_write_access() calls to the same
184          * buffer are perfectly permissable.
185          */
186         while (commit_transaction->t_reserved_list) {
187                 jh = commit_transaction->t_reserved_list;
188                 JBUFFER_TRACE(jh, "reserved, unused: refile");
189                 /*
190                  * A journal_get_undo_access()+journal_release_buffer() may
191                  * leave undo-committed data.
192                  */
193                 if (jh->b_committed_data) {
194                         struct buffer_head *bh = jh2bh(jh);
195
196                         jbd_lock_bh_state(bh);
197                         if (jh->b_committed_data) {
198                                 kfree(jh->b_committed_data);
199                                 jh->b_committed_data = NULL;
200                         }
201                         jbd_unlock_bh_state(bh);
202                 }
203                 journal_refile_buffer(journal, jh);
204         }
205
206         /*
207          * Now try to drop any written-back buffers from the journal's
208          * checkpoint lists.  We do this *before* commit because it potentially
209          * frees some memory
210          */
211         spin_lock(&journal->j_list_lock);
212         __journal_clean_checkpoint_list(journal);
213         spin_unlock(&journal->j_list_lock);
214
215         jbd_debug (3, "JBD: commit phase 1\n");
216
217         /*
218          * Switch to a new revoke table.
219          */
220         journal_switch_revoke_table(journal);
221
222         commit_transaction->t_state = T_FLUSH;
223         journal->j_committing_transaction = commit_transaction;
224         journal->j_running_transaction = NULL;
225         commit_transaction->t_log_start = journal->j_head;
226         wake_up(&journal->j_wait_transaction_locked);
227         spin_unlock(&journal->j_state_lock);
228
229         jbd_debug (3, "JBD: commit phase 2\n");
230
231         /*
232          * Now start flushing things to disk, in the order they appear
233          * on the transaction lists.  Data blocks go first.
234          */
235
236         err = 0;
237         /*
238          * Whenever we unlock the journal and sleep, things can get added
239          * onto ->t_sync_datalist, so we have to keep looping back to
240          * write_out_data until we *know* that the list is empty.
241          */
242         bufs = 0;
243         /*
244          * Cleanup any flushed data buffers from the data list.  Even in
245          * abort mode, we want to flush this out as soon as possible.
246          */
247 write_out_data:
248         cond_resched();
249         spin_lock(&journal->j_list_lock);
250
251         while (commit_transaction->t_sync_datalist) {
252                 struct buffer_head *bh;
253
254                 jh = commit_transaction->t_sync_datalist;
255                 commit_transaction->t_sync_datalist = jh->b_tnext;
256                 bh = jh2bh(jh);
257                 if (buffer_locked(bh)) {
258                         BUFFER_TRACE(bh, "locked");
259                         if (!inverted_lock(journal, bh))
260                                 goto write_out_data;
261                         __journal_unfile_buffer(jh);
262                         __journal_file_buffer(jh, commit_transaction,
263                                                 BJ_Locked);
264                         jbd_unlock_bh_state(bh);
265                         if (need_resched()) {
266                                 spin_unlock(&journal->j_list_lock);
267                                 goto write_out_data;
268                         }
269                 } else {
270                         if (buffer_dirty(bh)) {
271                                 BUFFER_TRACE(bh, "start journal writeout");
272                                 get_bh(bh);
273                                 wbuf[bufs++] = bh;
274                                 if (bufs == ARRAY_SIZE(wbuf)) {
275                                         jbd_debug(2, "submit %d writes\n",
276                                                         bufs);
277                                         spin_unlock(&journal->j_list_lock);
278                                         ll_rw_block(WRITE, bufs, wbuf);
279                                         journal_brelse_array(wbuf, bufs);
280                                         bufs = 0;
281                                         goto write_out_data;
282                                 }
283                         } else {
284                                 BUFFER_TRACE(bh, "writeout complete: unfile");
285                                 if (!inverted_lock(journal, bh))
286                                         goto write_out_data;
287                                 __journal_unfile_buffer(jh);
288                                 jbd_unlock_bh_state(bh);
289                                 journal_remove_journal_head(bh);
290                                 put_bh(bh);
291                                 if (need_resched()) {
292                                         spin_unlock(&journal->j_list_lock);
293                                         goto write_out_data;
294                                 }
295                         }
296                 }
297         }
298
299         if (bufs) {
300                 spin_unlock(&journal->j_list_lock);
301                 ll_rw_block(WRITE, bufs, wbuf);
302                 journal_brelse_array(wbuf, bufs);
303                 spin_lock(&journal->j_list_lock);
304         }
305
306         /*
307          * Wait for all previously submitted IO to complete.
308          */
309         while (commit_transaction->t_locked_list) {
310                 struct buffer_head *bh;
311
312                 jh = commit_transaction->t_locked_list->b_tprev;
313                 bh = jh2bh(jh);
314                 get_bh(bh);
315                 if (buffer_locked(bh)) {
316                         spin_unlock(&journal->j_list_lock);
317                         wait_on_buffer(bh);
318                         if (unlikely(!buffer_uptodate(bh)))
319                                 err = -EIO;
320                         spin_lock(&journal->j_list_lock);
321                 }
322                 if (!inverted_lock(journal, bh)) {
323                         put_bh(bh);
324                         spin_lock(&journal->j_list_lock);
325                         continue;
326                 }
327                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
328                         __journal_unfile_buffer(jh);
329                         jbd_unlock_bh_state(bh);
330                         journal_remove_journal_head(bh);
331                         put_bh(bh);
332                 } else {
333                         jbd_unlock_bh_state(bh);
334                 }
335                 put_bh(bh);
336                 if (need_resched()) {
337                         spin_unlock(&journal->j_list_lock);
338                         cond_resched();
339                         spin_lock(&journal->j_list_lock);
340                 }
341         }
342         spin_unlock(&journal->j_list_lock);
343
344         journal_write_revoke_records(journal, commit_transaction);
345
346         jbd_debug(3, "JBD: commit phase 2\n");
347
348         /*
349          * If we found any dirty or locked buffers, then we should have
350          * looped back up to the write_out_data label.  If there weren't
351          * any then journal_clean_data_list should have wiped the list
352          * clean by now, so check that it is in fact empty.
353          */
354         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
355
356         jbd_debug (3, "JBD: commit phase 3\n");
357
358         /*
359          * Way to go: we have now written out all of the data for a
360          * transaction!  Now comes the tricky part: we need to write out
361          * metadata.  Loop over the transaction's entire buffer list:
362          */
363         commit_transaction->t_state = T_COMMIT;
364
365         descriptor = NULL;
366         bufs = 0;
367         while (commit_transaction->t_buffers) {
368
369                 /* Find the next buffer to be journaled... */
370
371                 jh = commit_transaction->t_buffers;
372
373                 /* If we're in abort mode, we just un-journal the buffer and
374                    release it for background writing. */
375
376                 if (is_journal_aborted(journal)) {
377                         JBUFFER_TRACE(jh, "journal is aborting: refile");
378                         journal_refile_buffer(journal, jh);
379                         /* If that was the last one, we need to clean up
380                          * any descriptor buffers which may have been
381                          * already allocated, even if we are now
382                          * aborting. */
383                         if (!commit_transaction->t_buffers)
384                                 goto start_journal_io;
385                         continue;
386                 }
387
388                 /* Make sure we have a descriptor block in which to
389                    record the metadata buffer. */
390
391                 if (!descriptor) {
392                         struct buffer_head *bh;
393
394                         J_ASSERT (bufs == 0);
395
396                         jbd_debug(4, "JBD: get descriptor\n");
397
398                         descriptor = journal_get_descriptor_buffer(journal);
399                         if (!descriptor) {
400                                 __journal_abort_hard(journal);
401                                 continue;
402                         }
403
404                         bh = jh2bh(descriptor);
405                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
406                                 (unsigned long long)bh->b_blocknr, bh->b_data);
407                         header = (journal_header_t *)&bh->b_data[0];
408                         header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
409                         header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
410                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
411
412                         tagp = &bh->b_data[sizeof(journal_header_t)];
413                         space_left = bh->b_size - sizeof(journal_header_t);
414                         first_tag = 1;
415                         set_buffer_jwrite(bh);
416                         set_buffer_dirty(bh);
417                         wbuf[bufs++] = bh;
418
419                         /* Record it so that we can wait for IO
420                            completion later */
421                         BUFFER_TRACE(bh, "ph3: file as descriptor");
422                         journal_file_buffer(descriptor, commit_transaction,
423                                         BJ_LogCtl);
424                 }
425
426                 /* Where is the buffer to be written? */
427
428                 err = journal_next_log_block(journal, &blocknr);
429                 /* If the block mapping failed, just abandon the buffer
430                    and repeat this loop: we'll fall into the
431                    refile-on-abort condition above. */
432                 if (err) {
433                         __journal_abort_hard(journal);
434                         continue;
435                 }
436
437                 /*
438                  * start_this_handle() uses t_outstanding_credits to determine
439                  * the free space in the log, but this counter is changed
440                  * by journal_next_log_block() also.
441                  */
442                 commit_transaction->t_outstanding_credits--;
443
444                 /* Bump b_count to prevent truncate from stumbling over
445                    the shadowed buffer!  @@@ This can go if we ever get
446                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
447                 atomic_inc(&jh2bh(jh)->b_count);
448
449                 /* Make a temporary IO buffer with which to write it out
450                    (this will requeue both the metadata buffer and the
451                    temporary IO buffer). new_bh goes on BJ_IO*/
452
453                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
454                 /*
455                  * akpm: journal_write_metadata_buffer() sets
456                  * new_bh->b_transaction to commit_transaction.
457                  * We need to clean this up before we release new_bh
458                  * (which is of type BJ_IO)
459                  */
460                 JBUFFER_TRACE(jh, "ph3: write metadata");
461                 flags = journal_write_metadata_buffer(commit_transaction,
462                                                       jh, &new_jh, blocknr);
463                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
464                 wbuf[bufs++] = jh2bh(new_jh);
465
466                 /* Record the new block's tag in the current descriptor
467                    buffer */
468
469                 tag_flag = 0;
470                 if (flags & 1)
471                         tag_flag |= JFS_FLAG_ESCAPE;
472                 if (!first_tag)
473                         tag_flag |= JFS_FLAG_SAME_UUID;
474
475                 tag = (journal_block_tag_t *) tagp;
476                 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
477                 tag->t_flags = cpu_to_be32(tag_flag);
478                 tagp += sizeof(journal_block_tag_t);
479                 space_left -= sizeof(journal_block_tag_t);
480
481                 if (first_tag) {
482                         memcpy (tagp, journal->j_uuid, 16);
483                         tagp += 16;
484                         space_left -= 16;
485                         first_tag = 0;
486                 }
487
488                 /* If there's no more to do, or if the descriptor is full,
489                    let the IO rip! */
490
491                 if (bufs == ARRAY_SIZE(wbuf) ||
492                     commit_transaction->t_buffers == NULL ||
493                     space_left < sizeof(journal_block_tag_t) + 16) {
494
495                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
496
497                         /* Write an end-of-descriptor marker before
498                            submitting the IOs.  "tag" still points to
499                            the last tag we set up. */
500
501                         tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
502
503 start_journal_io:
504                         for (i = 0; i < bufs; i++) {
505                                 struct buffer_head *bh = wbuf[i];
506                                 lock_buffer(bh);
507                                 clear_buffer_dirty(bh);
508                                 set_buffer_uptodate(bh);
509                                 bh->b_end_io = journal_end_buffer_io_sync;
510                                 submit_bh(WRITE, bh);
511                         }
512                         cond_resched();
513
514                         /* Force a new descriptor to be generated next
515                            time round the loop. */
516                         descriptor = NULL;
517                         bufs = 0;
518                 }
519         }
520
521         /* Lo and behold: we have just managed to send a transaction to
522            the log.  Before we can commit it, wait for the IO so far to
523            complete.  Control buffers being written are on the
524            transaction's t_log_list queue, and metadata buffers are on
525            the t_iobuf_list queue.
526
527            Wait for the buffers in reverse order.  That way we are
528            less likely to be woken up until all IOs have completed, and
529            so we incur less scheduling load.
530         */
531
532         jbd_debug(3, "JBD: commit phase 4\n");
533
534         /*
535          * akpm: these are BJ_IO, and j_list_lock is not needed.
536          * See __journal_try_to_free_buffer.
537          */
538 wait_for_iobuf:
539         while (commit_transaction->t_iobuf_list != NULL) {
540                 struct buffer_head *bh;
541
542                 jh = commit_transaction->t_iobuf_list->b_tprev;
543                 bh = jh2bh(jh);
544                 if (buffer_locked(bh)) {
545                         wait_on_buffer(bh);
546                         goto wait_for_iobuf;
547                 }
548
549                 if (unlikely(!buffer_uptodate(bh)))
550                         err = -EIO;
551
552                 clear_buffer_jwrite(bh);
553
554                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
555                 journal_unfile_buffer(journal, jh);
556
557                 /*
558                  * ->t_iobuf_list should contain only dummy buffer_heads
559                  * which were created by journal_write_metadata_buffer().
560                  */
561                 BUFFER_TRACE(bh, "dumping temporary bh");
562                 journal_put_journal_head(jh);
563                 __brelse(bh);
564                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
565                 free_buffer_head(bh);
566
567                 /* We also have to unlock and free the corresponding
568                    shadowed buffer */
569                 jh = commit_transaction->t_shadow_list->b_tprev;
570                 bh = jh2bh(jh);
571                 clear_bit(BH_JWrite, &bh->b_state);
572                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
573
574                 /* The metadata is now released for reuse, but we need
575                    to remember it against this transaction so that when
576                    we finally commit, we can do any checkpointing
577                    required. */
578                 JBUFFER_TRACE(jh, "file as BJ_Forget");
579                 journal_file_buffer(jh, commit_transaction, BJ_Forget);
580                 /* Wake up any transactions which were waiting for this
581                    IO to complete */
582                 wake_up_buffer(bh);
583                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
584                 __brelse(bh);
585         }
586
587         J_ASSERT (commit_transaction->t_shadow_list == NULL);
588
589         jbd_debug(3, "JBD: commit phase 5\n");
590
591         /* Here we wait for the revoke record and descriptor record buffers */
592  wait_for_ctlbuf:
593         while (commit_transaction->t_log_list != NULL) {
594                 struct buffer_head *bh;
595
596                 jh = commit_transaction->t_log_list->b_tprev;
597                 bh = jh2bh(jh);
598                 if (buffer_locked(bh)) {
599                         wait_on_buffer(bh);
600                         goto wait_for_ctlbuf;
601                 }
602
603                 if (unlikely(!buffer_uptodate(bh)))
604                         err = -EIO;
605
606                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
607                 clear_buffer_jwrite(bh);
608                 journal_unfile_buffer(journal, jh);
609                 journal_put_journal_head(jh);
610                 __brelse(bh);           /* One for getblk */
611                 /* AKPM: bforget here */
612         }
613
614         jbd_debug(3, "JBD: commit phase 6\n");
615
616         if (is_journal_aborted(journal))
617                 goto skip_commit;
618
619         /* Done it all: now write the commit record.  We should have
620          * cleaned up our previous buffers by now, so if we are in abort
621          * mode we can now just skip the rest of the journal write
622          * entirely. */
623
624         descriptor = journal_get_descriptor_buffer(journal);
625         if (!descriptor) {
626                 __journal_abort_hard(journal);
627                 goto skip_commit;
628         }
629
630         /* AKPM: buglet - add `i' to tmp! */
631         for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
632                 journal_header_t *tmp =
633                         (journal_header_t*)jh2bh(descriptor)->b_data;
634                 tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
635                 tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
636                 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
637         }
638
639         JBUFFER_TRACE(descriptor, "write commit block");
640         {
641                 struct buffer_head *bh = jh2bh(descriptor);
642                 int ret;
643                 int barrier_done = 0;
644
645                 set_buffer_dirty(bh);
646                 if (journal->j_flags & JFS_BARRIER) {
647                         set_buffer_ordered(bh);
648                         barrier_done = 1;
649                 }
650                 ret = sync_dirty_buffer(bh);
651                 /* is it possible for another commit to fail at roughly
652                  * the same time as this one?  If so, we don't want to
653                  * trust the barrier flag in the super, but instead want
654                  * to remember if we sent a barrier request
655                  */
656                 if (ret == -EOPNOTSUPP && barrier_done) {
657                         char b[BDEVNAME_SIZE];
658
659                         printk(KERN_WARNING
660                                 "JBD: barrier-based sync failed on %s - "
661                                 "disabling barriers\n",
662                                 bdevname(journal->j_dev, b));
663                         spin_lock(&journal->j_state_lock);
664                         journal->j_flags &= ~JFS_BARRIER;
665                         spin_unlock(&journal->j_state_lock);
666
667                         /* And try again, without the barrier */
668                         clear_buffer_ordered(bh);
669                         set_buffer_uptodate(bh);
670                         set_buffer_dirty(bh);
671                         ret = sync_dirty_buffer(bh);
672                 }
673                 if (unlikely(ret == -EIO))
674                         err = -EIO;
675                 put_bh(bh);             /* One for getblk() */
676                 journal_put_journal_head(descriptor);
677         }
678
679         /* End of a transaction!  Finally, we can do checkpoint
680            processing: any buffers committed as a result of this
681            transaction can be removed from any checkpoint list it was on
682            before. */
683
684 skip_commit: /* The journal should be unlocked by now. */
685
686         if (err)
687                 __journal_abort_hard(journal);
688
689         /*
690          * Call any callbacks that had been registered for handles in this
691          * transaction.  It is up to the callback to free any allocated
692          * memory.
693          *
694          * The spinlocking (t_jcb_lock) here is surely unnecessary...
695          */
696         spin_lock(&commit_transaction->t_jcb_lock);
697         if (!list_empty(&commit_transaction->t_jcb)) {
698                 struct list_head *p, *n;
699                 int error = is_journal_aborted(journal);
700
701                 list_for_each_safe(p, n, &commit_transaction->t_jcb) {
702                         struct journal_callback *jcb;
703
704                         jcb = list_entry(p, struct journal_callback, jcb_list);
705                         list_del(p);
706                         spin_unlock(&commit_transaction->t_jcb_lock);
707                         jcb->jcb_func(jcb, error);
708                         spin_lock(&commit_transaction->t_jcb_lock);
709                 }
710         }
711         spin_unlock(&commit_transaction->t_jcb_lock);
712
713         jbd_debug(3, "JBD: commit phase 7\n");
714
715         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
716         J_ASSERT(commit_transaction->t_buffers == NULL);
717         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
718         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
719         J_ASSERT(commit_transaction->t_shadow_list == NULL);
720         J_ASSERT(commit_transaction->t_log_list == NULL);
721
722         while (commit_transaction->t_forget) {
723                 transaction_t *cp_transaction;
724                 struct buffer_head *bh;
725
726                 jh = commit_transaction->t_forget;
727                 bh = jh2bh(jh);
728                 jbd_lock_bh_state(bh);
729                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
730                         jh->b_transaction == journal->j_running_transaction);
731
732                 /*
733                  * If there is undo-protected committed data against
734                  * this buffer, then we can remove it now.  If it is a
735                  * buffer needing such protection, the old frozen_data
736                  * field now points to a committed version of the
737                  * buffer, so rotate that field to the new committed
738                  * data.
739                  *
740                  * Otherwise, we can just throw away the frozen data now.
741                  */
742                 if (jh->b_committed_data) {
743                         kfree(jh->b_committed_data);
744                         jh->b_committed_data = NULL;
745                         if (jh->b_frozen_data) {
746                                 jh->b_committed_data = jh->b_frozen_data;
747                                 jh->b_frozen_data = NULL;
748                         }
749                 } else if (jh->b_frozen_data) {
750                         kfree(jh->b_frozen_data);
751                         jh->b_frozen_data = NULL;
752                 }
753
754                 spin_lock(&journal->j_list_lock);
755                 cp_transaction = jh->b_cp_transaction;
756                 if (cp_transaction) {
757                         JBUFFER_TRACE(jh, "remove from old cp transaction");
758                         __journal_remove_checkpoint(jh);
759                 }
760
761                 /* Only re-checkpoint the buffer_head if it is marked
762                  * dirty.  If the buffer was added to the BJ_Forget list
763                  * by journal_forget, it may no longer be dirty and
764                  * there's no point in keeping a checkpoint record for
765                  * it. */
766
767                 /* A buffer which has been freed while still being
768                  * journaled by a previous transaction may end up still
769                  * being dirty here, but we want to avoid writing back
770                  * that buffer in the future now that the last use has
771                  * been committed.  That's not only a performance gain,
772                  * it also stops aliasing problems if the buffer is left
773                  * behind for writeback and gets reallocated for another
774                  * use in a different page. */
775                 if (buffer_freed(bh)) {
776                         clear_buffer_freed(bh);
777                         clear_buffer_jbddirty(bh);
778                 }
779
780                 if (buffer_jbddirty(bh)) {
781                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
782                         __journal_insert_checkpoint(jh, commit_transaction);
783                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
784                         __journal_refile_buffer(jh);
785                         jbd_unlock_bh_state(bh);
786                 } else {
787                         J_ASSERT_BH(bh, !buffer_dirty(bh));
788                         J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
789                         __journal_unfile_buffer(jh);
790                         jbd_unlock_bh_state(bh);
791                         journal_remove_journal_head(bh);  /* needs a brelse */
792                         release_buffer_page(bh);
793                 }
794                 spin_unlock(&journal->j_list_lock);
795         }
796
797         /* Done with this transaction! */
798
799         jbd_debug(3, "JBD: commit phase 8\n");
800
801         J_ASSERT(commit_transaction->t_state == T_COMMIT);
802
803         /*
804          * This is a bit sleazy.  We borrow j_list_lock to protect
805          * journal->j_committing_transaction in __journal_remove_checkpoint.
806          * Really, __jornal_remove_checkpoint should be using j_state_lock but
807          * it's a bit hassle to hold that across __journal_remove_checkpoint
808          */
809         spin_lock(&journal->j_state_lock);
810         spin_lock(&journal->j_list_lock);
811         commit_transaction->t_state = T_FINISHED;
812         J_ASSERT(commit_transaction == journal->j_committing_transaction);
813         journal->j_commit_sequence = commit_transaction->t_tid;
814         journal->j_committing_transaction = NULL;
815         spin_unlock(&journal->j_state_lock);
816
817         if (commit_transaction->t_checkpoint_list == NULL) {
818                 __journal_drop_transaction(journal, commit_transaction);
819         } else {
820                 if (journal->j_checkpoint_transactions == NULL) {
821                         journal->j_checkpoint_transactions = commit_transaction;
822                         commit_transaction->t_cpnext = commit_transaction;
823                         commit_transaction->t_cpprev = commit_transaction;
824                 } else {
825                         commit_transaction->t_cpnext =
826                                 journal->j_checkpoint_transactions;
827                         commit_transaction->t_cpprev =
828                                 commit_transaction->t_cpnext->t_cpprev;
829                         commit_transaction->t_cpnext->t_cpprev =
830                                 commit_transaction;
831                         commit_transaction->t_cpprev->t_cpnext =
832                                 commit_transaction;
833                 }
834         }
835         spin_unlock(&journal->j_list_lock);
836
837         jbd_debug(1, "JBD: commit %d complete, head %d\n",
838                   journal->j_commit_sequence, journal->j_tail_sequence);
839
840         wake_up(&journal->j_wait_done_commit);
841 }