Merge to Fedora kernel-2.6.18-1.2224_FC5 patched with stable patch-2.6.18.1-vs2.0...
[linux-2.6.git] / fs / jbd / checkpoint.c
1 /*
2  * linux/fs/checkpoint.c
3  * 
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5  *
6  * Copyright 1999 Red Hat Software --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Checkpoint routines for the generic filesystem journaling code.  
13  * Part of the ext2fs journaling system.  
14  *
15  * Checkpointing is the process of ensuring that a section of the log is
16  * committed fully to disk, so that that portion of the log can be
17  * reused.
18  */
19
20 #include <linux/time.h>
21 #include <linux/fs.h>
22 #include <linux/jbd.h>
23 #include <linux/errno.h>
24 #include <linux/slab.h>
25
26 /*
27  * Unlink a buffer from a transaction checkpoint list.
28  *
29  * Called with j_list_lock held.
30  */
31 static inline void __buffer_unlink_first(struct journal_head *jh)
32 {
33         transaction_t *transaction = jh->b_cp_transaction;
34
35         jh->b_cpnext->b_cpprev = jh->b_cpprev;
36         jh->b_cpprev->b_cpnext = jh->b_cpnext;
37         if (transaction->t_checkpoint_list == jh) {
38                 transaction->t_checkpoint_list = jh->b_cpnext;
39                 if (transaction->t_checkpoint_list == jh)
40                         transaction->t_checkpoint_list = NULL;
41         }
42 }
43
44 /*
45  * Unlink a buffer from a transaction checkpoint(io) list.
46  *
47  * Called with j_list_lock held.
48  */
49 static inline void __buffer_unlink(struct journal_head *jh)
50 {
51         transaction_t *transaction = jh->b_cp_transaction;
52
53         __buffer_unlink_first(jh);
54         if (transaction->t_checkpoint_io_list == jh) {
55                 transaction->t_checkpoint_io_list = jh->b_cpnext;
56                 if (transaction->t_checkpoint_io_list == jh)
57                         transaction->t_checkpoint_io_list = NULL;
58         }
59 }
60
61 /*
62  * Move a buffer from the checkpoint list to the checkpoint io list
63  *
64  * Called with j_list_lock held
65  */
66 static inline void __buffer_relink_io(struct journal_head *jh)
67 {
68         transaction_t *transaction = jh->b_cp_transaction;
69
70         __buffer_unlink_first(jh);
71
72         if (!transaction->t_checkpoint_io_list) {
73                 jh->b_cpnext = jh->b_cpprev = jh;
74         } else {
75                 jh->b_cpnext = transaction->t_checkpoint_io_list;
76                 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
77                 jh->b_cpprev->b_cpnext = jh;
78                 jh->b_cpnext->b_cpprev = jh;
79         }
80         transaction->t_checkpoint_io_list = jh;
81 }
82
83 /*
84  * Try to release a checkpointed buffer from its transaction.
85  * Returns 1 if we released it and 2 if we also released the
86  * whole transaction.
87  *
88  * Requires j_list_lock
89  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
90  */
91 static int __try_to_free_cp_buf(struct journal_head *jh)
92 {
93         int ret = 0;
94         struct buffer_head *bh = jh2bh(jh);
95
96         if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
97                 JBUFFER_TRACE(jh, "remove from checkpoint list");
98                 ret = __journal_remove_checkpoint(jh) + 1;
99                 jbd_unlock_bh_state(bh);
100                 journal_remove_journal_head(bh);
101                 BUFFER_TRACE(bh, "release");
102                 __brelse(bh);
103         } else {
104                 jbd_unlock_bh_state(bh);
105         }
106         return ret;
107 }
108
109 /*
110  * __log_wait_for_space: wait until there is space in the journal.
111  *
112  * Called under j-state_lock *only*.  It will be unlocked if we have to wait
113  * for a checkpoint to free up some space in the log.
114  */
115 void __log_wait_for_space(journal_t *journal)
116 {
117         int nblocks;
118         assert_spin_locked(&journal->j_state_lock);
119
120         nblocks = jbd_space_needed(journal);
121         while (__log_space_left(journal) < nblocks) {
122                 if (journal->j_flags & JFS_ABORT)
123                         return;
124                 spin_unlock(&journal->j_state_lock);
125                 mutex_lock(&journal->j_checkpoint_mutex);
126
127                 /*
128                  * Test again, another process may have checkpointed while we
129                  * were waiting for the checkpoint lock
130                  */
131                 spin_lock(&journal->j_state_lock);
132                 nblocks = jbd_space_needed(journal);
133                 if (__log_space_left(journal) < nblocks) {
134                         spin_unlock(&journal->j_state_lock);
135                         log_do_checkpoint(journal);
136                         spin_lock(&journal->j_state_lock);
137                 }
138                 mutex_unlock(&journal->j_checkpoint_mutex);
139         }
140 }
141
142 /*
143  * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
144  * The caller must restart a list walk.  Wait for someone else to run
145  * jbd_unlock_bh_state().
146  */
147 static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
148 {
149         get_bh(bh);
150         spin_unlock(&journal->j_list_lock);
151         jbd_lock_bh_state(bh);
152         jbd_unlock_bh_state(bh);
153         put_bh(bh);
154 }
155
156 /*
157  * Clean up transaction's list of buffers submitted for io.
158  * We wait for any pending IO to complete and remove any clean
159  * buffers. Note that we take the buffers in the opposite ordering
160  * from the one in which they were submitted for IO.
161  *
162  * Called with j_list_lock held.
163  */
164 static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
165 {
166         struct journal_head *jh;
167         struct buffer_head *bh;
168         tid_t this_tid;
169         int released = 0;
170
171         this_tid = transaction->t_tid;
172 restart:
173         /* Did somebody clean up the transaction in the meanwhile? */
174         if (journal->j_checkpoint_transactions != transaction ||
175                         transaction->t_tid != this_tid)
176                 return;
177         while (!released && transaction->t_checkpoint_io_list) {
178                 jh = transaction->t_checkpoint_io_list;
179                 bh = jh2bh(jh);
180                 if (!jbd_trylock_bh_state(bh)) {
181                         jbd_sync_bh(journal, bh);
182                         spin_lock(&journal->j_list_lock);
183                         goto restart;
184                 }
185                 if (buffer_locked(bh)) {
186                         atomic_inc(&bh->b_count);
187                         spin_unlock(&journal->j_list_lock);
188                         jbd_unlock_bh_state(bh);
189                         wait_on_buffer(bh);
190                         /* the journal_head may have gone by now */
191                         BUFFER_TRACE(bh, "brelse");
192                         __brelse(bh);
193                         spin_lock(&journal->j_list_lock);
194                         goto restart;
195                 }
196                 /*
197                  * Now in whatever state the buffer currently is, we know that
198                  * it has been written out and so we can drop it from the list
199                  */
200                 released = __journal_remove_checkpoint(jh);
201                 jbd_unlock_bh_state(bh);
202                 journal_remove_journal_head(bh);
203                 __brelse(bh);
204         }
205 }
206
207 #define NR_BATCH        64
208
209 static void
210 __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
211 {
212         int i;
213
214         ll_rw_block(SWRITE, *batch_count, bhs);
215         for (i = 0; i < *batch_count; i++) {
216                 struct buffer_head *bh = bhs[i];
217                 clear_buffer_jwrite(bh);
218                 BUFFER_TRACE(bh, "brelse");
219                 __brelse(bh);
220         }
221         *batch_count = 0;
222 }
223
224 /*
225  * Try to flush one buffer from the checkpoint list to disk.
226  *
227  * Return 1 if something happened which requires us to abort the current
228  * scan of the checkpoint list.  
229  *
230  * Called with j_list_lock held and drops it if 1 is returned
231  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
232  */
233 static int __process_buffer(journal_t *journal, struct journal_head *jh,
234                         struct buffer_head **bhs, int *batch_count)
235 {
236         struct buffer_head *bh = jh2bh(jh);
237         int ret = 0;
238
239         if (buffer_locked(bh)) {
240                 atomic_inc(&bh->b_count);
241                 spin_unlock(&journal->j_list_lock);
242                 jbd_unlock_bh_state(bh);
243                 wait_on_buffer(bh);
244                 /* the journal_head may have gone by now */
245                 BUFFER_TRACE(bh, "brelse");
246                 __brelse(bh);
247                 ret = 1;
248         } else if (jh->b_transaction != NULL) {
249                 transaction_t *t = jh->b_transaction;
250                 tid_t tid = t->t_tid;
251
252                 spin_unlock(&journal->j_list_lock);
253                 jbd_unlock_bh_state(bh);
254                 log_start_commit(journal, tid);
255                 log_wait_commit(journal, tid);
256                 ret = 1;
257         } else if (!buffer_dirty(bh)) {
258                 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
259                 BUFFER_TRACE(bh, "remove from checkpoint");
260                 __journal_remove_checkpoint(jh);
261                 spin_unlock(&journal->j_list_lock);
262                 jbd_unlock_bh_state(bh);
263                 journal_remove_journal_head(bh);
264                 __brelse(bh);
265                 ret = 1;
266         } else {
267                 /*
268                  * Important: we are about to write the buffer, and
269                  * possibly block, while still holding the journal lock.
270                  * We cannot afford to let the transaction logic start
271                  * messing around with this buffer before we write it to
272                  * disk, as that would break recoverability.  
273                  */
274                 BUFFER_TRACE(bh, "queue");
275                 get_bh(bh);
276                 J_ASSERT_BH(bh, !buffer_jwrite(bh));
277                 set_buffer_jwrite(bh);
278                 bhs[*batch_count] = bh;
279                 __buffer_relink_io(jh);
280                 jbd_unlock_bh_state(bh);
281                 (*batch_count)++;
282                 if (*batch_count == NR_BATCH) {
283                         spin_unlock(&journal->j_list_lock);
284                         __flush_batch(journal, bhs, batch_count);
285                         ret = 1;
286                 }
287         }
288         return ret;
289 }
290
291 /*
292  * Perform an actual checkpoint. We take the first transaction on the
293  * list of transactions to be checkpointed and send all its buffers
294  * to disk. We submit larger chunks of data at once.
295  * 
296  * The journal should be locked before calling this function.
297  */
298 int log_do_checkpoint(journal_t *journal)
299 {
300         transaction_t *transaction;
301         tid_t this_tid;
302         int result;
303
304         jbd_debug(1, "Start checkpoint\n");
305
306         /* 
307          * First thing: if there are any transactions in the log which
308          * don't need checkpointing, just eliminate them from the
309          * journal straight away.  
310          */
311         result = cleanup_journal_tail(journal);
312         jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
313         if (result <= 0)
314                 return result;
315
316         /*
317          * OK, we need to start writing disk blocks.  Take one transaction
318          * and write it.
319          */
320         spin_lock(&journal->j_list_lock);
321         if (!journal->j_checkpoint_transactions)
322                 goto out;
323         transaction = journal->j_checkpoint_transactions;
324         this_tid = transaction->t_tid;
325 restart:
326         /*
327          * If someone cleaned up this transaction while we slept, we're
328          * done (maybe it's a new transaction, but it fell at the same
329          * address).
330          */
331         if (journal->j_checkpoint_transactions == transaction &&
332                         transaction->t_tid == this_tid) {
333                 int batch_count = 0;
334                 struct buffer_head *bhs[NR_BATCH];
335                 struct journal_head *jh;
336                 int retry = 0;
337
338                 while (!retry && transaction->t_checkpoint_list) {
339                         struct buffer_head *bh;
340
341                         jh = transaction->t_checkpoint_list;
342                         bh = jh2bh(jh);
343                         if (!jbd_trylock_bh_state(bh)) {
344                                 jbd_sync_bh(journal, bh);
345                                 retry = 1;
346                                 break;
347                         }
348                         retry = __process_buffer(journal, jh, bhs,&batch_count);
349                         if (!retry && lock_need_resched(&journal->j_list_lock)){
350                                 spin_unlock(&journal->j_list_lock);
351                                 retry = 1;
352                                 break;
353                         }
354                 }
355
356                 if (batch_count) {
357                         if (!retry) {
358                                 spin_unlock(&journal->j_list_lock);
359                                 retry = 1;
360                         }
361                         __flush_batch(journal, bhs, &batch_count);
362                 }
363
364                 if (retry) {
365                         spin_lock(&journal->j_list_lock);
366                         goto restart;
367                 }
368                 /*
369                  * Now we have cleaned up the first transaction's checkpoint
370                  * list. Let's clean up the second one
371                  */
372                 __wait_cp_io(journal, transaction);
373         }
374 out:
375         spin_unlock(&journal->j_list_lock);
376         result = cleanup_journal_tail(journal);
377         if (result < 0)
378                 return result;
379         return 0;
380 }
381
382 /*
383  * Check the list of checkpoint transactions for the journal to see if
384  * we have already got rid of any since the last update of the log tail
385  * in the journal superblock.  If so, we can instantly roll the
386  * superblock forward to remove those transactions from the log.
387  * 
388  * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
389  * 
390  * Called with the journal lock held.
391  *
392  * This is the only part of the journaling code which really needs to be
393  * aware of transaction aborts.  Checkpointing involves writing to the
394  * main filesystem area rather than to the journal, so it can proceed
395  * even in abort state, but we must not update the journal superblock if
396  * we have an abort error outstanding.
397  */
398
399 int cleanup_journal_tail(journal_t *journal)
400 {
401         transaction_t * transaction;
402         tid_t           first_tid;
403         unsigned long   blocknr, freed;
404
405         /* OK, work out the oldest transaction remaining in the log, and
406          * the log block it starts at. 
407          * 
408          * If the log is now empty, we need to work out which is the
409          * next transaction ID we will write, and where it will
410          * start. */
411
412         spin_lock(&journal->j_state_lock);
413         spin_lock(&journal->j_list_lock);
414         transaction = journal->j_checkpoint_transactions;
415         if (transaction) {
416                 first_tid = transaction->t_tid;
417                 blocknr = transaction->t_log_start;
418         } else if ((transaction = journal->j_committing_transaction) != NULL) {
419                 first_tid = transaction->t_tid;
420                 blocknr = transaction->t_log_start;
421         } else if ((transaction = journal->j_running_transaction) != NULL) {
422                 first_tid = transaction->t_tid;
423                 blocknr = journal->j_head;
424         } else {
425                 first_tid = journal->j_transaction_sequence;
426                 blocknr = journal->j_head;
427         }
428         spin_unlock(&journal->j_list_lock);
429         J_ASSERT(blocknr != 0);
430
431         /* If the oldest pinned transaction is at the tail of the log
432            already then there's not much we can do right now. */
433         if (journal->j_tail_sequence == first_tid) {
434                 spin_unlock(&journal->j_state_lock);
435                 return 1;
436         }
437
438         /* OK, update the superblock to recover the freed space.
439          * Physical blocks come first: have we wrapped beyond the end of
440          * the log?  */
441         freed = blocknr - journal->j_tail;
442         if (blocknr < journal->j_tail)
443                 freed = freed + journal->j_last - journal->j_first;
444
445         jbd_debug(1,
446                   "Cleaning journal tail from %d to %d (offset %lu), "
447                   "freeing %lu\n",
448                   journal->j_tail_sequence, first_tid, blocknr, freed);
449
450         journal->j_free += freed;
451         journal->j_tail_sequence = first_tid;
452         journal->j_tail = blocknr;
453         spin_unlock(&journal->j_state_lock);
454         if (!(journal->j_flags & JFS_ABORT))
455                 journal_update_superblock(journal, 1);
456         return 0;
457 }
458
459
460 /* Checkpoint list management */
461
462 /*
463  * journal_clean_one_cp_list
464  *
465  * Find all the written-back checkpoint buffers in the given list and release them.
466  *
467  * Called with the journal locked.
468  * Called with j_list_lock held.
469  * Returns number of bufers reaped (for debug)
470  */
471
472 static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
473 {
474         struct journal_head *last_jh;
475         struct journal_head *next_jh = jh;
476         int ret, freed = 0;
477
478         *released = 0;
479         if (!jh)
480                 return 0;
481
482         last_jh = jh->b_cpprev;
483         do {
484                 jh = next_jh;
485                 next_jh = jh->b_cpnext;
486                 /* Use trylock because of the ranking */
487                 if (jbd_trylock_bh_state(jh2bh(jh))) {
488                         ret = __try_to_free_cp_buf(jh);
489                         if (ret) {
490                                 freed++;
491                                 if (ret == 2) {
492                                         *released = 1;
493                                         return freed;
494                                 }
495                         }
496                 }
497                 /*
498                  * This function only frees up some memory
499                  * if possible so we dont have an obligation
500                  * to finish processing. Bail out if preemption
501                  * requested:
502                  */
503                 if (need_resched())
504                         return freed;
505         } while (jh != last_jh);
506
507         return freed;
508 }
509
510 /*
511  * journal_clean_checkpoint_list
512  *
513  * Find all the written-back checkpoint buffers in the journal and release them.
514  *
515  * Called with the journal locked.
516  * Called with j_list_lock held.
517  * Returns number of buffers reaped (for debug)
518  */
519
520 int __journal_clean_checkpoint_list(journal_t *journal)
521 {
522         transaction_t *transaction, *last_transaction, *next_transaction;
523         int ret = 0;
524         int released;
525
526         transaction = journal->j_checkpoint_transactions;
527         if (!transaction)
528                 goto out;
529
530         last_transaction = transaction->t_cpprev;
531         next_transaction = transaction;
532         do {
533                 transaction = next_transaction;
534                 next_transaction = transaction->t_cpnext;
535                 ret += journal_clean_one_cp_list(transaction->
536                                 t_checkpoint_list, &released);
537                 /*
538                  * This function only frees up some memory if possible so we
539                  * dont have an obligation to finish processing. Bail out if
540                  * preemption requested:
541                  */
542                 if (need_resched())
543                         goto out;
544                 if (released)
545                         continue;
546                 /*
547                  * It is essential that we are as careful as in the case of
548                  * t_checkpoint_list with removing the buffer from the list as
549                  * we can possibly see not yet submitted buffers on io_list
550                  */
551                 ret += journal_clean_one_cp_list(transaction->
552                                 t_checkpoint_io_list, &released);
553                 if (need_resched())
554                         goto out;
555         } while (transaction != last_transaction);
556 out:
557         return ret;
558 }
559
560 /* 
561  * journal_remove_checkpoint: called after a buffer has been committed
562  * to disk (either by being write-back flushed to disk, or being
563  * committed to the log).
564  *
565  * We cannot safely clean a transaction out of the log until all of the
566  * buffer updates committed in that transaction have safely been stored
567  * elsewhere on disk.  To achieve this, all of the buffers in a
568  * transaction need to be maintained on the transaction's checkpoint
569  * lists until they have been rewritten, at which point this function is
570  * called to remove the buffer from the existing transaction's
571  * checkpoint lists.
572  *
573  * The function returns 1 if it frees the transaction, 0 otherwise.
574  *
575  * This function is called with the journal locked.
576  * This function is called with j_list_lock held.
577  * This function is called with jbd_lock_bh_state(jh2bh(jh))
578  */
579
580 int __journal_remove_checkpoint(struct journal_head *jh)
581 {
582         transaction_t *transaction;
583         journal_t *journal;
584         int ret = 0;
585
586         JBUFFER_TRACE(jh, "entry");
587
588         if ((transaction = jh->b_cp_transaction) == NULL) {
589                 JBUFFER_TRACE(jh, "not on transaction");
590                 goto out;
591         }
592         journal = transaction->t_journal;
593
594         __buffer_unlink(jh);
595         jh->b_cp_transaction = NULL;
596
597         if (transaction->t_checkpoint_list != NULL ||
598             transaction->t_checkpoint_io_list != NULL)
599                 goto out;
600         JBUFFER_TRACE(jh, "transaction has no more buffers");
601
602         /*
603          * There is one special case to worry about: if we have just pulled the
604          * buffer off a committing transaction's forget list, then even if the
605          * checkpoint list is empty, the transaction obviously cannot be
606          * dropped!
607          *
608          * The locking here around j_committing_transaction is a bit sleazy.
609          * See the comment at the end of journal_commit_transaction().
610          */
611         if (transaction == journal->j_committing_transaction) {
612                 JBUFFER_TRACE(jh, "belongs to committing transaction");
613                 goto out;
614         }
615
616         /* OK, that was the last buffer for the transaction: we can now
617            safely remove this transaction from the log */
618
619         __journal_drop_transaction(journal, transaction);
620
621         /* Just in case anybody was waiting for more transactions to be
622            checkpointed... */
623         wake_up(&journal->j_wait_logspace);
624         ret = 1;
625 out:
626         JBUFFER_TRACE(jh, "exit");
627         return ret;
628 }
629
630 /*
631  * journal_insert_checkpoint: put a committed buffer onto a checkpoint
632  * list so that we know when it is safe to clean the transaction out of
633  * the log.
634  *
635  * Called with the journal locked.
636  * Called with j_list_lock held.
637  */
638 void __journal_insert_checkpoint(struct journal_head *jh, 
639                                transaction_t *transaction)
640 {
641         JBUFFER_TRACE(jh, "entry");
642         J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
643         J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
644
645         jh->b_cp_transaction = transaction;
646
647         if (!transaction->t_checkpoint_list) {
648                 jh->b_cpnext = jh->b_cpprev = jh;
649         } else {
650                 jh->b_cpnext = transaction->t_checkpoint_list;
651                 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
652                 jh->b_cpprev->b_cpnext = jh;
653                 jh->b_cpnext->b_cpprev = jh;
654         }
655         transaction->t_checkpoint_list = jh;
656 }
657
658 /*
659  * We've finished with this transaction structure: adios...
660  * 
661  * The transaction must have no links except for the checkpoint by this
662  * point.
663  *
664  * Called with the journal locked.
665  * Called with j_list_lock held.
666  */
667
668 void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
669 {
670         assert_spin_locked(&journal->j_list_lock);
671         if (transaction->t_cpnext) {
672                 transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
673                 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
674                 if (journal->j_checkpoint_transactions == transaction)
675                         journal->j_checkpoint_transactions =
676                                 transaction->t_cpnext;
677                 if (journal->j_checkpoint_transactions == transaction)
678                         journal->j_checkpoint_transactions = NULL;
679         }
680
681         J_ASSERT(transaction->t_state == T_FINISHED);
682         J_ASSERT(transaction->t_buffers == NULL);
683         J_ASSERT(transaction->t_sync_datalist == NULL);
684         J_ASSERT(transaction->t_forget == NULL);
685         J_ASSERT(transaction->t_iobuf_list == NULL);
686         J_ASSERT(transaction->t_shadow_list == NULL);
687         J_ASSERT(transaction->t_log_list == NULL);
688         J_ASSERT(transaction->t_checkpoint_list == NULL);
689         J_ASSERT(transaction->t_checkpoint_io_list == NULL);
690         J_ASSERT(transaction->t_updates == 0);
691         J_ASSERT(journal->j_committing_transaction != transaction);
692         J_ASSERT(journal->j_running_transaction != transaction);
693
694         jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
695         kfree(transaction);
696 }