1 * orphan_list_check_on_destroy_inode.patch
2 http://github.com/caglar10ur/linux-2.6.27.y/commit/56ccd891a32e6409700786737953906426512ff7
3 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=a6c15c2b0fbfd5c0a84f5f0e1e3f20f85d2b8692
5 * don-t-read-inode-block-if-buf-has-write-error.patch
6 http://github.com/caglar10ur/linux-2.6.27.y/commit/1e7d951fa2bd7d49121aff0ec7eb0331e3d5eeb5
7 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=95450f5a7e53d5752ce1a0d0b8282e10fe745ae0
9 * error-in-ext3_lookup-if-corruption-found.patch
10 http://github.com/caglar10ur/linux-2.6.27.y/commit/f1d52243f69747ee601d671ec1b98a7363ce0597
11 Filesystem errors should be logged and not silently ignored
13 * fix-accessing-freed-memory-in-ext3_abort.patch
14 http://github.com/caglar10ur/linux-2.6.27.y/commit/f863ab8749fca8d167b357357ed7492c1c6d530d
15 http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.32.y.git;a=commitdiff_plain;h=44d6f78756560e95903de239e10f8a40a6eae444
17 * make_fdatasync_not_sync_metadata.patch
18 http://github.com/caglar10ur/linux-2.6.27.y/commit/1d29962ae5187764989ede9ec0d0777d2f489345
19 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=3d61f75eefedf75914ab4453c67aaa2ee64bcf93
21 * add_checks_for_errors_from_jbd.patch
22 http://github.com/caglar10ur/linux-2.6.27.y/commit/f38c319731f010b1c36c25ed591f79bcb557d2f2
23 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=2d7c820e56ce83b23daee9eb5343730fb309418e
25 * add_missing_error_checks.patch
26 http://github.com/caglar10ur/linux-2.6.27.y/commit/8f0e6faf83721162a77cb5df5c483e4799bea22b
27 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=cbe5f466f6995e10a10c7ae66d6dc8608f08a6b8
29 The original patch does the following two things:
31 (1) stop aborting the journal on file data write errors, instead
32 just call printk() and set AS_EIO to appropriate address_space
34 (2) add missing error checks for file data writes
36 This patch does only (2).
38 * dont_dirty_original_metadata_buffer_on_abort.patch
39 http://github.com/caglar10ur/linux-2.6.27.y/commit/d003fa1a88c857795ca7e102eefbf26c9088aa66
40 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=7ad7445f60fe4d46c4c9d2a9463db180d2a3b270
42 * fix_commit_code_to_properly_abort_journal.patch
43 http://github.com/caglar10ur/linux-2.6.27.y/commit/232632e87d9bc83b89d97f98f311d67d45e0e6dd
44 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=7a266e75cf5a1efd20d084408a1b7f1a185496dd
46 * fix_journal_overflow_issues.patch
47 http://github.com/caglar10ur/linux-2.6.27.y/commit/74361d7e55f28847e3b7eda4a4563d02ab001537
48 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=5bc833feaa8b2236265764e7e81f44937be46eda
49 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=5b9a499d77e9dd39c9e6611ea10c56a31604f274
51 * fix_typo_in_recovery_code.patch
52 http://github.com/caglar10ur/linux-2.6.27.y/commit/da999401f4bedf317b2e7dcd3c9163b1a433ba3c
53 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=439aeec639d7c57f3561054a6d315c40fd24bb74
55 * jbd-properly-dispose-of-unmapped-data-buffers.patch
56 http://github.com/caglar10ur/linux-2.6.27.y/commit/841d34d702702c85c7b8cc31a185e48ce3ca0a8e
57 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=fc80c44277b3c92d808b73e9d40e120229aa4b6a
59 * jdb-abort-when-failed-to-log-metadata-buffers.patch
60 http://github.com/caglar10ur/linux-2.6.27.y/commit/7de4ddac8020dcb2078b7237650e972ecfd112cf
61 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=44519faf22ad6ce924ad0352d3dc200d9e0b66e8
63 * fix-assertion-failure-in-fs-jbd-checkpoint.patch
64 http://github.com/caglar10ur/linux-2.6.27.y/commit/268ff9d67ed3269e5d84914aabd30d06ee89f563
65 http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.32.y.git;a=commitdiff_plain;h=d4beaf4ab5f89496f2bcf67db62ad95d99bfeff6
67 * fix-error-handling-for-checkpoint-io.patch
68 http://github.com/caglar10ur/linux-2.6.27.y/commit/e1ef6b77a95b8e02255dfa02fef06e2231e92645
69 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=4afe978530702c934dfdb11f54073136818b2119
71 * jbd-test-BH_Write_EIO-to-detect-errors-on-metadata.patch
72 http://github.com/caglar10ur/linux-2.6.27.y/commit/1a8ede62acc03d3b2baa98d02189685a4e30044f
73 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=9f818b4ac04f53458d0354950b4f229f54be4dbf
75 * handle-corrupted-orphan-list-at-mount.patch
76 http://github.com/caglar10ur/linux-2.6.27.y/commit/6003003452a5faaa0b2d1deb6356ebf8d4e2fe3f
77 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=ae76dd9a6b5bbe5315fb7028e03f68f75b8538f3
80 diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
81 index b0615c8..841f0f7 100644
84 @@ -73,6 +73,9 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
88 + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
92 * The VFS has written the file data. If the inode is unaltered
93 * then we need not start a commit.
94 diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
95 index 9885ff8..d586377 100644
96 --- a/fs/ext3/ialloc.c
97 +++ b/fs/ext3/ialloc.c
98 @@ -658,14 +658,15 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
99 unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
100 unsigned long block_group;
102 - struct buffer_head *bitmap_bh = NULL;
103 + struct buffer_head *bitmap_bh;
104 struct inode *inode = NULL;
107 /* Error cases - e2fsck has already cleaned up for us */
109 ext3_warning(sb, __FUNCTION__,
110 "bad orphan ino %lu! e2fsck was run?", ino);
115 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
116 @@ -674,38 +675,58 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
118 ext3_warning(sb, __FUNCTION__,
119 "inode bitmap error for orphan %lu", ino);
124 /* Having the inode bit set should be a 100% indicator that this
125 * is a valid orphan (no e2fsck run on fs). Orphans also include
126 * inodes that were being truncated, so we can't check i_nlink==0.
128 - if (!ext3_test_bit(bit, bitmap_bh->b_data) ||
129 - !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
130 - NEXT_ORPHAN(inode) > max_ino) {
131 - ext3_warning(sb, __FUNCTION__,
132 - "bad orphan inode %lu! e2fsck was run?", ino);
133 - printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
134 - bit, (unsigned long long)bitmap_bh->b_blocknr,
135 - ext3_test_bit(bit, bitmap_bh->b_data));
136 - printk(KERN_NOTICE "inode=%p\n", inode);
138 - printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
139 - is_bad_inode(inode));
140 - printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
141 - NEXT_ORPHAN(inode));
142 - printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
144 + if (!ext3_test_bit(bit, bitmap_bh->b_data))
147 + inode = iget(sb, ino);
152 + * If the orphans has i_nlinks > 0 then it should be able to be
153 + * truncated, otherwise it won't be removed from the orphan list
154 + * during processing and an infinite loop will result.
156 + if (inode->i_nlink && !ext3_can_truncate(inode))
159 + if (NEXT_ORPHAN(inode) > max_ino)
165 + err = PTR_ERR(inode);
168 + ext3_warning(sb, __FUNCTION__,
169 + "bad orphan inode %lu! e2fsck was run?", ino);
170 + printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
171 + bit, (unsigned long long)bitmap_bh->b_blocknr,
172 + ext3_test_bit(bit, bitmap_bh->b_data));
173 + printk(KERN_NOTICE "inode=%p\n", inode);
175 + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
176 + is_bad_inode(inode));
177 + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
178 + NEXT_ORPHAN(inode));
179 + printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
180 + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
181 /* Avoid freeing blocks if we got a bad deleted inode */
182 - if (inode && inode->i_nlink == 0)
183 + if (inode->i_nlink == 0)
193 + return ERR_PTR(err);
196 unsigned long ext3_count_free_inodes (struct super_block * sb)
197 diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
198 index 0698ce5..cede457 100644
199 --- a/fs/ext3/inode.c
200 +++ b/fs/ext3/inode.c
201 @@ -2189,6 +2189,19 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
205 +int ext3_can_truncate(struct inode *inode)
207 + if (IS_APPEND(inode) || IS_IXORUNLINK(inode))
209 + if (S_ISREG(inode->i_mode))
211 + if (S_ISDIR(inode->i_mode))
213 + if (S_ISLNK(inode->i_mode))
214 + return !ext3_inode_is_fast_symlink(inode);
221 @@ -2233,12 +2246,7 @@ void ext3_truncate(struct inode *inode)
222 unsigned blocksize = inode->i_sb->s_blocksize;
225 - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
226 - S_ISLNK(inode->i_mode)))
228 - if (ext3_inode_is_fast_symlink(inode))
230 - if (IS_APPEND(inode) || IS_IXORUNLINK(inode))
231 + if (!ext3_can_truncate(inode))
235 @@ -2462,6 +2470,16 @@ static int __ext3_get_inode_loc(struct inode *inode,
237 if (!buffer_uptodate(bh)) {
241 + * If the buffer has the write error flag, we have failed
242 + * to write out another inode in the same block. In this
243 + * case, we don't have to read the block because we may
244 + * read the old inode data successfully.
246 + if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
247 + set_buffer_uptodate(bh);
249 if (buffer_uptodate(bh)) {
250 /* someone brought it uptodate while we waited */
252 diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
253 index 2e910db..127733b 100644
254 --- a/fs/ext3/ioctl.c
255 +++ b/fs/ext3/ioctl.c
256 @@ -215,7 +215,7 @@ flags_err:
257 case EXT3_IOC_GROUP_EXTEND: {
258 ext3_fsblk_t n_blocks_count;
259 struct super_block *sb = inode->i_sb;
263 if (!capable(CAP_SYS_RESOURCE))
265 @@ -229,15 +229,17 @@ flags_err:
267 err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
268 journal_lock_updates(EXT3_SB(sb)->s_journal);
269 - journal_flush(EXT3_SB(sb)->s_journal);
270 + err2 = journal_flush(EXT3_SB(sb)->s_journal);
271 journal_unlock_updates(EXT3_SB(sb)->s_journal);
277 case EXT3_IOC_GROUP_ADD: {
278 struct ext3_new_group_data input;
279 struct super_block *sb = inode->i_sb;
283 if (!capable(CAP_SYS_RESOURCE))
285 @@ -252,8 +254,10 @@ flags_err:
287 err = ext3_group_add(sb, &input);
288 journal_lock_updates(EXT3_SB(sb)->s_journal);
289 - journal_flush(EXT3_SB(sb)->s_journal);
290 + err2 = journal_flush(EXT3_SB(sb)->s_journal);
291 journal_unlock_updates(EXT3_SB(sb)->s_journal);
297 diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
298 index acbfa15..a2a3d92 100644
299 --- a/fs/ext3/namei.c
300 +++ b/fs/ext3/namei.c
301 @@ -1053,6 +1053,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
304 return ERR_PTR(-EACCES);
306 + if (is_bad_inode(inode)) {
307 + /* if bad because unlinked, something has gone wrong */
308 + if (!inode->i_nlink && printk_ratelimit())
309 + ext3_error(inode->i_sb, __FUNCTION__, "unlinked inode %lu in dir #%lu", inode->i_ino, dir->i_ino);
312 + return ERR_PTR(-ENOENT);
315 dx_propagate_tag(nd, inode);
317 return d_splice_alias(inode, dentry);
318 @@ -1089,6 +1099,11 @@ struct dentry *ext3_get_parent(struct dentry *child)
320 return ERR_PTR(-EACCES);
322 + if (is_bad_inode(inode)) {
324 + return ERR_PTR(-ENOENT);
327 parent = d_alloc_anon(inode);
330 diff --git a/fs/ext3/super.c b/fs/ext3/super.c
331 index 22244a2..ce186bc 100644
332 --- a/fs/ext3/super.c
333 +++ b/fs/ext3/super.c
334 @@ -279,7 +279,8 @@ void ext3_abort (struct super_block * sb, const char * function,
335 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
336 sb->s_flags |= MS_RDONLY;
337 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
338 - journal_abort(EXT3_SB(sb)->s_journal, -EIO);
339 + if (EXT3_SB(sb)->s_journal)
340 + journal_abort(EXT3_SB(sb)->s_journal, -EIO);
343 void ext3_warning (struct super_block * sb, const char * function,
344 @@ -388,10 +389,14 @@ static void ext3_put_super (struct super_block * sb)
346 struct ext3_sb_info *sbi = EXT3_SB(sb);
347 struct ext3_super_block *es = sbi->s_es;
351 ext3_xattr_put_super(sb);
352 - journal_destroy(sbi->s_journal);
353 + err = journal_destroy(sbi->s_journal);
354 + sbi->s_journal = NULL;
356 + ext3_abort(sb, __func__, "Couldn't clean up the journal");
358 if (!(sb->s_flags & MS_RDONLY)) {
359 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
360 es->s_state = cpu_to_le16(sbi->s_mount_state);
361 @@ -2161,13 +2166,15 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
362 journal_t *journal = EXT3_SB(sb)->s_journal;
364 journal_lock_updates(journal);
365 - journal_flush(journal);
366 + if (journal_flush(journal) < 0)
368 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
369 sb->s_flags & MS_RDONLY) {
370 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
372 ext3_commit_super(sb, es, 1);
375 journal_unlock_updates(journal);
378 @@ -2269,6 +2276,13 @@ static void ext3_write_super_lockfs(struct super_block *sb)
379 journal_lock_updates(journal);
380 journal_flush(journal);
383 + * We don't want to clear needs_recovery flag when we failed
384 + * to flush the journal.
386 + if (journal_flush(journal) < 0)
389 /* Journal blocked and flushed, clear needs_recovery flag. */
390 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
391 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
392 diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
393 index 47552d4..803392f 100644
394 --- a/fs/jbd/checkpoint.c
395 +++ b/fs/jbd/checkpoint.c
396 @@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
398 struct buffer_head *bh = jh2bh(jh);
400 - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
401 + if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
402 + !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
403 JBUFFER_TRACE(jh, "remove from checkpoint list");
404 ret = __journal_remove_checkpoint(jh) + 1;
405 jbd_unlock_bh_state(bh);
406 @@ -160,21 +161,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
407 * buffers. Note that we take the buffers in the opposite ordering
408 * from the one in which they were submitted for IO.
410 + * Return 0 on success, and return <0 if some buffers have failed
411 + * to be written out.
413 * Called with j_list_lock held.
415 -static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
416 +static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
418 struct journal_head *jh;
419 struct buffer_head *bh;
424 this_tid = transaction->t_tid;
426 /* Did somebody clean up the transaction in the meanwhile? */
427 if (journal->j_checkpoint_transactions != transaction ||
428 transaction->t_tid != this_tid)
431 while (!released && transaction->t_checkpoint_io_list) {
432 jh = transaction->t_checkpoint_io_list;
434 @@ -194,6 +199,9 @@ restart:
435 spin_lock(&journal->j_list_lock);
438 + if (unlikely(buffer_write_io_error(bh)))
442 * Now in whatever state the buffer currently is, we know that
443 * it has been written out and so we can drop it from the list
444 @@ -203,6 +211,8 @@ restart:
445 journal_remove_journal_head(bh);
453 @@ -226,7 +236,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
454 * Try to flush one buffer from the checkpoint list to disk.
456 * Return 1 if something happened which requires us to abort the current
457 - * scan of the checkpoint list.
458 + * scan of the checkpoint list. Return <0 if the buffer has failed to
461 * Called with j_list_lock held and drops it if 1 is returned
462 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
463 @@ -256,6 +267,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
464 log_wait_commit(journal, tid);
466 } else if (!buffer_dirty(bh)) {
468 + if (unlikely(buffer_write_io_error(bh)))
470 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
471 BUFFER_TRACE(bh, "remove from checkpoint");
472 __journal_remove_checkpoint(jh);
473 @@ -263,7 +277,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
474 jbd_unlock_bh_state(bh);
475 journal_remove_journal_head(bh);
480 * Important: we are about to write the buffer, and
481 @@ -295,6 +308,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
482 * to disk. We submit larger chunks of data at once.
484 * The journal should be locked before calling this function.
485 + * Called with j_checkpoint_mutex held.
487 int log_do_checkpoint(journal_t *journal)
489 @@ -318,6 +332,7 @@ int log_do_checkpoint(journal_t *journal)
490 * OK, we need to start writing disk blocks. Take one transaction
494 spin_lock(&journal->j_list_lock);
495 if (!journal->j_checkpoint_transactions)
497 @@ -334,7 +349,7 @@ restart:
499 struct buffer_head *bhs[NR_BATCH];
500 struct journal_head *jh;
502 + int retry = 0, err;
504 while (!retry && transaction->t_checkpoint_list) {
505 struct buffer_head *bh;
506 @@ -347,6 +362,8 @@ restart:
509 retry = __process_buffer(journal, jh, bhs,&batch_count);
510 + if (retry < 0 && !result)
512 if (!retry && lock_need_resched(&journal->j_list_lock)){
513 spin_unlock(&journal->j_list_lock);
515 @@ -370,14 +387,18 @@ restart:
516 * Now we have cleaned up the first transaction's checkpoint
517 * list. Let's clean up the second one
519 - __wait_cp_io(journal, transaction);
520 + err = __wait_cp_io(journal, transaction);
525 spin_unlock(&journal->j_list_lock);
526 - result = cleanup_journal_tail(journal);
530 + journal_abort(journal, result);
532 + result = cleanup_journal_tail(journal);
534 + return (result < 0) ? result : 0;
538 @@ -393,8 +414,9 @@ out:
539 * This is the only part of the journaling code which really needs to be
540 * aware of transaction aborts. Checkpointing involves writing to the
541 * main filesystem area rather than to the journal, so it can proceed
542 - * even in abort state, but we must not update the journal superblock if
543 - * we have an abort error outstanding.
544 + * even in abort state, but we must not update the super block if
545 + * checkpointing may have failed. Otherwise, we would lose some metadata
546 + * buffers which should be written-back to the filesystem.
549 int cleanup_journal_tail(journal_t *journal)
550 @@ -403,6 +425,9 @@ int cleanup_journal_tail(journal_t *journal)
552 unsigned long blocknr, freed;
554 + if (is_journal_aborted(journal))
557 /* OK, work out the oldest transaction remaining in the log, and
558 * the log block it starts at.
560 @@ -602,15 +627,15 @@ int __journal_remove_checkpoint(struct journal_head *jh)
563 * There is one special case to worry about: if we have just pulled the
564 - * buffer off a committing transaction's forget list, then even if the
565 - * checkpoint list is empty, the transaction obviously cannot be
567 + * buffer off a running or committing transaction's checkpoing list,
568 + * then even if the checkpoint list is empty, the transaction obviously
569 + * cannot be dropped!
571 - * The locking here around j_committing_transaction is a bit sleazy.
572 + * The locking here around t_state is a bit sleazy.
573 * See the comment at the end of journal_commit_transaction().
575 - if (transaction == journal->j_committing_transaction) {
576 - JBUFFER_TRACE(jh, "belongs to committing transaction");
577 + if (transaction->t_state != T_FINISHED) {
578 + JBUFFER_TRACE(jh, "belongs to running/committing transaction");
582 diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
583 index a003d50..0d2785d 100644
584 --- a/fs/jbd/commit.c
585 +++ b/fs/jbd/commit.c
586 @@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
589 * When an ext3-ordered file is truncated, it is possible that many pages are
590 - * not sucessfully freed, because they are attached to a committing transaction.
591 + * not successfully freed, because they are attached to a committing transaction.
592 * After the transaction commits, these pages are left on the LRU, with no
593 * ->mapping, and with attached buffers. These pages are trivially reclaimable
594 * by the VM, but their apparent absence upsets the VM accounting, and it makes
595 @@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
596 * So here, we have a buffer which has just come off the forget list. Look to
597 * see if we can strip all buffers from the backing page.
599 - * Called under lock_journal(), and possibly under journal_datalist_lock. The
600 - * caller provided us with a ref against the buffer, and we drop that here.
601 + * Called under journal->j_list_lock. The caller provided us with a ref
602 + * against the buffer, and we drop that here.
604 static void release_buffer_page(struct buffer_head *bh)
606 @@ -78,6 +78,19 @@ nope:
610 + * Decrement reference counter for data buffer. If it has been marked
611 + * 'BH_Freed', release it and the page to which it belongs if possible.
613 +static void release_data_buffer(struct buffer_head *bh)
615 + if (buffer_freed(bh)) {
616 + clear_buffer_freed(bh);
617 + release_buffer_page(bh);
623 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
624 * held. For ranking reasons we must trylock. If we lose, schedule away and
625 * return 0. j_list_lock is dropped in this case.
626 @@ -173,7 +186,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
628 * Submit all the data buffers to disk
630 -static void journal_submit_data_buffers(journal_t *journal,
631 +static int journal_submit_data_buffers(journal_t *journal,
632 transaction_t *commit_transaction)
634 struct journal_head *jh;
635 @@ -181,6 +194,7 @@ static void journal_submit_data_buffers(journal_t *journal,
638 struct buffer_head **wbuf = journal->j_wbuf;
642 * Whenever we unlock the journal and sleep, things can get added
643 @@ -232,7 +246,7 @@ write_out_data:
646 BUFFER_TRACE(bh, "already cleaned up");
648 + release_data_buffer(bh);
651 if (locked && test_clear_buffer_dirty(bh)) {
652 @@ -254,15 +268,17 @@ write_out_data:
655 BUFFER_TRACE(bh, "writeout complete: unfile");
656 + if (unlikely(buffer_write_io_error(bh)))
658 __journal_unfile_buffer(jh);
659 jbd_unlock_bh_state(bh);
662 journal_remove_journal_head(bh);
663 - /* Once for our safety reference, once for
664 + /* One for our safety reference, other for
665 * journal_remove_journal_head() */
668 + release_data_buffer(bh);
671 if (lock_need_resched(&journal->j_list_lock)) {
672 @@ -272,6 +288,8 @@ write_out_data:
674 spin_unlock(&journal->j_list_lock);
675 journal_do_submit_data(wbuf, bufs);
681 @@ -408,27 +426,10 @@ void journal_commit_transaction(journal_t *journal)
682 jbd_debug (3, "JBD: commit phase 2\n");
685 - * First, drop modified flag: all accesses to the buffers
686 - * will be tracked for a new trasaction only -bzzz
688 - spin_lock(&journal->j_list_lock);
689 - if (commit_transaction->t_buffers) {
690 - new_jh = jh = commit_transaction->t_buffers->b_tnext;
692 - J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
693 - new_jh->b_modified == 0);
694 - new_jh->b_modified = 0;
695 - new_jh = new_jh->b_tnext;
696 - } while (new_jh != jh);
698 - spin_unlock(&journal->j_list_lock);
701 * Now start flushing things to disk, in the order they appear
702 * on the transaction lists. Data blocks go first.
705 - journal_submit_data_buffers(journal, commit_transaction);
706 + err = journal_submit_data_buffers(journal, commit_transaction);
709 * Wait for all previously submitted IO to complete.
710 @@ -443,10 +444,11 @@ void journal_commit_transaction(journal_t *journal)
711 if (buffer_locked(bh)) {
712 spin_unlock(&journal->j_list_lock);
714 - if (unlikely(!buffer_uptodate(bh)))
716 spin_lock(&journal->j_list_lock);
718 + if (unlikely(!buffer_uptodate(bh)))
721 if (!inverted_lock(journal, bh)) {
723 spin_lock(&journal->j_list_lock);
724 @@ -460,18 +462,16 @@ void journal_commit_transaction(journal_t *journal)
726 jbd_unlock_bh_state(bh);
729 + release_data_buffer(bh);
730 cond_resched_lock(&journal->j_list_lock);
732 spin_unlock(&journal->j_list_lock);
735 - __journal_abort_hard(journal);
736 + journal_abort(journal, err);
738 journal_write_revoke_records(journal, commit_transaction);
740 - jbd_debug(3, "JBD: commit phase 2\n");
743 * If we found any dirty or locked buffers, then we should have
744 * looped back up to the write_out_data label. If there weren't
745 @@ -489,6 +489,9 @@ void journal_commit_transaction(journal_t *journal)
747 commit_transaction->t_state = T_COMMIT;
749 + J_ASSERT(commit_transaction->t_nr_buffers <=
750 + commit_transaction->t_outstanding_credits);
754 while (commit_transaction->t_buffers) {
755 @@ -498,9 +501,10 @@ void journal_commit_transaction(journal_t *journal)
756 jh = commit_transaction->t_buffers;
758 /* If we're in abort mode, we just un-journal the buffer and
759 - release it for background writing. */
762 if (is_journal_aborted(journal)) {
763 + clear_buffer_jbddirty(jh2bh(jh));
764 JBUFFER_TRACE(jh, "journal is aborting: refile");
765 journal_refile_buffer(journal, jh);
766 /* If that was the last one, we need to clean up
767 @@ -524,7 +528,7 @@ void journal_commit_transaction(journal_t *journal)
769 descriptor = journal_get_descriptor_buffer(journal);
771 - __journal_abort_hard(journal);
772 + journal_abort(journal, -EIO);
776 @@ -557,7 +561,7 @@ void journal_commit_transaction(journal_t *journal)
777 and repeat this loop: we'll fall into the
778 refile-on-abort condition above. */
780 - __journal_abort_hard(journal);
781 + journal_abort(journal, err);
785 @@ -742,13 +746,16 @@ wait_for_iobuf:
786 /* AKPM: bforget here */
790 + journal_abort(journal, err);
792 jbd_debug(3, "JBD: commit phase 6\n");
794 if (journal_write_commit_record(journal, commit_transaction))
798 - __journal_abort_hard(journal);
799 + journal_abort(journal, err);
801 /* End of a transaction! Finally, we can do checkpoint
802 processing: any buffers committed as a result of this
803 @@ -832,6 +839,8 @@ restart_loop:
804 if (buffer_jbddirty(bh)) {
805 JBUFFER_TRACE(jh, "add to new checkpointing trans");
806 __journal_insert_checkpoint(jh, commit_transaction);
807 + if (is_journal_aborted(journal))
808 + clear_buffer_jbddirty(bh);
809 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
810 __journal_refile_buffer(jh);
811 jbd_unlock_bh_state(bh);
812 @@ -858,10 +867,10 @@ restart_loop:
814 spin_unlock(&journal->j_list_lock);
816 - * This is a bit sleazy. We borrow j_list_lock to protect
817 - * journal->j_committing_transaction in __journal_remove_checkpoint.
818 - * Really, __journal_remove_checkpoint should be using j_state_lock but
819 - * it's a bit hassle to hold that across __journal_remove_checkpoint
820 + * This is a bit sleazy. We use j_list_lock to protect transition
821 + * of a transaction into T_FINISHED state and calling
822 + * __journal_drop_transaction(). Otherwise we could race with
823 + * other checkpointing code processing the transaction...
825 spin_lock(&journal->j_state_lock);
826 spin_lock(&journal->j_list_lock);
827 diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
828 index 46fe743..8e937fc 100644
829 --- a/fs/jbd/journal.c
830 +++ b/fs/jbd/journal.c
831 @@ -1128,9 +1128,12 @@ recovery_error:
833 * Release a journal_t structure once it is no longer in use by the
835 + * Return <0 if we couldn't clean up the journal.
837 -void journal_destroy(journal_t *journal)
838 +int journal_destroy(journal_t *journal)
842 /* Wait for the commit thread to wake up and die. */
843 journal_kill_thread(journal);
845 @@ -1153,11 +1156,16 @@ void journal_destroy(journal_t *journal)
846 J_ASSERT(journal->j_checkpoint_transactions == NULL);
847 spin_unlock(&journal->j_list_lock);
849 - /* We can now mark the journal as empty. */
850 - journal->j_tail = 0;
851 - journal->j_tail_sequence = ++journal->j_transaction_sequence;
852 if (journal->j_sb_buffer) {
853 - journal_update_superblock(journal, 1);
854 + if (!is_journal_aborted(journal)) {
855 + /* We can now mark the journal as empty. */
856 + journal->j_tail = 0;
857 + journal->j_tail_sequence =
858 + ++journal->j_transaction_sequence;
859 + journal_update_superblock(journal, 1);
863 brelse(journal->j_sb_buffer);
866 @@ -1167,6 +1175,8 @@ void journal_destroy(journal_t *journal)
867 journal_destroy_revoke(journal);
868 kfree(journal->j_wbuf);
875 @@ -1366,10 +1376,16 @@ int journal_flush(journal_t *journal)
876 spin_lock(&journal->j_list_lock);
877 while (!err && journal->j_checkpoint_transactions != NULL) {
878 spin_unlock(&journal->j_list_lock);
879 + mutex_lock(&journal->j_checkpoint_mutex);
880 err = log_do_checkpoint(journal);
881 + mutex_unlock(&journal->j_checkpoint_mutex);
882 spin_lock(&journal->j_list_lock);
884 spin_unlock(&journal->j_list_lock);
886 + if (is_journal_aborted(journal))
889 cleanup_journal_tail(journal);
891 /* Finally, mark the journal as really needing no recovery.
892 @@ -1391,7 +1407,7 @@ int journal_flush(journal_t *journal)
893 J_ASSERT(journal->j_head == journal->j_tail);
894 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
895 spin_unlock(&journal->j_state_lock);
901 diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
902 index 2a5f4b8..66ae0e5 100644
903 --- a/fs/jbd/recovery.c
904 +++ b/fs/jbd/recovery.c
905 @@ -223,7 +223,7 @@ do { \
907 int journal_recover(journal_t *journal)
911 journal_superblock_t * sb;
913 struct recovery_info info;
914 @@ -261,7 +261,10 @@ int journal_recover(journal_t *journal)
915 journal->j_transaction_sequence = ++info.end_transaction;
917 journal_clear_revoke(journal);
918 - sync_blockdev(journal->j_fs_dev);
919 + err2 = sync_blockdev(journal->j_fs_dev);
926 @@ -478,7 +481,7 @@ static int do_one_pass(journal_t *journal,
927 memcpy(nbh->b_data, obh->b_data,
928 journal->j_blocksize);
929 if (flags & JFS_FLAG_ESCAPE) {
930 - *((__be32 *)bh->b_data) =
931 + *((__be32 *)nbh->b_data) =
932 cpu_to_be32(JFS_MAGIC_NUMBER);
935 diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
936 index 772b653..1b9a804 100644
937 --- a/fs/jbd/transaction.c
938 +++ b/fs/jbd/transaction.c
939 @@ -600,6 +600,13 @@ repeat:
940 jh->b_next_transaction == transaction)
944 + * this is the first time this transaction is touching this buffer,
945 + * reset the modified flag
947 + jh->b_modified = 0;
951 * If there is already a copy-out version of this buffer, then we don't
952 * need to make another one
953 @@ -812,9 +819,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
955 if (jh->b_transaction == NULL) {
956 jh->b_transaction = transaction;
958 + /* first access by this transaction */
959 + jh->b_modified = 0;
961 JBUFFER_TRACE(jh, "file as BJ_Reserved");
962 __journal_file_buffer(jh, transaction, BJ_Reserved);
963 } else if (jh->b_transaction == journal->j_committing_transaction) {
964 + /* first access by this transaction */
965 + jh->b_modified = 0;
966 JBUFFER_TRACE(jh, "set next transaction");
967 jh->b_next_transaction = transaction;
969 @@ -1213,6 +1226,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
970 struct journal_head *jh;
971 int drop_reserve = 0;
973 + int was_modified = 0;
975 BUFFER_TRACE(bh, "entry");
977 @@ -1231,6 +1245,9 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
981 + /* keep track of wether or not this transaction modified us */
982 + was_modified = jh->b_modified;
985 * The buffer's going from the transaction, we must drop
986 * all references -bzzz
987 @@ -1248,7 +1265,12 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
989 JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
993 + * we only want to drop a reference if this transaction
994 + * modified the buffer
1000 * We are no longer going to journal this buffer.
1001 @@ -1288,7 +1310,12 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
1002 if (jh->b_next_transaction) {
1003 J_ASSERT(jh->b_next_transaction == transaction);
1004 jh->b_next_transaction = NULL;
1007 + * only drop a reference if this transaction modified
1015 @@ -2058,7 +2085,7 @@ void __journal_refile_buffer(struct journal_head *jh)
1016 jh->b_transaction = jh->b_next_transaction;
1017 jh->b_next_transaction = NULL;
1018 __journal_file_buffer(jh, jh->b_transaction,
1019 - was_dirty ? BJ_Metadata : BJ_Reserved);
1020 + jh->b_modified ? BJ_Metadata : BJ_Reserved);
1021 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
1024 diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
1025 index ff56e39..8c51469 100644
1026 --- a/include/linux/ext3_fs.h
1027 +++ b/include/linux/ext3_fs.h
1028 @@ -827,6 +827,7 @@ extern void ext3_discard_reservation (struct inode *);
1029 extern void ext3_dirty_inode(struct inode *);
1030 extern int ext3_change_inode_journal_flag(struct inode *, int);
1031 extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
1032 +extern int ext3_can_truncate(struct inode *inode);
1033 extern void ext3_truncate (struct inode *);
1034 extern void ext3_set_inode_flags(struct inode *);
1035 extern void ext3_get_inode_flags(struct ext3_inode_info *);
1036 diff --git a/include/linux/jbd.h b/include/linux/jbd.h
1037 index 4527375..6bc0e4f 100644
1038 --- a/include/linux/jbd.h
1039 +++ b/include/linux/jbd.h
1040 @@ -446,6 +446,8 @@ struct transaction_s
1042 * Transaction's current state
1043 * [no locking - only kjournald alters this]
1044 + * [j_list_lock] guards transition of a transaction into T_FINISHED
1045 + * state and subsequent call of __journal_drop_transaction()
1046 * FIXME: needs barriers
1047 * KLUDGE: [use j_state_lock]
1049 @@ -924,7 +926,7 @@ extern int journal_set_features
1050 (journal_t *, unsigned long, unsigned long, unsigned long);
1051 extern int journal_create (journal_t *);
1052 extern int journal_load (journal_t *journal);
1053 -extern void journal_destroy (journal_t *);
1054 +extern int journal_destroy (journal_t *);
1055 extern int journal_recover (journal_t *journal);
1056 extern int journal_wipe (journal_t *, int);
1057 extern int journal_skip_recovery (journal_t *);