2 * linux/fs/ext3/inode.c
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
11 * linux/fs/minix/inode.c
13 * Copyright (C) 1991, 1992 Linus Torvalds
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
25 #include <linux/module.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
43 * Test whether an inode is a fast symlink.
45 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
47 int ea_blocks = EXT3_I(inode)->i_file_acl ?
48 (inode->i_sb->s_blocksize >> 9) : 0;
50 return (S_ISLNK(inode->i_mode) &&
51 inode->i_blocks - ea_blocks == 0);
54 /* The ext3 forget function must perform a revoke if we are freeing data
55 * which has been journaled. Metadata (eg. indirect blocks) must be
56 * revoked in all cases.
58 * "bh" may be NULL: a metadata block may have been freed from memory
59 * but there may still be a record of it in the journal, and that record
60 * still needs to be revoked.
63 int ext3_forget(handle_t *handle, int is_metadata,
64 struct inode *inode, struct buffer_head *bh,
71 BUFFER_TRACE(bh, "enter");
73 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
75 bh, is_metadata, inode->i_mode,
76 test_opt(inode->i_sb, DATA_FLAGS));
78 /* Never use the revoke function if we are doing full data
79 * journaling: there is no need to, and a V1 superblock won't
80 * support it. Otherwise, only skip the revoke on un-journaled
83 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
84 (!is_metadata && !ext3_should_journal_data(inode))) {
86 BUFFER_TRACE(bh, "call journal_forget");
87 ext3_journal_forget(handle, bh);
93 * data!=journal && (is_metadata || should_journal_data(inode))
95 BUFFER_TRACE(bh, "call ext3_journal_revoke");
96 err = ext3_journal_revoke(handle, blocknr, bh);
98 ext3_abort(inode->i_sb, __FUNCTION__,
99 "error %d when attempting revoke", err);
100 BUFFER_TRACE(bh, "exit");
105 * Work out how many blocks we need to progress with the next chunk of a
106 * truncate transaction.
109 static unsigned long blocks_for_truncate(struct inode *inode)
111 unsigned long needed;
113 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
115 /* Give ourselves just enough room to cope with inodes in which
116 * i_blocks is corrupt: we've seen disk corruptions in the past
117 * which resulted in random data in an inode which looked enough
118 * like a regular file for ext3 to try to delete it. Things
119 * will go a bit crazy if that happens, but at least we should
120 * try not to panic the whole kernel. */
124 /* But we need to bound the transaction so we don't overflow the
126 if (needed > EXT3_MAX_TRANS_DATA)
127 needed = EXT3_MAX_TRANS_DATA;
129 return EXT3_DATA_TRANS_BLOCKS + needed;
133 * Truncate transactions can be complex and absolutely huge. So we need to
134 * be able to restart the transaction at a conventient checkpoint to make
135 * sure we don't overflow the journal.
137 * start_transaction gets us a new handle for a truncate transaction,
138 * and extend_transaction tries to extend the existing one a bit. If
139 * extend fails, we need to propagate the failure up and restart the
140 * transaction in the top-level truncate loop. --sct
143 static handle_t *start_transaction(struct inode *inode)
147 result = ext3_journal_start(inode, blocks_for_truncate(inode));
151 ext3_std_error(inode->i_sb, PTR_ERR(result));
156 * Try to extend this transaction for the purposes of truncation.
158 * Returns 0 if we managed to create more room. If we can't create more
159 * room, and the transaction must be restarted we return 1.
161 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
163 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
165 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
171 * Restart the transaction associated with *handle. This does a commit,
172 * so before we call here everything must be consistently dirtied against
175 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
177 jbd_debug(2, "restarting handle %p\n", handle);
178 return ext3_journal_restart(handle, blocks_for_truncate(inode));
182 * Called at each iput()
184 * The inode may be "bad" if ext3_read_inode() saw an error from
185 * ext3_get_inode(), so we need to check that to avoid freeing random disk
188 void ext3_put_inode(struct inode *inode)
190 if (!is_bad_inode(inode))
191 ext3_discard_prealloc(inode);
195 * Called at the last iput() if i_nlink is zero.
197 void ext3_delete_inode (struct inode * inode)
201 if (is_bad_inode(inode))
204 handle = start_transaction(inode);
205 if (IS_ERR(handle)) {
206 /* If we're going to skip the normal cleanup, we still
207 * need to make sure that the in-core orphan linked list
208 * is properly cleaned up. */
209 ext3_orphan_del(NULL, inode);
217 ext3_truncate(inode);
219 * Kill off the orphan record which ext3_truncate created.
220 * AKPM: I think this can be inside the above `if'.
221 * Note that ext3_orphan_del() has to be able to cope with the
222 * deletion of a non-existent orphan - this is because we don't
223 * know if ext3_truncate() actually created an orphan record.
224 * (Well, we could do this if we need to, but heck - it works)
226 ext3_orphan_del(handle, inode);
227 EXT3_I(inode)->i_dtime = get_seconds();
230 * One subtle ordering requirement: if anything has gone wrong
231 * (transaction abort, IO errors, whatever), then we can still
232 * do these next steps (the fs will already have been marked as
233 * having errors), but we can't free the inode if the mark_dirty
236 if (ext3_mark_inode_dirty(handle, inode))
237 /* If that failed, just do the required in-core inode clear. */
240 ext3_free_inode(handle, inode);
241 ext3_journal_stop(handle);
244 clear_inode(inode); /* We must guarantee clearing of inode... */
247 void ext3_discard_prealloc (struct inode * inode)
249 #ifdef EXT3_PREALLOCATE
250 struct ext3_inode_info *ei = EXT3_I(inode);
251 /* Writer: ->i_prealloc* */
252 if (ei->i_prealloc_count) {
253 unsigned short total = ei->i_prealloc_count;
254 unsigned long block = ei->i_prealloc_block;
255 ei->i_prealloc_count = 0;
256 ei->i_prealloc_block = 0;
258 ext3_free_blocks (inode, block, total);
263 static int ext3_alloc_block (handle_t *handle,
264 struct inode * inode, unsigned long goal, int *err)
266 unsigned long result;
268 #ifdef EXT3_PREALLOCATE
270 static unsigned long alloc_hits, alloc_attempts;
272 struct ext3_inode_info *ei = EXT3_I(inode);
273 /* Writer: ->i_prealloc* */
274 if (ei->i_prealloc_count &&
275 (goal == ei->i_prealloc_block ||
276 goal + 1 == ei->i_prealloc_block))
278 result = ei->i_prealloc_block++;
279 ei->i_prealloc_count--;
281 ext3_debug ("preallocation hit (%lu/%lu).\n",
282 ++alloc_hits, ++alloc_attempts);
284 ext3_discard_prealloc (inode);
285 ext3_debug ("preallocation miss (%lu/%lu).\n",
286 alloc_hits, ++alloc_attempts);
287 if (S_ISREG(inode->i_mode))
288 result = ext3_new_block (inode, goal,
289 &ei->i_prealloc_count,
290 &ei->i_prealloc_block, err);
292 result = ext3_new_block(inode, goal, NULL, NULL, err);
294 * AKPM: this is somewhat sticky. I'm not surprised it was
295 * disabled in 2.2's ext3. Need to integrate b_committed_data
296 * guarding with preallocation, if indeed preallocation is
301 result = ext3_new_block(handle, inode, goal, NULL, NULL, err);
310 struct buffer_head *bh;
313 static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
315 p->key = *(p->p = v);
319 static inline int verify_chain(Indirect *from, Indirect *to)
321 while (from <= to && from->key == *from->p)
327 * ext3_block_to_path - parse the block number into array of offsets
328 * @inode: inode in question (we are only interested in its superblock)
329 * @i_block: block number to be parsed
330 * @offsets: array to store the offsets in
331 * @boundary: set this non-zero if the referred-to block is likely to be
332 * followed (on disk) by an indirect block.
334 * To store the locations of file's data ext3 uses a data structure common
335 * for UNIX filesystems - tree of pointers anchored in the inode, with
336 * data blocks at leaves and indirect blocks in intermediate nodes.
337 * This function translates the block number into path in that tree -
338 * return value is the path length and @offsets[n] is the offset of
339 * pointer to (n+1)th node in the nth one. If @block is out of range
340 * (negative or too large) warning is printed and zero returned.
342 * Note: function doesn't find node addresses, so no IO is needed. All
343 * we need to know is the capacity of indirect blocks (taken from the
348 * Portability note: the last comparison (check that we fit into triple
349 * indirect block) is spelled differently, because otherwise on an
350 * architecture with 32-bit longs and 8Kb pages we might get into trouble
351 * if our filesystem had 8Kb blocks. We might use long long, but that would
352 * kill us on x86. Oh, well, at least the sign propagation does not matter -
353 * i_block would have to be negative in the very beginning, so we would not
357 static int ext3_block_to_path(struct inode *inode,
358 long i_block, int offsets[4], int *boundary)
360 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
361 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
362 const long direct_blocks = EXT3_NDIR_BLOCKS,
363 indirect_blocks = ptrs,
364 double_blocks = (1 << (ptrs_bits * 2));
369 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
370 } else if (i_block < direct_blocks) {
371 offsets[n++] = i_block;
372 final = direct_blocks;
373 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
374 offsets[n++] = EXT3_IND_BLOCK;
375 offsets[n++] = i_block;
377 } else if ((i_block -= indirect_blocks) < double_blocks) {
378 offsets[n++] = EXT3_DIND_BLOCK;
379 offsets[n++] = i_block >> ptrs_bits;
380 offsets[n++] = i_block & (ptrs - 1);
382 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
383 offsets[n++] = EXT3_TIND_BLOCK;
384 offsets[n++] = i_block >> (ptrs_bits * 2);
385 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
386 offsets[n++] = i_block & (ptrs - 1);
389 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
392 *boundary = (i_block & (ptrs - 1)) == (final - 1);
397 * ext3_get_branch - read the chain of indirect blocks leading to data
398 * @inode: inode in question
399 * @depth: depth of the chain (1 - direct pointer, etc.)
400 * @offsets: offsets of pointers in inode/indirect blocks
401 * @chain: place to store the result
402 * @err: here we store the error value
404 * Function fills the array of triples <key, p, bh> and returns %NULL
405 * if everything went OK or the pointer to the last filled triple
406 * (incomplete one) otherwise. Upon the return chain[i].key contains
407 * the number of (i+1)-th block in the chain (as it is stored in memory,
408 * i.e. little-endian 32-bit), chain[i].p contains the address of that
409 * number (it points into struct inode for i==0 and into the bh->b_data
410 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
411 * block for i>0 and NULL for i==0. In other words, it holds the block
412 * numbers of the chain, addresses they were taken from (and where we can
413 * verify that chain did not change) and buffer_heads hosting these
416 * Function stops when it stumbles upon zero pointer (absent block)
417 * (pointer to last triple returned, *@err == 0)
418 * or when it gets an IO error reading an indirect block
419 * (ditto, *@err == -EIO)
420 * or when it notices that chain had been changed while it was reading
421 * (ditto, *@err == -EAGAIN)
422 * or when it reads all @depth-1 indirect blocks successfully and finds
423 * the whole chain, all way to the data (returns %NULL, *err == 0).
425 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
426 Indirect chain[4], int *err)
428 struct super_block *sb = inode->i_sb;
430 struct buffer_head *bh;
433 /* i_data is not going away, no lock needed */
434 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
438 bh = sb_bread(sb, le32_to_cpu(p->key));
441 /* Reader: pointers */
442 if (!verify_chain(chain, p))
444 add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
462 * ext3_find_near - find a place for allocation with sufficient locality
464 * @ind: descriptor of indirect block.
466 * This function returns the prefered place for block allocation.
467 * It is used when heuristic for sequential allocation fails.
469 * + if there is a block to the left of our position - allocate near it.
470 * + if pointer will live in indirect block - allocate near that block.
471 * + if pointer will live in inode - allocate in the same
474 * In the latter case we colour the starting block by the callers PID to
475 * prevent it from clashing with concurrent allocations for a different inode
476 * in the same block group. The PID is used here so that functionally related
477 * files will be close-by on-disk.
479 * Caller must make sure that @ind is valid and will stay that way.
482 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
484 struct ext3_inode_info *ei = EXT3_I(inode);
485 u32 *start = ind->bh ? (u32*) ind->bh->b_data : ei->i_data;
487 unsigned long bg_start;
488 unsigned long colour;
490 /* Try to find previous block */
491 for (p = ind->p - 1; p >= start; p--)
493 return le32_to_cpu(*p);
495 /* No such thing, so let's try location of indirect block */
497 return ind->bh->b_blocknr;
500 * It is going to be refered from inode itself? OK, just put it into
501 * the same cylinder group then.
503 bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
504 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
505 colour = (current->pid % 16) *
506 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
507 return bg_start + colour;
511 * ext3_find_goal - find a prefered place for allocation.
513 * @block: block we want
514 * @chain: chain of indirect blocks
515 * @partial: pointer to the last triple within a chain
516 * @goal: place to store the result.
518 * Normally this function find the prefered place for block allocation,
519 * stores it in *@goal and returns zero. If the branch had been changed
520 * under us we return -EAGAIN.
523 static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
524 Indirect *partial, unsigned long *goal)
526 struct ext3_inode_info *ei = EXT3_I(inode);
527 /* Writer: ->i_next_alloc* */
528 if (block == ei->i_next_alloc_block + 1) {
529 ei->i_next_alloc_block++;
530 ei->i_next_alloc_goal++;
533 /* Reader: pointers, ->i_next_alloc* */
534 if (verify_chain(chain, partial)) {
536 * try the heuristic for sequential allocation,
537 * failing that at least try to get decent locality.
539 if (block == ei->i_next_alloc_block)
540 *goal = ei->i_next_alloc_goal;
542 *goal = ext3_find_near(inode, partial);
550 * ext3_alloc_branch - allocate and set up a chain of blocks.
552 * @num: depth of the chain (number of blocks to allocate)
553 * @offsets: offsets (in the blocks) to store the pointers to next.
554 * @branch: place to store the chain in.
556 * This function allocates @num blocks, zeroes out all but the last one,
557 * links them into chain and (if we are synchronous) writes them to disk.
558 * In other words, it prepares a branch that can be spliced onto the
559 * inode. It stores the information about that chain in the branch[], in
560 * the same format as ext3_get_branch() would do. We are calling it after
561 * we had read the existing part of chain and partial points to the last
562 * triple of that (one with zero ->key). Upon the exit we have the same
563 * picture as after the successful ext3_get_block(), excpet that in one
564 * place chain is disconnected - *branch->p is still zero (we did not
565 * set the last link), but branch->key contains the number that should
566 * be placed into *branch->p to fill that gap.
568 * If allocation fails we free all blocks we've allocated (and forget
569 * their buffer_heads) and return the error value the from failed
570 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
571 * as described above and return 0.
574 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
580 int blocksize = inode->i_sb->s_blocksize;
584 int parent = ext3_alloc_block(handle, inode, goal, &err);
586 branch[0].key = cpu_to_le32(parent);
588 for (n = 1; n < num; n++) {
589 struct buffer_head *bh;
590 /* Allocate the next block */
591 int nr = ext3_alloc_block(handle, inode, parent, &err);
594 branch[n].key = cpu_to_le32(nr);
598 * Get buffer_head for parent block, zero it out
599 * and set the pointer to new one, then send
602 bh = sb_getblk(inode->i_sb, parent);
605 BUFFER_TRACE(bh, "call get_create_access");
606 err = ext3_journal_get_create_access(handle, bh);
613 memset(bh->b_data, 0, blocksize);
614 branch[n].p = (u32*) bh->b_data + offsets[n];
615 *branch[n].p = branch[n].key;
616 BUFFER_TRACE(bh, "marking uptodate");
617 set_buffer_uptodate(bh);
620 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
621 err = ext3_journal_dirty_metadata(handle, bh);
631 /* Allocation failed, free what we already allocated */
632 for (i = 1; i < keys; i++) {
633 BUFFER_TRACE(branch[i].bh, "call journal_forget");
634 ext3_journal_forget(handle, branch[i].bh);
636 for (i = 0; i < keys; i++)
637 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
642 * ext3_splice_branch - splice the allocated branch onto inode.
644 * @block: (logical) number of block we are adding
645 * @chain: chain of indirect blocks (with a missing link - see
647 * @where: location of missing link
648 * @num: number of blocks we are adding
650 * This function verifies that chain (up to the missing link) had not
651 * changed, fills the missing link and does all housekeeping needed in
652 * inode (->i_blocks, etc.). In case of success we end up with the full
653 * chain to new block and return 0. Otherwise (== chain had been changed)
654 * we free the new blocks (forgetting their buffer_heads, indeed) and
658 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
659 Indirect chain[4], Indirect *where, int num)
663 struct ext3_inode_info *ei = EXT3_I(inode);
666 * If we're splicing into a [td]indirect block (as opposed to the
667 * inode) then we need to get write access to the [td]indirect block
671 BUFFER_TRACE(where->bh, "get_write_access");
672 err = ext3_journal_get_write_access(handle, where->bh);
676 /* Verify that place we are splicing to is still there and vacant */
678 /* Writer: pointers, ->i_next_alloc* */
679 if (!verify_chain(chain, where-1) || *where->p)
685 *where->p = where->key;
686 ei->i_next_alloc_block = block;
687 ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key);
690 /* We are done with atomic stuff, now do the rest of housekeeping */
692 inode->i_ctime = CURRENT_TIME;
693 ext3_mark_inode_dirty(handle, inode);
695 /* had we spliced it onto indirect block? */
698 * akpm: If we spliced it onto an indirect block, we haven't
699 * altered the inode. Note however that if it is being spliced
700 * onto an indirect block at the very end of the file (the
701 * file is growing) then we *will* alter the inode to reflect
702 * the new i_size. But that is not done here - it is done in
703 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
705 jbd_debug(5, "splicing indirect only\n");
706 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
707 err = ext3_journal_dirty_metadata(handle, where->bh);
712 * OK, we spliced it into the inode itself on a direct block.
713 * Inode was dirtied above.
715 jbd_debug(5, "splicing direct\n");
721 * AKPM: if where[i].bh isn't part of the current updating
722 * transaction then we explode nastily. Test this code path.
724 jbd_debug(1, "the chain changed: try again\n");
728 for (i = 1; i < num; i++) {
729 BUFFER_TRACE(where[i].bh, "call journal_forget");
730 ext3_journal_forget(handle, where[i].bh);
732 /* For the normal collision cleanup case, we free up the blocks.
733 * On genuine filesystem errors we don't even think about doing
736 for (i = 0; i < num; i++)
737 ext3_free_blocks(handle, inode,
738 le32_to_cpu(where[i].key), 1);
743 * Allocation strategy is simple: if we have to allocate something, we will
744 * have to go the whole way to leaf. So let's do it before attaching anything
745 * to tree, set linkage between the newborn blocks, write them if sync is
746 * required, recheck the path, free and repeat if check fails, otherwise
747 * set the last missing link (that will protect us from any truncate-generated
748 * removals - all blocks on the path are immune now) and possibly force the
749 * write on the parent block.
750 * That has a nice additional property: no special recovery from the failed
751 * allocations is needed - we simply release blocks and do not touch anything
752 * reachable from inode.
754 * akpm: `handle' can be NULL if create == 0.
756 * The BKL may not be held on entry here. Be sure to take it early.
760 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
761 struct buffer_head *bh_result, int create, int extend_disksize)
770 int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
771 struct ext3_inode_info *ei = EXT3_I(inode);
773 J_ASSERT(handle != NULL || create == 0);
779 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
781 /* Simplest case - block found, no allocation needed */
783 clear_buffer_new(bh_result);
785 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
787 set_buffer_boundary(bh_result);
788 /* Clean up and exit */
789 partial = chain+depth-1; /* the whole chain */
793 /* Next simple case - plain lookup or failed read of indirect block */
794 if (!create || err == -EIO) {
796 while (partial > chain) {
797 BUFFER_TRACE(partial->bh, "call brelse");
801 BUFFER_TRACE(bh_result, "returned");
807 * Indirect block might be removed by truncate while we were
808 * reading it. Handling of that case (forget what we've got and
809 * reread) is taken out of the main path.
815 down(&ei->truncate_sem);
816 if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) {
817 up(&ei->truncate_sem);
821 left = (chain + depth) - partial;
824 * Block out ext3_truncate while we alter the tree
826 err = ext3_alloc_branch(handle, inode, left, goal,
827 offsets+(partial-chain), partial);
829 /* The ext3_splice_branch call will free and forget any buffers
830 * on the new chain if there is a failure, but that risks using
831 * up transaction credits, especially for bitmaps where the
832 * credits cannot be returned. Can we handle this somehow? We
833 * may need to return -EAGAIN upwards in the worst case. --sct */
835 err = ext3_splice_branch(handle, inode, iblock, chain,
837 /* i_disksize growing is protected by truncate_sem
838 * don't forget to protect it if you're about to implement
839 * concurrent ext3_get_block() -bzzz */
840 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
841 ei->i_disksize = inode->i_size;
842 up(&ei->truncate_sem);
848 set_buffer_new(bh_result);
852 while (partial > chain) {
853 jbd_debug(1, "buffer chain changed, retrying\n");
854 BUFFER_TRACE(partial->bh, "brelsing");
861 static int ext3_get_block(struct inode *inode, sector_t iblock,
862 struct buffer_head *bh_result, int create)
864 handle_t *handle = NULL;
868 handle = ext3_journal_current_handle();
869 J_ASSERT(handle != 0);
871 ret = ext3_get_block_handle(handle, inode, iblock,
872 bh_result, create, 1);
876 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
879 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
880 unsigned long max_blocks, struct buffer_head *bh_result,
883 handle_t *handle = journal_current_handle();
887 goto get_block; /* A read */
889 if (handle->h_transaction->t_state == T_LOCKED) {
891 * Huge direct-io writes can hold off commits for long
892 * periods of time. Let this commit run.
894 ext3_journal_stop(handle);
895 handle = ext3_journal_start(inode, DIO_CREDITS);
897 ret = PTR_ERR(handle);
901 if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
903 * Getting low on buffer credits...
905 ret = ext3_journal_extend(handle, DIO_CREDITS);
908 * Couldn't extend the transaction. Start a new one.
910 ret = ext3_journal_restart(handle, DIO_CREDITS);
916 ret = ext3_get_block_handle(handle, inode, iblock,
917 bh_result, create, 0);
918 bh_result->b_size = (1 << inode->i_blkbits);
923 * `handle' can be NULL if create is zero
925 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
926 long block, int create, int * errp)
928 struct buffer_head dummy;
931 J_ASSERT(handle != NULL || create == 0);
934 dummy.b_blocknr = -1000;
935 buffer_trace_init(&dummy.b_history);
936 *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
937 if (!*errp && buffer_mapped(&dummy)) {
938 struct buffer_head *bh;
939 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
940 if (buffer_new(&dummy)) {
941 J_ASSERT(create != 0);
942 J_ASSERT(handle != 0);
944 /* Now that we do not always journal data, we
945 should keep in mind whether this should
946 always journal the new buffer as metadata.
947 For now, regular file writes use
948 ext3_get_block instead, so it's not a
951 BUFFER_TRACE(bh, "call get_create_access");
952 fatal = ext3_journal_get_create_access(handle, bh);
953 if (!fatal && !buffer_uptodate(bh)) {
954 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
955 set_buffer_uptodate(bh);
958 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
959 err = ext3_journal_dirty_metadata(handle, bh);
963 BUFFER_TRACE(bh, "not a new buffer");
975 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
976 int block, int create, int *err)
978 struct buffer_head * bh;
981 prev_blocks = inode->i_blocks;
983 bh = ext3_getblk (handle, inode, block, create, err);
986 #ifdef EXT3_PREALLOCATE
988 * If the inode has grown, and this is a directory, then use a few
989 * more of the preallocated blocks to keep directory fragmentation
990 * down. The preallocated blocks are guaranteed to be contiguous.
993 S_ISDIR(inode->i_mode) &&
994 inode->i_blocks > prev_blocks &&
995 EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
996 EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
998 struct buffer_head *tmp_bh;
1001 EXT3_I(inode)->i_prealloc_count &&
1002 i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
1005 * ext3_getblk will zero out the contents of the
1008 tmp_bh = ext3_getblk(handle, inode,
1009 block+i, create, err);
1018 if (buffer_uptodate(bh))
1020 ll_rw_block (READ, 1, &bh);
1021 wait_on_buffer (bh);
1022 if (buffer_uptodate(bh))
1029 static int walk_page_buffers( handle_t *handle,
1030 struct buffer_head *head,
1034 int (*fn)( handle_t *handle,
1035 struct buffer_head *bh))
1037 struct buffer_head *bh;
1038 unsigned block_start, block_end;
1039 unsigned blocksize = head->b_size;
1041 struct buffer_head *next;
1043 for ( bh = head, block_start = 0;
1044 ret == 0 && (bh != head || !block_start);
1045 block_start = block_end, bh = next)
1047 next = bh->b_this_page;
1048 block_end = block_start + blocksize;
1049 if (block_end <= from || block_start >= to) {
1050 if (partial && !buffer_uptodate(bh))
1054 err = (*fn)(handle, bh);
1062 * To preserve ordering, it is essential that the hole instantiation and
1063 * the data write be encapsulated in a single transaction. We cannot
1064 * close off a transaction and start a new one between the ext3_get_block()
1065 * and the commit_write(). So doing the journal_start at the start of
1066 * prepare_write() is the right place.
1068 * Also, this function can nest inside ext3_writepage() ->
1069 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1070 * has generated enough buffer credits to do the whole page. So we won't
1071 * block on the journal in that case, which is good, because the caller may
1074 * By accident, ext3 can be reentered when a transaction is open via
1075 * quota file writes. If we were to commit the transaction while thus
1076 * reentered, there can be a deadlock - we would be holding a quota
1077 * lock, and the commit would never complete if another thread had a
1078 * transaction open and was blocking on the quota lock - a ranking
1081 * So what we do is to rely on the fact that journal_stop/journal_start
1082 * will _not_ run commit under these circumstances because handle->h_ref
1083 * is elevated. We'll still have enough credits for the tiny quotafile
1087 static int do_journal_get_write_access(handle_t *handle,
1088 struct buffer_head *bh)
1090 if (!buffer_mapped(bh) || buffer_freed(bh))
1092 return ext3_journal_get_write_access(handle, bh);
1095 static int ext3_prepare_write(struct file *file, struct page *page,
1096 unsigned from, unsigned to)
1098 struct inode *inode = page->mapping->host;
1099 int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1104 handle = ext3_journal_start(inode, needed_blocks);
1105 if (IS_ERR(handle)) {
1106 ret = PTR_ERR(handle);
1109 ret = block_prepare_write(page, from, to, ext3_get_block);
1111 goto prepare_write_failed;
1113 if (ext3_should_journal_data(inode)) {
1114 ret = walk_page_buffers(handle, page_buffers(page),
1115 from, to, NULL, do_journal_get_write_access);
1117 prepare_write_failed:
1119 ext3_journal_stop(handle);
1120 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1127 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1129 int err = journal_dirty_data(handle, bh);
1131 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1136 /* For commit_write() in data=journal mode */
1137 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1139 if (!buffer_mapped(bh) || buffer_freed(bh))
1141 set_buffer_uptodate(bh);
1142 return ext3_journal_dirty_metadata(handle, bh);
1146 * We need to pick up the new inode size which generic_commit_write gave us
1147 * `file' can be NULL - eg, when called from page_symlink().
1149 * ext3 never places buffers on inode->i_mapping->private_list. metadata
1150 * buffers are managed internally.
1153 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1154 unsigned from, unsigned to)
1156 handle_t *handle = ext3_journal_current_handle();
1157 struct inode *inode = page->mapping->host;
1160 ret = walk_page_buffers(handle, page_buffers(page),
1161 from, to, NULL, ext3_journal_dirty_data);
1165 * generic_commit_write() will run mark_inode_dirty() if i_size
1166 * changes. So let's piggyback the i_disksize mark_inode_dirty
1171 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1172 if (new_i_size > EXT3_I(inode)->i_disksize)
1173 EXT3_I(inode)->i_disksize = new_i_size;
1174 ret = generic_commit_write(file, page, from, to);
1176 ret2 = ext3_journal_stop(handle);
1182 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1183 unsigned from, unsigned to)
1185 handle_t *handle = ext3_journal_current_handle();
1186 struct inode *inode = page->mapping->host;
1190 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1191 if (new_i_size > EXT3_I(inode)->i_disksize)
1192 EXT3_I(inode)->i_disksize = new_i_size;
1193 ret = generic_commit_write(file, page, from, to);
1194 ret2 = ext3_journal_stop(handle);
1200 static int ext3_journalled_commit_write(struct file *file,
1201 struct page *page, unsigned from, unsigned to)
1203 handle_t *handle = ext3_journal_current_handle();
1204 struct inode *inode = page->mapping->host;
1210 * Here we duplicate the generic_commit_write() functionality
1212 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1214 ret = walk_page_buffers(handle, page_buffers(page), from,
1215 to, &partial, commit_write_fn);
1217 SetPageUptodate(page);
1218 if (pos > inode->i_size)
1219 i_size_write(inode, pos);
1220 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1221 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1222 EXT3_I(inode)->i_disksize = inode->i_size;
1223 ret2 = ext3_mark_inode_dirty(handle, inode);
1227 ret2 = ext3_journal_stop(handle);
1234 * bmap() is special. It gets used by applications such as lilo and by
1235 * the swapper to find the on-disk block of a specific piece of data.
1237 * Naturally, this is dangerous if the block concerned is still in the
1238 * journal. If somebody makes a swapfile on an ext3 data-journaling
1239 * filesystem and enables swap, then they may get a nasty shock when the
1240 * data getting swapped to that swapfile suddenly gets overwritten by
1241 * the original zero's written out previously to the journal and
1242 * awaiting writeback in the kernel's buffer cache.
1244 * So, if we see any bmap calls here on a modified, data-journaled file,
1245 * take extra steps to flush any blocks which might be in the cache.
1247 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1249 struct inode *inode = mapping->host;
1253 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1255 * This is a REALLY heavyweight approach, but the use of
1256 * bmap on dirty files is expected to be extremely rare:
1257 * only if we run lilo or swapon on a freshly made file
1258 * do we expect this to happen.
1260 * (bmap requires CAP_SYS_RAWIO so this does not
1261 * represent an unprivileged user DOS attack --- we'd be
1262 * in trouble if mortal users could trigger this path at
1265 * NB. EXT3_STATE_JDATA is not set on files other than
1266 * regular files. If somebody wants to bmap a directory
1267 * or symlink and gets confused because the buffer
1268 * hasn't yet been flushed to disk, they deserve
1269 * everything they get.
1272 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1273 journal = EXT3_JOURNAL(inode);
1274 journal_lock_updates(journal);
1275 err = journal_flush(journal);
1276 journal_unlock_updates(journal);
1282 return generic_block_bmap(mapping,block,ext3_get_block);
1285 static int bget_one(handle_t *handle, struct buffer_head *bh)
1291 static int bput_one(handle_t *handle, struct buffer_head *bh)
1297 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1299 if (buffer_mapped(bh))
1300 return ext3_journal_dirty_data(handle, bh);
1305 * Note that we always start a transaction even if we're not journalling
1306 * data. This is to preserve ordering: any hole instantiation within
1307 * __block_write_full_page -> ext3_get_block() should be journalled
1308 * along with the data so we don't crash and then get metadata which
1309 * refers to old data.
1311 * In all journalling modes block_write_full_page() will start the I/O.
1315 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1320 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1322 * Same applies to ext3_get_block(). We will deadlock on various things like
1323 * lock_journal and i_truncate_sem.
1325 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1328 * 16May01: If we're reentered then journal_current_handle() will be
1329 * non-zero. We simply *return*.
1331 * 1 July 2001: @@@ FIXME:
1332 * In journalled data mode, a data buffer may be metadata against the
1333 * current transaction. But the same file is part of a shared mapping
1334 * and someone does a writepage() on it.
1336 * We will move the buffer onto the async_data list, but *after* it has
1337 * been dirtied. So there's a small window where we have dirty data on
1340 * Note that this only applies to the last partial page in the file. The
1341 * bit which block_write_full_page() uses prepare/commit for. (That's
1342 * broken code anyway: it's wrong for msync()).
1344 * It's a rare case: affects the final partial page, for journalled data
1345 * where the file is subject to bith write() and writepage() in the same
1346 * transction. To fix it we'll need a custom block_write_full_page().
1347 * We'll probably need that anyway for journalling writepage() output.
1349 * We don't honour synchronous mounts for writepage(). That would be
1350 * disastrous. Any write() or metadata operation will sync the fs for
1353 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1354 * we don't need to open a transaction here.
1356 static int ext3_ordered_writepage(struct page *page,
1357 struct writeback_control *wbc)
1359 struct inode *inode = page->mapping->host;
1360 struct buffer_head *page_bufs;
1361 handle_t *handle = NULL;
1365 J_ASSERT(PageLocked(page));
1368 * We give up here if we're reentered, because it might be for a
1369 * different filesystem.
1371 if (ext3_journal_current_handle())
1374 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1376 if (IS_ERR(handle)) {
1377 ret = PTR_ERR(handle);
1381 if (!page_has_buffers(page)) {
1382 create_empty_buffers(page, inode->i_sb->s_blocksize,
1383 (1 << BH_Dirty)|(1 << BH_Uptodate));
1385 page_bufs = page_buffers(page);
1386 walk_page_buffers(handle, page_bufs, 0,
1387 PAGE_CACHE_SIZE, NULL, bget_one);
1389 ret = block_write_full_page(page, ext3_get_block, wbc);
1392 * The page can become unlocked at any point now, and
1393 * truncate can then come in and change things. So we
1394 * can't touch *page from now on. But *page_bufs is
1395 * safe due to elevated refcount.
1399 * And attach them to the current transaction. But only if
1400 * block_write_full_page() succeeded. Otherwise they are unmapped,
1401 * and generally junk.
1404 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1405 NULL, journal_dirty_data_fn);
1409 walk_page_buffers(handle, page_bufs, 0,
1410 PAGE_CACHE_SIZE, NULL, bput_one);
1411 err = ext3_journal_stop(handle);
1417 redirty_page_for_writepage(wbc, page);
1422 static int ext3_writeback_writepage(struct page *page,
1423 struct writeback_control *wbc)
1425 struct inode *inode = page->mapping->host;
1426 handle_t *handle = NULL;
1430 if (ext3_journal_current_handle())
1433 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1434 if (IS_ERR(handle)) {
1435 ret = PTR_ERR(handle);
1439 ret = block_write_full_page(page, ext3_get_block, wbc);
1440 err = ext3_journal_stop(handle);
1446 redirty_page_for_writepage(wbc, page);
1451 static int ext3_journalled_writepage(struct page *page,
1452 struct writeback_control *wbc)
1454 struct inode *inode = page->mapping->host;
1455 handle_t *handle = NULL;
1459 if (ext3_journal_current_handle())
1462 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1463 if (IS_ERR(handle)) {
1464 ret = PTR_ERR(handle);
1468 if (!page_has_buffers(page) || PageChecked(page)) {
1470 * It's mmapped pagecache. Add buffers and journal it. There
1471 * doesn't seem much point in redirtying the page here.
1473 ClearPageChecked(page);
1474 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1478 ret = walk_page_buffers(handle, page_buffers(page), 0,
1479 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1481 err = walk_page_buffers(handle, page_buffers(page), 0,
1482 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1485 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1489 * It may be a page full of checkpoint-mode buffers. We don't
1490 * really know unless we go poke around in the buffer_heads.
1491 * But block_write_full_page will do the right thing.
1493 ret = block_write_full_page(page, ext3_get_block, wbc);
1495 err = ext3_journal_stop(handle);
1502 redirty_page_for_writepage(wbc, page);
1508 static int ext3_readpage(struct file *file, struct page *page)
1510 return mpage_readpage(page, ext3_get_block);
1514 ext3_readpages(struct file *file, struct address_space *mapping,
1515 struct list_head *pages, unsigned nr_pages)
1517 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1520 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1522 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1525 * If it's a full truncate we just forget about the pending dirtying
1528 ClearPageChecked(page);
1530 return journal_invalidatepage(journal, page, offset);
1533 static int ext3_releasepage(struct page *page, int wait)
1535 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1537 WARN_ON(PageChecked(page));
1538 return journal_try_to_free_buffers(journal, page, wait);
1542 * If the O_DIRECT write will extend the file then add this inode to the
1543 * orphan list. So recovery will truncate it back to the original size
1544 * if the machine crashes during the write.
1546 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1547 * crashes then stale disk data _may_ be exposed inside the file.
1549 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1550 const struct iovec *iov, loff_t offset,
1551 unsigned long nr_segs)
1553 struct file *file = iocb->ki_filp;
1554 struct inode *inode = file->f_mapping->host;
1555 struct ext3_inode_info *ei = EXT3_I(inode);
1556 handle_t *handle = NULL;
1559 size_t count = iov_length(iov, nr_segs);
1562 loff_t final_size = offset + count;
1564 handle = ext3_journal_start(inode, DIO_CREDITS);
1565 if (IS_ERR(handle)) {
1566 ret = PTR_ERR(handle);
1569 if (final_size > inode->i_size) {
1570 ret = ext3_orphan_add(handle, inode);
1574 ei->i_disksize = inode->i_size;
1578 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1580 ext3_direct_io_get_blocks, NULL);
1587 ext3_orphan_del(handle, inode);
1588 if (orphan && ret > 0) {
1589 loff_t end = offset + ret;
1590 if (end > inode->i_size) {
1591 ei->i_disksize = end;
1592 i_size_write(inode, end);
1593 err = ext3_mark_inode_dirty(handle, inode);
1598 err = ext3_journal_stop(handle);
1607 * Pages can be marked dirty completely asynchronously from ext3's journalling
1608 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1609 * much here because ->set_page_dirty is called under VFS locks. The page is
1610 * not necessarily locked.
1612 * We cannot just dirty the page and leave attached buffers clean, because the
1613 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1614 * or jbddirty because all the journalling code will explode.
1616 * So what we do is to mark the page "pending dirty" and next time writepage
1617 * is called, propagate that into the buffers appropriately.
1619 static int ext3_journalled_set_page_dirty(struct page *page)
1621 SetPageChecked(page);
1622 return __set_page_dirty_nobuffers(page);
1625 static struct address_space_operations ext3_ordered_aops = {
1626 .readpage = ext3_readpage,
1627 .readpages = ext3_readpages,
1628 .writepage = ext3_ordered_writepage,
1629 .sync_page = block_sync_page,
1630 .prepare_write = ext3_prepare_write,
1631 .commit_write = ext3_ordered_commit_write,
1633 .invalidatepage = ext3_invalidatepage,
1634 .releasepage = ext3_releasepage,
1635 .direct_IO = ext3_direct_IO,
1638 static struct address_space_operations ext3_writeback_aops = {
1639 .readpage = ext3_readpage,
1640 .readpages = ext3_readpages,
1641 .writepage = ext3_writeback_writepage,
1642 .sync_page = block_sync_page,
1643 .prepare_write = ext3_prepare_write,
1644 .commit_write = ext3_writeback_commit_write,
1646 .invalidatepage = ext3_invalidatepage,
1647 .releasepage = ext3_releasepage,
1648 .direct_IO = ext3_direct_IO,
1651 static struct address_space_operations ext3_journalled_aops = {
1652 .readpage = ext3_readpage,
1653 .readpages = ext3_readpages,
1654 .writepage = ext3_journalled_writepage,
1655 .sync_page = block_sync_page,
1656 .prepare_write = ext3_prepare_write,
1657 .commit_write = ext3_journalled_commit_write,
1658 .set_page_dirty = ext3_journalled_set_page_dirty,
1660 .invalidatepage = ext3_invalidatepage,
1661 .releasepage = ext3_releasepage,
1664 void ext3_set_aops(struct inode *inode)
1666 if (ext3_should_order_data(inode))
1667 inode->i_mapping->a_ops = &ext3_ordered_aops;
1668 else if (ext3_should_writeback_data(inode))
1669 inode->i_mapping->a_ops = &ext3_writeback_aops;
1671 inode->i_mapping->a_ops = &ext3_journalled_aops;
1675 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1676 * up to the end of the block which corresponds to `from'.
1677 * This required during truncate. We need to physically zero the tail end
1678 * of that block so it doesn't yield old data if the file is later grown.
1680 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1681 struct address_space *mapping, loff_t from)
1683 unsigned long index = from >> PAGE_CACHE_SHIFT;
1684 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1685 unsigned blocksize, iblock, length, pos;
1686 struct inode *inode = mapping->host;
1687 struct buffer_head *bh;
1691 blocksize = inode->i_sb->s_blocksize;
1692 length = blocksize - (offset & (blocksize - 1));
1693 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1695 if (!page_has_buffers(page))
1696 create_empty_buffers(page, blocksize, 0);
1698 /* Find the buffer that contains "offset" */
1699 bh = page_buffers(page);
1701 while (offset >= pos) {
1702 bh = bh->b_this_page;
1708 if (buffer_freed(bh)) {
1709 BUFFER_TRACE(bh, "freed: skip");
1713 if (!buffer_mapped(bh)) {
1714 BUFFER_TRACE(bh, "unmapped");
1715 ext3_get_block(inode, iblock, bh, 0);
1716 /* unmapped? It's a hole - nothing to do */
1717 if (!buffer_mapped(bh)) {
1718 BUFFER_TRACE(bh, "still unmapped");
1723 /* Ok, it's mapped. Make sure it's up-to-date */
1724 if (PageUptodate(page))
1725 set_buffer_uptodate(bh);
1727 if (!buffer_uptodate(bh)) {
1729 ll_rw_block(READ, 1, &bh);
1731 /* Uhhuh. Read error. Complain and punt. */
1732 if (!buffer_uptodate(bh))
1736 if (ext3_should_journal_data(inode)) {
1737 BUFFER_TRACE(bh, "get write access");
1738 err = ext3_journal_get_write_access(handle, bh);
1743 kaddr = kmap_atomic(page, KM_USER0);
1744 memset(kaddr + offset, 0, length);
1745 flush_dcache_page(page);
1746 kunmap_atomic(kaddr, KM_USER0);
1748 BUFFER_TRACE(bh, "zeroed end of block");
1751 if (ext3_should_journal_data(inode)) {
1752 err = ext3_journal_dirty_metadata(handle, bh);
1754 if (ext3_should_order_data(inode))
1755 err = ext3_journal_dirty_data(handle, bh);
1756 mark_buffer_dirty(bh);
1761 page_cache_release(page);
1766 * Probably it should be a library function... search for first non-zero word
1767 * or memcmp with zero_page, whatever is better for particular architecture.
1770 static inline int all_zeroes(u32 *p, u32 *q)
1779 * ext3_find_shared - find the indirect blocks for partial truncation.
1780 * @inode: inode in question
1781 * @depth: depth of the affected branch
1782 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1783 * @chain: place to store the pointers to partial indirect blocks
1784 * @top: place to the (detached) top of branch
1786 * This is a helper function used by ext3_truncate().
1788 * When we do truncate() we may have to clean the ends of several
1789 * indirect blocks but leave the blocks themselves alive. Block is
1790 * partially truncated if some data below the new i_size is refered
1791 * from it (and it is on the path to the first completely truncated
1792 * data block, indeed). We have to free the top of that path along
1793 * with everything to the right of the path. Since no allocation
1794 * past the truncation point is possible until ext3_truncate()
1795 * finishes, we may safely do the latter, but top of branch may
1796 * require special attention - pageout below the truncation point
1797 * might try to populate it.
1799 * We atomically detach the top of branch from the tree, store the
1800 * block number of its root in *@top, pointers to buffer_heads of
1801 * partially truncated blocks - in @chain[].bh and pointers to
1802 * their last elements that should not be removed - in
1803 * @chain[].p. Return value is the pointer to last filled element
1806 * The work left to caller to do the actual freeing of subtrees:
1807 * a) free the subtree starting from *@top
1808 * b) free the subtrees whose roots are stored in
1809 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1810 * c) free the subtrees growing from the inode past the @chain[0].
1811 * (no partially truncated stuff there). */
1813 static Indirect *ext3_find_shared(struct inode *inode,
1819 Indirect *partial, *p;
1823 /* Make k index the deepest non-null offest + 1 */
1824 for (k = depth; k > 1 && !offsets[k-1]; k--)
1826 partial = ext3_get_branch(inode, k, offsets, chain, &err);
1827 /* Writer: pointers */
1829 partial = chain + k-1;
1831 * If the branch acquired continuation since we've looked at it -
1832 * fine, it should all survive and (new) top doesn't belong to us.
1834 if (!partial->key && *partial->p)
1837 for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
1840 * OK, we've found the last block that must survive. The rest of our
1841 * branch should be detached before unlocking. However, if that rest
1842 * of branch is all ours and does not grow immediately from the inode
1843 * it's easier to cheat and just decrement partial->p.
1845 if (p == chain + k - 1 && p > chain) {
1849 /* Nope, don't do this in ext3. Must leave the tree intact */
1858 brelse(partial->bh);
1866 * Zero a number of block pointers in either an inode or an indirect block.
1867 * If we restart the transaction we must again get write access to the
1868 * indirect block for further modification.
1870 * We release `count' blocks on disk, but (last - first) may be greater
1871 * than `count' because there can be holes in there.
1874 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1875 unsigned long block_to_free, unsigned long count,
1876 u32 *first, u32 *last)
1879 if (try_to_extend_transaction(handle, inode)) {
1881 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1882 ext3_journal_dirty_metadata(handle, bh);
1884 ext3_mark_inode_dirty(handle, inode);
1885 ext3_journal_test_restart(handle, inode);
1887 BUFFER_TRACE(bh, "retaking write access");
1888 ext3_journal_get_write_access(handle, bh);
1893 * Any buffers which are on the journal will be in memory. We find
1894 * them on the hash table so journal_revoke() will run journal_forget()
1895 * on them. We've already detached each block from the file, so
1896 * bforget() in journal_forget() should be safe.
1898 * AKPM: turn on bforget in journal_forget()!!!
1900 for (p = first; p < last; p++) {
1901 u32 nr = le32_to_cpu(*p);
1903 struct buffer_head *bh;
1906 bh = sb_find_get_block(inode->i_sb, nr);
1907 ext3_forget(handle, 0, inode, bh, nr);
1911 ext3_free_blocks(handle, inode, block_to_free, count);
1915 * ext3_free_data - free a list of data blocks
1916 * @handle: handle for this transaction
1917 * @inode: inode we are dealing with
1918 * @this_bh: indirect buffer_head which contains *@first and *@last
1919 * @first: array of block numbers
1920 * @last: points immediately past the end of array
1922 * We are freeing all blocks refered from that array (numbers are stored as
1923 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1925 * We accumulate contiguous runs of blocks to free. Conveniently, if these
1926 * blocks are contiguous then releasing them at one time will only affect one
1927 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1928 * actually use a lot of journal space.
1930 * @this_bh will be %NULL if @first and @last point into the inode's direct
1933 static void ext3_free_data(handle_t *handle, struct inode *inode,
1934 struct buffer_head *this_bh, u32 *first, u32 *last)
1936 unsigned long block_to_free = 0; /* Starting block # of a run */
1937 unsigned long count = 0; /* Number of blocks in the run */
1938 u32 *block_to_free_p = NULL; /* Pointer into inode/ind
1941 unsigned long nr; /* Current block # */
1942 u32 *p; /* Pointer into inode/ind
1943 for current block */
1946 if (this_bh) { /* For indirect block */
1947 BUFFER_TRACE(this_bh, "get_write_access");
1948 err = ext3_journal_get_write_access(handle, this_bh);
1949 /* Important: if we can't update the indirect pointers
1950 * to the blocks, we can't free them. */
1955 for (p = first; p < last; p++) {
1956 nr = le32_to_cpu(*p);
1958 /* accumulate blocks to free if they're contiguous */
1961 block_to_free_p = p;
1963 } else if (nr == block_to_free + count) {
1966 ext3_clear_blocks(handle, inode, this_bh,
1968 count, block_to_free_p, p);
1970 block_to_free_p = p;
1977 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1978 count, block_to_free_p, p);
1981 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1982 ext3_journal_dirty_metadata(handle, this_bh);
1987 * ext3_free_branches - free an array of branches
1988 * @handle: JBD handle for this transaction
1989 * @inode: inode we are dealing with
1990 * @parent_bh: the buffer_head which contains *@first and *@last
1991 * @first: array of block numbers
1992 * @last: pointer immediately past the end of array
1993 * @depth: depth of the branches to free
1995 * We are freeing all blocks refered from these branches (numbers are
1996 * stored as little-endian 32-bit) and updating @inode->i_blocks
1999 static void ext3_free_branches(handle_t *handle, struct inode *inode,
2000 struct buffer_head *parent_bh,
2001 u32 *first, u32 *last, int depth)
2006 if (is_handle_aborted(handle))
2010 struct buffer_head *bh;
2011 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2013 while (--p >= first) {
2014 nr = le32_to_cpu(*p);
2016 continue; /* A hole */
2018 /* Go read the buffer for the next level down */
2019 bh = sb_bread(inode->i_sb, nr);
2022 * A read failure? Report error and clear slot
2026 ext3_error(inode->i_sb, "ext3_free_branches",
2027 "Read failure, inode=%ld, block=%ld",
2032 /* This zaps the entire block. Bottom up. */
2033 BUFFER_TRACE(bh, "free child branches");
2034 ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
2035 (u32*)bh->b_data + addr_per_block,
2039 * We've probably journalled the indirect block several
2040 * times during the truncate. But it's no longer
2041 * needed and we now drop it from the transaction via
2044 * That's easy if it's exclusively part of this
2045 * transaction. But if it's part of the committing
2046 * transaction then journal_forget() will simply
2047 * brelse() it. That means that if the underlying
2048 * block is reallocated in ext3_get_block(),
2049 * unmap_underlying_metadata() will find this block
2050 * and will try to get rid of it. damn, damn.
2052 * If this block has already been committed to the
2053 * journal, a revoke record will be written. And
2054 * revoke records must be emitted *before* clearing
2055 * this block's bit in the bitmaps.
2057 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2060 * Everything below this this pointer has been
2061 * released. Now let this top-of-subtree go.
2063 * We want the freeing of this indirect block to be
2064 * atomic in the journal with the updating of the
2065 * bitmap block which owns it. So make some room in
2068 * We zero the parent pointer *after* freeing its
2069 * pointee in the bitmaps, so if extend_transaction()
2070 * for some reason fails to put the bitmap changes and
2071 * the release into the same transaction, recovery
2072 * will merely complain about releasing a free block,
2073 * rather than leaking blocks.
2075 if (is_handle_aborted(handle))
2077 if (try_to_extend_transaction(handle, inode)) {
2078 ext3_mark_inode_dirty(handle, inode);
2079 ext3_journal_test_restart(handle, inode);
2082 ext3_free_blocks(handle, inode, nr, 1);
2086 * The block which we have just freed is
2087 * pointed to by an indirect block: journal it
2089 BUFFER_TRACE(parent_bh, "get_write_access");
2090 if (!ext3_journal_get_write_access(handle,
2093 BUFFER_TRACE(parent_bh,
2094 "call ext3_journal_dirty_metadata");
2095 ext3_journal_dirty_metadata(handle,
2101 /* We have reached the bottom of the tree. */
2102 BUFFER_TRACE(parent_bh, "free data blocks");
2103 ext3_free_data(handle, inode, parent_bh, first, last);
2110 * We block out ext3_get_block() block instantiations across the entire
2111 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2112 * simultaneously on behalf of the same inode.
2114 * As we work through the truncate and commmit bits of it to the journal there
2115 * is one core, guiding principle: the file's tree must always be consistent on
2116 * disk. We must be able to restart the truncate after a crash.
2118 * The file's tree may be transiently inconsistent in memory (although it
2119 * probably isn't), but whenever we close off and commit a journal transaction,
2120 * the contents of (the filesystem + the journal) must be consistent and
2121 * restartable. It's pretty simple, really: bottom up, right to left (although
2122 * left-to-right works OK too).
2124 * Note that at recovery time, journal replay occurs *before* the restart of
2125 * truncate against the orphan inode list.
2127 * The committed inode has the new, desired i_size (which is the same as
2128 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
2129 * that this inode's truncate did not complete and it will again call
2130 * ext3_truncate() to have another go. So there will be instantiated blocks
2131 * to the right of the truncation point in a crashed ext3 filesystem. But
2132 * that's fine - as long as they are linked from the inode, the post-crash
2133 * ext3_truncate() run will find them and release them.
2136 void ext3_truncate(struct inode * inode)
2139 struct ext3_inode_info *ei = EXT3_I(inode);
2140 u32 *i_data = ei->i_data;
2141 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2142 struct address_space *mapping = inode->i_mapping;
2149 unsigned blocksize = inode->i_sb->s_blocksize;
2152 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2153 S_ISLNK(inode->i_mode)))
2155 if (ext3_inode_is_fast_symlink(inode))
2157 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2160 ext3_discard_prealloc(inode);
2163 * We have to lock the EOF page here, because lock_page() nests
2164 * outside journal_start().
2166 if ((inode->i_size & (blocksize - 1)) == 0) {
2167 /* Block boundary? Nothing to do */
2170 page = grab_cache_page(mapping,
2171 inode->i_size >> PAGE_CACHE_SHIFT);
2176 handle = start_transaction(inode);
2177 if (IS_ERR(handle)) {
2179 clear_highpage(page);
2180 flush_dcache_page(page);
2182 page_cache_release(page);
2184 return; /* AKPM: return what? */
2187 last_block = (inode->i_size + blocksize-1)
2188 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2191 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2193 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2195 goto out_stop; /* error */
2198 * OK. This truncate is going to happen. We add the inode to the
2199 * orphan list, so that if this truncate spans multiple transactions,
2200 * and we crash, we will resume the truncate when the filesystem
2201 * recovers. It also marks the inode dirty, to catch the new size.
2203 * Implication: the file must always be in a sane, consistent
2204 * truncatable state while each transaction commits.
2206 if (ext3_orphan_add(handle, inode))
2210 * The orphan list entry will now protect us from any crash which
2211 * occurs before the truncate completes, so it is now safe to propagate
2212 * the new, shorter inode size (held for now in i_size) into the
2213 * on-disk inode. We do this via i_disksize, which is the value which
2214 * ext3 *really* writes onto the disk inode.
2216 ei->i_disksize = inode->i_size;
2219 * From here we block out all ext3_get_block() callers who want to
2220 * modify the block allocation tree.
2222 down(&ei->truncate_sem);
2224 if (n == 1) { /* direct blocks */
2225 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2226 i_data + EXT3_NDIR_BLOCKS);
2230 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2231 /* Kill the top of shared branch (not detached) */
2233 if (partial == chain) {
2234 /* Shared branch grows from the inode */
2235 ext3_free_branches(handle, inode, NULL,
2236 &nr, &nr+1, (chain+n-1) - partial);
2239 * We mark the inode dirty prior to restart,
2240 * and prior to stop. No need for it here.
2243 /* Shared branch grows from an indirect block */
2244 BUFFER_TRACE(partial->bh, "get_write_access");
2245 ext3_free_branches(handle, inode, partial->bh,
2247 partial->p+1, (chain+n-1) - partial);
2250 /* Clear the ends of indirect blocks on the shared branch */
2251 while (partial > chain) {
2252 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2253 (u32*)partial->bh->b_data + addr_per_block,
2254 (chain+n-1) - partial);
2255 BUFFER_TRACE(partial->bh, "call brelse");
2256 brelse (partial->bh);
2260 /* Kill the remaining (whole) subtrees */
2261 switch (offsets[0]) {
2263 nr = i_data[EXT3_IND_BLOCK];
2265 ext3_free_branches(handle, inode, NULL,
2267 i_data[EXT3_IND_BLOCK] = 0;
2269 case EXT3_IND_BLOCK:
2270 nr = i_data[EXT3_DIND_BLOCK];
2272 ext3_free_branches(handle, inode, NULL,
2274 i_data[EXT3_DIND_BLOCK] = 0;
2276 case EXT3_DIND_BLOCK:
2277 nr = i_data[EXT3_TIND_BLOCK];
2279 ext3_free_branches(handle, inode, NULL,
2281 i_data[EXT3_TIND_BLOCK] = 0;
2283 case EXT3_TIND_BLOCK:
2286 up(&ei->truncate_sem);
2287 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2288 ext3_mark_inode_dirty(handle, inode);
2290 /* In a multi-transaction truncate, we only make the final
2291 * transaction synchronous */
2296 * If this was a simple ftruncate(), and the file will remain alive
2297 * then we need to clear up the orphan record which we created above.
2298 * However, if this was a real unlink then we were called by
2299 * ext3_delete_inode(), and we allow that function to clean up the
2300 * orphan info for us.
2303 ext3_orphan_del(handle, inode);
2305 ext3_journal_stop(handle);
2308 static unsigned long ext3_get_inode_block(struct super_block *sb,
2309 unsigned long ino, struct ext3_iloc *iloc)
2311 unsigned long desc, group_desc, block_group;
2312 unsigned long offset, block;
2313 struct buffer_head *bh;
2314 struct ext3_group_desc * gdp;
2316 if ((ino != EXT3_ROOT_INO &&
2317 ino != EXT3_JOURNAL_INO &&
2318 ino < EXT3_FIRST_INO(sb)) ||
2320 EXT3_SB(sb)->s_es->s_inodes_count)) {
2321 ext3_error (sb, "ext3_get_inode_block",
2322 "bad inode number: %lu", ino);
2325 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2326 if (block_group >= EXT3_SB(sb)->s_groups_count) {
2327 ext3_error (sb, "ext3_get_inode_block",
2328 "group >= groups count");
2331 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2332 desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2333 bh = EXT3_SB(sb)->s_group_desc[group_desc];
2335 ext3_error (sb, "ext3_get_inode_block",
2336 "Descriptor not loaded");
2340 gdp = (struct ext3_group_desc *) bh->b_data;
2342 * Figure out the offset within the block group inode table
2344 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2345 EXT3_INODE_SIZE(sb);
2346 block = le32_to_cpu(gdp[desc].bg_inode_table) +
2347 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2349 iloc->block_group = block_group;
2350 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2355 * ext3_get_inode_loc returns with an extra refcount against the inode's
2356 * underlying buffer_head on success. If `in_mem' is false then we're purely
2357 * trying to determine the inode's location on-disk and no read need be
2360 static int ext3_get_inode_loc(struct inode *inode,
2361 struct ext3_iloc *iloc, int in_mem)
2363 unsigned long block;
2364 struct buffer_head *bh;
2366 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2370 bh = sb_getblk(inode->i_sb, block);
2372 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2373 "unable to read inode block - "
2374 "inode=%lu, block=%lu", inode->i_ino, block);
2377 if (!buffer_uptodate(bh)) {
2379 if (buffer_uptodate(bh)) {
2380 /* someone brought it uptodate while we waited */
2385 /* we can't skip I/O if inode is on a disk only */
2387 struct buffer_head *bitmap_bh;
2388 struct ext3_group_desc *desc;
2389 int inodes_per_buffer;
2390 int inode_offset, i;
2395 * If this is the only valid inode in the block we
2396 * need not read the block.
2398 block_group = (inode->i_ino - 1) /
2399 EXT3_INODES_PER_GROUP(inode->i_sb);
2400 inodes_per_buffer = bh->b_size /
2401 EXT3_INODE_SIZE(inode->i_sb);
2402 inode_offset = ((inode->i_ino - 1) %
2403 EXT3_INODES_PER_GROUP(inode->i_sb));
2404 start = inode_offset & ~(inodes_per_buffer - 1);
2406 /* Is the inode bitmap in cache? */
2407 desc = ext3_get_group_desc(inode->i_sb,
2412 bitmap_bh = sb_getblk(inode->i_sb,
2413 le32_to_cpu(desc->bg_inode_bitmap));
2418 * If the inode bitmap isn't in cache then the
2419 * optimisation may end up performing two reads instead
2420 * of one, so skip it.
2422 if (!buffer_uptodate(bitmap_bh)) {
2426 for (i = start; i < start + inodes_per_buffer; i++) {
2427 if (i == inode_offset)
2429 if (ext3_test_bit(i, bitmap_bh->b_data))
2433 if (i == start + inodes_per_buffer) {
2434 /* all other inodes are free, so skip I/O */
2435 memset(bh->b_data, 0, bh->b_size);
2436 set_buffer_uptodate(bh);
2444 * There are another valid inodes in the buffer so we must
2445 * read the block from disk
2448 bh->b_end_io = end_buffer_read_sync;
2449 submit_bh(READ, bh);
2451 if (!buffer_uptodate(bh)) {
2452 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2453 "unable to read inode block - "
2454 "inode=%lu, block=%lu",
2455 inode->i_ino, block);
2465 void ext3_set_inode_flags(struct inode *inode)
2467 unsigned int flags = EXT3_I(inode)->i_flags;
2469 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2470 if (flags & EXT3_SYNC_FL)
2471 inode->i_flags |= S_SYNC;
2472 if (flags & EXT3_APPEND_FL)
2473 inode->i_flags |= S_APPEND;
2474 if (flags & EXT3_IMMUTABLE_FL)
2475 inode->i_flags |= S_IMMUTABLE;
2476 if (flags & EXT3_NOATIME_FL)
2477 inode->i_flags |= S_NOATIME;
2478 if (flags & EXT3_DIRSYNC_FL)
2479 inode->i_flags |= S_DIRSYNC;
2482 void ext3_read_inode(struct inode * inode)
2484 struct ext3_iloc iloc;
2485 struct ext3_inode *raw_inode;
2486 struct ext3_inode_info *ei = EXT3_I(inode);
2487 struct buffer_head *bh;
2490 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2491 ei->i_acl = EXT3_ACL_NOT_CACHED;
2492 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2494 if (ext3_get_inode_loc(inode, &iloc, 0))
2497 raw_inode = ext3_raw_inode(&iloc);
2498 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2499 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2500 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2501 if(!(test_opt (inode->i_sb, NO_UID32))) {
2502 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2503 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2505 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2506 inode->i_size = le32_to_cpu(raw_inode->i_size);
2507 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2508 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2509 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2510 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2513 ei->i_next_alloc_block = 0;
2514 ei->i_next_alloc_goal = 0;
2515 ei->i_dir_start_lookup = 0;
2516 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2517 /* We now have enough fields to check if the inode was active or not.
2518 * This is needed because nfsd might try to access dead inodes
2519 * the test is that same one that e2fsck uses
2520 * NeilBrown 1999oct15
2522 if (inode->i_nlink == 0) {
2523 if (inode->i_mode == 0 ||
2524 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2525 /* this inode is deleted */
2529 /* The only unlinked inodes we let through here have
2530 * valid i_mode and are being read by the orphan
2531 * recovery code: that's fine, we're about to complete
2532 * the process of deleting those. */
2534 inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
2535 * (for stat), not the fs block
2537 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2538 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2539 #ifdef EXT3_FRAGMENTS
2540 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2541 ei->i_frag_no = raw_inode->i_frag;
2542 ei->i_frag_size = raw_inode->i_fsize;
2544 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2545 if (!S_ISREG(inode->i_mode)) {
2546 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2549 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2551 ei->i_disksize = inode->i_size;
2552 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2553 #ifdef EXT3_PREALLOCATE
2554 ei->i_prealloc_count = 0;
2556 ei->i_block_group = iloc.block_group;
2559 * NOTE! The in-memory inode i_data array is in little-endian order
2560 * even on big-endian machines: we do NOT byteswap the block numbers!
2562 for (block = 0; block < EXT3_N_BLOCKS; block++)
2563 ei->i_data[block] = raw_inode->i_block[block];
2564 INIT_LIST_HEAD(&ei->i_orphan);
2566 if (S_ISREG(inode->i_mode)) {
2567 inode->i_op = &ext3_file_inode_operations;
2568 inode->i_fop = &ext3_file_operations;
2569 ext3_set_aops(inode);
2570 } else if (S_ISDIR(inode->i_mode)) {
2571 inode->i_op = &ext3_dir_inode_operations;
2572 inode->i_fop = &ext3_dir_operations;
2573 } else if (S_ISLNK(inode->i_mode)) {
2574 if (ext3_inode_is_fast_symlink(inode))
2575 inode->i_op = &ext3_fast_symlink_inode_operations;
2577 inode->i_op = &ext3_symlink_inode_operations;
2578 ext3_set_aops(inode);
2581 inode->i_op = &ext3_special_inode_operations;
2582 if (raw_inode->i_block[0])
2583 init_special_inode(inode, inode->i_mode,
2584 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2586 init_special_inode(inode, inode->i_mode,
2587 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2590 ext3_set_inode_flags(inode);
2594 make_bad_inode(inode);
2599 * Post the struct inode info into an on-disk inode location in the
2600 * buffer-cache. This gobbles the caller's reference to the
2601 * buffer_head in the inode location struct.
2603 * The caller must have write access to iloc->bh.
2605 static int ext3_do_update_inode(handle_t *handle,
2606 struct inode *inode,
2607 struct ext3_iloc *iloc)
2609 struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2610 struct ext3_inode_info *ei = EXT3_I(inode);
2611 struct buffer_head *bh = iloc->bh;
2612 int err = 0, rc, block;
2614 /* For fields not not tracking in the in-memory inode,
2615 * initialise them to zero for new inodes. */
2616 if (ei->i_state & EXT3_STATE_NEW)
2617 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2619 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2620 if(!(test_opt(inode->i_sb, NO_UID32))) {
2621 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2622 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2624 * Fix up interoperability with old kernels. Otherwise, old inodes get
2625 * re-used with the upper 16 bits of the uid/gid intact
2628 raw_inode->i_uid_high =
2629 cpu_to_le16(high_16_bits(inode->i_uid));
2630 raw_inode->i_gid_high =
2631 cpu_to_le16(high_16_bits(inode->i_gid));
2633 raw_inode->i_uid_high = 0;
2634 raw_inode->i_gid_high = 0;
2637 raw_inode->i_uid_low =
2638 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2639 raw_inode->i_gid_low =
2640 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2641 raw_inode->i_uid_high = 0;
2642 raw_inode->i_gid_high = 0;
2644 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2645 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2646 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2647 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2648 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2649 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2650 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2651 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2652 #ifdef EXT3_FRAGMENTS
2653 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2654 raw_inode->i_frag = ei->i_frag_no;
2655 raw_inode->i_fsize = ei->i_frag_size;
2657 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2658 if (!S_ISREG(inode->i_mode)) {
2659 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2661 raw_inode->i_size_high =
2662 cpu_to_le32(ei->i_disksize >> 32);
2663 if (ei->i_disksize > 0x7fffffffULL) {
2664 struct super_block *sb = inode->i_sb;
2665 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2666 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2667 EXT3_SB(sb)->s_es->s_rev_level ==
2668 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2669 /* If this is the first large file
2670 * created, add a flag to the superblock.
2672 err = ext3_journal_get_write_access(handle,
2673 EXT3_SB(sb)->s_sbh);
2676 ext3_update_dynamic_rev(sb);
2677 EXT3_SET_RO_COMPAT_FEATURE(sb,
2678 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2681 err = ext3_journal_dirty_metadata(handle,
2682 EXT3_SB(sb)->s_sbh);
2686 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2687 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2688 if (old_valid_dev(inode->i_rdev)) {
2689 raw_inode->i_block[0] =
2690 cpu_to_le32(old_encode_dev(inode->i_rdev));
2691 raw_inode->i_block[1] = 0;
2693 raw_inode->i_block[0] = 0;
2694 raw_inode->i_block[1] =
2695 cpu_to_le32(new_encode_dev(inode->i_rdev));
2696 raw_inode->i_block[2] = 0;
2698 } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2699 raw_inode->i_block[block] = ei->i_data[block];
2701 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2702 rc = ext3_journal_dirty_metadata(handle, bh);
2705 ei->i_state &= ~EXT3_STATE_NEW;
2709 ext3_std_error(inode->i_sb, err);
2714 * ext3_write_inode()
2716 * We are called from a few places:
2718 * - Within generic_file_write() for O_SYNC files.
2719 * Here, there will be no transaction running. We wait for any running
2720 * trasnaction to commit.
2722 * - Within sys_sync(), kupdate and such.
2723 * We wait on commit, if tol to.
2725 * - Within prune_icache() (PF_MEMALLOC == true)
2726 * Here we simply return. We can't afford to block kswapd on the
2729 * In all cases it is actually safe for us to return without doing anything,
2730 * because the inode has been copied into a raw inode buffer in
2731 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2734 * Note that we are absolutely dependent upon all inode dirtiers doing the
2735 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2736 * which we are interested.
2738 * It would be a bug for them to not do this. The code:
2740 * mark_inode_dirty(inode)
2742 * inode->i_size = expr;
2744 * is in error because a kswapd-driven write_inode() could occur while
2745 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2746 * will no longer be on the superblock's dirty inode list.
2748 void ext3_write_inode(struct inode *inode, int wait)
2750 if (current->flags & PF_MEMALLOC)
2753 if (ext3_journal_current_handle()) {
2754 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2762 ext3_force_commit(inode->i_sb);
2768 * Called from notify_change.
2770 * We want to trap VFS attempts to truncate the file as soon as
2771 * possible. In particular, we want to make sure that when the VFS
2772 * shrinks i_size, we put the inode on the orphan list and modify
2773 * i_disksize immediately, so that during the subsequent flushing of
2774 * dirty pages and freeing of disk blocks, we can guarantee that any
2775 * commit will leave the blocks being flushed in an unused state on
2776 * disk. (On recovery, the inode will get truncated and the blocks will
2777 * be freed, so we have a strong guarantee that no future commit will
2778 * leave these blocks visible to the user.)
2780 * Called with inode->sem down.
2782 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2784 struct inode *inode = dentry->d_inode;
2786 const unsigned int ia_valid = attr->ia_valid;
2788 error = inode_change_ok(inode, attr);
2792 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2793 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2796 /* (user+group)*(old+new) structure, inode write (sb,
2797 * inode block, ? - but truncate inode update has it) */
2798 handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
2799 if (IS_ERR(handle)) {
2800 error = PTR_ERR(handle);
2803 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2805 ext3_journal_stop(handle);
2808 /* Update corresponding info in inode so that everything is in
2809 * one transaction */
2810 if (attr->ia_valid & ATTR_UID)
2811 inode->i_uid = attr->ia_uid;
2812 if (attr->ia_valid & ATTR_GID)
2813 inode->i_gid = attr->ia_gid;
2814 error = ext3_mark_inode_dirty(handle, inode);
2815 ext3_journal_stop(handle);
2818 if (S_ISREG(inode->i_mode) &&
2819 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2822 handle = ext3_journal_start(inode, 3);
2823 if (IS_ERR(handle)) {
2824 error = PTR_ERR(handle);
2828 error = ext3_orphan_add(handle, inode);
2829 EXT3_I(inode)->i_disksize = attr->ia_size;
2830 rc = ext3_mark_inode_dirty(handle, inode);
2833 ext3_journal_stop(handle);
2836 rc = inode_setattr(inode, attr);
2838 /* If inode_setattr's call to ext3_truncate failed to get a
2839 * transaction handle at all, we need to clean up the in-core
2840 * orphan list manually. */
2842 ext3_orphan_del(NULL, inode);
2844 if (!rc && (ia_valid & ATTR_MODE))
2845 rc = ext3_acl_chmod(inode);
2848 ext3_std_error(inode->i_sb, error);
2856 * akpm: how many blocks doth make a writepage()?
2858 * With N blocks per page, it may be:
2863 * N+5 bitmap blocks (from the above)
2864 * N+5 group descriptor summary blocks
2867 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2869 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2871 * With ordered or writeback data it's the same, less the N data blocks.
2873 * If the inode's direct blocks can hold an integral number of pages then a
2874 * page cannot straddle two indirect blocks, and we can only touch one indirect
2875 * and dindirect block, and the "5" above becomes "3".
2877 * This still overestimates under most circumstances. If we were to pass the
2878 * start and end offsets in here as well we could do block_to_path() on each
2879 * block and work out the exact number of indirects which are touched. Pah.
2882 int ext3_writepage_trans_blocks(struct inode *inode)
2884 int bpp = ext3_journal_blocks_per_page(inode);
2885 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2888 if (ext3_should_journal_data(inode))
2889 ret = 3 * (bpp + indirects) + 2;
2891 ret = 2 * (bpp + indirects) + 2;
2894 /* We know that structure was already allocated during DQUOT_INIT so
2895 * we will be updating only the data blocks + inodes */
2896 ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
2903 * The caller must have previously called ext3_reserve_inode_write().
2904 * Give this, we know that the caller already has write access to iloc->bh.
2906 int ext3_mark_iloc_dirty(handle_t *handle,
2907 struct inode *inode, struct ext3_iloc *iloc)
2911 /* the do_update_inode consumes one bh->b_count */
2914 /* ext3_do_update_inode() does journal_dirty_metadata */
2915 err = ext3_do_update_inode(handle, inode, iloc);
2921 * On success, We end up with an outstanding reference count against
2922 * iloc->bh. This _must_ be cleaned up later.
2926 ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
2927 struct ext3_iloc *iloc)
2931 err = ext3_get_inode_loc(inode, iloc, 1);
2933 BUFFER_TRACE(iloc->bh, "get_write_access");
2934 err = ext3_journal_get_write_access(handle, iloc->bh);
2941 ext3_std_error(inode->i_sb, err);
2946 * akpm: What we do here is to mark the in-core inode as clean
2947 * with respect to inode dirtiness (it may still be data-dirty).
2948 * This means that the in-core inode may be reaped by prune_icache
2949 * without having to perform any I/O. This is a very good thing,
2950 * because *any* task may call prune_icache - even ones which
2951 * have a transaction open against a different journal.
2953 * Is this cheating? Not really. Sure, we haven't written the
2954 * inode out, but prune_icache isn't a user-visible syncing function.
2955 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2956 * we start and wait on commits.
2958 * Is this efficient/effective? Well, we're being nice to the system
2959 * by cleaning up our inodes proactively so they can be reaped
2960 * without I/O. But we are potentially leaving up to five seconds'
2961 * worth of inodes floating about which prune_icache wants us to
2962 * write out. One way to fix that would be to get prune_icache()
2963 * to do a write_super() to free up some memory. It has the desired
2966 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2968 struct ext3_iloc iloc;
2972 err = ext3_reserve_inode_write(handle, inode, &iloc);
2974 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2979 * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
2981 * We're really interested in the case where a file is being extended.
2982 * i_size has been changed by generic_commit_write() and we thus need
2983 * to include the updated inode in the current transaction.
2985 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
2986 * are allocated to the file.
2988 * If the inode is marked synchronous, we don't honour that here - doing
2989 * so would cause a commit on atime updates, which we don't bother doing.
2990 * We handle synchronous inodes at the highest possible level.
2992 void ext3_dirty_inode(struct inode *inode)
2994 handle_t *current_handle = ext3_journal_current_handle();
2997 handle = ext3_journal_start(inode, 2);
3000 if (current_handle &&
3001 current_handle->h_transaction != handle->h_transaction) {
3002 /* This task has a transaction open against a different fs */
3003 printk(KERN_EMERG "%s: transactions do not match!\n",
3006 jbd_debug(5, "marking dirty. outer handle=%p\n",
3008 ext3_mark_inode_dirty(handle, inode);
3010 ext3_journal_stop(handle);
3017 * Bind an inode's backing buffer_head into this transaction, to prevent
3018 * it from being flushed to disk early. Unlike
3019 * ext3_reserve_inode_write, this leaves behind no bh reference and
3020 * returns no iloc structure, so the caller needs to repeat the iloc
3021 * lookup to mark the inode dirty later.
3024 ext3_pin_inode(handle_t *handle, struct inode *inode)
3026 struct ext3_iloc iloc;
3030 err = ext3_get_inode_loc(inode, &iloc, 1);
3032 BUFFER_TRACE(iloc.bh, "get_write_access");
3033 err = journal_get_write_access(handle, iloc.bh);
3035 err = ext3_journal_dirty_metadata(handle,
3040 ext3_std_error(inode->i_sb, err);
3045 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3052 * We have to be very careful here: changing a data block's
3053 * journaling status dynamically is dangerous. If we write a
3054 * data block to the journal, change the status and then delete
3055 * that block, we risk forgetting to revoke the old log record
3056 * from the journal and so a subsequent replay can corrupt data.
3057 * So, first we make sure that the journal is empty and that
3058 * nobody is changing anything.
3061 journal = EXT3_JOURNAL(inode);
3062 if (is_journal_aborted(journal) || IS_RDONLY(inode))
3065 journal_lock_updates(journal);
3066 journal_flush(journal);
3069 * OK, there are no updates running now, and all cached data is
3070 * synced to disk. We are now in a completely consistent state
3071 * which doesn't have anything in the journal, and we know that
3072 * no filesystem updates are running, so it is safe to modify
3073 * the inode's in-core data-journaling state flag now.
3077 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3079 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3080 ext3_set_aops(inode);
3082 journal_unlock_updates(journal);
3084 /* Finally we can mark the inode as dirty. */
3086 handle = ext3_journal_start(inode, 1);
3088 return PTR_ERR(handle);
3090 err = ext3_mark_inode_dirty(handle, inode);
3092 ext3_journal_stop(handle);
3093 ext3_std_error(inode->i_sb, err);