X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Fext3%2Finode.c;h=23d956ff0870ecc23f6679675256300fbff63f98;hb=16c70f8c1b54b61c3b951b6fb220df250fe09b32;hp=da40fc7c6264da9fd8dc658658cda30b4fee94ed;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index da40fc7c6..23d956ff0 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -40,19 +40,21 @@ #include "xattr.h" #include "acl.h" +static int ext3_writepage_trans_blocks(struct inode *inode); + /* * Test whether an inode is a fast symlink. */ -static inline int ext3_inode_is_fast_symlink(struct inode *inode) +static int ext3_inode_is_fast_symlink(struct inode *inode) { int ea_blocks = EXT3_I(inode)->i_file_acl ? (inode->i_sb->s_blocksize >> 9) : 0; - return (S_ISLNK(inode->i_mode) && - inode->i_blocks - ea_blocks == 0); + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } -/* The ext3 forget function must perform a revoke if we are freeing data +/* + * The ext3 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be * revoked in all cases. * @@ -60,13 +62,13 @@ static inline int ext3_inode_is_fast_symlink(struct inode *inode) * but there may still be a record of it in the journal, and that record * still needs to be revoked. */ - -int ext3_forget(handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - int blocknr) +int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t blocknr) { int err; + might_sleep(); + BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -83,7 +85,7 @@ int ext3_forget(handle_t *handle, int is_metadata, (!is_metadata && !ext3_should_journal_data(inode))) { if (bh) { BUFFER_TRACE(bh, "call journal_forget"); - ext3_journal_forget(handle, bh); + return ext3_journal_forget(handle, bh); } return 0; } @@ -101,10 +103,9 @@ int ext3_forget(handle_t *handle, int is_metadata, } /* - * Work out how many blocks we need to progress with the next chunk of a + * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ - static unsigned long blocks_for_truncate(struct inode *inode) { unsigned long needed; @@ -125,7 +126,7 @@ static unsigned long blocks_for_truncate(struct inode *inode) if (needed > EXT3_MAX_TRANS_DATA) needed = EXT3_MAX_TRANS_DATA; - return EXT3_DATA_TRANS_BLOCKS + needed; + return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; } /* @@ -138,7 +139,6 @@ static unsigned long blocks_for_truncate(struct inode *inode) * extend fails, we need to propagate the failure up and restart the * transaction in the top-level truncate loop. --sct */ - static handle_t *start_transaction(struct inode *inode) { handle_t *result; @@ -177,21 +177,6 @@ static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) return ext3_journal_restart(handle, blocks_for_truncate(inode)); } -/* - * Called at each iput() - * - * The inode may be "bad" if ext3_read_inode() saw an error from - * ext3_get_inode(), so we need to check that to avoid freeing random disk - * blocks. - */ -void ext3_put_inode(struct inode *inode) -{ - if (!is_bad_inode(inode)) - ext3_discard_prealloc(inode); -} - -static void ext3_truncate_nocheck (struct inode *inode); - /* * Called at the last iput() if i_nlink is zero. */ @@ -199,14 +184,18 @@ void ext3_delete_inode (struct inode * inode) { handle_t *handle; + truncate_inode_pages(&inode->i_data, 0); + if (is_bad_inode(inode)) goto no_delete; handle = start_transaction(inode); if (IS_ERR(handle)) { - /* If we're going to skip the normal cleanup, we still - * need to make sure that the in-core orphan linked list - * is properly cleaned up. */ + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ ext3_orphan_del(NULL, inode); goto no_delete; } @@ -215,7 +204,7 @@ void ext3_delete_inode (struct inode * inode) handle->h_sync = 1; inode->i_size = 0; if (inode->i_blocks) - ext3_truncate_nocheck(inode); + ext3_truncate(inode); /* * Kill off the orphan record which ext3_truncate created. * AKPM: I think this can be inside the above `if'. @@ -245,79 +234,19 @@ no_delete: clear_inode(inode); /* We must guarantee clearing of inode... */ } -void ext3_discard_prealloc (struct inode * inode) -{ -#ifdef EXT3_PREALLOCATE - struct ext3_inode_info *ei = EXT3_I(inode); - /* Writer: ->i_prealloc* */ - if (ei->i_prealloc_count) { - unsigned short total = ei->i_prealloc_count; - unsigned long block = ei->i_prealloc_block; - ei->i_prealloc_count = 0; - ei->i_prealloc_block = 0; - /* Writer: end */ - ext3_free_blocks (inode, block, total); - } -#endif -} - -static int ext3_alloc_block (handle_t *handle, - struct inode * inode, unsigned long goal, int *err) -{ - unsigned long result; - -#ifdef EXT3_PREALLOCATE -#ifdef EXT3FS_DEBUG - static unsigned long alloc_hits, alloc_attempts; -#endif - struct ext3_inode_info *ei = EXT3_I(inode); - /* Writer: ->i_prealloc* */ - if (ei->i_prealloc_count && - (goal == ei->i_prealloc_block || - goal + 1 == ei->i_prealloc_block)) - { - result = ei->i_prealloc_block++; - ei->i_prealloc_count--; - /* Writer: end */ - ext3_debug ("preallocation hit (%lu/%lu).\n", - ++alloc_hits, ++alloc_attempts); - } else { - ext3_discard_prealloc (inode); - ext3_debug ("preallocation miss (%lu/%lu).\n", - alloc_hits, ++alloc_attempts); - if (S_ISREG(inode->i_mode)) - result = ext3_new_block (inode, goal, - &ei->i_prealloc_count, - &ei->i_prealloc_block, err); - else - result = ext3_new_block(inode, goal, NULL, NULL, err); - /* - * AKPM: this is somewhat sticky. I'm not surprised it was - * disabled in 2.2's ext3. Need to integrate b_committed_data - * guarding with preallocation, if indeed preallocation is - * effective. - */ - } -#else - result = ext3_new_block(handle, inode, goal, NULL, NULL, err); -#endif - return result; -} - - typedef struct { - u32 *p; - u32 key; + __le32 *p; + __le32 key; struct buffer_head *bh; } Indirect; -static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v) +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) { p->key = *(p->p = v); p->bh = bh; } -static inline int verify_chain(Indirect *from, Indirect *to) +static int verify_chain(Indirect *from, Indirect *to) { while (from <= to && from->key == *from->p) from++; @@ -387,10 +316,10 @@ static int ext3_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); + ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big"); } if (boundary) - *boundary = (i_block & (ptrs - 1)) == (final - 1); + *boundary = final - 1 - (i_block & (ptrs - 1)); return n; } @@ -442,7 +371,7 @@ static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, /* Reader: pointers */ if (!verify_chain(chain, p)) goto changed; - add_chain(++p, bh, (u32*)bh->b_data + *++offsets); + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) goto no_block; @@ -479,30 +408,29 @@ no_block: * * Caller must make sure that @ind is valid and will stay that way. */ - -static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) +static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) { struct ext3_inode_info *ei = EXT3_I(inode); - u32 *start = ind->bh ? (u32*) ind->bh->b_data : ei->i_data; - u32 *p; - unsigned long bg_start; - unsigned long colour; + __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; + __le32 *p; + ext3_fsblk_t bg_start; + ext3_grpblk_t colour; /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) + for (p = ind->p - 1; p >= start; p--) { if (*p) return le32_to_cpu(*p); + } /* No such thing, so let's try location of indirect block */ if (ind->bh) return ind->bh->b_blocknr; /* - * It is going to be refered from inode itself? OK, just put it into - * the same cylinder group then. + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. */ - bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + - le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); colour = (current->pid % 16) * (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); return bg_start + colour; @@ -517,51 +445,143 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) * @goal: place to store the result. * * Normally this function find the prefered place for block allocation, - * stores it in *@goal and returns zero. If the branch had been changed - * under us we return -EAGAIN. + * stores it in *@goal and returns zero. */ -static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], - Indirect *partial, unsigned long *goal) +static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, + Indirect chain[4], Indirect *partial) { - struct ext3_inode_info *ei = EXT3_I(inode); - /* Writer: ->i_next_alloc* */ - if (block == ei->i_next_alloc_block + 1) { - ei->i_next_alloc_block++; - ei->i_next_alloc_goal++; + struct ext3_block_alloc_info *block_i; + + block_i = EXT3_I(inode)->i_block_alloc_info; + + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block_i && (block == block_i->last_alloc_logical_block + 1) + && (block_i->last_alloc_physical_block != 0)) { + return block_i->last_alloc_physical_block + 1; } - /* Writer: end */ - /* Reader: pointers, ->i_next_alloc* */ - if (verify_chain(chain, partial)) { - /* - * try the heuristic for sequential allocation, - * failing that at least try to get decent locality. - */ - if (block == ei->i_next_alloc_block) - *goal = ei->i_next_alloc_goal; - if (!*goal) - *goal = ext3_find_near(inode, partial); - return 0; + + return ext3_find_near(inode, partial); +} + +/** + * ext3_blks_to_allocate: Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, + int blocks_to_boundary) +{ + unsigned long count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext3_alloc_blocks: multiple allocate blocks needed for a branch + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @blks: on return it will store the total number of allocated + * direct blocks + */ +static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int indirect_blks, int blks, + ext3_fsblk_t new_blocks[4], int *err) +{ + int target, i; + unsigned long count = 0; + int index = 0; + ext3_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + target = blks + indirect_blks; + + while (1) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext3_new_blocks(handle,inode,goal,&count,err); + if (*err) + goto failed_out; + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + + if (count > 0) + break; } - /* Reader: end */ - return -EAGAIN; + + /* save the new block number for the first direct block */ + new_blocks[index] = current_block; + + /* total number of blocks allocated for direct blocks */ + ret = count; + *err = 0; + return ret; +failed_out: + for (i = 0; i key). Upon the exit we have the same - * picture as after the successful ext3_get_block(), excpet that in one + * picture as after the successful ext3_get_block(), except that in one * place chain is disconnected - *branch->p is still zero (we did not * set the last link), but branch->key contains the number that should * be placed into *branch->p to fill that gap. @@ -571,98 +591,106 @@ static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ - static int ext3_alloc_branch(handle_t *handle, struct inode *inode, - int num, - unsigned long goal, - int *offsets, - Indirect *branch) + int indirect_blks, int *blks, ext3_fsblk_t goal, + int *offsets, Indirect *branch) { int blocksize = inode->i_sb->s_blocksize; - int n = 0, keys = 0; + int i, n = 0; int err = 0; - int i; - int parent = ext3_alloc_block(handle, inode, goal, &err); - - branch[0].key = cpu_to_le32(parent); - if (parent) { - for (n = 1; n < num; n++) { - struct buffer_head *bh; - /* Allocate the next block */ - int nr = ext3_alloc_block(handle, inode, parent, &err); - if (!nr) - break; - branch[n].key = cpu_to_le32(nr); - keys = n+1; + struct buffer_head *bh; + int num; + ext3_fsblk_t new_blocks[4]; + ext3_fsblk_t current_block; - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, parent); - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext3_journal_get_create_access(handle, bh); - if (err) { - unlock_buffer(bh); - brelse(bh); - break; - } + num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; - memset(bh->b_data, 0, blocksize); - branch[n].p = (u32*) bh->b_data + offsets[n]; - *branch[n].p = branch[n].key; - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3_journal_get_create_access(handle, bh); + if (err) { unlock_buffer(bh); + brelse(bh); + goto failed; + } - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) - break; - - parent = nr; + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if ( n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i=1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); } - } - if (n == num) - return 0; + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: /* Allocation failed, free what we already allocated */ - for (i = 1; i < keys; i++) { + for (i = 1; i <= n ; i++) { BUFFER_TRACE(branch[i].bh, "call journal_forget"); ext3_journal_forget(handle, branch[i].bh); } - for (i = 0; i < keys; i++) - ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + for (i = 0; i i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. Otherwise (== chain had been changed) - * we free the new blocks (forgetting their buffer_heads, indeed) and - * return -EAGAIN. + * ext3_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext3_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. */ - -static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, - Indirect chain[4], Indirect *where, int num) +static int ext3_splice_branch(handle_t *handle, struct inode *inode, + long block, Indirect *where, int num, int blks) { int i; int err = 0; - struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *block_i; + ext3_fsblk_t current_block; + block_i = EXT3_I(inode)->i_block_alloc_info; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block @@ -674,29 +702,40 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, if (err) goto err_out; } - /* Verify that place we are splicing to is still there and vacant */ - - /* Writer: pointers, ->i_next_alloc* */ - if (!verify_chain(chain, where-1) || *where->p) - /* Writer: end */ - goto changed; - /* That's it */ *where->p = where->key; - ei->i_next_alloc_block = block; - ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key); - /* Writer: end */ + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i ) = cpu_to_le32(current_block++); + } + + /* + * update the most recently allocated logical & physical block + * in i_block_alloc_info, to assist find the proper goal block for next + * allocation + */ + if (block_i) { + block_i->last_alloc_logical_block = block + blks - 1; + block_i->last_alloc_physical_block = + le32_to_cpu(where[num].key) + blks - 1; + } /* We are done with atomic stuff, now do the rest of housekeeping */ - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); /* had we spliced it onto indirect block? */ if (where->bh) { /* - * akpm: If we spliced it onto an indirect block, we haven't + * If we spliced it onto an indirect block, we haven't * altered the inode. Note however that if it is being spliced * onto an indirect block at the very end of the file (the * file is growing) then we *will* alter the inode to reflect @@ -717,26 +756,14 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, } return err; -changed: - /* - * AKPM: if where[i].bh isn't part of the current updating - * transaction then we explode nastily. Test this code path. - */ - jbd_debug(1, "the chain changed: try again\n"); - err = -EAGAIN; - err_out: - for (i = 1; i < num; i++) { + for (i = 1; i <= num; i++) { BUFFER_TRACE(where[i].bh, "call journal_forget"); ext3_journal_forget(handle, where[i].bh); + ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); } - /* For the normal collision cleanup case, we free up the blocks. - * On genuine filesystem errors we don't even think about doing - * that. */ - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, - le32_to_cpu(where[i].key), 1); + ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); + return err; } @@ -752,141 +779,184 @@ err_out: * allocations is needed - we simply release blocks and do not touch anything * reachable from inode. * - * akpm: `handle' can be NULL if create == 0. + * `handle' can be NULL if create == 0. * * The BKL may not be held on entry here. Be sure to take it early. + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. */ - -static int -ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, int extend_disksize) +int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize) { int err = -EIO; int offsets[4]; Indirect chain[4]; Indirect *partial; - unsigned long goal; - int left; - int boundary = 0; - int depth = ext3_block_to_path(inode, iblock, offsets, &boundary); + ext3_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; struct ext3_inode_info *ei = EXT3_I(inode); + int count = 0; + ext3_fsblk_t first_block = 0; + J_ASSERT(handle != NULL || create == 0); + depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); if (depth == 0) goto out; -reread: partial = ext3_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); clear_buffer_new(bh_result); -got_it: - map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); - if (boundary) - set_buffer_boundary(bh_result); - /* Clean up and exit */ - partial = chain+depth-1; /* the whole chain */ - goto cleanup; + count++; + /*map more blocks*/ + while (count < maxblocks && count <= blocks_to_boundary) { + ext3_fsblk_t blk; + + if (!verify_chain(chain, partial)) { + /* + * Indirect block might be removed by + * truncate while we were reading it. + * Handling of that case: forget what we've + * got now. Flag the err as EAGAIN, so it + * will reread. + */ + err = -EAGAIN; + count = 0; + break; + } + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + if (err != -EAGAIN) + goto got_it; } /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) { -cleanup: + if (!create || err == -EIO) + goto cleanup; + + mutex_lock(&ei->truncate_mutex); + + /* + * If the indirect block is missing while we are reading + * the chain(ext3_get_branch() returns -EAGAIN err), or + * if the chain has been changed after we grab the semaphore, + * (either because another process truncated this branch, or + * another get_block allocated this branch) re-grab the chain to see if + * the request block has been allocated or not. + * + * Since we already block the truncate/other get_block + * at this point, we will have the current copy of the chain when we + * splice the branch into the tree. + */ + if (err == -EAGAIN || !verify_chain(chain, partial)) { while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } - BUFFER_TRACE(bh_result, "returned"); -out: - return err; + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + if (!partial) { + count++; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + clear_buffer_new(bh_result); + goto got_it; + } } /* - * Indirect block might be removed by truncate while we were - * reading it. Handling of that case (forget what we've got and - * reread) is taken out of the main path. - */ - if (err == -EAGAIN) - goto changed; + * Okay, we need to do block allocation. Lazily initialize the block + * allocation info here if necessary + */ + if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) + ext3_init_block_alloc_info(inode); - goal = 0; - down(&ei->truncate_sem); - if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) { - up(&ei->truncate_sem); - goto changed; - } + goal = ext3_find_goal(inode, iblock, chain, partial); - left = (chain + depth) - partial; + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext3_blks_to_allocate(partial, indirect_blks, + maxblocks, blocks_to_boundary); /* * Block out ext3_truncate while we alter the tree */ - err = ext3_alloc_branch(handle, inode, left, goal, - offsets+(partial-chain), partial); + err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, + offsets + (partial - chain), partial); - /* The ext3_splice_branch call will free and forget any buffers + /* + * The ext3_splice_branch call will free and forget any buffers * on the new chain if there is a failure, but that risks using * up transaction credits, especially for bitmaps where the * credits cannot be returned. Can we handle this somehow? We - * may need to return -EAGAIN upwards in the worst case. --sct */ + * may need to return -EAGAIN upwards in the worst case. --sct + */ if (!err) - err = ext3_splice_branch(handle, inode, iblock, chain, - partial, left); - /* i_disksize growing is protected by truncate_sem - * don't forget to protect it if you're about to implement - * concurrent ext3_get_block() -bzzz */ + err = ext3_splice_branch(handle, inode, iblock, + partial, indirect_blks, count); + /* + * i_disksize growing is protected by truncate_mutex. Don't forget to + * protect it if you're about to implement concurrent + * ext3_get_block() -bzzz + */ if (!err && extend_disksize && inode->i_size > ei->i_disksize) ei->i_disksize = inode->i_size; - up(&ei->truncate_sem); - if (err == -EAGAIN) - goto changed; + mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; set_buffer_new(bh_result); - goto got_it; - -changed: +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (count > blocks_to_boundary) + set_buffer_boundary(bh_result); + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: while (partial > chain) { - jbd_debug(1, "buffer chain changed, retrying\n"); - BUFFER_TRACE(partial->bh, "brelsing"); + BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } - goto reread; -} - -static int ext3_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - handle_t *handle = NULL; - int ret; - - if (create) { - handle = ext3_journal_current_handle(); - J_ASSERT(handle != 0); - } - ret = ext3_get_block_handle(handle, inode, iblock, - bh_result, create, 1); - return ret; + BUFFER_TRACE(bh_result, "returned"); +out: + return err; } #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) -static int -ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, - int create) +static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { handle_t *handle = journal_current_handle(); int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - if (!handle) + if (!create) goto get_block; /* A read */ + if (max_blocks == 1) + goto get_block; /* A single block get */ + if (handle->h_transaction->t_state == T_LOCKED) { /* * Huge direct-io writes can hold off commits for long @@ -913,18 +983,22 @@ ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock, } get_block: - if (ret == 0) - ret = ext3_get_block_handle(handle, inode, iblock, - bh_result, create, 0); - bh_result->b_size = (1 << inode->i_blkbits); + if (ret == 0) { + ret = ext3_get_blocks_handle(handle, inode, iblock, + max_blocks, bh_result, create, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + } return ret; } /* * `handle' can be NULL if create is zero */ -struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, - long block, int create, int * errp) +struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, + long block, int create, int *errp) { struct buffer_head dummy; int fatal = 0, err; @@ -934,25 +1008,41 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); - if (!*errp && buffer_mapped(&dummy)) { + err = ext3_get_blocks_handle(handle, inode, block, 1, + &dummy, create, 1); + /* + * ext3_get_blocks_handle() returns number of blocks + * mapped. 0 in case of a HOLE. + */ + if (err > 0) { + if (err > 1) + WARN_ON(1); + err = 0; + } + *errp = err; + if (!err && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (!bh) { + *errp = -EIO; + goto err; + } if (buffer_new(&dummy)) { J_ASSERT(create != 0); J_ASSERT(handle != 0); - /* Now that we do not always journal data, we - should keep in mind whether this should - always journal the new buffer as metadata. - For now, regular file writes use - ext3_get_block instead, so it's not a - problem. */ + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext3_get_block instead, so it's not a + * problem. + */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); fatal = ext3_journal_get_create_access(handle, bh); if (!fatal && !buffer_uptodate(bh)) { - memset(bh->b_data, 0, inode->i_sb->s_blocksize); + memset(bh->b_data,0,inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); @@ -970,59 +1060,25 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, } return bh; } +err: return NULL; } -struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, +struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, int block, int create, int *err) { struct buffer_head * bh; - int prev_blocks; - prev_blocks = inode->i_blocks; - - bh = ext3_getblk (handle, inode, block, create, err); + bh = ext3_getblk(handle, inode, block, create, err); if (!bh) return bh; -#ifdef EXT3_PREALLOCATE - /* - * If the inode has grown, and this is a directory, then use a few - * more of the preallocated blocks to keep directory fragmentation - * down. The preallocated blocks are guaranteed to be contiguous. - */ - if (create && - S_ISDIR(inode->i_mode) && - inode->i_blocks > prev_blocks && - EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_PREALLOC)) { - int i; - struct buffer_head *tmp_bh; - - for (i = 1; - EXT3_I(inode)->i_prealloc_count && - i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; - i++) { - /* - * ext3_getblk will zero out the contents of the - * directory for us - */ - tmp_bh = ext3_getblk(handle, inode, - block+i, create, err); - if (!tmp_bh) { - brelse (bh); - return 0; - } - brelse (tmp_bh); - } - } -#endif if (buffer_uptodate(bh)) return bh; - ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; - brelse (bh); + put_bh(bh); *err = -EIO; return NULL; } @@ -1084,9 +1140,8 @@ static int walk_page_buffers( handle_t *handle, * is elevated. We'll still have enough credits for the tiny quotafile * write. */ - -static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) { if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; @@ -1107,7 +1162,10 @@ retry: ret = PTR_ERR(handle); goto out; } - ret = block_prepare_write(page, from, to, ext3_get_block); + if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) + ret = nobh_prepare_write(page, from, to, ext3_get_block); + else + ret = block_prepare_write(page, from, to, ext3_get_block); if (ret) goto prepare_write_failed; @@ -1124,8 +1182,7 @@ out: return ret; } -static int -ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) +int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { int err = journal_dirty_data(handle, bh); if (err) @@ -1150,7 +1207,6 @@ static int commit_write_fn(handle_t *handle, struct buffer_head *bh) * ext3 never places buffers on inode->i_mapping->private_list. metadata * buffers are managed internally. */ - static int ext3_ordered_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { @@ -1191,7 +1247,12 @@ static int ext3_writeback_commit_write(struct file *file, struct page *page, new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; if (new_i_size > EXT3_I(inode)->i_disksize) EXT3_I(inode)->i_disksize = new_i_size; - ret = generic_commit_write(file, page, from, to); + + if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) + ret = nobh_commit_write(file, page, from, to); + else + ret = generic_commit_write(file, page, from, to); + ret2 = ext3_journal_stop(handle); if (!ret) ret = ret2; @@ -1321,7 +1382,7 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... * * Same applies to ext3_get_block(). We will deadlock on various things like - * lock_journal and i_truncate_sem. + * lock_journal and i_truncate_mutex. * * Setting PF_MEMALLOC here doesn't work - too many internal memory * allocations fail. @@ -1355,7 +1416,7 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) * we don't need to open a transaction here. */ static int ext3_ordered_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct buffer_head *page_bufs; @@ -1437,7 +1498,11 @@ static int ext3_writeback_writepage(struct page *page, goto out_fail; } - ret = block_write_full_page(page, ext3_get_block, wbc); + if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) + ret = nobh_writepage(page, ext3_get_block, wbc); + else + ret = block_write_full_page(page, ext3_get_block, wbc); + err = ext3_journal_stop(handle); if (!ret) ret = err; @@ -1466,16 +1531,18 @@ static int ext3_journalled_writepage(struct page *page, goto no_write; } - if (!page_has_buffers(page) || PageChecked(page)) { + if (!page_has_buffers(page) || PageFsMisc(page)) { /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ - ClearPageChecked(page); + ClearPageFsMisc(page); ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext3_get_block); - if (ret != 0) + if (ret != 0) { + ext3_journal_stop(handle); goto out_unlock; + } ret = walk_page_buffers(handle, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); @@ -1518,7 +1585,7 @@ ext3_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); } -static int ext3_invalidatepage(struct page *page, unsigned long offset) +static void ext3_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); @@ -1526,16 +1593,18 @@ static int ext3_invalidatepage(struct page *page, unsigned long offset) * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0) - ClearPageChecked(page); + ClearPageFsMisc(page); - return journal_invalidatepage(journal, page, offset); + journal_invalidatepage(journal, page, offset); } -static int ext3_releasepage(struct page *page, int wait) +static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); - WARN_ON(PageChecked(page)); + WARN_ON(PageFsMisc(page)); + if (!page_has_buffers(page)) + return 0; return journal_try_to_free_buffers(journal, page, wait); } @@ -1578,22 +1647,32 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, - ext3_direct_io_get_blocks, NULL); + ext3_get_block, NULL); + + /* + * Reacquire the handle: ext3_get_block() can restart the transaction + */ + handle = journal_current_handle(); out_stop: if (handle) { int err; - if (orphan) + if (orphan && inode->i_nlink) ext3_orphan_del(handle, inode); if (orphan && ret > 0) { loff_t end = offset + ret; if (end > inode->i_size) { ei->i_disksize = end; i_size_write(inode, end); - err = ext3_mark_inode_dirty(handle, inode); - if (!ret) - ret = err; + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext3_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext3_mark_inode_dirty(handle, inode); } } err = ext3_journal_stop(handle); @@ -1619,11 +1698,11 @@ out: */ static int ext3_journalled_set_page_dirty(struct page *page) { - SetPageChecked(page); + SetPageFsMisc(page); return __set_page_dirty_nobuffers(page); } -static struct address_space_operations ext3_ordered_aops = { +static const struct address_space_operations ext3_ordered_aops = { .readpage = ext3_readpage, .readpages = ext3_readpages, .writepage = ext3_ordered_writepage, @@ -1634,9 +1713,10 @@ static struct address_space_operations ext3_ordered_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, }; -static struct address_space_operations ext3_writeback_aops = { +static const struct address_space_operations ext3_writeback_aops = { .readpage = ext3_readpage, .readpages = ext3_readpages, .writepage = ext3_writeback_writepage, @@ -1647,9 +1727,10 @@ static struct address_space_operations ext3_writeback_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, }; -static struct address_space_operations ext3_journalled_aops = { +static const struct address_space_operations ext3_journalled_aops = { .readpage = ext3_readpage, .readpages = ext3_readpages, .writepage = ext3_journalled_writepage, @@ -1681,18 +1762,32 @@ void ext3_set_aops(struct inode *inode) static int ext3_block_truncate_page(handle_t *handle, struct page *page, struct address_space *mapping, loff_t from) { - unsigned long index = from >> PAGE_CACHE_SHIFT; + ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned blocksize, iblock, length, pos; struct inode *inode = mapping->host; struct buffer_head *bh; - int err; + int err = 0; void *kaddr; blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + /* + * For "nobh" option, we can only work if we don't need to + * read-in the page - otherwise we create buffers to do the IO. + */ + if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && + ext3_should_writeback_data(inode) && PageUptodate(page)) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + goto unlock; + } + if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -1768,7 +1863,7 @@ unlock: * or memcmp with zero_page, whatever is better for particular architecture. * Linus? */ -static inline int all_zeroes(u32 *p, u32 *q) +static inline int all_zeroes(__le32 *p, __le32 *q) { while (p < q) if (*p++) @@ -1811,11 +1906,8 @@ static inline int all_zeroes(u32 *p, u32 *q) * c) free the subtrees growing from the inode past the @chain[0]. * (no partially truncated stuff there). */ -static Indirect *ext3_find_shared(struct inode *inode, - int depth, - int offsets[4], - Indirect chain[4], - u32 *top) +static Indirect *ext3_find_shared(struct inode *inode, int depth, + int offsets[4], Indirect chain[4], __le32 *top) { Indirect *partial, *p; int k, err; @@ -1835,7 +1927,7 @@ static Indirect *ext3_find_shared(struct inode *inode, if (!partial->key && *partial->p) /* Writer: end */ goto no_top; - for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--) + for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) ; /* * OK, we've found the last block that must survive. The rest of our @@ -1854,8 +1946,7 @@ static Indirect *ext3_find_shared(struct inode *inode, } /* Writer: end */ - while(partial > p) - { + while(partial > p) { brelse(partial->bh); partial--; } @@ -1871,12 +1962,11 @@ no_top: * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. */ -static void -ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, - unsigned long block_to_free, unsigned long count, - u32 *first, u32 *last) +static void ext3_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext3_fsblk_t block_to_free, + unsigned long count, __le32 *first, __le32 *last) { - u32 *p; + __le32 *p; if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); @@ -1932,15 +2022,16 @@ ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, * block pointers. */ static void ext3_free_data(handle_t *handle, struct inode *inode, - struct buffer_head *this_bh, u32 *first, u32 *last) + struct buffer_head *this_bh, + __le32 *first, __le32 *last) { - unsigned long block_to_free = 0; /* Starting block # of a run */ + ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ unsigned long count = 0; /* Number of blocks in the run */ - u32 *block_to_free_p = NULL; /* Pointer into inode/ind + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind corresponding to block_to_free */ - unsigned long nr; /* Current block # */ - u32 *p; /* Pointer into inode/ind + ext3_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind for current block */ int err; @@ -1999,10 +2090,10 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, */ static void ext3_free_branches(handle_t *handle, struct inode *inode, struct buffer_head *parent_bh, - u32 *first, u32 *last, int depth) + __le32 *first, __le32 *last, int depth) { - unsigned long nr; - u32 *p; + ext3_fsblk_t nr; + __le32 *p; if (is_handle_aborted(handle)) return; @@ -2025,15 +2116,16 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, */ if (!bh) { ext3_error(inode->i_sb, "ext3_free_branches", - "Read failure, inode=%ld, block=%ld", + "Read failure, inode=%lu, block="E3FSBLK, inode->i_ino, nr); continue; } /* This zaps the entire block. Bottom up. */ BUFFER_TRACE(bh, "free child branches"); - ext3_free_branches(handle, inode, bh, (u32*)bh->b_data, - (u32*)bh->b_data + addr_per_block, + ext3_free_branches(handle, inode, bh, + (__le32*)bh->b_data, + (__le32*)bh->b_data + addr_per_block, depth); /* @@ -2133,18 +2225,17 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, * that's fine - as long as they are linked from the inode, the post-crash * ext3_truncate() run will find them and release them. */ - -void ext3_truncate_nocheck(struct inode * inode) +void ext3_truncate(struct inode *inode) { handle_t *handle; struct ext3_inode_info *ei = EXT3_I(inode); - u32 *i_data = ei->i_data; + __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; - int nr = 0; + __le32 nr = 0; int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; @@ -2155,8 +2246,8 @@ void ext3_truncate_nocheck(struct inode * inode) return; if (ext3_inode_is_fast_symlink(inode)) return; - - ext3_discard_prealloc(inode); + if (IS_APPEND(inode) || IS_IXORUNLINK(inode)) + return; /* * We have to lock the EOF page here, because lock_page() nests @@ -2218,7 +2309,7 @@ void ext3_truncate_nocheck(struct inode * inode) * From here we block out all ext3_get_block() callers who want to * modify the block allocation tree. */ - down(&ei->truncate_sem); + mutex_lock(&ei->truncate_mutex); if (n == 1) { /* direct blocks */ ext3_free_data(handle, inode, NULL, i_data+offsets[0], @@ -2249,7 +2340,7 @@ void ext3_truncate_nocheck(struct inode * inode) /* Clear the ends of indirect blocks on the shared branch */ while (partial > chain) { ext3_free_branches(handle, inode, partial->bh, partial->p + 1, - (u32*)partial->bh->b_data + addr_per_block, + (__le32*)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); brelse (partial->bh); @@ -2258,36 +2349,38 @@ void ext3_truncate_nocheck(struct inode * inode) do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { - default: - nr = i_data[EXT3_IND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, - &nr, &nr+1, 1); - i_data[EXT3_IND_BLOCK] = 0; - } - case EXT3_IND_BLOCK: - nr = i_data[EXT3_DIND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, - &nr, &nr+1, 2); - i_data[EXT3_DIND_BLOCK] = 0; - } - case EXT3_DIND_BLOCK: - nr = i_data[EXT3_TIND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, - &nr, &nr+1, 3); - i_data[EXT3_TIND_BLOCK] = 0; - } - case EXT3_TIND_BLOCK: - ; + default: + nr = i_data[EXT3_IND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT3_IND_BLOCK] = 0; + } + case EXT3_IND_BLOCK: + nr = i_data[EXT3_DIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT3_DIND_BLOCK] = 0; + } + case EXT3_DIND_BLOCK: + nr = i_data[EXT3_TIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT3_TIND_BLOCK] = 0; + } + case EXT3_TIND_BLOCK: + ; } - up(&ei->truncate_sem); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + ext3_discard_reservation(inode); + + mutex_unlock(&ei->truncate_mutex); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); - /* In a multi-transaction truncate, we only make the final - * transaction synchronous */ + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ if (IS_SYNC(inode)) handle->h_sync = 1; out_stop: @@ -2304,29 +2397,30 @@ out_stop: ext3_journal_stop(handle); } -static unsigned long ext3_get_inode_block(struct super_block *sb, +static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, unsigned long ino, struct ext3_iloc *iloc) { unsigned long desc, group_desc, block_group; - unsigned long offset, block; + unsigned long offset; + ext3_fsblk_t block; struct buffer_head *bh; struct ext3_group_desc * gdp; - if ((ino != EXT3_ROOT_INO && - ino != EXT3_JOURNAL_INO && - ino < EXT3_FIRST_INO(sb)) || - ino > le32_to_cpu( - EXT3_SB(sb)->s_es->s_inodes_count)) { - ext3_error (sb, "ext3_get_inode_block", - "bad inode number: %lu", ino); + if (!ext3_valid_inum(sb, ino)) { + /* + * This error is already checked for in namei.c unless we are + * looking at an NFS filehandle, in which case no error + * report is needed + */ return 0; } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); if (block_group >= EXT3_SB(sb)->s_groups_count) { - ext3_error (sb, "ext3_get_inode_block", - "group >= groups count"); + ext3_error(sb,"ext3_get_inode_block","group >= groups count"); return 0; } + smp_rmb(); group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); bh = EXT3_SB(sb)->s_group_desc[group_desc]; @@ -2336,7 +2430,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb, return 0; } - gdp = (struct ext3_group_desc *) bh->b_data; + gdp = (struct ext3_group_desc *)bh->b_data; /* * Figure out the offset within the block group inode table */ @@ -2350,16 +2444,16 @@ static unsigned long ext3_get_inode_block(struct super_block *sb, return block; } -/* +/* * ext3_get_inode_loc returns with an extra refcount against the inode's - * underlying buffer_head on success. If `in_mem' is false then we're purely - * trying to determine the inode's location on-disk and no read need be - * performed. + * underlying buffer_head on success. If 'in_mem' is true, we have all + * data in memory that is needed to recreate the on-disk version of this + * inode. */ -static int ext3_get_inode_loc(struct inode *inode, +static int __ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc, int in_mem) { - unsigned long block; + ext3_fsblk_t block; struct buffer_head *bh; block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); @@ -2370,7 +2464,8 @@ static int ext3_get_inode_loc(struct inode *inode, if (!bh) { ext3_error (inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " - "inode=%lu, block=%lu", inode->i_ino, block); + "inode=%lu, block="E3FSBLK, + inode->i_ino, block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -2381,7 +2476,11 @@ static int ext3_get_inode_loc(struct inode *inode, goto has_buffer; } - /* we can't skip I/O if inode is on a disk only */ + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ if (in_mem) { struct buffer_head *bitmap_bh; struct ext3_group_desc *desc; @@ -2390,10 +2489,6 @@ static int ext3_get_inode_loc(struct inode *inode, int block_group; int start; - /* - * If this is the only valid inode in the block we - * need not read the block. - */ block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb); inodes_per_buffer = bh->b_size / @@ -2440,8 +2535,9 @@ static int ext3_get_inode_loc(struct inode *inode, make_io: /* - * There are another valid inodes in the buffer so we must - * read the block from disk + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. */ get_bh(bh); bh->b_end_io = end_buffer_read_sync; @@ -2450,7 +2546,7 @@ make_io: if (!buffer_uptodate(bh)) { ext3_error(inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " - "inode=%lu, block=%lu", + "inode=%lu, block="E3FSBLK, inode->i_ino, block); brelse(bh); return -EIO; @@ -2461,34 +2557,88 @@ has_buffer: return 0; } -void ext3_truncate(struct inode * inode) +int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - ext3_truncate_nocheck(inode); + /* We have all inode data except xattrs in memory here. */ + return __ext3_get_inode_loc(inode, iloc, + !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); } void ext3_set_inode_flags(struct inode *inode) { unsigned int flags = EXT3_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - if (flags & EXT3_SYNC_FL) - inode->i_flags |= S_SYNC; - if (flags & EXT3_APPEND_FL) - inode->i_flags |= S_APPEND; + inode->i_flags &= ~(S_IMMUTABLE | S_IUNLINK | S_BARRIER | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + if (flags & EXT3_IMMUTABLE_FL) inode->i_flags |= S_IMMUTABLE; if (flags & EXT3_IUNLINK_FL) inode->i_flags |= S_IUNLINK; if (flags & EXT3_BARRIER_FL) inode->i_flags |= S_BARRIER; + + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT3_APPEND_FL) + inode->i_flags |= S_APPEND; if (flags & EXT3_NOATIME_FL) inode->i_flags |= S_NOATIME; if (flags & EXT3_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; } +int ext3_sync_flags(struct inode *inode) +{ + unsigned int oldflags, newflags; + int err = 0; + + oldflags = EXT3_I(inode)->i_flags; + newflags = oldflags & ~(EXT3_APPEND_FL | + EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL | + EXT3_BARRIER_FL | EXT3_NOATIME_FL | + EXT3_SYNC_FL | EXT3_DIRSYNC_FL); + + if (IS_APPEND(inode)) + newflags |= EXT3_APPEND_FL; + if (IS_IMMUTABLE(inode)) + newflags |= EXT3_IMMUTABLE_FL; + if (IS_IUNLINK(inode)) + newflags |= EXT3_IUNLINK_FL; + if (IS_BARRIER(inode)) + newflags |= EXT3_BARRIER_FL; + + /* we do not want to copy superblock flags */ + if (inode->i_flags & S_NOATIME) + newflags |= EXT3_NOATIME_FL; + if (inode->i_flags & S_SYNC) + newflags |= EXT3_SYNC_FL; + if (inode->i_flags & S_DIRSYNC) + newflags |= EXT3_DIRSYNC_FL; + + if (oldflags ^ newflags) { + handle_t *handle; + struct ext3_iloc iloc; + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (IS_SYNC(inode)) + handle->h_sync = 1; + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + EXT3_I(inode)->i_flags = newflags; + inode->i_ctime = CURRENT_TIME; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + flags_err: + ext3_journal_stop(handle); + } + return err; +} + void ext3_read_inode(struct inode * inode) { struct ext3_iloc iloc; @@ -2503,7 +2653,9 @@ void ext3_read_inode(struct inode * inode) ei->i_acl = EXT3_ACL_NOT_CACHED; ei->i_default_acl = EXT3_ACL_NOT_CACHED; #endif - if (ext3_get_inode_loc(inode, &iloc, 0)) + ei->i_block_alloc_info = NULL; + + if (__ext3_get_inode_loc(inode, &iloc, 0)) goto bad_inode; bh = iloc.bh; raw_inode = ext3_raw_inode(&iloc); @@ -2527,8 +2679,6 @@ void ext3_read_inode(struct inode * inode) inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; ei->i_state = 0; - ei->i_next_alloc_block = 0; - ei->i_next_alloc_goal = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -2548,9 +2698,6 @@ void ext3_read_inode(struct inode * inode) * recovery code: that's fine, we're about to complete * the process of deleting those. */ } - inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size - * (for stat), not the fs block - * size */ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ei->i_flags = le32_to_cpu(raw_inode->i_flags); #ifdef EXT3_FRAGMENTS @@ -2567,11 +2714,7 @@ void ext3_read_inode(struct inode * inode) } ei->i_disksize = inode->i_size; inode->i_generation = le32_to_cpu(raw_inode->i_generation); -#ifdef EXT3_PREALLOCATE - ei->i_prealloc_count = 0; -#endif ei->i_block_group = iloc.block_group; - /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! @@ -2580,6 +2723,31 @@ void ext3_read_inode(struct inode * inode) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && + EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { + /* + * When mke2fs creates big inodes it does not zero out + * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE, + * so ignore those first few inodes. + */ + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT3_INODE_SIZE(inode->i_sb)) + goto bad_inode; + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + ei->i_extra_isize = sizeof(struct ext3_inode) - + EXT3_GOOD_OLD_INODE_SIZE; + } else { + __le32 *magic = (void *)raw_inode + + EXT3_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) + ei->i_state |= EXT3_STATE_XATTR; + } + } else + ei->i_extra_isize = 0; + if (S_ISREG(inode->i_mode)) { inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; @@ -2660,7 +2828,7 @@ static int ext3_do_update_inode(handle_t *handle, raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } -#ifdef CONFIG_INOXID_GID32 +#ifdef CONFIG_INOXID_INTERN raw_inode->i_raw_xid = cpu_to_le16(inode->i_xid); #endif raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); @@ -2720,6 +2888,9 @@ static int ext3_do_update_inode(handle_t *handle, } else for (block = 0; block < EXT3_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; + if (ei->i_extra_isize) + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) @@ -2767,59 +2938,21 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -void ext3_write_inode(struct inode *inode, int wait) +int ext3_write_inode(struct inode *inode, int wait) { if (current->flags & PF_MEMALLOC) - return; + return 0; if (ext3_journal_current_handle()) { jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); dump_stack(); - return; + return -EIO; } if (!wait) - return; - - ext3_force_commit(inode->i_sb); -} - -int ext3_setattr_flags(struct inode *inode, unsigned int flags) -{ - unsigned int oldflags, newflags; - int err = 0; - - oldflags = EXT3_I(inode)->i_flags; - newflags = oldflags & - ~(EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL | EXT3_BARRIER_FL); - if (flags & ATTR_FLAG_IMMUTABLE) - newflags |= EXT3_IMMUTABLE_FL; - if (flags & ATTR_FLAG_IUNLINK) - newflags |= EXT3_IUNLINK_FL; - if (flags & ATTR_FLAG_BARRIER) - newflags |= EXT3_BARRIER_FL; - - if (oldflags ^ newflags) { - handle_t *handle; - struct ext3_iloc iloc; - - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - if (IS_SYNC(inode)) - handle->h_sync = 1; - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err) - goto flags_err; - - EXT3_I(inode)->i_flags = newflags; - inode->i_ctime = CURRENT_TIME; + return 0; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - flags_err: - ext3_journal_stop(handle); - } - return err; + return ext3_force_commit(inode->i_sb); } /* @@ -2856,7 +2989,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3); + handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ + EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -2872,9 +3006,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if ((attr->ia_valid & ATTR_XID) - && inode->i_sb - && (inode->i_sb->s_flags & MS_TAGXID)) + if ((attr->ia_valid & ATTR_XID) && IS_TAGXID(inode)) inode->i_xid = attr->ia_xid; error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); @@ -2898,12 +3030,6 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) ext3_journal_stop(handle); } - if (ia_valid & ATTR_ATTR_FLAG) { - rc = ext3_setattr_flags(inode, attr->ia_attr_flags); - if (!error) - error = rc; - } - rc = inode_setattr(inode, attr); /* If inode_setattr's call to ext3_truncate failed to get a @@ -2924,7 +3050,7 @@ err_out: /* - * akpm: how many blocks doth make a writepage()? + * How many blocks doth make a writepage()? * * With N blocks per page, it may be: * N data blocks @@ -2950,7 +3076,7 @@ err_out: * block and work out the exact number of indirects which are touched. Pah. */ -int ext3_writepage_trans_blocks(struct inode *inode) +static int ext3_writepage_trans_blocks(struct inode *inode) { int bpp = ext3_journal_blocks_per_page(inode); int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; @@ -2964,7 +3090,7 @@ int ext3_writepage_trans_blocks(struct inode *inode) #ifdef CONFIG_QUOTA /* We know that structure was already allocated during DQUOT_INIT so * we will be updating only the data blocks + inodes */ - ret += 2*EXT3_QUOTA_TRANS_BLOCKS; + ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); #endif return ret; @@ -2999,7 +3125,7 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode, { int err = 0; if (handle) { - err = ext3_get_inode_loc(inode, iloc, 1); + err = ext3_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, iloc->bh); @@ -3014,8 +3140,8 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode, } /* - * akpm: What we do here is to mark the in-core inode as clean - * with respect to inode dirtiness (it may still be data-dirty). + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). * This means that the in-core inode may be reaped by prune_icache * without having to perform any I/O. This is a very good thing, * because *any* task may call prune_icache - even ones which @@ -3039,6 +3165,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err; + might_sleep(); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); @@ -3046,7 +3173,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) } /* - * akpm: ext3_dirty_inode() is called from __mark_inode_dirty() + * ext3_dirty_inode() is called from __mark_inode_dirty() * * We're really interested in the case where a file is being extended. * i_size has been changed by generic_commit_write() and we thus need @@ -3082,7 +3209,7 @@ out: return; } -#ifdef AKPM +#if 0 /* * Bind an inode's backing buffer_head into this transaction, to prevent * it from being flushed to disk early. Unlike @@ -3090,14 +3217,13 @@ out: * returns no iloc structure, so the caller needs to repeat the iloc * lookup to mark the inode dirty later. */ -static inline int -ext3_pin_inode(handle_t *handle, struct inode *inode) +static int ext3_pin_inode(handle_t *handle, struct inode *inode) { struct ext3_iloc iloc; int err = 0; if (handle) { - err = ext3_get_inode_loc(inode, &iloc, 1); + err = ext3_get_inode_loc(inode, &iloc); if (!err) { BUFFER_TRACE(iloc.bh, "get_write_access"); err = journal_get_write_access(handle, iloc.bh);