vserver 2.0 rc7
[linux-2.6.git] / fs / ext3 / inode.c
1 /*
2  *  linux/fs/ext3/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *      (sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *      (jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23  */
24
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
39 #include <linux/vserver/xid.h>
40 #include "xattr.h"
41 #include "acl.h"
42
43 static int ext3_writepage_trans_blocks(struct inode *inode);
44
45 /*
46  * Test whether an inode is a fast symlink.
47  */
48 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
49 {
50         int ea_blocks = EXT3_I(inode)->i_file_acl ?
51                 (inode->i_sb->s_blocksize >> 9) : 0;
52
53         return (S_ISLNK(inode->i_mode) &&
54                 inode->i_blocks - ea_blocks == 0);
55 }
56
57 /* The ext3 forget function must perform a revoke if we are freeing data
58  * which has been journaled.  Metadata (eg. indirect blocks) must be
59  * revoked in all cases. 
60  *
61  * "bh" may be NULL: a metadata block may have been freed from memory
62  * but there may still be a record of it in the journal, and that record
63  * still needs to be revoked.
64  */
65
66 int ext3_forget(handle_t *handle, int is_metadata,
67                        struct inode *inode, struct buffer_head *bh,
68                        int blocknr)
69 {
70         int err;
71
72         might_sleep();
73
74         BUFFER_TRACE(bh, "enter");
75
76         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
77                   "data mode %lx\n",
78                   bh, is_metadata, inode->i_mode,
79                   test_opt(inode->i_sb, DATA_FLAGS));
80
81         /* Never use the revoke function if we are doing full data
82          * journaling: there is no need to, and a V1 superblock won't
83          * support it.  Otherwise, only skip the revoke on un-journaled
84          * data blocks. */
85
86         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
87             (!is_metadata && !ext3_should_journal_data(inode))) {
88                 if (bh) {
89                         BUFFER_TRACE(bh, "call journal_forget");
90                         return ext3_journal_forget(handle, bh);
91                 }
92                 return 0;
93         }
94
95         /*
96          * data!=journal && (is_metadata || should_journal_data(inode))
97          */
98         BUFFER_TRACE(bh, "call ext3_journal_revoke");
99         err = ext3_journal_revoke(handle, blocknr, bh);
100         if (err)
101                 ext3_abort(inode->i_sb, __FUNCTION__,
102                            "error %d when attempting revoke", err);
103         BUFFER_TRACE(bh, "exit");
104         return err;
105 }
106
107 /*
108  * Work out how many blocks we need to progress with the next chunk of a
109  * truncate transaction.
110  */
111
112 static unsigned long blocks_for_truncate(struct inode *inode) 
113 {
114         unsigned long needed;
115
116         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
117
118         /* Give ourselves just enough room to cope with inodes in which
119          * i_blocks is corrupt: we've seen disk corruptions in the past
120          * which resulted in random data in an inode which looked enough
121          * like a regular file for ext3 to try to delete it.  Things
122          * will go a bit crazy if that happens, but at least we should
123          * try not to panic the whole kernel. */
124         if (needed < 2)
125                 needed = 2;
126
127         /* But we need to bound the transaction so we don't overflow the
128          * journal. */
129         if (needed > EXT3_MAX_TRANS_DATA) 
130                 needed = EXT3_MAX_TRANS_DATA;
131
132         return EXT3_DATA_TRANS_BLOCKS + needed;
133 }
134
135 /* 
136  * Truncate transactions can be complex and absolutely huge.  So we need to
137  * be able to restart the transaction at a conventient checkpoint to make
138  * sure we don't overflow the journal.
139  *
140  * start_transaction gets us a new handle for a truncate transaction,
141  * and extend_transaction tries to extend the existing one a bit.  If
142  * extend fails, we need to propagate the failure up and restart the
143  * transaction in the top-level truncate loop. --sct 
144  */
145
146 static handle_t *start_transaction(struct inode *inode) 
147 {
148         handle_t *result;
149
150         result = ext3_journal_start(inode, blocks_for_truncate(inode));
151         if (!IS_ERR(result))
152                 return result;
153
154         ext3_std_error(inode->i_sb, PTR_ERR(result));
155         return result;
156 }
157
158 /*
159  * Try to extend this transaction for the purposes of truncation.
160  *
161  * Returns 0 if we managed to create more room.  If we can't create more
162  * room, and the transaction must be restarted we return 1.
163  */
164 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
165 {
166         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
167                 return 0;
168         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
169                 return 0;
170         return 1;
171 }
172
173 /*
174  * Restart the transaction associated with *handle.  This does a commit,
175  * so before we call here everything must be consistently dirtied against
176  * this transaction.
177  */
178 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
179 {
180         jbd_debug(2, "restarting handle %p\n", handle);
181         return ext3_journal_restart(handle, blocks_for_truncate(inode));
182 }
183
184 static void ext3_truncate_nocheck (struct inode *inode);
185
186 /*
187  * Called at the last iput() if i_nlink is zero.
188  */
189 void ext3_delete_inode (struct inode * inode)
190 {
191         handle_t *handle;
192
193         if (is_bad_inode(inode))
194                 goto no_delete;
195
196         handle = start_transaction(inode);
197         if (IS_ERR(handle)) {
198                 /* If we're going to skip the normal cleanup, we still
199                  * need to make sure that the in-core orphan linked list
200                  * is properly cleaned up. */
201                 ext3_orphan_del(NULL, inode);
202                 goto no_delete;
203         }
204
205         if (IS_SYNC(inode))
206                 handle->h_sync = 1;
207         inode->i_size = 0;
208         if (inode->i_blocks)
209                 ext3_truncate_nocheck(inode);
210         /*
211          * Kill off the orphan record which ext3_truncate created.
212          * AKPM: I think this can be inside the above `if'.
213          * Note that ext3_orphan_del() has to be able to cope with the
214          * deletion of a non-existent orphan - this is because we don't
215          * know if ext3_truncate() actually created an orphan record.
216          * (Well, we could do this if we need to, but heck - it works)
217          */
218         ext3_orphan_del(handle, inode);
219         EXT3_I(inode)->i_dtime  = get_seconds();
220
221         /* 
222          * One subtle ordering requirement: if anything has gone wrong
223          * (transaction abort, IO errors, whatever), then we can still
224          * do these next steps (the fs will already have been marked as
225          * having errors), but we can't free the inode if the mark_dirty
226          * fails.  
227          */
228         if (ext3_mark_inode_dirty(handle, inode))
229                 /* If that failed, just do the required in-core inode clear. */
230                 clear_inode(inode);
231         else
232                 ext3_free_inode(handle, inode);
233         ext3_journal_stop(handle);
234         return;
235 no_delete:
236         clear_inode(inode);     /* We must guarantee clearing of inode... */
237 }
238
239 static int ext3_alloc_block (handle_t *handle,
240                         struct inode * inode, unsigned long goal, int *err)
241 {
242         unsigned long result;
243
244         result = ext3_new_block(handle, inode, goal, err);
245         return result;
246 }
247
248
249 typedef struct {
250         __le32  *p;
251         __le32  key;
252         struct buffer_head *bh;
253 } Indirect;
254
255 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
256 {
257         p->key = *(p->p = v);
258         p->bh = bh;
259 }
260
261 static inline int verify_chain(Indirect *from, Indirect *to)
262 {
263         while (from <= to && from->key == *from->p)
264                 from++;
265         return (from > to);
266 }
267
268 /**
269  *      ext3_block_to_path - parse the block number into array of offsets
270  *      @inode: inode in question (we are only interested in its superblock)
271  *      @i_block: block number to be parsed
272  *      @offsets: array to store the offsets in
273  *      @boundary: set this non-zero if the referred-to block is likely to be
274  *             followed (on disk) by an indirect block.
275  *
276  *      To store the locations of file's data ext3 uses a data structure common
277  *      for UNIX filesystems - tree of pointers anchored in the inode, with
278  *      data blocks at leaves and indirect blocks in intermediate nodes.
279  *      This function translates the block number into path in that tree -
280  *      return value is the path length and @offsets[n] is the offset of
281  *      pointer to (n+1)th node in the nth one. If @block is out of range
282  *      (negative or too large) warning is printed and zero returned.
283  *
284  *      Note: function doesn't find node addresses, so no IO is needed. All
285  *      we need to know is the capacity of indirect blocks (taken from the
286  *      inode->i_sb).
287  */
288
289 /*
290  * Portability note: the last comparison (check that we fit into triple
291  * indirect block) is spelled differently, because otherwise on an
292  * architecture with 32-bit longs and 8Kb pages we might get into trouble
293  * if our filesystem had 8Kb blocks. We might use long long, but that would
294  * kill us on x86. Oh, well, at least the sign propagation does not matter -
295  * i_block would have to be negative in the very beginning, so we would not
296  * get there at all.
297  */
298
299 static int ext3_block_to_path(struct inode *inode,
300                         long i_block, int offsets[4], int *boundary)
301 {
302         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
303         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
304         const long direct_blocks = EXT3_NDIR_BLOCKS,
305                 indirect_blocks = ptrs,
306                 double_blocks = (1 << (ptrs_bits * 2));
307         int n = 0;
308         int final = 0;
309
310         if (i_block < 0) {
311                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
312         } else if (i_block < direct_blocks) {
313                 offsets[n++] = i_block;
314                 final = direct_blocks;
315         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
316                 offsets[n++] = EXT3_IND_BLOCK;
317                 offsets[n++] = i_block;
318                 final = ptrs;
319         } else if ((i_block -= indirect_blocks) < double_blocks) {
320                 offsets[n++] = EXT3_DIND_BLOCK;
321                 offsets[n++] = i_block >> ptrs_bits;
322                 offsets[n++] = i_block & (ptrs - 1);
323                 final = ptrs;
324         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
325                 offsets[n++] = EXT3_TIND_BLOCK;
326                 offsets[n++] = i_block >> (ptrs_bits * 2);
327                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
328                 offsets[n++] = i_block & (ptrs - 1);
329                 final = ptrs;
330         } else {
331                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
332         }
333         if (boundary)
334                 *boundary = (i_block & (ptrs - 1)) == (final - 1);
335         return n;
336 }
337
338 /**
339  *      ext3_get_branch - read the chain of indirect blocks leading to data
340  *      @inode: inode in question
341  *      @depth: depth of the chain (1 - direct pointer, etc.)
342  *      @offsets: offsets of pointers in inode/indirect blocks
343  *      @chain: place to store the result
344  *      @err: here we store the error value
345  *
346  *      Function fills the array of triples <key, p, bh> and returns %NULL
347  *      if everything went OK or the pointer to the last filled triple
348  *      (incomplete one) otherwise. Upon the return chain[i].key contains
349  *      the number of (i+1)-th block in the chain (as it is stored in memory,
350  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
351  *      number (it points into struct inode for i==0 and into the bh->b_data
352  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
353  *      block for i>0 and NULL for i==0. In other words, it holds the block
354  *      numbers of the chain, addresses they were taken from (and where we can
355  *      verify that chain did not change) and buffer_heads hosting these
356  *      numbers.
357  *
358  *      Function stops when it stumbles upon zero pointer (absent block)
359  *              (pointer to last triple returned, *@err == 0)
360  *      or when it gets an IO error reading an indirect block
361  *              (ditto, *@err == -EIO)
362  *      or when it notices that chain had been changed while it was reading
363  *              (ditto, *@err == -EAGAIN)
364  *      or when it reads all @depth-1 indirect blocks successfully and finds
365  *      the whole chain, all way to the data (returns %NULL, *err == 0).
366  */
367 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
368                                  Indirect chain[4], int *err)
369 {
370         struct super_block *sb = inode->i_sb;
371         Indirect *p = chain;
372         struct buffer_head *bh;
373
374         *err = 0;
375         /* i_data is not going away, no lock needed */
376         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
377         if (!p->key)
378                 goto no_block;
379         while (--depth) {
380                 bh = sb_bread(sb, le32_to_cpu(p->key));
381                 if (!bh)
382                         goto failure;
383                 /* Reader: pointers */
384                 if (!verify_chain(chain, p))
385                         goto changed;
386                 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
387                 /* Reader: end */
388                 if (!p->key)
389                         goto no_block;
390         }
391         return NULL;
392
393 changed:
394         brelse(bh);
395         *err = -EAGAIN;
396         goto no_block;
397 failure:
398         *err = -EIO;
399 no_block:
400         return p;
401 }
402
403 /**
404  *      ext3_find_near - find a place for allocation with sufficient locality
405  *      @inode: owner
406  *      @ind: descriptor of indirect block.
407  *
408  *      This function returns the prefered place for block allocation.
409  *      It is used when heuristic for sequential allocation fails.
410  *      Rules are:
411  *        + if there is a block to the left of our position - allocate near it.
412  *        + if pointer will live in indirect block - allocate near that block.
413  *        + if pointer will live in inode - allocate in the same
414  *          cylinder group. 
415  *
416  * In the latter case we colour the starting block by the callers PID to
417  * prevent it from clashing with concurrent allocations for a different inode
418  * in the same block group.   The PID is used here so that functionally related
419  * files will be close-by on-disk.
420  *
421  *      Caller must make sure that @ind is valid and will stay that way.
422  */
423
424 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
425 {
426         struct ext3_inode_info *ei = EXT3_I(inode);
427         __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
428         __le32 *p;
429         unsigned long bg_start;
430         unsigned long colour;
431
432         /* Try to find previous block */
433         for (p = ind->p - 1; p >= start; p--)
434                 if (*p)
435                         return le32_to_cpu(*p);
436
437         /* No such thing, so let's try location of indirect block */
438         if (ind->bh)
439                 return ind->bh->b_blocknr;
440
441         /*
442          * It is going to be refered from inode itself? OK, just put it into
443          * the same cylinder group then.
444          */
445         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
446                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
447         colour = (current->pid % 16) *
448                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
449         return bg_start + colour;
450 }
451
452 /**
453  *      ext3_find_goal - find a prefered place for allocation.
454  *      @inode: owner
455  *      @block:  block we want
456  *      @chain:  chain of indirect blocks
457  *      @partial: pointer to the last triple within a chain
458  *      @goal:  place to store the result.
459  *
460  *      Normally this function find the prefered place for block allocation,
461  *      stores it in *@goal and returns zero.
462  */
463
464 static unsigned long ext3_find_goal(struct inode *inode, long block,
465                 Indirect chain[4], Indirect *partial)
466 {
467         struct ext3_block_alloc_info *block_i =  EXT3_I(inode)->i_block_alloc_info;
468
469         /*
470          * try the heuristic for sequential allocation,
471          * failing that at least try to get decent locality.
472          */
473         if (block_i && (block == block_i->last_alloc_logical_block + 1)
474                 && (block_i->last_alloc_physical_block != 0)) {
475                 return block_i->last_alloc_physical_block + 1;
476         }
477
478         return ext3_find_near(inode, partial);
479 }
480
481 /**
482  *      ext3_alloc_branch - allocate and set up a chain of blocks.
483  *      @inode: owner
484  *      @num: depth of the chain (number of blocks to allocate)
485  *      @offsets: offsets (in the blocks) to store the pointers to next.
486  *      @branch: place to store the chain in.
487  *
488  *      This function allocates @num blocks, zeroes out all but the last one,
489  *      links them into chain and (if we are synchronous) writes them to disk.
490  *      In other words, it prepares a branch that can be spliced onto the
491  *      inode. It stores the information about that chain in the branch[], in
492  *      the same format as ext3_get_branch() would do. We are calling it after
493  *      we had read the existing part of chain and partial points to the last
494  *      triple of that (one with zero ->key). Upon the exit we have the same
495  *      picture as after the successful ext3_get_block(), excpet that in one
496  *      place chain is disconnected - *branch->p is still zero (we did not
497  *      set the last link), but branch->key contains the number that should
498  *      be placed into *branch->p to fill that gap.
499  *
500  *      If allocation fails we free all blocks we've allocated (and forget
501  *      their buffer_heads) and return the error value the from failed
502  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
503  *      as described above and return 0.
504  */
505
506 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
507                              int num,
508                              unsigned long goal,
509                              int *offsets,
510                              Indirect *branch)
511 {
512         int blocksize = inode->i_sb->s_blocksize;
513         int n = 0, keys = 0;
514         int err = 0;
515         int i;
516         int parent = ext3_alloc_block(handle, inode, goal, &err);
517
518         branch[0].key = cpu_to_le32(parent);
519         if (parent) {
520                 for (n = 1; n < num; n++) {
521                         struct buffer_head *bh;
522                         /* Allocate the next block */
523                         int nr = ext3_alloc_block(handle, inode, parent, &err);
524                         if (!nr)
525                                 break;
526                         branch[n].key = cpu_to_le32(nr);
527                         keys = n+1;
528
529                         /*
530                          * Get buffer_head for parent block, zero it out
531                          * and set the pointer to new one, then send
532                          * parent to disk.  
533                          */
534                         bh = sb_getblk(inode->i_sb, parent);
535                         branch[n].bh = bh;
536                         lock_buffer(bh);
537                         BUFFER_TRACE(bh, "call get_create_access");
538                         err = ext3_journal_get_create_access(handle, bh);
539                         if (err) {
540                                 unlock_buffer(bh);
541                                 brelse(bh);
542                                 break;
543                         }
544
545                         memset(bh->b_data, 0, blocksize);
546                         branch[n].p = (__le32*) bh->b_data + offsets[n];
547                         *branch[n].p = branch[n].key;
548                         BUFFER_TRACE(bh, "marking uptodate");
549                         set_buffer_uptodate(bh);
550                         unlock_buffer(bh);
551
552                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
553                         err = ext3_journal_dirty_metadata(handle, bh);
554                         if (err)
555                                 break;
556
557                         parent = nr;
558                 }
559         }
560         if (n == num)
561                 return 0;
562
563         /* Allocation failed, free what we already allocated */
564         for (i = 1; i < keys; i++) {
565                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
566                 ext3_journal_forget(handle, branch[i].bh);
567         }
568         for (i = 0; i < keys; i++)
569                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
570         return err;
571 }
572
573 /**
574  *      ext3_splice_branch - splice the allocated branch onto inode.
575  *      @inode: owner
576  *      @block: (logical) number of block we are adding
577  *      @chain: chain of indirect blocks (with a missing link - see
578  *              ext3_alloc_branch)
579  *      @where: location of missing link
580  *      @num:   number of blocks we are adding
581  *
582  *      This function fills the missing link and does all housekeeping needed in
583  *      inode (->i_blocks, etc.). In case of success we end up with the full
584  *      chain to new block and return 0.
585  */
586
587 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
588                               Indirect chain[4], Indirect *where, int num)
589 {
590         int i;
591         int err = 0;
592         struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
593
594         /*
595          * If we're splicing into a [td]indirect block (as opposed to the
596          * inode) then we need to get write access to the [td]indirect block
597          * before the splice.
598          */
599         if (where->bh) {
600                 BUFFER_TRACE(where->bh, "get_write_access");
601                 err = ext3_journal_get_write_access(handle, where->bh);
602                 if (err)
603                         goto err_out;
604         }
605         /* That's it */
606
607         *where->p = where->key;
608
609         /*
610          * update the most recently allocated logical & physical block
611          * in i_block_alloc_info, to assist find the proper goal block for next
612          * allocation
613          */
614         if (block_i) {
615                 block_i->last_alloc_logical_block = block;
616                 block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key);
617         }
618
619         /* We are done with atomic stuff, now do the rest of housekeeping */
620
621         inode->i_ctime = CURRENT_TIME_SEC;
622         ext3_mark_inode_dirty(handle, inode);
623
624         /* had we spliced it onto indirect block? */
625         if (where->bh) {
626                 /*
627                  * akpm: If we spliced it onto an indirect block, we haven't
628                  * altered the inode.  Note however that if it is being spliced
629                  * onto an indirect block at the very end of the file (the
630                  * file is growing) then we *will* alter the inode to reflect
631                  * the new i_size.  But that is not done here - it is done in
632                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
633                  */
634                 jbd_debug(5, "splicing indirect only\n");
635                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
636                 err = ext3_journal_dirty_metadata(handle, where->bh);
637                 if (err) 
638                         goto err_out;
639         } else {
640                 /*
641                  * OK, we spliced it into the inode itself on a direct block.
642                  * Inode was dirtied above.
643                  */
644                 jbd_debug(5, "splicing direct\n");
645         }
646         return err;
647
648 err_out:
649         for (i = 1; i < num; i++) {
650                 BUFFER_TRACE(where[i].bh, "call journal_forget");
651                 ext3_journal_forget(handle, where[i].bh);
652         }
653         return err;
654 }
655
656 /*
657  * Allocation strategy is simple: if we have to allocate something, we will
658  * have to go the whole way to leaf. So let's do it before attaching anything
659  * to tree, set linkage between the newborn blocks, write them if sync is
660  * required, recheck the path, free and repeat if check fails, otherwise
661  * set the last missing link (that will protect us from any truncate-generated
662  * removals - all blocks on the path are immune now) and possibly force the
663  * write on the parent block.
664  * That has a nice additional property: no special recovery from the failed
665  * allocations is needed - we simply release blocks and do not touch anything
666  * reachable from inode.
667  *
668  * akpm: `handle' can be NULL if create == 0.
669  *
670  * The BKL may not be held on entry here.  Be sure to take it early.
671  */
672
673 static int
674 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
675                 struct buffer_head *bh_result, int create, int extend_disksize)
676 {
677         int err = -EIO;
678         int offsets[4];
679         Indirect chain[4];
680         Indirect *partial;
681         unsigned long goal;
682         int left;
683         int boundary = 0;
684         const int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
685         struct ext3_inode_info *ei = EXT3_I(inode);
686
687         J_ASSERT(handle != NULL || create == 0);
688
689         if (depth == 0)
690                 goto out;
691
692         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
693
694         /* Simplest case - block found, no allocation needed */
695         if (!partial) {
696                 clear_buffer_new(bh_result);
697                 goto got_it;
698         }
699
700         /* Next simple case - plain lookup or failed read of indirect block */
701         if (!create || err == -EIO)
702                 goto cleanup;
703
704         down(&ei->truncate_sem);
705
706         /*
707          * If the indirect block is missing while we are reading
708          * the chain(ext3_get_branch() returns -EAGAIN err), or
709          * if the chain has been changed after we grab the semaphore,
710          * (either because another process truncated this branch, or
711          * another get_block allocated this branch) re-grab the chain to see if
712          * the request block has been allocated or not.
713          *
714          * Since we already block the truncate/other get_block
715          * at this point, we will have the current copy of the chain when we
716          * splice the branch into the tree.
717          */
718         if (err == -EAGAIN || !verify_chain(chain, partial)) {
719                 while (partial > chain) {
720                         brelse(partial->bh);
721                         partial--;
722                 }
723                 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
724                 if (!partial) {
725                         up(&ei->truncate_sem);
726                         if (err)
727                                 goto cleanup;
728                         clear_buffer_new(bh_result);
729                         goto got_it;
730                 }
731         }
732
733         /*
734          * Okay, we need to do block allocation.  Lazily initialize the block
735          * allocation info here if necessary
736         */
737         if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
738                 ext3_init_block_alloc_info(inode);
739
740         goal = ext3_find_goal(inode, iblock, chain, partial);
741
742         left = (chain + depth) - partial;
743
744         /*
745          * Block out ext3_truncate while we alter the tree
746          */
747         err = ext3_alloc_branch(handle, inode, left, goal,
748                                 offsets + (partial - chain), partial);
749
750         /*
751          * The ext3_splice_branch call will free and forget any buffers
752          * on the new chain if there is a failure, but that risks using
753          * up transaction credits, especially for bitmaps where the
754          * credits cannot be returned.  Can we handle this somehow?  We
755          * may need to return -EAGAIN upwards in the worst case.  --sct
756          */
757         if (!err)
758                 err = ext3_splice_branch(handle, inode, iblock, chain,
759                                          partial, left);
760         /*
761          * i_disksize growing is protected by truncate_sem.  Don't forget to
762          * protect it if you're about to implement concurrent
763          * ext3_get_block() -bzzz
764         */
765         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
766                 ei->i_disksize = inode->i_size;
767         up(&ei->truncate_sem);
768         if (err)
769                 goto cleanup;
770
771         set_buffer_new(bh_result);
772 got_it:
773         map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
774         if (boundary)
775                 set_buffer_boundary(bh_result);
776         /* Clean up and exit */
777         partial = chain + depth - 1;    /* the whole chain */
778 cleanup:
779         while (partial > chain) {
780                 BUFFER_TRACE(partial->bh, "call brelse");
781                 brelse(partial->bh);
782                 partial--;
783         }
784         BUFFER_TRACE(bh_result, "returned");
785 out:
786         return err;
787 }
788
789 static int ext3_get_block(struct inode *inode, sector_t iblock,
790                         struct buffer_head *bh_result, int create)
791 {
792         handle_t *handle = NULL;
793         int ret;
794
795         if (create) {
796                 handle = ext3_journal_current_handle();
797                 J_ASSERT(handle != 0);
798         }
799         ret = ext3_get_block_handle(handle, inode, iblock,
800                                 bh_result, create, 1);
801         return ret;
802 }
803
804 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
805
806 static int
807 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
808                 unsigned long max_blocks, struct buffer_head *bh_result,
809                 int create)
810 {
811         handle_t *handle = journal_current_handle();
812         int ret = 0;
813
814         if (!handle)
815                 goto get_block;         /* A read */
816
817         if (handle->h_transaction->t_state == T_LOCKED) {
818                 /*
819                  * Huge direct-io writes can hold off commits for long
820                  * periods of time.  Let this commit run.
821                  */
822                 ext3_journal_stop(handle);
823                 handle = ext3_journal_start(inode, DIO_CREDITS);
824                 if (IS_ERR(handle))
825                         ret = PTR_ERR(handle);
826                 goto get_block;
827         }
828
829         if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
830                 /*
831                  * Getting low on buffer credits...
832                  */
833                 ret = ext3_journal_extend(handle, DIO_CREDITS);
834                 if (ret > 0) {
835                         /*
836                          * Couldn't extend the transaction.  Start a new one.
837                          */
838                         ret = ext3_journal_restart(handle, DIO_CREDITS);
839                 }
840         }
841
842 get_block:
843         if (ret == 0)
844                 ret = ext3_get_block_handle(handle, inode, iblock,
845                                         bh_result, create, 0);
846         bh_result->b_size = (1 << inode->i_blkbits);
847         return ret;
848 }
849
850 /*
851  * `handle' can be NULL if create is zero
852  */
853 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
854                                 long block, int create, int * errp)
855 {
856         struct buffer_head dummy;
857         int fatal = 0, err;
858
859         J_ASSERT(handle != NULL || create == 0);
860
861         dummy.b_state = 0;
862         dummy.b_blocknr = -1000;
863         buffer_trace_init(&dummy.b_history);
864         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
865         if (!*errp && buffer_mapped(&dummy)) {
866                 struct buffer_head *bh;
867                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
868                 if (buffer_new(&dummy)) {
869                         J_ASSERT(create != 0);
870                         J_ASSERT(handle != 0);
871
872                         /* Now that we do not always journal data, we
873                            should keep in mind whether this should
874                            always journal the new buffer as metadata.
875                            For now, regular file writes use
876                            ext3_get_block instead, so it's not a
877                            problem. */
878                         lock_buffer(bh);
879                         BUFFER_TRACE(bh, "call get_create_access");
880                         fatal = ext3_journal_get_create_access(handle, bh);
881                         if (!fatal && !buffer_uptodate(bh)) {
882                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
883                                 set_buffer_uptodate(bh);
884                         }
885                         unlock_buffer(bh);
886                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
887                         err = ext3_journal_dirty_metadata(handle, bh);
888                         if (!fatal)
889                                 fatal = err;
890                 } else {
891                         BUFFER_TRACE(bh, "not a new buffer");
892                 }
893                 if (fatal) {
894                         *errp = fatal;
895                         brelse(bh);
896                         bh = NULL;
897                 }
898                 return bh;
899         }
900         return NULL;
901 }
902
903 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
904                                int block, int create, int *err)
905 {
906         struct buffer_head * bh;
907
908         bh = ext3_getblk(handle, inode, block, create, err);
909         if (!bh)
910                 return bh;
911         if (buffer_uptodate(bh))
912                 return bh;
913         ll_rw_block(READ, 1, &bh);
914         wait_on_buffer(bh);
915         if (buffer_uptodate(bh))
916                 return bh;
917         put_bh(bh);
918         *err = -EIO;
919         return NULL;
920 }
921
922 static int walk_page_buffers(   handle_t *handle,
923                                 struct buffer_head *head,
924                                 unsigned from,
925                                 unsigned to,
926                                 int *partial,
927                                 int (*fn)(      handle_t *handle,
928                                                 struct buffer_head *bh))
929 {
930         struct buffer_head *bh;
931         unsigned block_start, block_end;
932         unsigned blocksize = head->b_size;
933         int err, ret = 0;
934         struct buffer_head *next;
935
936         for (   bh = head, block_start = 0;
937                 ret == 0 && (bh != head || !block_start);
938                 block_start = block_end, bh = next)
939         {
940                 next = bh->b_this_page;
941                 block_end = block_start + blocksize;
942                 if (block_end <= from || block_start >= to) {
943                         if (partial && !buffer_uptodate(bh))
944                                 *partial = 1;
945                         continue;
946                 }
947                 err = (*fn)(handle, bh);
948                 if (!ret)
949                         ret = err;
950         }
951         return ret;
952 }
953
954 /*
955  * To preserve ordering, it is essential that the hole instantiation and
956  * the data write be encapsulated in a single transaction.  We cannot
957  * close off a transaction and start a new one between the ext3_get_block()
958  * and the commit_write().  So doing the journal_start at the start of
959  * prepare_write() is the right place.
960  *
961  * Also, this function can nest inside ext3_writepage() ->
962  * block_write_full_page(). In that case, we *know* that ext3_writepage()
963  * has generated enough buffer credits to do the whole page.  So we won't
964  * block on the journal in that case, which is good, because the caller may
965  * be PF_MEMALLOC.
966  *
967  * By accident, ext3 can be reentered when a transaction is open via
968  * quota file writes.  If we were to commit the transaction while thus
969  * reentered, there can be a deadlock - we would be holding a quota
970  * lock, and the commit would never complete if another thread had a
971  * transaction open and was blocking on the quota lock - a ranking
972  * violation.
973  *
974  * So what we do is to rely on the fact that journal_stop/journal_start
975  * will _not_ run commit under these circumstances because handle->h_ref
976  * is elevated.  We'll still have enough credits for the tiny quotafile
977  * write.  
978  */
979
980 static int do_journal_get_write_access(handle_t *handle, 
981                                        struct buffer_head *bh)
982 {
983         if (!buffer_mapped(bh) || buffer_freed(bh))
984                 return 0;
985         return ext3_journal_get_write_access(handle, bh);
986 }
987
988 static int ext3_prepare_write(struct file *file, struct page *page,
989                               unsigned from, unsigned to)
990 {
991         struct inode *inode = page->mapping->host;
992         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
993         handle_t *handle;
994         int retries = 0;
995
996 retry:
997         handle = ext3_journal_start(inode, needed_blocks);
998         if (IS_ERR(handle)) {
999                 ret = PTR_ERR(handle);
1000                 goto out;
1001         }
1002         if (test_opt(inode->i_sb, NOBH))
1003                 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1004         else
1005                 ret = block_prepare_write(page, from, to, ext3_get_block);
1006         if (ret)
1007                 goto prepare_write_failed;
1008
1009         if (ext3_should_journal_data(inode)) {
1010                 ret = walk_page_buffers(handle, page_buffers(page),
1011                                 from, to, NULL, do_journal_get_write_access);
1012         }
1013 prepare_write_failed:
1014         if (ret)
1015                 ext3_journal_stop(handle);
1016         if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1017                 goto retry;
1018 out:
1019         return ret;
1020 }
1021
1022 int
1023 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1024 {
1025         int err = journal_dirty_data(handle, bh);
1026         if (err)
1027                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1028                                                 bh, handle,err);
1029         return err;
1030 }
1031
1032 /* For commit_write() in data=journal mode */
1033 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1034 {
1035         if (!buffer_mapped(bh) || buffer_freed(bh))
1036                 return 0;
1037         set_buffer_uptodate(bh);
1038         return ext3_journal_dirty_metadata(handle, bh);
1039 }
1040
1041 /*
1042  * We need to pick up the new inode size which generic_commit_write gave us
1043  * `file' can be NULL - eg, when called from page_symlink().
1044  *
1045  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1046  * buffers are managed internally.
1047  */
1048
1049 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1050                              unsigned from, unsigned to)
1051 {
1052         handle_t *handle = ext3_journal_current_handle();
1053         struct inode *inode = page->mapping->host;
1054         int ret = 0, ret2;
1055
1056         ret = walk_page_buffers(handle, page_buffers(page),
1057                 from, to, NULL, ext3_journal_dirty_data);
1058
1059         if (ret == 0) {
1060                 /*
1061                  * generic_commit_write() will run mark_inode_dirty() if i_size
1062                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1063                  * into that.
1064                  */
1065                 loff_t new_i_size;
1066
1067                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1068                 if (new_i_size > EXT3_I(inode)->i_disksize)
1069                         EXT3_I(inode)->i_disksize = new_i_size;
1070                 ret = generic_commit_write(file, page, from, to);
1071         }
1072         ret2 = ext3_journal_stop(handle);
1073         if (!ret)
1074                 ret = ret2;
1075         return ret;
1076 }
1077
1078 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1079                              unsigned from, unsigned to)
1080 {
1081         handle_t *handle = ext3_journal_current_handle();
1082         struct inode *inode = page->mapping->host;
1083         int ret = 0, ret2;
1084         loff_t new_i_size;
1085
1086         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1087         if (new_i_size > EXT3_I(inode)->i_disksize)
1088                 EXT3_I(inode)->i_disksize = new_i_size;
1089
1090         if (test_opt(inode->i_sb, NOBH))
1091                 ret = nobh_commit_write(file, page, from, to);
1092         else
1093                 ret = generic_commit_write(file, page, from, to);
1094
1095         ret2 = ext3_journal_stop(handle);
1096         if (!ret)
1097                 ret = ret2;
1098         return ret;
1099 }
1100
1101 static int ext3_journalled_commit_write(struct file *file,
1102                         struct page *page, unsigned from, unsigned to)
1103 {
1104         handle_t *handle = ext3_journal_current_handle();
1105         struct inode *inode = page->mapping->host;
1106         int ret = 0, ret2;
1107         int partial = 0;
1108         loff_t pos;
1109
1110         /*
1111          * Here we duplicate the generic_commit_write() functionality
1112          */
1113         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1114
1115         ret = walk_page_buffers(handle, page_buffers(page), from,
1116                                 to, &partial, commit_write_fn);
1117         if (!partial)
1118                 SetPageUptodate(page);
1119         if (pos > inode->i_size)
1120                 i_size_write(inode, pos);
1121         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1122         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1123                 EXT3_I(inode)->i_disksize = inode->i_size;
1124                 ret2 = ext3_mark_inode_dirty(handle, inode);
1125                 if (!ret) 
1126                         ret = ret2;
1127         }
1128         ret2 = ext3_journal_stop(handle);
1129         if (!ret)
1130                 ret = ret2;
1131         return ret;
1132 }
1133
1134 /* 
1135  * bmap() is special.  It gets used by applications such as lilo and by
1136  * the swapper to find the on-disk block of a specific piece of data.
1137  *
1138  * Naturally, this is dangerous if the block concerned is still in the
1139  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1140  * filesystem and enables swap, then they may get a nasty shock when the
1141  * data getting swapped to that swapfile suddenly gets overwritten by
1142  * the original zero's written out previously to the journal and
1143  * awaiting writeback in the kernel's buffer cache. 
1144  *
1145  * So, if we see any bmap calls here on a modified, data-journaled file,
1146  * take extra steps to flush any blocks which might be in the cache. 
1147  */
1148 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1149 {
1150         struct inode *inode = mapping->host;
1151         journal_t *journal;
1152         int err;
1153
1154         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1155                 /* 
1156                  * This is a REALLY heavyweight approach, but the use of
1157                  * bmap on dirty files is expected to be extremely rare:
1158                  * only if we run lilo or swapon on a freshly made file
1159                  * do we expect this to happen. 
1160                  *
1161                  * (bmap requires CAP_SYS_RAWIO so this does not
1162                  * represent an unprivileged user DOS attack --- we'd be
1163                  * in trouble if mortal users could trigger this path at
1164                  * will.) 
1165                  *
1166                  * NB. EXT3_STATE_JDATA is not set on files other than
1167                  * regular files.  If somebody wants to bmap a directory
1168                  * or symlink and gets confused because the buffer
1169                  * hasn't yet been flushed to disk, they deserve
1170                  * everything they get.
1171                  */
1172
1173                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1174                 journal = EXT3_JOURNAL(inode);
1175                 journal_lock_updates(journal);
1176                 err = journal_flush(journal);
1177                 journal_unlock_updates(journal);
1178
1179                 if (err)
1180                         return 0;
1181         }
1182
1183         return generic_block_bmap(mapping,block,ext3_get_block);
1184 }
1185
1186 static int bget_one(handle_t *handle, struct buffer_head *bh)
1187 {
1188         get_bh(bh);
1189         return 0;
1190 }
1191
1192 static int bput_one(handle_t *handle, struct buffer_head *bh)
1193 {
1194         put_bh(bh);
1195         return 0;
1196 }
1197
1198 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1199 {
1200         if (buffer_mapped(bh))
1201                 return ext3_journal_dirty_data(handle, bh);
1202         return 0;
1203 }
1204
1205 /*
1206  * Note that we always start a transaction even if we're not journalling
1207  * data.  This is to preserve ordering: any hole instantiation within
1208  * __block_write_full_page -> ext3_get_block() should be journalled
1209  * along with the data so we don't crash and then get metadata which
1210  * refers to old data.
1211  *
1212  * In all journalling modes block_write_full_page() will start the I/O.
1213  *
1214  * Problem:
1215  *
1216  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1217  *              ext3_writepage()
1218  *
1219  * Similar for:
1220  *
1221  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1222  *
1223  * Same applies to ext3_get_block().  We will deadlock on various things like
1224  * lock_journal and i_truncate_sem.
1225  *
1226  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1227  * allocations fail.
1228  *
1229  * 16May01: If we're reentered then journal_current_handle() will be
1230  *          non-zero. We simply *return*.
1231  *
1232  * 1 July 2001: @@@ FIXME:
1233  *   In journalled data mode, a data buffer may be metadata against the
1234  *   current transaction.  But the same file is part of a shared mapping
1235  *   and someone does a writepage() on it.
1236  *
1237  *   We will move the buffer onto the async_data list, but *after* it has
1238  *   been dirtied. So there's a small window where we have dirty data on
1239  *   BJ_Metadata.
1240  *
1241  *   Note that this only applies to the last partial page in the file.  The
1242  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1243  *   broken code anyway: it's wrong for msync()).
1244  *
1245  *   It's a rare case: affects the final partial page, for journalled data
1246  *   where the file is subject to bith write() and writepage() in the same
1247  *   transction.  To fix it we'll need a custom block_write_full_page().
1248  *   We'll probably need that anyway for journalling writepage() output.
1249  *
1250  * We don't honour synchronous mounts for writepage().  That would be
1251  * disastrous.  Any write() or metadata operation will sync the fs for
1252  * us.
1253  *
1254  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1255  * we don't need to open a transaction here.
1256  */
1257 static int ext3_ordered_writepage(struct page *page,
1258                         struct writeback_control *wbc)
1259 {
1260         struct inode *inode = page->mapping->host;
1261         struct buffer_head *page_bufs;
1262         handle_t *handle = NULL;
1263         int ret = 0;
1264         int err;
1265
1266         J_ASSERT(PageLocked(page));
1267
1268         /*
1269          * We give up here if we're reentered, because it might be for a
1270          * different filesystem.
1271          */
1272         if (ext3_journal_current_handle())
1273                 goto out_fail;
1274
1275         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1276
1277         if (IS_ERR(handle)) {
1278                 ret = PTR_ERR(handle);
1279                 goto out_fail;
1280         }
1281
1282         if (!page_has_buffers(page)) {
1283                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1284                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1285         }
1286         page_bufs = page_buffers(page);
1287         walk_page_buffers(handle, page_bufs, 0,
1288                         PAGE_CACHE_SIZE, NULL, bget_one);
1289
1290         ret = block_write_full_page(page, ext3_get_block, wbc);
1291
1292         /*
1293          * The page can become unlocked at any point now, and
1294          * truncate can then come in and change things.  So we
1295          * can't touch *page from now on.  But *page_bufs is
1296          * safe due to elevated refcount.
1297          */
1298
1299         /*
1300          * And attach them to the current transaction.  But only if 
1301          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1302          * and generally junk.
1303          */
1304         if (ret == 0) {
1305                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1306                                         NULL, journal_dirty_data_fn);
1307                 if (!ret)
1308                         ret = err;
1309         }
1310         walk_page_buffers(handle, page_bufs, 0,
1311                         PAGE_CACHE_SIZE, NULL, bput_one);
1312         err = ext3_journal_stop(handle);
1313         if (!ret)
1314                 ret = err;
1315         return ret;
1316
1317 out_fail:
1318         redirty_page_for_writepage(wbc, page);
1319         unlock_page(page);
1320         return ret;
1321 }
1322
1323 static int ext3_writeback_writepage(struct page *page,
1324                                 struct writeback_control *wbc)
1325 {
1326         struct inode *inode = page->mapping->host;
1327         handle_t *handle = NULL;
1328         int ret = 0;
1329         int err;
1330
1331         if (ext3_journal_current_handle())
1332                 goto out_fail;
1333
1334         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1335         if (IS_ERR(handle)) {
1336                 ret = PTR_ERR(handle);
1337                 goto out_fail;
1338         }
1339
1340         if (test_opt(inode->i_sb, NOBH))
1341                 ret = nobh_writepage(page, ext3_get_block, wbc);
1342         else
1343                 ret = block_write_full_page(page, ext3_get_block, wbc);
1344
1345         err = ext3_journal_stop(handle);
1346         if (!ret)
1347                 ret = err;
1348         return ret;
1349
1350 out_fail:
1351         redirty_page_for_writepage(wbc, page);
1352         unlock_page(page);
1353         return ret;
1354 }
1355
1356 static int ext3_journalled_writepage(struct page *page,
1357                                 struct writeback_control *wbc)
1358 {
1359         struct inode *inode = page->mapping->host;
1360         handle_t *handle = NULL;
1361         int ret = 0;
1362         int err;
1363
1364         if (ext3_journal_current_handle())
1365                 goto no_write;
1366
1367         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1368         if (IS_ERR(handle)) {
1369                 ret = PTR_ERR(handle);
1370                 goto no_write;
1371         }
1372
1373         if (!page_has_buffers(page) || PageChecked(page)) {
1374                 /*
1375                  * It's mmapped pagecache.  Add buffers and journal it.  There
1376                  * doesn't seem much point in redirtying the page here.
1377                  */
1378                 ClearPageChecked(page);
1379                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1380                                         ext3_get_block);
1381                 if (ret != 0)
1382                         goto out_unlock;
1383                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1384                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1385
1386                 err = walk_page_buffers(handle, page_buffers(page), 0,
1387                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1388                 if (ret == 0)
1389                         ret = err;
1390                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1391                 unlock_page(page);
1392         } else {
1393                 /*
1394                  * It may be a page full of checkpoint-mode buffers.  We don't
1395                  * really know unless we go poke around in the buffer_heads.
1396                  * But block_write_full_page will do the right thing.
1397                  */
1398                 ret = block_write_full_page(page, ext3_get_block, wbc);
1399         }
1400         err = ext3_journal_stop(handle);
1401         if (!ret)
1402                 ret = err;
1403 out:
1404         return ret;
1405
1406 no_write:
1407         redirty_page_for_writepage(wbc, page);
1408 out_unlock:
1409         unlock_page(page);
1410         goto out;
1411 }
1412
1413 static int ext3_readpage(struct file *file, struct page *page)
1414 {
1415         return mpage_readpage(page, ext3_get_block);
1416 }
1417
1418 static int
1419 ext3_readpages(struct file *file, struct address_space *mapping,
1420                 struct list_head *pages, unsigned nr_pages)
1421 {
1422         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1423 }
1424
1425 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1426 {
1427         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1428
1429         /*
1430          * If it's a full truncate we just forget about the pending dirtying
1431          */
1432         if (offset == 0)
1433                 ClearPageChecked(page);
1434
1435         return journal_invalidatepage(journal, page, offset);
1436 }
1437
1438 static int ext3_releasepage(struct page *page, int wait)
1439 {
1440         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1441
1442         WARN_ON(PageChecked(page));
1443         if (!page_has_buffers(page))
1444                 return 0;
1445         return journal_try_to_free_buffers(journal, page, wait);
1446 }
1447
1448 /*
1449  * If the O_DIRECT write will extend the file then add this inode to the
1450  * orphan list.  So recovery will truncate it back to the original size
1451  * if the machine crashes during the write.
1452  *
1453  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1454  * crashes then stale disk data _may_ be exposed inside the file.
1455  */
1456 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1457                         const struct iovec *iov, loff_t offset,
1458                         unsigned long nr_segs)
1459 {
1460         struct file *file = iocb->ki_filp;
1461         struct inode *inode = file->f_mapping->host;
1462         struct ext3_inode_info *ei = EXT3_I(inode);
1463         handle_t *handle = NULL;
1464         ssize_t ret;
1465         int orphan = 0;
1466         size_t count = iov_length(iov, nr_segs);
1467
1468         if (rw == WRITE) {
1469                 loff_t final_size = offset + count;
1470
1471                 handle = ext3_journal_start(inode, DIO_CREDITS);
1472                 if (IS_ERR(handle)) {
1473                         ret = PTR_ERR(handle);
1474                         goto out;
1475                 }
1476                 if (final_size > inode->i_size) {
1477                         ret = ext3_orphan_add(handle, inode);
1478                         if (ret)
1479                                 goto out_stop;
1480                         orphan = 1;
1481                         ei->i_disksize = inode->i_size;
1482                 }
1483         }
1484
1485         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
1486                                  offset, nr_segs,
1487                                  ext3_direct_io_get_blocks, NULL);
1488
1489         /*
1490          * Reacquire the handle: ext3_direct_io_get_block() can restart the
1491          * transaction
1492          */
1493         handle = journal_current_handle();
1494
1495 out_stop:
1496         if (handle) {
1497                 int err;
1498
1499                 if (orphan && inode->i_nlink)
1500                         ext3_orphan_del(handle, inode);
1501                 if (orphan && ret > 0) {
1502                         loff_t end = offset + ret;
1503                         if (end > inode->i_size) {
1504                                 ei->i_disksize = end;
1505                                 i_size_write(inode, end);
1506                                 /*
1507                                  * We're going to return a positive `ret'
1508                                  * here due to non-zero-length I/O, so there's
1509                                  * no way of reporting error returns from
1510                                  * ext3_mark_inode_dirty() to userspace.  So
1511                                  * ignore it.
1512                                  */
1513                                 ext3_mark_inode_dirty(handle, inode);
1514                         }
1515                 }
1516                 err = ext3_journal_stop(handle);
1517                 if (ret == 0)
1518                         ret = err;
1519         }
1520 out:
1521         return ret;
1522 }
1523
1524 /*
1525  * Pages can be marked dirty completely asynchronously from ext3's journalling
1526  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1527  * much here because ->set_page_dirty is called under VFS locks.  The page is
1528  * not necessarily locked.
1529  *
1530  * We cannot just dirty the page and leave attached buffers clean, because the
1531  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1532  * or jbddirty because all the journalling code will explode.
1533  *
1534  * So what we do is to mark the page "pending dirty" and next time writepage
1535  * is called, propagate that into the buffers appropriately.
1536  */
1537 static int ext3_journalled_set_page_dirty(struct page *page)
1538 {
1539         SetPageChecked(page);
1540         return __set_page_dirty_nobuffers(page);
1541 }
1542
1543 static struct address_space_operations ext3_ordered_aops = {
1544         .readpage       = ext3_readpage,
1545         .readpages      = ext3_readpages,
1546         .writepage      = ext3_ordered_writepage,
1547         .sync_page      = block_sync_page,
1548         .prepare_write  = ext3_prepare_write,
1549         .commit_write   = ext3_ordered_commit_write,
1550         .bmap           = ext3_bmap,
1551         .invalidatepage = ext3_invalidatepage,
1552         .releasepage    = ext3_releasepage,
1553         .direct_IO      = ext3_direct_IO,
1554 };
1555
1556 static struct address_space_operations ext3_writeback_aops = {
1557         .readpage       = ext3_readpage,
1558         .readpages      = ext3_readpages,
1559         .writepage      = ext3_writeback_writepage,
1560         .sync_page      = block_sync_page,
1561         .prepare_write  = ext3_prepare_write,
1562         .commit_write   = ext3_writeback_commit_write,
1563         .bmap           = ext3_bmap,
1564         .invalidatepage = ext3_invalidatepage,
1565         .releasepage    = ext3_releasepage,
1566         .direct_IO      = ext3_direct_IO,
1567 };
1568
1569 static struct address_space_operations ext3_journalled_aops = {
1570         .readpage       = ext3_readpage,
1571         .readpages      = ext3_readpages,
1572         .writepage      = ext3_journalled_writepage,
1573         .sync_page      = block_sync_page,
1574         .prepare_write  = ext3_prepare_write,
1575         .commit_write   = ext3_journalled_commit_write,
1576         .set_page_dirty = ext3_journalled_set_page_dirty,
1577         .bmap           = ext3_bmap,
1578         .invalidatepage = ext3_invalidatepage,
1579         .releasepage    = ext3_releasepage,
1580 };
1581
1582 void ext3_set_aops(struct inode *inode)
1583 {
1584         if (ext3_should_order_data(inode))
1585                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1586         else if (ext3_should_writeback_data(inode))
1587                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1588         else
1589                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1590 }
1591
1592 /*
1593  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1594  * up to the end of the block which corresponds to `from'.
1595  * This required during truncate. We need to physically zero the tail end
1596  * of that block so it doesn't yield old data if the file is later grown.
1597  */
1598 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1599                 struct address_space *mapping, loff_t from)
1600 {
1601         unsigned long index = from >> PAGE_CACHE_SHIFT;
1602         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1603         unsigned blocksize, iblock, length, pos;
1604         struct inode *inode = mapping->host;
1605         struct buffer_head *bh;
1606         int err = 0;
1607         void *kaddr;
1608
1609         blocksize = inode->i_sb->s_blocksize;
1610         length = blocksize - (offset & (blocksize - 1));
1611         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1612
1613         /*
1614          * For "nobh" option,  we can only work if we don't need to
1615          * read-in the page - otherwise we create buffers to do the IO.
1616          */
1617         if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) {
1618                 if (PageUptodate(page)) {
1619                         kaddr = kmap_atomic(page, KM_USER0);
1620                         memset(kaddr + offset, 0, length);
1621                         flush_dcache_page(page);
1622                         kunmap_atomic(kaddr, KM_USER0);
1623                         set_page_dirty(page);
1624                         goto unlock;
1625                 }
1626         }
1627
1628         if (!page_has_buffers(page))
1629                 create_empty_buffers(page, blocksize, 0);
1630
1631         /* Find the buffer that contains "offset" */
1632         bh = page_buffers(page);
1633         pos = blocksize;
1634         while (offset >= pos) {
1635                 bh = bh->b_this_page;
1636                 iblock++;
1637                 pos += blocksize;
1638         }
1639
1640         err = 0;
1641         if (buffer_freed(bh)) {
1642                 BUFFER_TRACE(bh, "freed: skip");
1643                 goto unlock;
1644         }
1645
1646         if (!buffer_mapped(bh)) {
1647                 BUFFER_TRACE(bh, "unmapped");
1648                 ext3_get_block(inode, iblock, bh, 0);
1649                 /* unmapped? It's a hole - nothing to do */
1650                 if (!buffer_mapped(bh)) {
1651                         BUFFER_TRACE(bh, "still unmapped");
1652                         goto unlock;
1653                 }
1654         }
1655
1656         /* Ok, it's mapped. Make sure it's up-to-date */
1657         if (PageUptodate(page))
1658                 set_buffer_uptodate(bh);
1659
1660         if (!buffer_uptodate(bh)) {
1661                 err = -EIO;
1662                 ll_rw_block(READ, 1, &bh);
1663                 wait_on_buffer(bh);
1664                 /* Uhhuh. Read error. Complain and punt. */
1665                 if (!buffer_uptodate(bh))
1666                         goto unlock;
1667         }
1668
1669         if (ext3_should_journal_data(inode)) {
1670                 BUFFER_TRACE(bh, "get write access");
1671                 err = ext3_journal_get_write_access(handle, bh);
1672                 if (err)
1673                         goto unlock;
1674         }
1675
1676         kaddr = kmap_atomic(page, KM_USER0);
1677         memset(kaddr + offset, 0, length);
1678         flush_dcache_page(page);
1679         kunmap_atomic(kaddr, KM_USER0);
1680
1681         BUFFER_TRACE(bh, "zeroed end of block");
1682
1683         err = 0;
1684         if (ext3_should_journal_data(inode)) {
1685                 err = ext3_journal_dirty_metadata(handle, bh);
1686         } else {
1687                 if (ext3_should_order_data(inode))
1688                         err = ext3_journal_dirty_data(handle, bh);
1689                 mark_buffer_dirty(bh);
1690         }
1691
1692 unlock:
1693         unlock_page(page);
1694         page_cache_release(page);
1695         return err;
1696 }
1697
1698 /*
1699  * Probably it should be a library function... search for first non-zero word
1700  * or memcmp with zero_page, whatever is better for particular architecture.
1701  * Linus?
1702  */
1703 static inline int all_zeroes(__le32 *p, __le32 *q)
1704 {
1705         while (p < q)
1706                 if (*p++)
1707                         return 0;
1708         return 1;
1709 }
1710
1711 /**
1712  *      ext3_find_shared - find the indirect blocks for partial truncation.
1713  *      @inode:   inode in question
1714  *      @depth:   depth of the affected branch
1715  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1716  *      @chain:   place to store the pointers to partial indirect blocks
1717  *      @top:     place to the (detached) top of branch
1718  *
1719  *      This is a helper function used by ext3_truncate().
1720  *
1721  *      When we do truncate() we may have to clean the ends of several
1722  *      indirect blocks but leave the blocks themselves alive. Block is
1723  *      partially truncated if some data below the new i_size is refered
1724  *      from it (and it is on the path to the first completely truncated
1725  *      data block, indeed).  We have to free the top of that path along
1726  *      with everything to the right of the path. Since no allocation
1727  *      past the truncation point is possible until ext3_truncate()
1728  *      finishes, we may safely do the latter, but top of branch may
1729  *      require special attention - pageout below the truncation point
1730  *      might try to populate it.
1731  *
1732  *      We atomically detach the top of branch from the tree, store the
1733  *      block number of its root in *@top, pointers to buffer_heads of
1734  *      partially truncated blocks - in @chain[].bh and pointers to
1735  *      their last elements that should not be removed - in
1736  *      @chain[].p. Return value is the pointer to last filled element
1737  *      of @chain.
1738  *
1739  *      The work left to caller to do the actual freeing of subtrees:
1740  *              a) free the subtree starting from *@top
1741  *              b) free the subtrees whose roots are stored in
1742  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1743  *              c) free the subtrees growing from the inode past the @chain[0].
1744  *                      (no partially truncated stuff there).  */
1745
1746 static Indirect *ext3_find_shared(struct inode *inode,
1747                                 int depth,
1748                                 int offsets[4],
1749                                 Indirect chain[4],
1750                                 __le32 *top)
1751 {
1752         Indirect *partial, *p;
1753         int k, err;
1754
1755         *top = 0;
1756         /* Make k index the deepest non-null offest + 1 */
1757         for (k = depth; k > 1 && !offsets[k-1]; k--)
1758                 ;
1759         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1760         /* Writer: pointers */
1761         if (!partial)
1762                 partial = chain + k-1;
1763         /*
1764          * If the branch acquired continuation since we've looked at it -
1765          * fine, it should all survive and (new) top doesn't belong to us.
1766          */
1767         if (!partial->key && *partial->p)
1768                 /* Writer: end */
1769                 goto no_top;
1770         for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1771                 ;
1772         /*
1773          * OK, we've found the last block that must survive. The rest of our
1774          * branch should be detached before unlocking. However, if that rest
1775          * of branch is all ours and does not grow immediately from the inode
1776          * it's easier to cheat and just decrement partial->p.
1777          */
1778         if (p == chain + k - 1 && p > chain) {
1779                 p->p--;
1780         } else {
1781                 *top = *p->p;
1782                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1783 #if 0
1784                 *p->p = 0;
1785 #endif
1786         }
1787         /* Writer: end */
1788
1789         while(partial > p)
1790         {
1791                 brelse(partial->bh);
1792                 partial--;
1793         }
1794 no_top:
1795         return partial;
1796 }
1797
1798 /*
1799  * Zero a number of block pointers in either an inode or an indirect block.
1800  * If we restart the transaction we must again get write access to the
1801  * indirect block for further modification.
1802  *
1803  * We release `count' blocks on disk, but (last - first) may be greater
1804  * than `count' because there can be holes in there.
1805  */
1806 static void
1807 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1808                 unsigned long block_to_free, unsigned long count,
1809                 __le32 *first, __le32 *last)
1810 {
1811         __le32 *p;
1812         if (try_to_extend_transaction(handle, inode)) {
1813                 if (bh) {
1814                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1815                         ext3_journal_dirty_metadata(handle, bh);
1816                 }
1817                 ext3_mark_inode_dirty(handle, inode);
1818                 ext3_journal_test_restart(handle, inode);
1819                 if (bh) {
1820                         BUFFER_TRACE(bh, "retaking write access");
1821                         ext3_journal_get_write_access(handle, bh);
1822                 }
1823         }
1824
1825         /*
1826          * Any buffers which are on the journal will be in memory. We find
1827          * them on the hash table so journal_revoke() will run journal_forget()
1828          * on them.  We've already detached each block from the file, so
1829          * bforget() in journal_forget() should be safe.
1830          *
1831          * AKPM: turn on bforget in journal_forget()!!!
1832          */
1833         for (p = first; p < last; p++) {
1834                 u32 nr = le32_to_cpu(*p);
1835                 if (nr) {
1836                         struct buffer_head *bh;
1837
1838                         *p = 0;
1839                         bh = sb_find_get_block(inode->i_sb, nr);
1840                         ext3_forget(handle, 0, inode, bh, nr);
1841                 }
1842         }
1843
1844         ext3_free_blocks(handle, inode, block_to_free, count);
1845 }
1846
1847 /**
1848  * ext3_free_data - free a list of data blocks
1849  * @handle:     handle for this transaction
1850  * @inode:      inode we are dealing with
1851  * @this_bh:    indirect buffer_head which contains *@first and *@last
1852  * @first:      array of block numbers
1853  * @last:       points immediately past the end of array
1854  *
1855  * We are freeing all blocks refered from that array (numbers are stored as
1856  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1857  *
1858  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1859  * blocks are contiguous then releasing them at one time will only affect one
1860  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1861  * actually use a lot of journal space.
1862  *
1863  * @this_bh will be %NULL if @first and @last point into the inode's direct
1864  * block pointers.
1865  */
1866 static void ext3_free_data(handle_t *handle, struct inode *inode,
1867                            struct buffer_head *this_bh,
1868                            __le32 *first, __le32 *last)
1869 {
1870         unsigned long block_to_free = 0;    /* Starting block # of a run */
1871         unsigned long count = 0;            /* Number of blocks in the run */ 
1872         __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
1873                                                corresponding to
1874                                                block_to_free */
1875         unsigned long nr;                   /* Current block # */
1876         __le32 *p;                          /* Pointer into inode/ind
1877                                                for current block */
1878         int err;
1879
1880         if (this_bh) {                          /* For indirect block */
1881                 BUFFER_TRACE(this_bh, "get_write_access");
1882                 err = ext3_journal_get_write_access(handle, this_bh);
1883                 /* Important: if we can't update the indirect pointers
1884                  * to the blocks, we can't free them. */
1885                 if (err)
1886                         return;
1887         }
1888
1889         for (p = first; p < last; p++) {
1890                 nr = le32_to_cpu(*p);
1891                 if (nr) {
1892                         /* accumulate blocks to free if they're contiguous */
1893                         if (count == 0) {
1894                                 block_to_free = nr;
1895                                 block_to_free_p = p;
1896                                 count = 1;
1897                         } else if (nr == block_to_free + count) {
1898                                 count++;
1899                         } else {
1900                                 ext3_clear_blocks(handle, inode, this_bh, 
1901                                                   block_to_free,
1902                                                   count, block_to_free_p, p);
1903                                 block_to_free = nr;
1904                                 block_to_free_p = p;
1905                                 count = 1;
1906                         }
1907                 }
1908         }
1909
1910         if (count > 0)
1911                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1912                                   count, block_to_free_p, p);
1913
1914         if (this_bh) {
1915                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1916                 ext3_journal_dirty_metadata(handle, this_bh);
1917         }
1918 }
1919
1920 /**
1921  *      ext3_free_branches - free an array of branches
1922  *      @handle: JBD handle for this transaction
1923  *      @inode: inode we are dealing with
1924  *      @parent_bh: the buffer_head which contains *@first and *@last
1925  *      @first: array of block numbers
1926  *      @last:  pointer immediately past the end of array
1927  *      @depth: depth of the branches to free
1928  *
1929  *      We are freeing all blocks refered from these branches (numbers are
1930  *      stored as little-endian 32-bit) and updating @inode->i_blocks
1931  *      appropriately.
1932  */
1933 static void ext3_free_branches(handle_t *handle, struct inode *inode,
1934                                struct buffer_head *parent_bh,
1935                                __le32 *first, __le32 *last, int depth)
1936 {
1937         unsigned long nr;
1938         __le32 *p;
1939
1940         if (is_handle_aborted(handle))
1941                 return;
1942
1943         if (depth--) {
1944                 struct buffer_head *bh;
1945                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1946                 p = last;
1947                 while (--p >= first) {
1948                         nr = le32_to_cpu(*p);
1949                         if (!nr)
1950                                 continue;               /* A hole */
1951
1952                         /* Go read the buffer for the next level down */
1953                         bh = sb_bread(inode->i_sb, nr);
1954
1955                         /*
1956                          * A read failure? Report error and clear slot
1957                          * (should be rare).
1958                          */
1959                         if (!bh) {
1960                                 ext3_error(inode->i_sb, "ext3_free_branches",
1961                                            "Read failure, inode=%ld, block=%ld",
1962                                            inode->i_ino, nr);
1963                                 continue;
1964                         }
1965
1966                         /* This zaps the entire block.  Bottom up. */
1967                         BUFFER_TRACE(bh, "free child branches");
1968                         ext3_free_branches(handle, inode, bh,
1969                                            (__le32*)bh->b_data,
1970                                            (__le32*)bh->b_data + addr_per_block,
1971                                            depth);
1972
1973                         /*
1974                          * We've probably journalled the indirect block several
1975                          * times during the truncate.  But it's no longer
1976                          * needed and we now drop it from the transaction via
1977                          * journal_revoke().
1978                          *
1979                          * That's easy if it's exclusively part of this
1980                          * transaction.  But if it's part of the committing
1981                          * transaction then journal_forget() will simply
1982                          * brelse() it.  That means that if the underlying
1983                          * block is reallocated in ext3_get_block(),
1984                          * unmap_underlying_metadata() will find this block
1985                          * and will try to get rid of it.  damn, damn.
1986                          *
1987                          * If this block has already been committed to the
1988                          * journal, a revoke record will be written.  And
1989                          * revoke records must be emitted *before* clearing
1990                          * this block's bit in the bitmaps.
1991                          */
1992                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
1993
1994                         /*
1995                          * Everything below this this pointer has been
1996                          * released.  Now let this top-of-subtree go.
1997                          *
1998                          * We want the freeing of this indirect block to be
1999                          * atomic in the journal with the updating of the
2000                          * bitmap block which owns it.  So make some room in
2001                          * the journal.
2002                          *
2003                          * We zero the parent pointer *after* freeing its
2004                          * pointee in the bitmaps, so if extend_transaction()
2005                          * for some reason fails to put the bitmap changes and
2006                          * the release into the same transaction, recovery
2007                          * will merely complain about releasing a free block,
2008                          * rather than leaking blocks.
2009                          */
2010                         if (is_handle_aborted(handle))
2011                                 return;
2012                         if (try_to_extend_transaction(handle, inode)) {
2013                                 ext3_mark_inode_dirty(handle, inode);
2014                                 ext3_journal_test_restart(handle, inode);
2015                         }
2016
2017                         ext3_free_blocks(handle, inode, nr, 1);
2018
2019                         if (parent_bh) {
2020                                 /*
2021                                  * The block which we have just freed is
2022                                  * pointed to by an indirect block: journal it
2023                                  */
2024                                 BUFFER_TRACE(parent_bh, "get_write_access");
2025                                 if (!ext3_journal_get_write_access(handle,
2026                                                                    parent_bh)){
2027                                         *p = 0;
2028                                         BUFFER_TRACE(parent_bh,
2029                                         "call ext3_journal_dirty_metadata");
2030                                         ext3_journal_dirty_metadata(handle, 
2031                                                                     parent_bh);
2032                                 }
2033                         }
2034                 }
2035         } else {
2036                 /* We have reached the bottom of the tree. */
2037                 BUFFER_TRACE(parent_bh, "free data blocks");
2038                 ext3_free_data(handle, inode, parent_bh, first, last);
2039         }
2040 }
2041
2042 /*
2043  * ext3_truncate()
2044  *
2045  * We block out ext3_get_block() block instantiations across the entire
2046  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2047  * simultaneously on behalf of the same inode.
2048  *
2049  * As we work through the truncate and commmit bits of it to the journal there
2050  * is one core, guiding principle: the file's tree must always be consistent on
2051  * disk.  We must be able to restart the truncate after a crash.
2052  *
2053  * The file's tree may be transiently inconsistent in memory (although it
2054  * probably isn't), but whenever we close off and commit a journal transaction,
2055  * the contents of (the filesystem + the journal) must be consistent and
2056  * restartable.  It's pretty simple, really: bottom up, right to left (although
2057  * left-to-right works OK too).
2058  *
2059  * Note that at recovery time, journal replay occurs *before* the restart of
2060  * truncate against the orphan inode list.
2061  *
2062  * The committed inode has the new, desired i_size (which is the same as
2063  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2064  * that this inode's truncate did not complete and it will again call
2065  * ext3_truncate() to have another go.  So there will be instantiated blocks
2066  * to the right of the truncation point in a crashed ext3 filesystem.  But
2067  * that's fine - as long as they are linked from the inode, the post-crash
2068  * ext3_truncate() run will find them and release them.
2069  */
2070
2071 void ext3_truncate_nocheck(struct inode * inode)
2072 {
2073         handle_t *handle;
2074         struct ext3_inode_info *ei = EXT3_I(inode);
2075         __le32 *i_data = ei->i_data;
2076         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2077         struct address_space *mapping = inode->i_mapping;
2078         int offsets[4];
2079         Indirect chain[4];
2080         Indirect *partial;
2081         __le32 nr = 0;
2082         int n;
2083         long last_block;
2084         unsigned blocksize = inode->i_sb->s_blocksize;
2085         struct page *page;
2086
2087         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2088             S_ISLNK(inode->i_mode)))
2089                 return;
2090         if (ext3_inode_is_fast_symlink(inode))
2091                 return;
2092
2093         /*
2094          * We have to lock the EOF page here, because lock_page() nests
2095          * outside journal_start().
2096          */
2097         if ((inode->i_size & (blocksize - 1)) == 0) {
2098                 /* Block boundary? Nothing to do */
2099                 page = NULL;
2100         } else {
2101                 page = grab_cache_page(mapping,
2102                                 inode->i_size >> PAGE_CACHE_SHIFT);
2103                 if (!page)
2104                         return;
2105         }
2106
2107         handle = start_transaction(inode);
2108         if (IS_ERR(handle)) {
2109                 if (page) {
2110                         clear_highpage(page);
2111                         flush_dcache_page(page);
2112                         unlock_page(page);
2113                         page_cache_release(page);
2114                 }
2115                 return;         /* AKPM: return what? */
2116         }
2117
2118         last_block = (inode->i_size + blocksize-1)
2119                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2120
2121         if (page)
2122                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2123
2124         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2125         if (n == 0)
2126                 goto out_stop;  /* error */
2127
2128         /*
2129          * OK.  This truncate is going to happen.  We add the inode to the
2130          * orphan list, so that if this truncate spans multiple transactions,
2131          * and we crash, we will resume the truncate when the filesystem
2132          * recovers.  It also marks the inode dirty, to catch the new size.
2133          *
2134          * Implication: the file must always be in a sane, consistent
2135          * truncatable state while each transaction commits.
2136          */
2137         if (ext3_orphan_add(handle, inode))
2138                 goto out_stop;
2139
2140         /*
2141          * The orphan list entry will now protect us from any crash which
2142          * occurs before the truncate completes, so it is now safe to propagate
2143          * the new, shorter inode size (held for now in i_size) into the
2144          * on-disk inode. We do this via i_disksize, which is the value which
2145          * ext3 *really* writes onto the disk inode.
2146          */
2147         ei->i_disksize = inode->i_size;
2148
2149         /*
2150          * From here we block out all ext3_get_block() callers who want to
2151          * modify the block allocation tree.
2152          */
2153         down(&ei->truncate_sem);
2154
2155         if (n == 1) {           /* direct blocks */
2156                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2157                                i_data + EXT3_NDIR_BLOCKS);
2158                 goto do_indirects;
2159         }
2160
2161         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2162         /* Kill the top of shared branch (not detached) */
2163         if (nr) {
2164                 if (partial == chain) {
2165                         /* Shared branch grows from the inode */
2166                         ext3_free_branches(handle, inode, NULL,
2167                                            &nr, &nr+1, (chain+n-1) - partial);
2168                         *partial->p = 0;
2169                         /*
2170                          * We mark the inode dirty prior to restart,
2171                          * and prior to stop.  No need for it here.
2172                          */
2173                 } else {
2174                         /* Shared branch grows from an indirect block */
2175                         BUFFER_TRACE(partial->bh, "get_write_access");
2176                         ext3_free_branches(handle, inode, partial->bh,
2177                                         partial->p,
2178                                         partial->p+1, (chain+n-1) - partial);
2179                 }
2180         }
2181         /* Clear the ends of indirect blocks on the shared branch */
2182         while (partial > chain) {
2183                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2184                                    (__le32*)partial->bh->b_data+addr_per_block,
2185                                    (chain+n-1) - partial);
2186                 BUFFER_TRACE(partial->bh, "call brelse");
2187                 brelse (partial->bh);
2188                 partial--;
2189         }
2190 do_indirects:
2191         /* Kill the remaining (whole) subtrees */
2192         switch (offsets[0]) {
2193                 default:
2194                         nr = i_data[EXT3_IND_BLOCK];
2195                         if (nr) {
2196                                 ext3_free_branches(handle, inode, NULL,
2197                                                    &nr, &nr+1, 1);
2198                                 i_data[EXT3_IND_BLOCK] = 0;
2199                         }
2200                 case EXT3_IND_BLOCK:
2201                         nr = i_data[EXT3_DIND_BLOCK];
2202                         if (nr) {
2203                                 ext3_free_branches(handle, inode, NULL,
2204                                                    &nr, &nr+1, 2);
2205                                 i_data[EXT3_DIND_BLOCK] = 0;
2206                         }
2207                 case EXT3_DIND_BLOCK:
2208                         nr = i_data[EXT3_TIND_BLOCK];
2209                         if (nr) {
2210                                 ext3_free_branches(handle, inode, NULL,
2211                                                    &nr, &nr+1, 3);
2212                                 i_data[EXT3_TIND_BLOCK] = 0;
2213                         }
2214                 case EXT3_TIND_BLOCK:
2215                         ;
2216         }
2217
2218         ext3_discard_reservation(inode);
2219
2220         up(&ei->truncate_sem);
2221         inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2222         ext3_mark_inode_dirty(handle, inode);
2223
2224         /* In a multi-transaction truncate, we only make the final
2225          * transaction synchronous */
2226         if (IS_SYNC(inode))
2227                 handle->h_sync = 1;
2228 out_stop:
2229         /*
2230          * If this was a simple ftruncate(), and the file will remain alive
2231          * then we need to clear up the orphan record which we created above.
2232          * However, if this was a real unlink then we were called by
2233          * ext3_delete_inode(), and we allow that function to clean up the
2234          * orphan info for us.
2235          */
2236         if (inode->i_nlink)
2237                 ext3_orphan_del(handle, inode);
2238
2239         ext3_journal_stop(handle);
2240 }
2241
2242 static unsigned long ext3_get_inode_block(struct super_block *sb,
2243                 unsigned long ino, struct ext3_iloc *iloc)
2244 {
2245         unsigned long desc, group_desc, block_group;
2246         unsigned long offset, block;
2247         struct buffer_head *bh;
2248         struct ext3_group_desc * gdp;
2249
2250
2251         if ((ino != EXT3_ROOT_INO &&
2252                 ino != EXT3_JOURNAL_INO &&
2253                 ino != EXT3_RESIZE_INO &&
2254                 ino < EXT3_FIRST_INO(sb)) ||
2255                 ino > le32_to_cpu(
2256                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2257                 ext3_error (sb, "ext3_get_inode_block",
2258                             "bad inode number: %lu", ino);
2259                 return 0;
2260         }
2261         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2262         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2263                 ext3_error (sb, "ext3_get_inode_block",
2264                             "group >= groups count");
2265                 return 0;
2266         }
2267         smp_rmb();
2268         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2269         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2270         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2271         if (!bh) {
2272                 ext3_error (sb, "ext3_get_inode_block",
2273                             "Descriptor not loaded");
2274                 return 0;
2275         }
2276
2277         gdp = (struct ext3_group_desc *) bh->b_data;
2278         /*
2279          * Figure out the offset within the block group inode table
2280          */
2281         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2282                 EXT3_INODE_SIZE(sb);
2283         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2284                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2285
2286         iloc->block_group = block_group;
2287         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2288         return block;
2289 }
2290
2291 /*
2292  * ext3_get_inode_loc returns with an extra refcount against the inode's
2293  * underlying buffer_head on success. If 'in_mem' is true, we have all
2294  * data in memory that is needed to recreate the on-disk version of this
2295  * inode.
2296  */
2297 static int __ext3_get_inode_loc(struct inode *inode,
2298                                 struct ext3_iloc *iloc, int in_mem)
2299 {
2300         unsigned long block;
2301         struct buffer_head *bh;
2302
2303         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2304         if (!block)
2305                 return -EIO;
2306
2307         bh = sb_getblk(inode->i_sb, block);
2308         if (!bh) {
2309                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2310                                 "unable to read inode block - "
2311                                 "inode=%lu, block=%lu", inode->i_ino, block);
2312                 return -EIO;
2313         }
2314         if (!buffer_uptodate(bh)) {
2315                 lock_buffer(bh);
2316                 if (buffer_uptodate(bh)) {
2317                         /* someone brought it uptodate while we waited */
2318                         unlock_buffer(bh);
2319                         goto has_buffer;
2320                 }
2321
2322                 /*
2323                  * If we have all information of the inode in memory and this
2324                  * is the only valid inode in the block, we need not read the
2325                  * block.
2326                  */
2327                 if (in_mem) {
2328                         struct buffer_head *bitmap_bh;
2329                         struct ext3_group_desc *desc;
2330                         int inodes_per_buffer;
2331                         int inode_offset, i;
2332                         int block_group;
2333                         int start;
2334
2335                         block_group = (inode->i_ino - 1) /
2336                                         EXT3_INODES_PER_GROUP(inode->i_sb);
2337                         inodes_per_buffer = bh->b_size /
2338                                 EXT3_INODE_SIZE(inode->i_sb);
2339                         inode_offset = ((inode->i_ino - 1) %
2340                                         EXT3_INODES_PER_GROUP(inode->i_sb));
2341                         start = inode_offset & ~(inodes_per_buffer - 1);
2342
2343                         /* Is the inode bitmap in cache? */
2344                         desc = ext3_get_group_desc(inode->i_sb,
2345                                                 block_group, NULL);
2346                         if (!desc)
2347                                 goto make_io;
2348
2349                         bitmap_bh = sb_getblk(inode->i_sb,
2350                                         le32_to_cpu(desc->bg_inode_bitmap));
2351                         if (!bitmap_bh)
2352                                 goto make_io;
2353
2354                         /*
2355                          * If the inode bitmap isn't in cache then the
2356                          * optimisation may end up performing two reads instead
2357                          * of one, so skip it.
2358                          */
2359                         if (!buffer_uptodate(bitmap_bh)) {
2360                                 brelse(bitmap_bh);
2361                                 goto make_io;
2362                         }
2363                         for (i = start; i < start + inodes_per_buffer; i++) {
2364                                 if (i == inode_offset)
2365                                         continue;
2366                                 if (ext3_test_bit(i, bitmap_bh->b_data))
2367                                         break;
2368                         }
2369                         brelse(bitmap_bh);
2370                         if (i == start + inodes_per_buffer) {
2371                                 /* all other inodes are free, so skip I/O */
2372                                 memset(bh->b_data, 0, bh->b_size);
2373                                 set_buffer_uptodate(bh);
2374                                 unlock_buffer(bh);
2375                                 goto has_buffer;
2376                         }
2377                 }
2378
2379 make_io:
2380                 /*
2381                  * There are other valid inodes in the buffer, this inode
2382                  * has in-inode xattrs, or we don't have this inode in memory.
2383                  * Read the block from disk.
2384                  */
2385                 get_bh(bh);
2386                 bh->b_end_io = end_buffer_read_sync;
2387                 submit_bh(READ, bh);
2388                 wait_on_buffer(bh);
2389                 if (!buffer_uptodate(bh)) {
2390                         ext3_error(inode->i_sb, "ext3_get_inode_loc",
2391                                         "unable to read inode block - "
2392                                         "inode=%lu, block=%lu",
2393                                         inode->i_ino, block);
2394                         brelse(bh);
2395                         return -EIO;
2396                 }
2397         }
2398 has_buffer:
2399         iloc->bh = bh;
2400         return 0;
2401 }
2402
2403 int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2404 {
2405         /* We have all inode data except xattrs in memory here. */
2406         return __ext3_get_inode_loc(inode, iloc,
2407                 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2408 }
2409
2410 void ext3_truncate(struct inode * inode)
2411 {
2412         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2413                 return;
2414         ext3_truncate_nocheck(inode);
2415 }
2416
2417 void ext3_set_inode_flags(struct inode *inode)
2418 {
2419         unsigned int flags = EXT3_I(inode)->i_flags;
2420
2421         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_IUNLINK|S_BARRIER|S_NOATIME|S_DIRSYNC);
2422         if (flags & EXT3_SYNC_FL)
2423                 inode->i_flags |= S_SYNC;
2424         if (flags & EXT3_APPEND_FL)
2425                 inode->i_flags |= S_APPEND;
2426         if (flags & EXT3_IMMUTABLE_FL)
2427                 inode->i_flags |= S_IMMUTABLE;
2428         if (flags & EXT3_IUNLINK_FL)
2429                 inode->i_flags |= S_IUNLINK;
2430         if (flags & EXT3_BARRIER_FL)
2431                 inode->i_flags |= S_BARRIER;
2432         if (flags & EXT3_NOATIME_FL)
2433                 inode->i_flags |= S_NOATIME;
2434         if (flags & EXT3_DIRSYNC_FL)
2435                 inode->i_flags |= S_DIRSYNC;
2436 }
2437
2438 void ext3_read_inode(struct inode * inode)
2439 {
2440         struct ext3_iloc iloc;
2441         struct ext3_inode *raw_inode;
2442         struct ext3_inode_info *ei = EXT3_I(inode);
2443         struct buffer_head *bh;
2444         int block;
2445         uid_t uid;
2446         gid_t gid;
2447
2448 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2449         ei->i_acl = EXT3_ACL_NOT_CACHED;
2450         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2451 #endif
2452         ei->i_block_alloc_info = NULL;
2453
2454         if (__ext3_get_inode_loc(inode, &iloc, 0))
2455                 goto bad_inode;
2456         bh = iloc.bh;
2457         raw_inode = ext3_raw_inode(&iloc);
2458         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2459         uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2460         gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2461         if(!(test_opt (inode->i_sb, NO_UID32))) {
2462                 uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2463                 gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2464         }
2465         inode->i_uid = INOXID_UID(XID_TAG(inode), uid, gid);
2466         inode->i_gid = INOXID_GID(XID_TAG(inode), uid, gid);
2467         inode->i_xid = INOXID_XID(XID_TAG(inode), uid, gid,
2468                 le16_to_cpu(raw_inode->i_raw_xid));
2469
2470         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2471         inode->i_size = le32_to_cpu(raw_inode->i_size);
2472         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2473         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2474         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2475         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2476
2477         ei->i_state = 0;
2478         ei->i_dir_start_lookup = 0;
2479         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2480         /* We now have enough fields to check if the inode was active or not.
2481          * This is needed because nfsd might try to access dead inodes
2482          * the test is that same one that e2fsck uses
2483          * NeilBrown 1999oct15
2484          */
2485         if (inode->i_nlink == 0) {
2486                 if (inode->i_mode == 0 ||
2487                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2488                         /* this inode is deleted */
2489                         brelse (bh);
2490                         goto bad_inode;
2491                 }
2492                 /* The only unlinked inodes we let through here have
2493                  * valid i_mode and are being read by the orphan
2494                  * recovery code: that's fine, we're about to complete
2495                  * the process of deleting those. */
2496         }
2497         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2498                                          * (for stat), not the fs block
2499                                          * size */  
2500         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2501         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2502 #ifdef EXT3_FRAGMENTS
2503         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2504         ei->i_frag_no = raw_inode->i_frag;
2505         ei->i_frag_size = raw_inode->i_fsize;
2506 #endif
2507         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2508         if (!S_ISREG(inode->i_mode)) {
2509                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2510         } else {
2511                 inode->i_size |=
2512                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2513         }
2514         ei->i_disksize = inode->i_size;
2515         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2516         ei->i_block_group = iloc.block_group;
2517         /*
2518          * NOTE! The in-memory inode i_data array is in little-endian order
2519          * even on big-endian machines: we do NOT byteswap the block numbers!
2520          */
2521         for (block = 0; block < EXT3_N_BLOCKS; block++)
2522                 ei->i_data[block] = raw_inode->i_block[block];
2523         INIT_LIST_HEAD(&ei->i_orphan);
2524
2525         if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2526             EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2527                 /*
2528                  * When mke2fs creates big inodes it does not zero out
2529                  * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2530                  * so ignore those first few inodes.
2531                  */
2532                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2533                 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2534                     EXT3_INODE_SIZE(inode->i_sb))
2535                         goto bad_inode;
2536                 if (ei->i_extra_isize == 0) {
2537                         /* The extra space is currently unused. Use it. */
2538                         ei->i_extra_isize = sizeof(struct ext3_inode) -
2539                                             EXT3_GOOD_OLD_INODE_SIZE;
2540                 } else {
2541                         __le32 *magic = (void *)raw_inode +
2542                                         EXT3_GOOD_OLD_INODE_SIZE +
2543                                         ei->i_extra_isize;
2544                         if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2545                                  ei->i_state |= EXT3_STATE_XATTR;
2546                 }
2547         } else
2548                 ei->i_extra_isize = 0;
2549
2550         if (S_ISREG(inode->i_mode)) {
2551                 inode->i_op = &ext3_file_inode_operations;
2552                 inode->i_fop = &ext3_file_operations;
2553                 ext3_set_aops(inode);
2554         } else if (S_ISDIR(inode->i_mode)) {
2555                 inode->i_op = &ext3_dir_inode_operations;
2556                 inode->i_fop = &ext3_dir_operations;
2557         } else if (S_ISLNK(inode->i_mode)) {
2558                 if (ext3_inode_is_fast_symlink(inode))
2559                         inode->i_op = &ext3_fast_symlink_inode_operations;
2560                 else {
2561                         inode->i_op = &ext3_symlink_inode_operations;
2562                         ext3_set_aops(inode);
2563                 }
2564         } else {
2565                 inode->i_op = &ext3_special_inode_operations;
2566                 if (raw_inode->i_block[0])
2567                         init_special_inode(inode, inode->i_mode,
2568                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2569                 else 
2570                         init_special_inode(inode, inode->i_mode,
2571                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2572         }
2573         brelse (iloc.bh);
2574         ext3_set_inode_flags(inode);
2575         return;
2576
2577 bad_inode:
2578         make_bad_inode(inode);
2579         return;
2580 }
2581
2582 /*
2583  * Post the struct inode info into an on-disk inode location in the
2584  * buffer-cache.  This gobbles the caller's reference to the
2585  * buffer_head in the inode location struct.
2586  *
2587  * The caller must have write access to iloc->bh.
2588  */
2589 static int ext3_do_update_inode(handle_t *handle, 
2590                                 struct inode *inode, 
2591                                 struct ext3_iloc *iloc)
2592 {
2593         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2594         struct ext3_inode_info *ei = EXT3_I(inode);
2595         struct buffer_head *bh = iloc->bh;
2596         uid_t uid = XIDINO_UID(XID_TAG(inode), inode->i_uid, inode->i_xid);
2597         gid_t gid = XIDINO_GID(XID_TAG(inode), inode->i_gid, inode->i_xid);
2598         int err = 0, rc, block;
2599
2600         /* For fields not not tracking in the in-memory inode,
2601          * initialise them to zero for new inodes. */
2602         if (ei->i_state & EXT3_STATE_NEW)
2603                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2604
2605         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2606         if(!(test_opt(inode->i_sb, NO_UID32))) {
2607                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
2608                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
2609 /*
2610  * Fix up interoperability with old kernels. Otherwise, old inodes get
2611  * re-used with the upper 16 bits of the uid/gid intact
2612  */
2613                 if(!ei->i_dtime) {
2614                         raw_inode->i_uid_high =
2615                                 cpu_to_le16(high_16_bits(uid));
2616                         raw_inode->i_gid_high =
2617                                 cpu_to_le16(high_16_bits(gid));
2618                 } else {
2619                         raw_inode->i_uid_high = 0;
2620                         raw_inode->i_gid_high = 0;
2621                 }
2622         } else {
2623                 raw_inode->i_uid_low =
2624                         cpu_to_le16(fs_high2lowuid(uid));
2625                 raw_inode->i_gid_low =
2626                         cpu_to_le16(fs_high2lowgid(gid));
2627                 raw_inode->i_uid_high = 0;
2628                 raw_inode->i_gid_high = 0;
2629         }
2630 #ifdef CONFIG_INOXID_INTERN
2631         raw_inode->i_raw_xid = cpu_to_le16(inode->i_xid);
2632 #endif
2633         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2634         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2635         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2636         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2637         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2638         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2639         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2640         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2641 #ifdef EXT3_FRAGMENTS
2642         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2643         raw_inode->i_frag = ei->i_frag_no;
2644         raw_inode->i_fsize = ei->i_frag_size;
2645 #endif
2646         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2647         if (!S_ISREG(inode->i_mode)) {
2648                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2649         } else {
2650                 raw_inode->i_size_high =
2651                         cpu_to_le32(ei->i_disksize >> 32);
2652                 if (ei->i_disksize > 0x7fffffffULL) {
2653                         struct super_block *sb = inode->i_sb;
2654                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2655                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2656                             EXT3_SB(sb)->s_es->s_rev_level ==
2657                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2658                                /* If this is the first large file
2659                                 * created, add a flag to the superblock.
2660                                 */
2661                                 err = ext3_journal_get_write_access(handle,
2662                                                 EXT3_SB(sb)->s_sbh);
2663                                 if (err)
2664                                         goto out_brelse;
2665                                 ext3_update_dynamic_rev(sb);
2666                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2667                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2668                                 sb->s_dirt = 1;
2669                                 handle->h_sync = 1;
2670                                 err = ext3_journal_dirty_metadata(handle,
2671                                                 EXT3_SB(sb)->s_sbh);
2672                         }
2673                 }
2674         }
2675         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2676         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2677                 if (old_valid_dev(inode->i_rdev)) {
2678                         raw_inode->i_block[0] =
2679                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
2680                         raw_inode->i_block[1] = 0;
2681                 } else {
2682                         raw_inode->i_block[0] = 0;
2683                         raw_inode->i_block[1] =
2684                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
2685                         raw_inode->i_block[2] = 0;
2686                 }
2687         } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2688                 raw_inode->i_block[block] = ei->i_data[block];
2689
2690         if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
2691                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2692
2693         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2694         rc = ext3_journal_dirty_metadata(handle, bh);
2695         if (!err)
2696                 err = rc;
2697         ei->i_state &= ~EXT3_STATE_NEW;
2698
2699 out_brelse:
2700         brelse (bh);
2701         ext3_std_error(inode->i_sb, err);
2702         return err;
2703 }
2704
2705 /*
2706  * ext3_write_inode()
2707  *
2708  * We are called from a few places:
2709  *
2710  * - Within generic_file_write() for O_SYNC files.
2711  *   Here, there will be no transaction running. We wait for any running
2712  *   trasnaction to commit.
2713  *
2714  * - Within sys_sync(), kupdate and such.
2715  *   We wait on commit, if tol to.
2716  *
2717  * - Within prune_icache() (PF_MEMALLOC == true)
2718  *   Here we simply return.  We can't afford to block kswapd on the
2719  *   journal commit.
2720  *
2721  * In all cases it is actually safe for us to return without doing anything,
2722  * because the inode has been copied into a raw inode buffer in
2723  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2724  * knfsd.
2725  *
2726  * Note that we are absolutely dependent upon all inode dirtiers doing the
2727  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2728  * which we are interested.
2729  *
2730  * It would be a bug for them to not do this.  The code:
2731  *
2732  *      mark_inode_dirty(inode)
2733  *      stuff();
2734  *      inode->i_size = expr;
2735  *
2736  * is in error because a kswapd-driven write_inode() could occur while
2737  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2738  * will no longer be on the superblock's dirty inode list.
2739  */
2740 int ext3_write_inode(struct inode *inode, int wait)
2741 {
2742         if (current->flags & PF_MEMALLOC)
2743                 return 0;
2744
2745         if (ext3_journal_current_handle()) {
2746                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2747                 dump_stack();
2748                 return -EIO;
2749         }
2750
2751         if (!wait)
2752                 return 0;
2753
2754         return ext3_force_commit(inode->i_sb);
2755 }
2756
2757 int ext3_setattr_flags(struct inode *inode, unsigned int flags)
2758 {
2759         unsigned int oldflags, newflags;
2760         int err = 0;
2761
2762         oldflags = EXT3_I(inode)->i_flags;
2763         newflags = oldflags &
2764                 ~(EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL | EXT3_BARRIER_FL);
2765         if (flags & ATTR_FLAG_IMMUTABLE)
2766                 newflags |= EXT3_IMMUTABLE_FL;
2767         if (flags & ATTR_FLAG_IUNLINK)
2768                 newflags |= EXT3_IUNLINK_FL;
2769         if (flags & ATTR_FLAG_BARRIER)
2770                 newflags |= EXT3_BARRIER_FL;
2771
2772         if (oldflags ^ newflags) {
2773                 handle_t *handle;
2774                 struct ext3_iloc iloc;
2775
2776                 handle = ext3_journal_start(inode, 1);
2777                 if (IS_ERR(handle))
2778                         return PTR_ERR(handle);
2779                 if (IS_SYNC(inode))
2780                         handle->h_sync = 1;
2781                 err = ext3_reserve_inode_write(handle, inode, &iloc);
2782                 if (err)
2783                         goto flags_err;
2784
2785                 EXT3_I(inode)->i_flags = newflags;
2786                 inode->i_ctime = CURRENT_TIME;
2787
2788                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2789         flags_err:
2790                 ext3_journal_stop(handle);
2791         }
2792         return err;
2793 }
2794
2795 /*
2796  * ext3_setattr()
2797  *
2798  * Called from notify_change.
2799  *
2800  * We want to trap VFS attempts to truncate the file as soon as
2801  * possible.  In particular, we want to make sure that when the VFS
2802  * shrinks i_size, we put the inode on the orphan list and modify
2803  * i_disksize immediately, so that during the subsequent flushing of
2804  * dirty pages and freeing of disk blocks, we can guarantee that any
2805  * commit will leave the blocks being flushed in an unused state on
2806  * disk.  (On recovery, the inode will get truncated and the blocks will
2807  * be freed, so we have a strong guarantee that no future commit will
2808  * leave these blocks visible to the user.)  
2809  *
2810  * Called with inode->sem down.
2811  */
2812 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2813 {
2814         struct inode *inode = dentry->d_inode;
2815         int error, rc = 0;
2816         const unsigned int ia_valid = attr->ia_valid;
2817
2818         error = inode_change_ok(inode, attr);
2819         if (error)
2820                 return error;
2821
2822         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2823                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) ||
2824                 (ia_valid & ATTR_XID && attr->ia_xid != inode->i_xid)) {
2825                 handle_t *handle;
2826
2827                 /* (user+group)*(old+new) structure, inode write (sb,
2828                  * inode block, ? - but truncate inode update has it) */
2829                 handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
2830                 if (IS_ERR(handle)) {
2831                         error = PTR_ERR(handle);
2832                         goto err_out;
2833                 }
2834                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2835                 if (error) {
2836                         ext3_journal_stop(handle);
2837                         return error;
2838                 }
2839                 /* Update corresponding info in inode so that everything is in
2840                  * one transaction */
2841                 if (attr->ia_valid & ATTR_UID)
2842                         inode->i_uid = attr->ia_uid;
2843                 if (attr->ia_valid & ATTR_GID)
2844                         inode->i_gid = attr->ia_gid;
2845                 if ((attr->ia_valid & ATTR_XID)
2846                         && inode->i_sb
2847                         && (inode->i_sb->s_flags & MS_TAGXID))
2848                         inode->i_xid = attr->ia_xid;
2849                 error = ext3_mark_inode_dirty(handle, inode);
2850                 ext3_journal_stop(handle);
2851         }
2852
2853         if (S_ISREG(inode->i_mode) &&
2854             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2855                 handle_t *handle;
2856
2857                 handle = ext3_journal_start(inode, 3);
2858                 if (IS_ERR(handle)) {
2859                         error = PTR_ERR(handle);
2860                         goto err_out;
2861                 }
2862
2863                 error = ext3_orphan_add(handle, inode);
2864                 EXT3_I(inode)->i_disksize = attr->ia_size;
2865                 rc = ext3_mark_inode_dirty(handle, inode);
2866                 if (!error)
2867                         error = rc;
2868                 ext3_journal_stop(handle);
2869         }
2870
2871         if (ia_valid & ATTR_ATTR_FLAG) {
2872                 rc = ext3_setattr_flags(inode, attr->ia_attr_flags);
2873                 if (!error)
2874                         error = rc;
2875         }
2876
2877         rc = inode_setattr(inode, attr);
2878
2879         /* If inode_setattr's call to ext3_truncate failed to get a
2880          * transaction handle at all, we need to clean up the in-core
2881          * orphan list manually. */
2882         if (inode->i_nlink)
2883                 ext3_orphan_del(NULL, inode);
2884
2885         if (!rc && (ia_valid & ATTR_MODE))
2886                 rc = ext3_acl_chmod(inode);
2887
2888 err_out:
2889         ext3_std_error(inode->i_sb, error);
2890         if (!error)
2891                 error = rc;
2892         return error;
2893 }
2894
2895
2896 /*
2897  * akpm: how many blocks doth make a writepage()?
2898  *
2899  * With N blocks per page, it may be:
2900  * N data blocks
2901  * 2 indirect block
2902  * 2 dindirect
2903  * 1 tindirect
2904  * N+5 bitmap blocks (from the above)
2905  * N+5 group descriptor summary blocks
2906  * 1 inode block
2907  * 1 superblock.
2908  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2909  *
2910  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2911  *
2912  * With ordered or writeback data it's the same, less the N data blocks.
2913  *
2914  * If the inode's direct blocks can hold an integral number of pages then a
2915  * page cannot straddle two indirect blocks, and we can only touch one indirect
2916  * and dindirect block, and the "5" above becomes "3".
2917  *
2918  * This still overestimates under most circumstances.  If we were to pass the
2919  * start and end offsets in here as well we could do block_to_path() on each
2920  * block and work out the exact number of indirects which are touched.  Pah.
2921  */
2922
2923 static int ext3_writepage_trans_blocks(struct inode *inode)
2924 {
2925         int bpp = ext3_journal_blocks_per_page(inode);
2926         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2927         int ret;
2928
2929         if (ext3_should_journal_data(inode))
2930                 ret = 3 * (bpp + indirects) + 2;
2931         else
2932                 ret = 2 * (bpp + indirects) + 2;
2933
2934 #ifdef CONFIG_QUOTA
2935         /* We know that structure was already allocated during DQUOT_INIT so
2936          * we will be updating only the data blocks + inodes */
2937         ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
2938 #endif
2939
2940         return ret;
2941 }
2942
2943 /*
2944  * The caller must have previously called ext3_reserve_inode_write().
2945  * Give this, we know that the caller already has write access to iloc->bh.
2946  */
2947 int ext3_mark_iloc_dirty(handle_t *handle,
2948                 struct inode *inode, struct ext3_iloc *iloc)
2949 {
2950         int err = 0;
2951
2952         /* the do_update_inode consumes one bh->b_count */
2953         get_bh(iloc->bh);
2954
2955         /* ext3_do_update_inode() does journal_dirty_metadata */
2956         err = ext3_do_update_inode(handle, inode, iloc);
2957         put_bh(iloc->bh);
2958         return err;
2959 }
2960
2961 /* 
2962  * On success, We end up with an outstanding reference count against
2963  * iloc->bh.  This _must_ be cleaned up later. 
2964  */
2965
2966 int
2967 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
2968                          struct ext3_iloc *iloc)
2969 {
2970         int err = 0;
2971         if (handle) {
2972                 err = ext3_get_inode_loc(inode, iloc);
2973                 if (!err) {
2974                         BUFFER_TRACE(iloc->bh, "get_write_access");
2975                         err = ext3_journal_get_write_access(handle, iloc->bh);
2976                         if (err) {
2977                                 brelse(iloc->bh);
2978                                 iloc->bh = NULL;
2979                         }
2980                 }
2981         }
2982         ext3_std_error(inode->i_sb, err);
2983         return err;
2984 }
2985
2986 /*
2987  * akpm: What we do here is to mark the in-core inode as clean
2988  * with respect to inode dirtiness (it may still be data-dirty).
2989  * This means that the in-core inode may be reaped by prune_icache
2990  * without having to perform any I/O.  This is a very good thing,
2991  * because *any* task may call prune_icache - even ones which
2992  * have a transaction open against a different journal.
2993  *
2994  * Is this cheating?  Not really.  Sure, we haven't written the
2995  * inode out, but prune_icache isn't a user-visible syncing function.
2996  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2997  * we start and wait on commits.
2998  *
2999  * Is this efficient/effective?  Well, we're being nice to the system
3000  * by cleaning up our inodes proactively so they can be reaped
3001  * without I/O.  But we are potentially leaving up to five seconds'
3002  * worth of inodes floating about which prune_icache wants us to
3003  * write out.  One way to fix that would be to get prune_icache()
3004  * to do a write_super() to free up some memory.  It has the desired
3005  * effect.
3006  */
3007 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3008 {
3009         struct ext3_iloc iloc;
3010         int err;
3011
3012         might_sleep();
3013         err = ext3_reserve_inode_write(handle, inode, &iloc);
3014         if (!err)
3015                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3016         return err;
3017 }
3018
3019 /*
3020  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
3021  *
3022  * We're really interested in the case where a file is being extended.
3023  * i_size has been changed by generic_commit_write() and we thus need
3024  * to include the updated inode in the current transaction.
3025  *
3026  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3027  * are allocated to the file.
3028  *
3029  * If the inode is marked synchronous, we don't honour that here - doing
3030  * so would cause a commit on atime updates, which we don't bother doing.
3031  * We handle synchronous inodes at the highest possible level.
3032  */
3033 void ext3_dirty_inode(struct inode *inode)
3034 {
3035         handle_t *current_handle = ext3_journal_current_handle();
3036         handle_t *handle;
3037
3038         handle = ext3_journal_start(inode, 2);
3039         if (IS_ERR(handle))
3040                 goto out;
3041         if (current_handle &&
3042                 current_handle->h_transaction != handle->h_transaction) {
3043                 /* This task has a transaction open against a different fs */
3044                 printk(KERN_EMERG "%s: transactions do not match!\n",
3045                        __FUNCTION__);
3046         } else {
3047                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
3048                                 current_handle);
3049                 ext3_mark_inode_dirty(handle, inode);
3050         }
3051         ext3_journal_stop(handle);
3052 out:
3053         return;
3054 }
3055
3056 #ifdef AKPM
3057 /* 
3058  * Bind an inode's backing buffer_head into this transaction, to prevent
3059  * it from being flushed to disk early.  Unlike
3060  * ext3_reserve_inode_write, this leaves behind no bh reference and
3061  * returns no iloc structure, so the caller needs to repeat the iloc
3062  * lookup to mark the inode dirty later.
3063  */
3064 static inline int
3065 ext3_pin_inode(handle_t *handle, struct inode *inode)
3066 {
3067         struct ext3_iloc iloc;
3068
3069         int err = 0;
3070         if (handle) {
3071                 err = ext3_get_inode_loc(inode, &iloc);
3072                 if (!err) {
3073                         BUFFER_TRACE(iloc.bh, "get_write_access");
3074                         err = journal_get_write_access(handle, iloc.bh);
3075                         if (!err)
3076                                 err = ext3_journal_dirty_metadata(handle, 
3077                                                                   iloc.bh);
3078                         brelse(iloc.bh);
3079                 }
3080         }
3081         ext3_std_error(inode->i_sb, err);
3082         return err;
3083 }
3084 #endif
3085
3086 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3087 {
3088         journal_t *journal;
3089         handle_t *handle;
3090         int err;
3091
3092         /*
3093          * We have to be very careful here: changing a data block's
3094          * journaling status dynamically is dangerous.  If we write a
3095          * data block to the journal, change the status and then delete
3096          * that block, we risk forgetting to revoke the old log record
3097          * from the journal and so a subsequent replay can corrupt data.
3098          * So, first we make sure that the journal is empty and that
3099          * nobody is changing anything.
3100          */
3101
3102         journal = EXT3_JOURNAL(inode);
3103         if (is_journal_aborted(journal) || IS_RDONLY(inode))
3104                 return -EROFS;
3105
3106         journal_lock_updates(journal);
3107         journal_flush(journal);
3108
3109         /*
3110          * OK, there are no updates running now, and all cached data is
3111          * synced to disk.  We are now in a completely consistent state
3112          * which doesn't have anything in the journal, and we know that
3113          * no filesystem updates are running, so it is safe to modify
3114          * the inode's in-core data-journaling state flag now.
3115          */
3116
3117         if (val)
3118                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3119         else
3120                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3121         ext3_set_aops(inode);
3122
3123         journal_unlock_updates(journal);
3124
3125         /* Finally we can mark the inode as dirty. */
3126
3127         handle = ext3_journal_start(inode, 1);
3128         if (IS_ERR(handle))
3129                 return PTR_ERR(handle);
3130
3131         err = ext3_mark_inode_dirty(handle, inode);
3132         handle->h_sync = 1;
3133         ext3_journal_stop(handle);
3134         ext3_std_error(inode->i_sb, err);
3135
3136         return err;
3137 }