909b7d9779f726ff6661e0eb5de3fba5c53ab47f
[linux-2.6.git] / fs / ext3 / inode.c
1 /*
2  *  linux/fs/ext3/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *      (sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *      (jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23  */
24
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
39 #include "xattr.h"
40 #include "acl.h"
41
42 /*
43  * Test whether an inode is a fast symlink.
44  */
45 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
46 {
47         int ea_blocks = EXT3_I(inode)->i_file_acl ?
48                 (inode->i_sb->s_blocksize >> 9) : 0;
49
50         return (S_ISLNK(inode->i_mode) &&
51                 inode->i_blocks - ea_blocks == 0);
52 }
53
54 /* The ext3 forget function must perform a revoke if we are freeing data
55  * which has been journaled.  Metadata (eg. indirect blocks) must be
56  * revoked in all cases. 
57  *
58  * "bh" may be NULL: a metadata block may have been freed from memory
59  * but there may still be a record of it in the journal, and that record
60  * still needs to be revoked.
61  */
62
63 int ext3_forget(handle_t *handle, int is_metadata,
64                        struct inode *inode, struct buffer_head *bh,
65                        int blocknr)
66 {
67         int err;
68
69         BUFFER_TRACE(bh, "enter");
70
71         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
72                   "data mode %lx\n",
73                   bh, is_metadata, inode->i_mode,
74                   test_opt(inode->i_sb, DATA_FLAGS));
75
76         /* Never use the revoke function if we are doing full data
77          * journaling: there is no need to, and a V1 superblock won't
78          * support it.  Otherwise, only skip the revoke on un-journaled
79          * data blocks. */
80
81         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
82             (!is_metadata && !ext3_should_journal_data(inode))) {
83                 if (bh) {
84                         BUFFER_TRACE(bh, "call journal_forget");
85                         ext3_journal_forget(handle, bh);
86                 }
87                 return 0;
88         }
89
90         /*
91          * data!=journal && (is_metadata || should_journal_data(inode))
92          */
93         BUFFER_TRACE(bh, "call ext3_journal_revoke");
94         err = ext3_journal_revoke(handle, blocknr, bh);
95         if (err)
96                 ext3_abort(inode->i_sb, __FUNCTION__,
97                            "error %d when attempting revoke", err);
98         BUFFER_TRACE(bh, "exit");
99         return err;
100 }
101
102 /*
103  * Work out how many blocks we need to progress with the next chunk of a
104  * truncate transaction.
105  */
106
107 static unsigned long blocks_for_truncate(struct inode *inode) 
108 {
109         unsigned long needed;
110
111         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
112
113         /* Give ourselves just enough room to cope with inodes in which
114          * i_blocks is corrupt: we've seen disk corruptions in the past
115          * which resulted in random data in an inode which looked enough
116          * like a regular file for ext3 to try to delete it.  Things
117          * will go a bit crazy if that happens, but at least we should
118          * try not to panic the whole kernel. */
119         if (needed < 2)
120                 needed = 2;
121
122         /* But we need to bound the transaction so we don't overflow the
123          * journal. */
124         if (needed > EXT3_MAX_TRANS_DATA) 
125                 needed = EXT3_MAX_TRANS_DATA;
126
127         return EXT3_DATA_TRANS_BLOCKS + needed;
128 }
129
130 /* 
131  * Truncate transactions can be complex and absolutely huge.  So we need to
132  * be able to restart the transaction at a conventient checkpoint to make
133  * sure we don't overflow the journal.
134  *
135  * start_transaction gets us a new handle for a truncate transaction,
136  * and extend_transaction tries to extend the existing one a bit.  If
137  * extend fails, we need to propagate the failure up and restart the
138  * transaction in the top-level truncate loop. --sct 
139  */
140
141 static handle_t *start_transaction(struct inode *inode) 
142 {
143         handle_t *result;
144
145         result = ext3_journal_start(inode, blocks_for_truncate(inode));
146         if (!IS_ERR(result))
147                 return result;
148
149         ext3_std_error(inode->i_sb, PTR_ERR(result));
150         return result;
151 }
152
153 /*
154  * Try to extend this transaction for the purposes of truncation.
155  *
156  * Returns 0 if we managed to create more room.  If we can't create more
157  * room, and the transaction must be restarted we return 1.
158  */
159 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
160 {
161         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
162                 return 0;
163         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
164                 return 0;
165         return 1;
166 }
167
168 /*
169  * Restart the transaction associated with *handle.  This does a commit,
170  * so before we call here everything must be consistently dirtied against
171  * this transaction.
172  */
173 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
174 {
175         jbd_debug(2, "restarting handle %p\n", handle);
176         return ext3_journal_restart(handle, blocks_for_truncate(inode));
177 }
178
179 /*
180  * Called at each iput()
181  *
182  * The inode may be "bad" if ext3_read_inode() saw an error from
183  * ext3_get_inode(), so we need to check that to avoid freeing random disk
184  * blocks.
185  */
186 void ext3_put_inode(struct inode *inode)
187 {
188         if (!is_bad_inode(inode))
189                 ext3_discard_prealloc(inode);
190 }
191
192 /*
193  * Called at the last iput() if i_nlink is zero.
194  */
195 void ext3_delete_inode (struct inode * inode)
196 {
197         handle_t *handle;
198
199         if (is_bad_inode(inode))
200                 goto no_delete;
201
202         handle = start_transaction(inode);
203         if (IS_ERR(handle)) {
204                 /* If we're going to skip the normal cleanup, we still
205                  * need to make sure that the in-core orphan linked list
206                  * is properly cleaned up. */
207                 ext3_orphan_del(NULL, inode);
208                 goto no_delete;
209         }
210
211         if (IS_SYNC(inode))
212                 handle->h_sync = 1;
213         inode->i_size = 0;
214         if (inode->i_blocks)
215                 ext3_truncate(inode);
216         /*
217          * Kill off the orphan record which ext3_truncate created.
218          * AKPM: I think this can be inside the above `if'.
219          * Note that ext3_orphan_del() has to be able to cope with the
220          * deletion of a non-existent orphan - this is because we don't
221          * know if ext3_truncate() actually created an orphan record.
222          * (Well, we could do this if we need to, but heck - it works)
223          */
224         ext3_orphan_del(handle, inode);
225         EXT3_I(inode)->i_dtime  = get_seconds();
226
227         /* 
228          * One subtle ordering requirement: if anything has gone wrong
229          * (transaction abort, IO errors, whatever), then we can still
230          * do these next steps (the fs will already have been marked as
231          * having errors), but we can't free the inode if the mark_dirty
232          * fails.  
233          */
234         if (ext3_mark_inode_dirty(handle, inode))
235                 /* If that failed, just do the required in-core inode clear. */
236                 clear_inode(inode);
237         else
238                 ext3_free_inode(handle, inode);
239         ext3_journal_stop(handle);
240         return;
241 no_delete:
242         clear_inode(inode);     /* We must guarantee clearing of inode... */
243 }
244
245 void ext3_discard_prealloc (struct inode * inode)
246 {
247 #ifdef EXT3_PREALLOCATE
248         struct ext3_inode_info *ei = EXT3_I(inode);
249         /* Writer: ->i_prealloc* */
250         if (ei->i_prealloc_count) {
251                 unsigned short total = ei->i_prealloc_count;
252                 unsigned long block = ei->i_prealloc_block;
253                 ei->i_prealloc_count = 0;
254                 ei->i_prealloc_block = 0;
255                 /* Writer: end */
256                 ext3_free_blocks (inode, block, total);
257         }
258 #endif
259 }
260
261 static int ext3_alloc_block (handle_t *handle,
262                         struct inode * inode, unsigned long goal, int *err)
263 {
264         unsigned long result;
265
266 #ifdef EXT3_PREALLOCATE
267 #ifdef EXT3FS_DEBUG
268         static unsigned long alloc_hits, alloc_attempts;
269 #endif
270         struct ext3_inode_info *ei = EXT3_I(inode);
271         /* Writer: ->i_prealloc* */
272         if (ei->i_prealloc_count &&
273             (goal == ei->i_prealloc_block ||
274              goal + 1 == ei->i_prealloc_block))
275         {
276                 result = ei->i_prealloc_block++;
277                 ei->i_prealloc_count--;
278                 /* Writer: end */
279                 ext3_debug ("preallocation hit (%lu/%lu).\n",
280                             ++alloc_hits, ++alloc_attempts);
281         } else {
282                 ext3_discard_prealloc (inode);
283                 ext3_debug ("preallocation miss (%lu/%lu).\n",
284                             alloc_hits, ++alloc_attempts);
285                 if (S_ISREG(inode->i_mode))
286                         result = ext3_new_block (inode, goal, 
287                                  &ei->i_prealloc_count,
288                                  &ei->i_prealloc_block, err);
289                 else
290                         result = ext3_new_block(inode, goal, NULL, NULL, err);
291                 /*
292                  * AKPM: this is somewhat sticky.  I'm not surprised it was
293                  * disabled in 2.2's ext3.  Need to integrate b_committed_data
294                  * guarding with preallocation, if indeed preallocation is
295                  * effective.
296                  */
297         }
298 #else
299         result = ext3_new_block(handle, inode, goal, NULL, NULL, err);
300 #endif
301         return result;
302 }
303
304
305 typedef struct {
306         u32     *p;
307         u32     key;
308         struct buffer_head *bh;
309 } Indirect;
310
311 static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
312 {
313         p->key = *(p->p = v);
314         p->bh = bh;
315 }
316
317 static inline int verify_chain(Indirect *from, Indirect *to)
318 {
319         while (from <= to && from->key == *from->p)
320                 from++;
321         return (from > to);
322 }
323
324 /**
325  *      ext3_block_to_path - parse the block number into array of offsets
326  *      @inode: inode in question (we are only interested in its superblock)
327  *      @i_block: block number to be parsed
328  *      @offsets: array to store the offsets in
329  *      @boundary: set this non-zero if the referred-to block is likely to be
330  *             followed (on disk) by an indirect block.
331  *
332  *      To store the locations of file's data ext3 uses a data structure common
333  *      for UNIX filesystems - tree of pointers anchored in the inode, with
334  *      data blocks at leaves and indirect blocks in intermediate nodes.
335  *      This function translates the block number into path in that tree -
336  *      return value is the path length and @offsets[n] is the offset of
337  *      pointer to (n+1)th node in the nth one. If @block is out of range
338  *      (negative or too large) warning is printed and zero returned.
339  *
340  *      Note: function doesn't find node addresses, so no IO is needed. All
341  *      we need to know is the capacity of indirect blocks (taken from the
342  *      inode->i_sb).
343  */
344
345 /*
346  * Portability note: the last comparison (check that we fit into triple
347  * indirect block) is spelled differently, because otherwise on an
348  * architecture with 32-bit longs and 8Kb pages we might get into trouble
349  * if our filesystem had 8Kb blocks. We might use long long, but that would
350  * kill us on x86. Oh, well, at least the sign propagation does not matter -
351  * i_block would have to be negative in the very beginning, so we would not
352  * get there at all.
353  */
354
355 static int ext3_block_to_path(struct inode *inode,
356                         long i_block, int offsets[4], int *boundary)
357 {
358         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
359         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
360         const long direct_blocks = EXT3_NDIR_BLOCKS,
361                 indirect_blocks = ptrs,
362                 double_blocks = (1 << (ptrs_bits * 2));
363         int n = 0;
364         int final = 0;
365
366         if (i_block < 0) {
367                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
368         } else if (i_block < direct_blocks) {
369                 offsets[n++] = i_block;
370                 final = direct_blocks;
371         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
372                 offsets[n++] = EXT3_IND_BLOCK;
373                 offsets[n++] = i_block;
374                 final = ptrs;
375         } else if ((i_block -= indirect_blocks) < double_blocks) {
376                 offsets[n++] = EXT3_DIND_BLOCK;
377                 offsets[n++] = i_block >> ptrs_bits;
378                 offsets[n++] = i_block & (ptrs - 1);
379                 final = ptrs;
380         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
381                 offsets[n++] = EXT3_TIND_BLOCK;
382                 offsets[n++] = i_block >> (ptrs_bits * 2);
383                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
384                 offsets[n++] = i_block & (ptrs - 1);
385                 final = ptrs;
386         } else {
387                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
388         }
389         if (boundary)
390                 *boundary = (i_block & (ptrs - 1)) == (final - 1);
391         return n;
392 }
393
394 /**
395  *      ext3_get_branch - read the chain of indirect blocks leading to data
396  *      @inode: inode in question
397  *      @depth: depth of the chain (1 - direct pointer, etc.)
398  *      @offsets: offsets of pointers in inode/indirect blocks
399  *      @chain: place to store the result
400  *      @err: here we store the error value
401  *
402  *      Function fills the array of triples <key, p, bh> and returns %NULL
403  *      if everything went OK or the pointer to the last filled triple
404  *      (incomplete one) otherwise. Upon the return chain[i].key contains
405  *      the number of (i+1)-th block in the chain (as it is stored in memory,
406  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
407  *      number (it points into struct inode for i==0 and into the bh->b_data
408  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
409  *      block for i>0 and NULL for i==0. In other words, it holds the block
410  *      numbers of the chain, addresses they were taken from (and where we can
411  *      verify that chain did not change) and buffer_heads hosting these
412  *      numbers.
413  *
414  *      Function stops when it stumbles upon zero pointer (absent block)
415  *              (pointer to last triple returned, *@err == 0)
416  *      or when it gets an IO error reading an indirect block
417  *              (ditto, *@err == -EIO)
418  *      or when it notices that chain had been changed while it was reading
419  *              (ditto, *@err == -EAGAIN)
420  *      or when it reads all @depth-1 indirect blocks successfully and finds
421  *      the whole chain, all way to the data (returns %NULL, *err == 0).
422  */
423 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
424                                  Indirect chain[4], int *err)
425 {
426         struct super_block *sb = inode->i_sb;
427         Indirect *p = chain;
428         struct buffer_head *bh;
429
430         *err = 0;
431         /* i_data is not going away, no lock needed */
432         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
433         if (!p->key)
434                 goto no_block;
435         while (--depth) {
436                 bh = sb_bread(sb, le32_to_cpu(p->key));
437                 if (!bh)
438                         goto failure;
439                 /* Reader: pointers */
440                 if (!verify_chain(chain, p))
441                         goto changed;
442                 add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
443                 /* Reader: end */
444                 if (!p->key)
445                         goto no_block;
446         }
447         return NULL;
448
449 changed:
450         brelse(bh);
451         *err = -EAGAIN;
452         goto no_block;
453 failure:
454         *err = -EIO;
455 no_block:
456         return p;
457 }
458
459 /**
460  *      ext3_find_near - find a place for allocation with sufficient locality
461  *      @inode: owner
462  *      @ind: descriptor of indirect block.
463  *
464  *      This function returns the prefered place for block allocation.
465  *      It is used when heuristic for sequential allocation fails.
466  *      Rules are:
467  *        + if there is a block to the left of our position - allocate near it.
468  *        + if pointer will live in indirect block - allocate near that block.
469  *        + if pointer will live in inode - allocate in the same
470  *          cylinder group. 
471  *
472  * In the latter case we colour the starting block by the callers PID to
473  * prevent it from clashing with concurrent allocations for a different inode
474  * in the same block group.   The PID is used here so that functionally related
475  * files will be close-by on-disk.
476  *
477  *      Caller must make sure that @ind is valid and will stay that way.
478  */
479
480 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
481 {
482         struct ext3_inode_info *ei = EXT3_I(inode);
483         u32 *start = ind->bh ? (u32*) ind->bh->b_data : ei->i_data;
484         u32 *p;
485         unsigned long bg_start;
486         unsigned long colour;
487
488         /* Try to find previous block */
489         for (p = ind->p - 1; p >= start; p--)
490                 if (*p)
491                         return le32_to_cpu(*p);
492
493         /* No such thing, so let's try location of indirect block */
494         if (ind->bh)
495                 return ind->bh->b_blocknr;
496
497         /*
498          * It is going to be refered from inode itself? OK, just put it into
499          * the same cylinder group then.
500          */
501         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
502                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
503         colour = (current->pid % 16) *
504                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
505         return bg_start + colour;
506 }
507
508 /**
509  *      ext3_find_goal - find a prefered place for allocation.
510  *      @inode: owner
511  *      @block:  block we want
512  *      @chain:  chain of indirect blocks
513  *      @partial: pointer to the last triple within a chain
514  *      @goal:  place to store the result.
515  *
516  *      Normally this function find the prefered place for block allocation,
517  *      stores it in *@goal and returns zero. If the branch had been changed
518  *      under us we return -EAGAIN.
519  */
520
521 static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
522                           Indirect *partial, unsigned long *goal)
523 {
524         struct ext3_inode_info *ei = EXT3_I(inode);
525         /* Writer: ->i_next_alloc* */
526         if (block == ei->i_next_alloc_block + 1) {
527                 ei->i_next_alloc_block++;
528                 ei->i_next_alloc_goal++;
529         }
530         /* Writer: end */
531         /* Reader: pointers, ->i_next_alloc* */
532         if (verify_chain(chain, partial)) {
533                 /*
534                  * try the heuristic for sequential allocation,
535                  * failing that at least try to get decent locality.
536                  */
537                 if (block == ei->i_next_alloc_block)
538                         *goal = ei->i_next_alloc_goal;
539                 if (!*goal)
540                         *goal = ext3_find_near(inode, partial);
541                 return 0;
542         }
543         /* Reader: end */
544         return -EAGAIN;
545 }
546
547 /**
548  *      ext3_alloc_branch - allocate and set up a chain of blocks.
549  *      @inode: owner
550  *      @num: depth of the chain (number of blocks to allocate)
551  *      @offsets: offsets (in the blocks) to store the pointers to next.
552  *      @branch: place to store the chain in.
553  *
554  *      This function allocates @num blocks, zeroes out all but the last one,
555  *      links them into chain and (if we are synchronous) writes them to disk.
556  *      In other words, it prepares a branch that can be spliced onto the
557  *      inode. It stores the information about that chain in the branch[], in
558  *      the same format as ext3_get_branch() would do. We are calling it after
559  *      we had read the existing part of chain and partial points to the last
560  *      triple of that (one with zero ->key). Upon the exit we have the same
561  *      picture as after the successful ext3_get_block(), excpet that in one
562  *      place chain is disconnected - *branch->p is still zero (we did not
563  *      set the last link), but branch->key contains the number that should
564  *      be placed into *branch->p to fill that gap.
565  *
566  *      If allocation fails we free all blocks we've allocated (and forget
567  *      their buffer_heads) and return the error value the from failed
568  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
569  *      as described above and return 0.
570  */
571
572 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
573                              int num,
574                              unsigned long goal,
575                              int *offsets,
576                              Indirect *branch)
577 {
578         int blocksize = inode->i_sb->s_blocksize;
579         int n = 0, keys = 0;
580         int err = 0;
581         int i;
582         int parent = ext3_alloc_block(handle, inode, goal, &err);
583
584         branch[0].key = cpu_to_le32(parent);
585         if (parent) {
586                 for (n = 1; n < num; n++) {
587                         struct buffer_head *bh;
588                         /* Allocate the next block */
589                         int nr = ext3_alloc_block(handle, inode, parent, &err);
590                         if (!nr)
591                                 break;
592                         branch[n].key = cpu_to_le32(nr);
593                         keys = n+1;
594
595                         /*
596                          * Get buffer_head for parent block, zero it out
597                          * and set the pointer to new one, then send
598                          * parent to disk.  
599                          */
600                         bh = sb_getblk(inode->i_sb, parent);
601                         branch[n].bh = bh;
602                         lock_buffer(bh);
603                         BUFFER_TRACE(bh, "call get_create_access");
604                         err = ext3_journal_get_create_access(handle, bh);
605                         if (err) {
606                                 unlock_buffer(bh);
607                                 brelse(bh);
608                                 break;
609                         }
610
611                         memset(bh->b_data, 0, blocksize);
612                         branch[n].p = (u32*) bh->b_data + offsets[n];
613                         *branch[n].p = branch[n].key;
614                         BUFFER_TRACE(bh, "marking uptodate");
615                         set_buffer_uptodate(bh);
616                         unlock_buffer(bh);
617
618                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
619                         err = ext3_journal_dirty_metadata(handle, bh);
620                         if (err)
621                                 break;
622
623                         parent = nr;
624                 }
625         }
626         if (n == num)
627                 return 0;
628
629         /* Allocation failed, free what we already allocated */
630         for (i = 1; i < keys; i++) {
631                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
632                 ext3_journal_forget(handle, branch[i].bh);
633         }
634         for (i = 0; i < keys; i++)
635                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
636         return err;
637 }
638
639 /**
640  *      ext3_splice_branch - splice the allocated branch onto inode.
641  *      @inode: owner
642  *      @block: (logical) number of block we are adding
643  *      @chain: chain of indirect blocks (with a missing link - see
644  *              ext3_alloc_branch)
645  *      @where: location of missing link
646  *      @num:   number of blocks we are adding
647  *
648  *      This function verifies that chain (up to the missing link) had not
649  *      changed, fills the missing link and does all housekeeping needed in
650  *      inode (->i_blocks, etc.). In case of success we end up with the full
651  *      chain to new block and return 0. Otherwise (== chain had been changed)
652  *      we free the new blocks (forgetting their buffer_heads, indeed) and
653  *      return -EAGAIN.
654  */
655
656 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
657                               Indirect chain[4], Indirect *where, int num)
658 {
659         int i;
660         int err = 0;
661         struct ext3_inode_info *ei = EXT3_I(inode);
662
663         /*
664          * If we're splicing into a [td]indirect block (as opposed to the
665          * inode) then we need to get write access to the [td]indirect block
666          * before the splice.
667          */
668         if (where->bh) {
669                 BUFFER_TRACE(where->bh, "get_write_access");
670                 err = ext3_journal_get_write_access(handle, where->bh);
671                 if (err)
672                         goto err_out;
673         }
674         /* Verify that place we are splicing to is still there and vacant */
675
676         /* Writer: pointers, ->i_next_alloc* */
677         if (!verify_chain(chain, where-1) || *where->p)
678                 /* Writer: end */
679                 goto changed;
680
681         /* That's it */
682
683         *where->p = where->key;
684         ei->i_next_alloc_block = block;
685         ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key);
686         /* Writer: end */
687
688         /* We are done with atomic stuff, now do the rest of housekeeping */
689
690         inode->i_ctime = CURRENT_TIME;
691         ext3_mark_inode_dirty(handle, inode);
692
693         /* had we spliced it onto indirect block? */
694         if (where->bh) {
695                 /*
696                  * akpm: If we spliced it onto an indirect block, we haven't
697                  * altered the inode.  Note however that if it is being spliced
698                  * onto an indirect block at the very end of the file (the
699                  * file is growing) then we *will* alter the inode to reflect
700                  * the new i_size.  But that is not done here - it is done in
701                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
702                  */
703                 jbd_debug(5, "splicing indirect only\n");
704                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
705                 err = ext3_journal_dirty_metadata(handle, where->bh);
706                 if (err) 
707                         goto err_out;
708         } else {
709                 /*
710                  * OK, we spliced it into the inode itself on a direct block.
711                  * Inode was dirtied above.
712                  */
713                 jbd_debug(5, "splicing direct\n");
714         }
715         return err;
716
717 changed:
718         /*
719          * AKPM: if where[i].bh isn't part of the current updating
720          * transaction then we explode nastily.  Test this code path.
721          */
722         jbd_debug(1, "the chain changed: try again\n");
723         err = -EAGAIN;
724
725 err_out:
726         for (i = 1; i < num; i++) {
727                 BUFFER_TRACE(where[i].bh, "call journal_forget");
728                 ext3_journal_forget(handle, where[i].bh);
729         }
730         /* For the normal collision cleanup case, we free up the blocks.
731          * On genuine filesystem errors we don't even think about doing
732          * that. */
733         if (err == -EAGAIN)
734                 for (i = 0; i < num; i++)
735                         ext3_free_blocks(handle, inode, 
736                                          le32_to_cpu(where[i].key), 1);
737         return err;
738 }
739
740 /*
741  * Allocation strategy is simple: if we have to allocate something, we will
742  * have to go the whole way to leaf. So let's do it before attaching anything
743  * to tree, set linkage between the newborn blocks, write them if sync is
744  * required, recheck the path, free and repeat if check fails, otherwise
745  * set the last missing link (that will protect us from any truncate-generated
746  * removals - all blocks on the path are immune now) and possibly force the
747  * write on the parent block.
748  * That has a nice additional property: no special recovery from the failed
749  * allocations is needed - we simply release blocks and do not touch anything
750  * reachable from inode.
751  *
752  * akpm: `handle' can be NULL if create == 0.
753  *
754  * The BKL may not be held on entry here.  Be sure to take it early.
755  */
756
757 static int
758 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
759                 struct buffer_head *bh_result, int create, int extend_disksize)
760 {
761         int err = -EIO;
762         int offsets[4];
763         Indirect chain[4];
764         Indirect *partial;
765         unsigned long goal;
766         int left;
767         int boundary = 0;
768         int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
769         struct ext3_inode_info *ei = EXT3_I(inode);
770
771         J_ASSERT(handle != NULL || create == 0);
772
773         if (depth == 0)
774                 goto out;
775
776 reread:
777         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
778
779         /* Simplest case - block found, no allocation needed */
780         if (!partial) {
781                 clear_buffer_new(bh_result);
782 got_it:
783                 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
784                 if (boundary)
785                         set_buffer_boundary(bh_result);
786                 /* Clean up and exit */
787                 partial = chain+depth-1; /* the whole chain */
788                 goto cleanup;
789         }
790
791         /* Next simple case - plain lookup or failed read of indirect block */
792         if (!create || err == -EIO) {
793 cleanup:
794                 while (partial > chain) {
795                         BUFFER_TRACE(partial->bh, "call brelse");
796                         brelse(partial->bh);
797                         partial--;
798                 }
799                 BUFFER_TRACE(bh_result, "returned");
800 out:
801                 return err;
802         }
803
804         /*
805          * Indirect block might be removed by truncate while we were
806          * reading it. Handling of that case (forget what we've got and
807          * reread) is taken out of the main path.
808          */
809         if (err == -EAGAIN)
810                 goto changed;
811
812         goal = 0;
813         down(&ei->truncate_sem);
814         if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) {
815                 up(&ei->truncate_sem);
816                 goto changed;
817         }
818
819         left = (chain + depth) - partial;
820
821         /*
822          * Block out ext3_truncate while we alter the tree
823          */
824         err = ext3_alloc_branch(handle, inode, left, goal,
825                                         offsets+(partial-chain), partial);
826
827         /* The ext3_splice_branch call will free and forget any buffers
828          * on the new chain if there is a failure, but that risks using
829          * up transaction credits, especially for bitmaps where the
830          * credits cannot be returned.  Can we handle this somehow?  We
831          * may need to return -EAGAIN upwards in the worst case.  --sct */
832         if (!err)
833                 err = ext3_splice_branch(handle, inode, iblock, chain,
834                                          partial, left);
835         /* i_disksize growing is protected by truncate_sem
836          * don't forget to protect it if you're about to implement
837          * concurrent ext3_get_block() -bzzz */
838         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
839                 ei->i_disksize = inode->i_size;
840         up(&ei->truncate_sem);
841         if (err == -EAGAIN)
842                 goto changed;
843         if (err)
844                 goto cleanup;
845
846         set_buffer_new(bh_result);
847         goto got_it;
848
849 changed:
850         while (partial > chain) {
851                 jbd_debug(1, "buffer chain changed, retrying\n");
852                 BUFFER_TRACE(partial->bh, "brelsing");
853                 brelse(partial->bh);
854                 partial--;
855         }
856         goto reread;
857 }
858
859 static int ext3_get_block(struct inode *inode, sector_t iblock,
860                         struct buffer_head *bh_result, int create)
861 {
862         handle_t *handle = NULL;
863         int ret;
864
865         if (create) {
866                 handle = ext3_journal_current_handle();
867                 J_ASSERT(handle != 0);
868         }
869         ret = ext3_get_block_handle(handle, inode, iblock,
870                                 bh_result, create, 1);
871         return ret;
872 }
873
874 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
875
876 static int
877 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
878                 unsigned long max_blocks, struct buffer_head *bh_result,
879                 int create)
880 {
881         handle_t *handle = journal_current_handle();
882         int ret = 0;
883
884         if (!handle)
885                 goto get_block;         /* A read */
886
887         if (handle->h_transaction->t_state == T_LOCKED) {
888                 /*
889                  * Huge direct-io writes can hold off commits for long
890                  * periods of time.  Let this commit run.
891                  */
892                 ext3_journal_stop(handle);
893                 handle = ext3_journal_start(inode, DIO_CREDITS);
894                 if (IS_ERR(handle))
895                         ret = PTR_ERR(handle);
896                 goto get_block;
897         }
898
899         if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
900                 /*
901                  * Getting low on buffer credits...
902                  */
903                 ret = ext3_journal_extend(handle, DIO_CREDITS);
904                 if (ret > 0) {
905                         /*
906                          * Couldn't extend the transaction.  Start a new one.
907                          */
908                         ret = ext3_journal_restart(handle, DIO_CREDITS);
909                 }
910         }
911
912 get_block:
913         if (ret == 0)
914                 ret = ext3_get_block_handle(handle, inode, iblock,
915                                         bh_result, create, 0);
916         bh_result->b_size = (1 << inode->i_blkbits);
917         return ret;
918 }
919
920 /*
921  * `handle' can be NULL if create is zero
922  */
923 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
924                                 long block, int create, int * errp)
925 {
926         struct buffer_head dummy;
927         int fatal = 0, err;
928
929         J_ASSERT(handle != NULL || create == 0);
930
931         dummy.b_state = 0;
932         dummy.b_blocknr = -1000;
933         buffer_trace_init(&dummy.b_history);
934         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
935         if (!*errp && buffer_mapped(&dummy)) {
936                 struct buffer_head *bh;
937                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
938                 if (buffer_new(&dummy)) {
939                         J_ASSERT(create != 0);
940                         J_ASSERT(handle != 0);
941
942                         /* Now that we do not always journal data, we
943                            should keep in mind whether this should
944                            always journal the new buffer as metadata.
945                            For now, regular file writes use
946                            ext3_get_block instead, so it's not a
947                            problem. */
948                         lock_buffer(bh);
949                         BUFFER_TRACE(bh, "call get_create_access");
950                         fatal = ext3_journal_get_create_access(handle, bh);
951                         if (!fatal && !buffer_uptodate(bh)) {
952                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
953                                 set_buffer_uptodate(bh);
954                         }
955                         unlock_buffer(bh);
956                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
957                         err = ext3_journal_dirty_metadata(handle, bh);
958                         if (!fatal)
959                                 fatal = err;
960                 } else {
961                         BUFFER_TRACE(bh, "not a new buffer");
962                 }
963                 if (fatal) {
964                         *errp = fatal;
965                         brelse(bh);
966                         bh = NULL;
967                 }
968                 return bh;
969         }
970         return NULL;
971 }
972
973 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
974                                int block, int create, int *err)
975 {
976         struct buffer_head * bh;
977         int prev_blocks;
978
979         prev_blocks = inode->i_blocks;
980
981         bh = ext3_getblk (handle, inode, block, create, err);
982         if (!bh)
983                 return bh;
984 #ifdef EXT3_PREALLOCATE
985         /*
986          * If the inode has grown, and this is a directory, then use a few
987          * more of the preallocated blocks to keep directory fragmentation
988          * down.  The preallocated blocks are guaranteed to be contiguous.
989          */
990         if (create &&
991             S_ISDIR(inode->i_mode) &&
992             inode->i_blocks > prev_blocks &&
993             EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
994                                     EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
995                 int i;
996                 struct buffer_head *tmp_bh;
997
998                 for (i = 1;
999                      EXT3_I(inode)->i_prealloc_count &&
1000                      i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
1001                      i++) {
1002                         /*
1003                          * ext3_getblk will zero out the contents of the
1004                          * directory for us
1005                          */
1006                         tmp_bh = ext3_getblk(handle, inode,
1007                                                 block+i, create, err);
1008                         if (!tmp_bh) {
1009                                 brelse (bh);
1010                                 return 0;
1011                         }
1012                         brelse (tmp_bh);
1013                 }
1014         }
1015 #endif
1016         if (buffer_uptodate(bh))
1017                 return bh;
1018         ll_rw_block (READ, 1, &bh);
1019         wait_on_buffer (bh);
1020         if (buffer_uptodate(bh))
1021                 return bh;
1022         brelse (bh);
1023         *err = -EIO;
1024         return NULL;
1025 }
1026
1027 static int walk_page_buffers(   handle_t *handle,
1028                                 struct buffer_head *head,
1029                                 unsigned from,
1030                                 unsigned to,
1031                                 int *partial,
1032                                 int (*fn)(      handle_t *handle,
1033                                                 struct buffer_head *bh))
1034 {
1035         struct buffer_head *bh;
1036         unsigned block_start, block_end;
1037         unsigned blocksize = head->b_size;
1038         int err, ret = 0;
1039         struct buffer_head *next;
1040
1041         for (   bh = head, block_start = 0;
1042                 ret == 0 && (bh != head || !block_start);
1043                 block_start = block_end, bh = next)
1044         {
1045                 next = bh->b_this_page;
1046                 block_end = block_start + blocksize;
1047                 if (block_end <= from || block_start >= to) {
1048                         if (partial && !buffer_uptodate(bh))
1049                                 *partial = 1;
1050                         continue;
1051                 }
1052                 err = (*fn)(handle, bh);
1053                 if (!ret)
1054                         ret = err;
1055         }
1056         return ret;
1057 }
1058
1059 /*
1060  * To preserve ordering, it is essential that the hole instantiation and
1061  * the data write be encapsulated in a single transaction.  We cannot
1062  * close off a transaction and start a new one between the ext3_get_block()
1063  * and the commit_write().  So doing the journal_start at the start of
1064  * prepare_write() is the right place.
1065  *
1066  * Also, this function can nest inside ext3_writepage() ->
1067  * block_write_full_page(). In that case, we *know* that ext3_writepage()
1068  * has generated enough buffer credits to do the whole page.  So we won't
1069  * block on the journal in that case, which is good, because the caller may
1070  * be PF_MEMALLOC.
1071  *
1072  * By accident, ext3 can be reentered when a transaction is open via
1073  * quota file writes.  If we were to commit the transaction while thus
1074  * reentered, there can be a deadlock - we would be holding a quota
1075  * lock, and the commit would never complete if another thread had a
1076  * transaction open and was blocking on the quota lock - a ranking
1077  * violation.
1078  *
1079  * So what we do is to rely on the fact that journal_stop/journal_start
1080  * will _not_ run commit under these circumstances because handle->h_ref
1081  * is elevated.  We'll still have enough credits for the tiny quotafile
1082  * write.  
1083  */
1084
1085 static int do_journal_get_write_access(handle_t *handle, 
1086                                        struct buffer_head *bh)
1087 {
1088         if (!buffer_mapped(bh) || buffer_freed(bh))
1089                 return 0;
1090         return ext3_journal_get_write_access(handle, bh);
1091 }
1092
1093 static int ext3_prepare_write(struct file *file, struct page *page,
1094                               unsigned from, unsigned to)
1095 {
1096         struct inode *inode = page->mapping->host;
1097         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1098         handle_t *handle;
1099         int retries = 0;
1100
1101 retry:
1102         handle = ext3_journal_start(inode, needed_blocks);
1103         if (IS_ERR(handle)) {
1104                 ret = PTR_ERR(handle);
1105                 goto out;
1106         }
1107         ret = block_prepare_write(page, from, to, ext3_get_block);
1108         if (ret)
1109                 goto prepare_write_failed;
1110
1111         if (ext3_should_journal_data(inode)) {
1112                 ret = walk_page_buffers(handle, page_buffers(page),
1113                                 from, to, NULL, do_journal_get_write_access);
1114         }
1115 prepare_write_failed:
1116         if (ret)
1117                 ext3_journal_stop(handle);
1118         if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1119                 goto retry;
1120 out:
1121         return ret;
1122 }
1123
1124 static int
1125 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1126 {
1127         int err = journal_dirty_data(handle, bh);
1128         if (err)
1129                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1130                                                 bh, handle,err);
1131         return err;
1132 }
1133
1134 /* For commit_write() in data=journal mode */
1135 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1136 {
1137         if (!buffer_mapped(bh) || buffer_freed(bh))
1138                 return 0;
1139         set_buffer_uptodate(bh);
1140         return ext3_journal_dirty_metadata(handle, bh);
1141 }
1142
1143 /*
1144  * We need to pick up the new inode size which generic_commit_write gave us
1145  * `file' can be NULL - eg, when called from page_symlink().
1146  *
1147  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1148  * buffers are managed internally.
1149  */
1150
1151 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1152                              unsigned from, unsigned to)
1153 {
1154         handle_t *handle = ext3_journal_current_handle();
1155         struct inode *inode = page->mapping->host;
1156         int ret = 0, ret2;
1157
1158         ret = walk_page_buffers(handle, page_buffers(page),
1159                 from, to, NULL, ext3_journal_dirty_data);
1160
1161         if (ret == 0) {
1162                 /*
1163                  * generic_commit_write() will run mark_inode_dirty() if i_size
1164                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1165                  * into that.
1166                  */
1167                 loff_t new_i_size;
1168
1169                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1170                 if (new_i_size > EXT3_I(inode)->i_disksize)
1171                         EXT3_I(inode)->i_disksize = new_i_size;
1172                 ret = generic_commit_write(file, page, from, to);
1173         }
1174         ret2 = ext3_journal_stop(handle);
1175         if (!ret)
1176                 ret = ret2;
1177         return ret;
1178 }
1179
1180 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1181                              unsigned from, unsigned to)
1182 {
1183         handle_t *handle = ext3_journal_current_handle();
1184         struct inode *inode = page->mapping->host;
1185         int ret = 0, ret2;
1186         loff_t new_i_size;
1187
1188         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1189         if (new_i_size > EXT3_I(inode)->i_disksize)
1190                 EXT3_I(inode)->i_disksize = new_i_size;
1191         ret = generic_commit_write(file, page, from, to);
1192         ret2 = ext3_journal_stop(handle);
1193         if (!ret)
1194                 ret = ret2;
1195         return ret;
1196 }
1197
1198 static int ext3_journalled_commit_write(struct file *file,
1199                         struct page *page, unsigned from, unsigned to)
1200 {
1201         handle_t *handle = ext3_journal_current_handle();
1202         struct inode *inode = page->mapping->host;
1203         int ret = 0, ret2;
1204         int partial = 0;
1205         loff_t pos;
1206
1207         /*
1208          * Here we duplicate the generic_commit_write() functionality
1209          */
1210         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1211
1212         ret = walk_page_buffers(handle, page_buffers(page), from,
1213                                 to, &partial, commit_write_fn);
1214         if (!partial)
1215                 SetPageUptodate(page);
1216         if (pos > inode->i_size)
1217                 i_size_write(inode, pos);
1218         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1219         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1220                 EXT3_I(inode)->i_disksize = inode->i_size;
1221                 ret2 = ext3_mark_inode_dirty(handle, inode);
1222                 if (!ret) 
1223                         ret = ret2;
1224         }
1225         ret2 = ext3_journal_stop(handle);
1226         if (!ret)
1227                 ret = ret2;
1228         return ret;
1229 }
1230
1231 /* 
1232  * bmap() is special.  It gets used by applications such as lilo and by
1233  * the swapper to find the on-disk block of a specific piece of data.
1234  *
1235  * Naturally, this is dangerous if the block concerned is still in the
1236  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1237  * filesystem and enables swap, then they may get a nasty shock when the
1238  * data getting swapped to that swapfile suddenly gets overwritten by
1239  * the original zero's written out previously to the journal and
1240  * awaiting writeback in the kernel's buffer cache. 
1241  *
1242  * So, if we see any bmap calls here on a modified, data-journaled file,
1243  * take extra steps to flush any blocks which might be in the cache. 
1244  */
1245 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1246 {
1247         struct inode *inode = mapping->host;
1248         journal_t *journal;
1249         int err;
1250
1251         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1252                 /* 
1253                  * This is a REALLY heavyweight approach, but the use of
1254                  * bmap on dirty files is expected to be extremely rare:
1255                  * only if we run lilo or swapon on a freshly made file
1256                  * do we expect this to happen. 
1257                  *
1258                  * (bmap requires CAP_SYS_RAWIO so this does not
1259                  * represent an unprivileged user DOS attack --- we'd be
1260                  * in trouble if mortal users could trigger this path at
1261                  * will.) 
1262                  *
1263                  * NB. EXT3_STATE_JDATA is not set on files other than
1264                  * regular files.  If somebody wants to bmap a directory
1265                  * or symlink and gets confused because the buffer
1266                  * hasn't yet been flushed to disk, they deserve
1267                  * everything they get.
1268                  */
1269
1270                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1271                 journal = EXT3_JOURNAL(inode);
1272                 journal_lock_updates(journal);
1273                 err = journal_flush(journal);
1274                 journal_unlock_updates(journal);
1275
1276                 if (err)
1277                         return 0;
1278         }
1279
1280         return generic_block_bmap(mapping,block,ext3_get_block);
1281 }
1282
1283 static int bget_one(handle_t *handle, struct buffer_head *bh)
1284 {
1285         get_bh(bh);
1286         return 0;
1287 }
1288
1289 static int bput_one(handle_t *handle, struct buffer_head *bh)
1290 {
1291         put_bh(bh);
1292         return 0;
1293 }
1294
1295 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1296 {
1297         if (buffer_mapped(bh))
1298                 return ext3_journal_dirty_data(handle, bh);
1299         return 0;
1300 }
1301
1302 /*
1303  * Note that we always start a transaction even if we're not journalling
1304  * data.  This is to preserve ordering: any hole instantiation within
1305  * __block_write_full_page -> ext3_get_block() should be journalled
1306  * along with the data so we don't crash and then get metadata which
1307  * refers to old data.
1308  *
1309  * In all journalling modes block_write_full_page() will start the I/O.
1310  *
1311  * Problem:
1312  *
1313  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1314  *              ext3_writepage()
1315  *
1316  * Similar for:
1317  *
1318  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1319  *
1320  * Same applies to ext3_get_block().  We will deadlock on various things like
1321  * lock_journal and i_truncate_sem.
1322  *
1323  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1324  * allocations fail.
1325  *
1326  * 16May01: If we're reentered then journal_current_handle() will be
1327  *          non-zero. We simply *return*.
1328  *
1329  * 1 July 2001: @@@ FIXME:
1330  *   In journalled data mode, a data buffer may be metadata against the
1331  *   current transaction.  But the same file is part of a shared mapping
1332  *   and someone does a writepage() on it.
1333  *
1334  *   We will move the buffer onto the async_data list, but *after* it has
1335  *   been dirtied. So there's a small window where we have dirty data on
1336  *   BJ_Metadata.
1337  *
1338  *   Note that this only applies to the last partial page in the file.  The
1339  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1340  *   broken code anyway: it's wrong for msync()).
1341  *
1342  *   It's a rare case: affects the final partial page, for journalled data
1343  *   where the file is subject to bith write() and writepage() in the same
1344  *   transction.  To fix it we'll need a custom block_write_full_page().
1345  *   We'll probably need that anyway for journalling writepage() output.
1346  *
1347  * We don't honour synchronous mounts for writepage().  That would be
1348  * disastrous.  Any write() or metadata operation will sync the fs for
1349  * us.
1350  *
1351  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1352  * we don't need to open a transaction here.
1353  */
1354 static int ext3_ordered_writepage(struct page *page,
1355                         struct writeback_control *wbc)
1356 {
1357         struct inode *inode = page->mapping->host;
1358         struct buffer_head *page_bufs;
1359         handle_t *handle = NULL;
1360         int ret = 0;
1361         int err;
1362
1363         J_ASSERT(PageLocked(page));
1364
1365         /*
1366          * We give up here if we're reentered, because it might be for a
1367          * different filesystem.
1368          */
1369         if (ext3_journal_current_handle())
1370                 goto out_fail;
1371
1372         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1373
1374         if (IS_ERR(handle)) {
1375                 ret = PTR_ERR(handle);
1376                 goto out_fail;
1377         }
1378
1379         if (!page_has_buffers(page)) {
1380                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1381                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1382         }
1383         page_bufs = page_buffers(page);
1384         walk_page_buffers(handle, page_bufs, 0,
1385                         PAGE_CACHE_SIZE, NULL, bget_one);
1386
1387         ret = block_write_full_page(page, ext3_get_block, wbc);
1388
1389         /*
1390          * The page can become unlocked at any point now, and
1391          * truncate can then come in and change things.  So we
1392          * can't touch *page from now on.  But *page_bufs is
1393          * safe due to elevated refcount.
1394          */
1395
1396         /*
1397          * And attach them to the current transaction.  But only if 
1398          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1399          * and generally junk.
1400          */
1401         if (ret == 0) {
1402                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1403                                         NULL, journal_dirty_data_fn);
1404                 if (!ret)
1405                         ret = err;
1406         }
1407         walk_page_buffers(handle, page_bufs, 0,
1408                         PAGE_CACHE_SIZE, NULL, bput_one);
1409         err = ext3_journal_stop(handle);
1410         if (!ret)
1411                 ret = err;
1412         return ret;
1413
1414 out_fail:
1415         redirty_page_for_writepage(wbc, page);
1416         unlock_page(page);
1417         return ret;
1418 }
1419
1420 static int ext3_writeback_writepage(struct page *page,
1421                                 struct writeback_control *wbc)
1422 {
1423         struct inode *inode = page->mapping->host;
1424         handle_t *handle = NULL;
1425         int ret = 0;
1426         int err;
1427
1428         if (ext3_journal_current_handle())
1429                 goto out_fail;
1430
1431         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1432         if (IS_ERR(handle)) {
1433                 ret = PTR_ERR(handle);
1434                 goto out_fail;
1435         }
1436
1437         ret = block_write_full_page(page, ext3_get_block, wbc);
1438         err = ext3_journal_stop(handle);
1439         if (!ret)
1440                 ret = err;
1441         return ret;
1442
1443 out_fail:
1444         redirty_page_for_writepage(wbc, page);
1445         unlock_page(page);
1446         return ret;
1447 }
1448
1449 static int ext3_journalled_writepage(struct page *page,
1450                                 struct writeback_control *wbc)
1451 {
1452         struct inode *inode = page->mapping->host;
1453         handle_t *handle = NULL;
1454         int ret = 0;
1455         int err;
1456
1457         if (ext3_journal_current_handle())
1458                 goto no_write;
1459
1460         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1461         if (IS_ERR(handle)) {
1462                 ret = PTR_ERR(handle);
1463                 goto no_write;
1464         }
1465
1466         if (!page_has_buffers(page) || PageChecked(page)) {
1467                 /*
1468                  * It's mmapped pagecache.  Add buffers and journal it.  There
1469                  * doesn't seem much point in redirtying the page here.
1470                  */
1471                 ClearPageChecked(page);
1472                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1473                                         ext3_get_block);
1474                 if (ret != 0)
1475                         goto out_unlock;
1476                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1477                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1478
1479                 err = walk_page_buffers(handle, page_buffers(page), 0,
1480                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1481                 if (ret == 0)
1482                         ret = err;
1483                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1484                 unlock_page(page);
1485         } else {
1486                 /*
1487                  * It may be a page full of checkpoint-mode buffers.  We don't
1488                  * really know unless we go poke around in the buffer_heads.
1489                  * But block_write_full_page will do the right thing.
1490                  */
1491                 ret = block_write_full_page(page, ext3_get_block, wbc);
1492         }
1493         err = ext3_journal_stop(handle);
1494         if (!ret)
1495                 ret = err;
1496 out:
1497         return ret;
1498
1499 no_write:
1500         redirty_page_for_writepage(wbc, page);
1501 out_unlock:
1502         unlock_page(page);
1503         goto out;
1504 }
1505
1506 static int ext3_readpage(struct file *file, struct page *page)
1507 {
1508         return mpage_readpage(page, ext3_get_block);
1509 }
1510
1511 static int
1512 ext3_readpages(struct file *file, struct address_space *mapping,
1513                 struct list_head *pages, unsigned nr_pages)
1514 {
1515         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1516 }
1517
1518 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1519 {
1520         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1521
1522         /*
1523          * If it's a full truncate we just forget about the pending dirtying
1524          */
1525         if (offset == 0)
1526                 ClearPageChecked(page);
1527
1528         return journal_invalidatepage(journal, page, offset);
1529 }
1530
1531 static int ext3_releasepage(struct page *page, int wait)
1532 {
1533         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1534
1535         WARN_ON(PageChecked(page));
1536         return journal_try_to_free_buffers(journal, page, wait);
1537 }
1538
1539 /*
1540  * If the O_DIRECT write will extend the file then add this inode to the
1541  * orphan list.  So recovery will truncate it back to the original size
1542  * if the machine crashes during the write.
1543  *
1544  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1545  * crashes then stale disk data _may_ be exposed inside the file.
1546  */
1547 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1548                         const struct iovec *iov, loff_t offset,
1549                         unsigned long nr_segs)
1550 {
1551         struct file *file = iocb->ki_filp;
1552         struct inode *inode = file->f_mapping->host;
1553         struct ext3_inode_info *ei = EXT3_I(inode);
1554         handle_t *handle = NULL;
1555         ssize_t ret;
1556         int orphan = 0;
1557         size_t count = iov_length(iov, nr_segs);
1558
1559         if (rw == WRITE) {
1560                 loff_t final_size = offset + count;
1561
1562                 handle = ext3_journal_start(inode, DIO_CREDITS);
1563                 if (IS_ERR(handle)) {
1564                         ret = PTR_ERR(handle);
1565                         goto out;
1566                 }
1567                 if (final_size > inode->i_size) {
1568                         ret = ext3_orphan_add(handle, inode);
1569                         if (ret)
1570                                 goto out_stop;
1571                         orphan = 1;
1572                         ei->i_disksize = inode->i_size;
1573                 }
1574         }
1575
1576         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
1577                                  offset, nr_segs,
1578                                  ext3_direct_io_get_blocks, NULL);
1579
1580 out_stop:
1581         if (handle) {
1582                 int err;
1583
1584                 if (orphan) 
1585                         ext3_orphan_del(handle, inode);
1586                 if (orphan && ret > 0) {
1587                         loff_t end = offset + ret;
1588                         if (end > inode->i_size) {
1589                                 ei->i_disksize = end;
1590                                 i_size_write(inode, end);
1591                                 err = ext3_mark_inode_dirty(handle, inode);
1592                                 if (!ret) 
1593                                         ret = err;
1594                         }
1595                 }
1596                 err = ext3_journal_stop(handle);
1597                 if (ret == 0)
1598                         ret = err;
1599         }
1600 out:
1601         return ret;
1602 }
1603
1604 /*
1605  * Pages can be marked dirty completely asynchronously from ext3's journalling
1606  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1607  * much here because ->set_page_dirty is called under VFS locks.  The page is
1608  * not necessarily locked.
1609  *
1610  * We cannot just dirty the page and leave attached buffers clean, because the
1611  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1612  * or jbddirty because all the journalling code will explode.
1613  *
1614  * So what we do is to mark the page "pending dirty" and next time writepage
1615  * is called, propagate that into the buffers appropriately.
1616  */
1617 static int ext3_journalled_set_page_dirty(struct page *page)
1618 {
1619         SetPageChecked(page);
1620         return __set_page_dirty_nobuffers(page);
1621 }
1622
1623 static struct address_space_operations ext3_ordered_aops = {
1624         .readpage       = ext3_readpage,
1625         .readpages      = ext3_readpages,
1626         .writepage      = ext3_ordered_writepage,
1627         .sync_page      = block_sync_page,
1628         .prepare_write  = ext3_prepare_write,
1629         .commit_write   = ext3_ordered_commit_write,
1630         .bmap           = ext3_bmap,
1631         .invalidatepage = ext3_invalidatepage,
1632         .releasepage    = ext3_releasepage,
1633         .direct_IO      = ext3_direct_IO,
1634 };
1635
1636 static struct address_space_operations ext3_writeback_aops = {
1637         .readpage       = ext3_readpage,
1638         .readpages      = ext3_readpages,
1639         .writepage      = ext3_writeback_writepage,
1640         .sync_page      = block_sync_page,
1641         .prepare_write  = ext3_prepare_write,
1642         .commit_write   = ext3_writeback_commit_write,
1643         .bmap           = ext3_bmap,
1644         .invalidatepage = ext3_invalidatepage,
1645         .releasepage    = ext3_releasepage,
1646         .direct_IO      = ext3_direct_IO,
1647 };
1648
1649 static struct address_space_operations ext3_journalled_aops = {
1650         .readpage       = ext3_readpage,
1651         .readpages      = ext3_readpages,
1652         .writepage      = ext3_journalled_writepage,
1653         .sync_page      = block_sync_page,
1654         .prepare_write  = ext3_prepare_write,
1655         .commit_write   = ext3_journalled_commit_write,
1656         .set_page_dirty = ext3_journalled_set_page_dirty,
1657         .bmap           = ext3_bmap,
1658         .invalidatepage = ext3_invalidatepage,
1659         .releasepage    = ext3_releasepage,
1660 };
1661
1662 void ext3_set_aops(struct inode *inode)
1663 {
1664         if (ext3_should_order_data(inode))
1665                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1666         else if (ext3_should_writeback_data(inode))
1667                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1668         else
1669                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1670 }
1671
1672 /*
1673  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1674  * up to the end of the block which corresponds to `from'.
1675  * This required during truncate. We need to physically zero the tail end
1676  * of that block so it doesn't yield old data if the file is later grown.
1677  */
1678 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1679                 struct address_space *mapping, loff_t from)
1680 {
1681         unsigned long index = from >> PAGE_CACHE_SHIFT;
1682         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1683         unsigned blocksize, iblock, length, pos;
1684         struct inode *inode = mapping->host;
1685         struct buffer_head *bh;
1686         int err;
1687         void *kaddr;
1688
1689         blocksize = inode->i_sb->s_blocksize;
1690         length = blocksize - (offset & (blocksize - 1));
1691         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1692
1693         if (!page_has_buffers(page))
1694                 create_empty_buffers(page, blocksize, 0);
1695
1696         /* Find the buffer that contains "offset" */
1697         bh = page_buffers(page);
1698         pos = blocksize;
1699         while (offset >= pos) {
1700                 bh = bh->b_this_page;
1701                 iblock++;
1702                 pos += blocksize;
1703         }
1704
1705         err = 0;
1706         if (buffer_freed(bh)) {
1707                 BUFFER_TRACE(bh, "freed: skip");
1708                 goto unlock;
1709         }
1710
1711         if (!buffer_mapped(bh)) {
1712                 BUFFER_TRACE(bh, "unmapped");
1713                 ext3_get_block(inode, iblock, bh, 0);
1714                 /* unmapped? It's a hole - nothing to do */
1715                 if (!buffer_mapped(bh)) {
1716                         BUFFER_TRACE(bh, "still unmapped");
1717                         goto unlock;
1718                 }
1719         }
1720
1721         /* Ok, it's mapped. Make sure it's up-to-date */
1722         if (PageUptodate(page))
1723                 set_buffer_uptodate(bh);
1724
1725         if (!buffer_uptodate(bh)) {
1726                 err = -EIO;
1727                 ll_rw_block(READ, 1, &bh);
1728                 wait_on_buffer(bh);
1729                 /* Uhhuh. Read error. Complain and punt. */
1730                 if (!buffer_uptodate(bh))
1731                         goto unlock;
1732         }
1733
1734         if (ext3_should_journal_data(inode)) {
1735                 BUFFER_TRACE(bh, "get write access");
1736                 err = ext3_journal_get_write_access(handle, bh);
1737                 if (err)
1738                         goto unlock;
1739         }
1740
1741         kaddr = kmap_atomic(page, KM_USER0);
1742         memset(kaddr + offset, 0, length);
1743         flush_dcache_page(page);
1744         kunmap_atomic(kaddr, KM_USER0);
1745
1746         BUFFER_TRACE(bh, "zeroed end of block");
1747
1748         err = 0;
1749         if (ext3_should_journal_data(inode)) {
1750                 err = ext3_journal_dirty_metadata(handle, bh);
1751         } else {
1752                 if (ext3_should_order_data(inode))
1753                         err = ext3_journal_dirty_data(handle, bh);
1754                 mark_buffer_dirty(bh);
1755         }
1756
1757 unlock:
1758         unlock_page(page);
1759         page_cache_release(page);
1760         return err;
1761 }
1762
1763 /*
1764  * Probably it should be a library function... search for first non-zero word
1765  * or memcmp with zero_page, whatever is better for particular architecture.
1766  * Linus?
1767  */
1768 static inline int all_zeroes(u32 *p, u32 *q)
1769 {
1770         while (p < q)
1771                 if (*p++)
1772                         return 0;
1773         return 1;
1774 }
1775
1776 /**
1777  *      ext3_find_shared - find the indirect blocks for partial truncation.
1778  *      @inode:   inode in question
1779  *      @depth:   depth of the affected branch
1780  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1781  *      @chain:   place to store the pointers to partial indirect blocks
1782  *      @top:     place to the (detached) top of branch
1783  *
1784  *      This is a helper function used by ext3_truncate().
1785  *
1786  *      When we do truncate() we may have to clean the ends of several
1787  *      indirect blocks but leave the blocks themselves alive. Block is
1788  *      partially truncated if some data below the new i_size is refered
1789  *      from it (and it is on the path to the first completely truncated
1790  *      data block, indeed).  We have to free the top of that path along
1791  *      with everything to the right of the path. Since no allocation
1792  *      past the truncation point is possible until ext3_truncate()
1793  *      finishes, we may safely do the latter, but top of branch may
1794  *      require special attention - pageout below the truncation point
1795  *      might try to populate it.
1796  *
1797  *      We atomically detach the top of branch from the tree, store the
1798  *      block number of its root in *@top, pointers to buffer_heads of
1799  *      partially truncated blocks - in @chain[].bh and pointers to
1800  *      their last elements that should not be removed - in
1801  *      @chain[].p. Return value is the pointer to last filled element
1802  *      of @chain.
1803  *
1804  *      The work left to caller to do the actual freeing of subtrees:
1805  *              a) free the subtree starting from *@top
1806  *              b) free the subtrees whose roots are stored in
1807  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1808  *              c) free the subtrees growing from the inode past the @chain[0].
1809  *                      (no partially truncated stuff there).  */
1810
1811 static Indirect *ext3_find_shared(struct inode *inode,
1812                                 int depth,
1813                                 int offsets[4],
1814                                 Indirect chain[4],
1815                                 u32 *top)
1816 {
1817         Indirect *partial, *p;
1818         int k, err;
1819
1820         *top = 0;
1821         /* Make k index the deepest non-null offest + 1 */
1822         for (k = depth; k > 1 && !offsets[k-1]; k--)
1823                 ;
1824         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1825         /* Writer: pointers */
1826         if (!partial)
1827                 partial = chain + k-1;
1828         /*
1829          * If the branch acquired continuation since we've looked at it -
1830          * fine, it should all survive and (new) top doesn't belong to us.
1831          */
1832         if (!partial->key && *partial->p)
1833                 /* Writer: end */
1834                 goto no_top;
1835         for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
1836                 ;
1837         /*
1838          * OK, we've found the last block that must survive. The rest of our
1839          * branch should be detached before unlocking. However, if that rest
1840          * of branch is all ours and does not grow immediately from the inode
1841          * it's easier to cheat and just decrement partial->p.
1842          */
1843         if (p == chain + k - 1 && p > chain) {
1844                 p->p--;
1845         } else {
1846                 *top = *p->p;
1847                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1848 #if 0
1849                 *p->p = 0;
1850 #endif
1851         }
1852         /* Writer: end */
1853
1854         while(partial > p)
1855         {
1856                 brelse(partial->bh);
1857                 partial--;
1858         }
1859 no_top:
1860         return partial;
1861 }
1862
1863 /*
1864  * Zero a number of block pointers in either an inode or an indirect block.
1865  * If we restart the transaction we must again get write access to the
1866  * indirect block for further modification.
1867  *
1868  * We release `count' blocks on disk, but (last - first) may be greater
1869  * than `count' because there can be holes in there.
1870  */
1871 static void
1872 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1873                 unsigned long block_to_free, unsigned long count,
1874                 u32 *first, u32 *last)
1875 {
1876         u32 *p;
1877         if (try_to_extend_transaction(handle, inode)) {
1878                 if (bh) {
1879                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1880                         ext3_journal_dirty_metadata(handle, bh);
1881                 }
1882                 ext3_mark_inode_dirty(handle, inode);
1883                 ext3_journal_test_restart(handle, inode);
1884                 if (bh) {
1885                         BUFFER_TRACE(bh, "retaking write access");
1886                         ext3_journal_get_write_access(handle, bh);
1887                 }
1888         }
1889
1890         /*
1891          * Any buffers which are on the journal will be in memory. We find
1892          * them on the hash table so journal_revoke() will run journal_forget()
1893          * on them.  We've already detached each block from the file, so
1894          * bforget() in journal_forget() should be safe.
1895          *
1896          * AKPM: turn on bforget in journal_forget()!!!
1897          */
1898         for (p = first; p < last; p++) {
1899                 u32 nr = le32_to_cpu(*p);
1900                 if (nr) {
1901                         struct buffer_head *bh;
1902
1903                         *p = 0;
1904                         bh = sb_find_get_block(inode->i_sb, nr);
1905                         ext3_forget(handle, 0, inode, bh, nr);
1906                 }
1907         }
1908
1909         ext3_free_blocks(handle, inode, block_to_free, count);
1910 }
1911
1912 /**
1913  * ext3_free_data - free a list of data blocks
1914  * @handle:     handle for this transaction
1915  * @inode:      inode we are dealing with
1916  * @this_bh:    indirect buffer_head which contains *@first and *@last
1917  * @first:      array of block numbers
1918  * @last:       points immediately past the end of array
1919  *
1920  * We are freeing all blocks refered from that array (numbers are stored as
1921  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1922  *
1923  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1924  * blocks are contiguous then releasing them at one time will only affect one
1925  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1926  * actually use a lot of journal space.
1927  *
1928  * @this_bh will be %NULL if @first and @last point into the inode's direct
1929  * block pointers.
1930  */
1931 static void ext3_free_data(handle_t *handle, struct inode *inode,
1932                            struct buffer_head *this_bh, u32 *first, u32 *last)
1933 {
1934         unsigned long block_to_free = 0;    /* Starting block # of a run */
1935         unsigned long count = 0;            /* Number of blocks in the run */ 
1936         u32 *block_to_free_p = NULL;        /* Pointer into inode/ind
1937                                                corresponding to
1938                                                block_to_free */
1939         unsigned long nr;                   /* Current block # */
1940         u32 *p;                             /* Pointer into inode/ind
1941                                                for current block */
1942         int err;
1943
1944         if (this_bh) {                          /* For indirect block */
1945                 BUFFER_TRACE(this_bh, "get_write_access");
1946                 err = ext3_journal_get_write_access(handle, this_bh);
1947                 /* Important: if we can't update the indirect pointers
1948                  * to the blocks, we can't free them. */
1949                 if (err)
1950                         return;
1951         }
1952
1953         for (p = first; p < last; p++) {
1954                 nr = le32_to_cpu(*p);
1955                 if (nr) {
1956                         /* accumulate blocks to free if they're contiguous */
1957                         if (count == 0) {
1958                                 block_to_free = nr;
1959                                 block_to_free_p = p;
1960                                 count = 1;
1961                         } else if (nr == block_to_free + count) {
1962                                 count++;
1963                         } else {
1964                                 ext3_clear_blocks(handle, inode, this_bh, 
1965                                                   block_to_free,
1966                                                   count, block_to_free_p, p);
1967                                 block_to_free = nr;
1968                                 block_to_free_p = p;
1969                                 count = 1;
1970                         }
1971                 }
1972         }
1973
1974         if (count > 0)
1975                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1976                                   count, block_to_free_p, p);
1977
1978         if (this_bh) {
1979                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1980                 ext3_journal_dirty_metadata(handle, this_bh);
1981         }
1982 }
1983
1984 /**
1985  *      ext3_free_branches - free an array of branches
1986  *      @handle: JBD handle for this transaction
1987  *      @inode: inode we are dealing with
1988  *      @parent_bh: the buffer_head which contains *@first and *@last
1989  *      @first: array of block numbers
1990  *      @last:  pointer immediately past the end of array
1991  *      @depth: depth of the branches to free
1992  *
1993  *      We are freeing all blocks refered from these branches (numbers are
1994  *      stored as little-endian 32-bit) and updating @inode->i_blocks
1995  *      appropriately.
1996  */
1997 static void ext3_free_branches(handle_t *handle, struct inode *inode,
1998                                struct buffer_head *parent_bh,
1999                                u32 *first, u32 *last, int depth)
2000 {
2001         unsigned long nr;
2002         u32 *p;
2003
2004         if (is_handle_aborted(handle))
2005                 return;
2006
2007         if (depth--) {
2008                 struct buffer_head *bh;
2009                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2010                 p = last;
2011                 while (--p >= first) {
2012                         nr = le32_to_cpu(*p);
2013                         if (!nr)
2014                                 continue;               /* A hole */
2015
2016                         /* Go read the buffer for the next level down */
2017                         bh = sb_bread(inode->i_sb, nr);
2018
2019                         /*
2020                          * A read failure? Report error and clear slot
2021                          * (should be rare).
2022                          */
2023                         if (!bh) {
2024                                 ext3_error(inode->i_sb, "ext3_free_branches",
2025                                            "Read failure, inode=%ld, block=%ld",
2026                                            inode->i_ino, nr);
2027                                 continue;
2028                         }
2029
2030                         /* This zaps the entire block.  Bottom up. */
2031                         BUFFER_TRACE(bh, "free child branches");
2032                         ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
2033                                            (u32*)bh->b_data + addr_per_block,
2034                                            depth);
2035
2036                         /*
2037                          * We've probably journalled the indirect block several
2038                          * times during the truncate.  But it's no longer
2039                          * needed and we now drop it from the transaction via
2040                          * journal_revoke().
2041                          *
2042                          * That's easy if it's exclusively part of this
2043                          * transaction.  But if it's part of the committing
2044                          * transaction then journal_forget() will simply
2045                          * brelse() it.  That means that if the underlying
2046                          * block is reallocated in ext3_get_block(),
2047                          * unmap_underlying_metadata() will find this block
2048                          * and will try to get rid of it.  damn, damn.
2049                          *
2050                          * If this block has already been committed to the
2051                          * journal, a revoke record will be written.  And
2052                          * revoke records must be emitted *before* clearing
2053                          * this block's bit in the bitmaps.
2054                          */
2055                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2056
2057                         /*
2058                          * Everything below this this pointer has been
2059                          * released.  Now let this top-of-subtree go.
2060                          *
2061                          * We want the freeing of this indirect block to be
2062                          * atomic in the journal with the updating of the
2063                          * bitmap block which owns it.  So make some room in
2064                          * the journal.
2065                          *
2066                          * We zero the parent pointer *after* freeing its
2067                          * pointee in the bitmaps, so if extend_transaction()
2068                          * for some reason fails to put the bitmap changes and
2069                          * the release into the same transaction, recovery
2070                          * will merely complain about releasing a free block,
2071                          * rather than leaking blocks.
2072                          */
2073                         if (is_handle_aborted(handle))
2074                                 return;
2075                         if (try_to_extend_transaction(handle, inode)) {
2076                                 ext3_mark_inode_dirty(handle, inode);
2077                                 ext3_journal_test_restart(handle, inode);
2078                         }
2079
2080                         ext3_free_blocks(handle, inode, nr, 1);
2081
2082                         if (parent_bh) {
2083                                 /*
2084                                  * The block which we have just freed is
2085                                  * pointed to by an indirect block: journal it
2086                                  */
2087                                 BUFFER_TRACE(parent_bh, "get_write_access");
2088                                 if (!ext3_journal_get_write_access(handle,
2089                                                                    parent_bh)){
2090                                         *p = 0;
2091                                         BUFFER_TRACE(parent_bh,
2092                                         "call ext3_journal_dirty_metadata");
2093                                         ext3_journal_dirty_metadata(handle, 
2094                                                                     parent_bh);
2095                                 }
2096                         }
2097                 }
2098         } else {
2099                 /* We have reached the bottom of the tree. */
2100                 BUFFER_TRACE(parent_bh, "free data blocks");
2101                 ext3_free_data(handle, inode, parent_bh, first, last);
2102         }
2103 }
2104
2105 /*
2106  * ext3_truncate()
2107  *
2108  * We block out ext3_get_block() block instantiations across the entire
2109  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2110  * simultaneously on behalf of the same inode.
2111  *
2112  * As we work through the truncate and commmit bits of it to the journal there
2113  * is one core, guiding principle: the file's tree must always be consistent on
2114  * disk.  We must be able to restart the truncate after a crash.
2115  *
2116  * The file's tree may be transiently inconsistent in memory (although it
2117  * probably isn't), but whenever we close off and commit a journal transaction,
2118  * the contents of (the filesystem + the journal) must be consistent and
2119  * restartable.  It's pretty simple, really: bottom up, right to left (although
2120  * left-to-right works OK too).
2121  *
2122  * Note that at recovery time, journal replay occurs *before* the restart of
2123  * truncate against the orphan inode list.
2124  *
2125  * The committed inode has the new, desired i_size (which is the same as
2126  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2127  * that this inode's truncate did not complete and it will again call
2128  * ext3_truncate() to have another go.  So there will be instantiated blocks
2129  * to the right of the truncation point in a crashed ext3 filesystem.  But
2130  * that's fine - as long as they are linked from the inode, the post-crash
2131  * ext3_truncate() run will find them and release them.
2132  */
2133
2134 void ext3_truncate(struct inode * inode)
2135 {
2136         handle_t *handle;
2137         struct ext3_inode_info *ei = EXT3_I(inode);
2138         u32 *i_data = ei->i_data;
2139         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2140         struct address_space *mapping = inode->i_mapping;
2141         int offsets[4];
2142         Indirect chain[4];
2143         Indirect *partial;
2144         int nr = 0;
2145         int n;
2146         long last_block;
2147         unsigned blocksize = inode->i_sb->s_blocksize;
2148         struct page *page;
2149
2150         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2151             S_ISLNK(inode->i_mode)))
2152                 return;
2153         if (ext3_inode_is_fast_symlink(inode))
2154                 return;
2155         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2156                 return;
2157
2158         ext3_discard_prealloc(inode);
2159
2160         /*
2161          * We have to lock the EOF page here, because lock_page() nests
2162          * outside journal_start().
2163          */
2164         if ((inode->i_size & (blocksize - 1)) == 0) {
2165                 /* Block boundary? Nothing to do */
2166                 page = NULL;
2167         } else {
2168                 page = grab_cache_page(mapping,
2169                                 inode->i_size >> PAGE_CACHE_SHIFT);
2170                 if (!page)
2171                         return;
2172         }
2173
2174         handle = start_transaction(inode);
2175         if (IS_ERR(handle)) {
2176                 if (page) {
2177                         clear_highpage(page);
2178                         flush_dcache_page(page);
2179                         unlock_page(page);
2180                         page_cache_release(page);
2181                 }
2182                 return;         /* AKPM: return what? */
2183         }
2184
2185         last_block = (inode->i_size + blocksize-1)
2186                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2187
2188         if (page)
2189                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2190
2191         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2192         if (n == 0)
2193                 goto out_stop;  /* error */
2194
2195         /*
2196          * OK.  This truncate is going to happen.  We add the inode to the
2197          * orphan list, so that if this truncate spans multiple transactions,
2198          * and we crash, we will resume the truncate when the filesystem
2199          * recovers.  It also marks the inode dirty, to catch the new size.
2200          *
2201          * Implication: the file must always be in a sane, consistent
2202          * truncatable state while each transaction commits.
2203          */
2204         if (ext3_orphan_add(handle, inode))
2205                 goto out_stop;
2206
2207         /*
2208          * The orphan list entry will now protect us from any crash which
2209          * occurs before the truncate completes, so it is now safe to propagate
2210          * the new, shorter inode size (held for now in i_size) into the
2211          * on-disk inode. We do this via i_disksize, which is the value which
2212          * ext3 *really* writes onto the disk inode.
2213          */
2214         ei->i_disksize = inode->i_size;
2215
2216         /*
2217          * From here we block out all ext3_get_block() callers who want to
2218          * modify the block allocation tree.
2219          */
2220         down(&ei->truncate_sem);
2221
2222         if (n == 1) {           /* direct blocks */
2223                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2224                                i_data + EXT3_NDIR_BLOCKS);
2225                 goto do_indirects;
2226         }
2227
2228         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2229         /* Kill the top of shared branch (not detached) */
2230         if (nr) {
2231                 if (partial == chain) {
2232                         /* Shared branch grows from the inode */
2233                         ext3_free_branches(handle, inode, NULL,
2234                                            &nr, &nr+1, (chain+n-1) - partial);
2235                         *partial->p = 0;
2236                         /*
2237                          * We mark the inode dirty prior to restart,
2238                          * and prior to stop.  No need for it here.
2239                          */
2240                 } else {
2241                         /* Shared branch grows from an indirect block */
2242                         BUFFER_TRACE(partial->bh, "get_write_access");
2243                         ext3_free_branches(handle, inode, partial->bh,
2244                                         partial->p,
2245                                         partial->p+1, (chain+n-1) - partial);
2246                 }
2247         }
2248         /* Clear the ends of indirect blocks on the shared branch */
2249         while (partial > chain) {
2250                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2251                                    (u32*)partial->bh->b_data + addr_per_block,
2252                                    (chain+n-1) - partial);
2253                 BUFFER_TRACE(partial->bh, "call brelse");
2254                 brelse (partial->bh);
2255                 partial--;
2256         }
2257 do_indirects:
2258         /* Kill the remaining (whole) subtrees */
2259         switch (offsets[0]) {
2260                 default:
2261                         nr = i_data[EXT3_IND_BLOCK];
2262                         if (nr) {
2263                                 ext3_free_branches(handle, inode, NULL,
2264                                                    &nr, &nr+1, 1);
2265                                 i_data[EXT3_IND_BLOCK] = 0;
2266                         }
2267                 case EXT3_IND_BLOCK:
2268                         nr = i_data[EXT3_DIND_BLOCK];
2269                         if (nr) {
2270                                 ext3_free_branches(handle, inode, NULL,
2271                                                    &nr, &nr+1, 2);
2272                                 i_data[EXT3_DIND_BLOCK] = 0;
2273                         }
2274                 case EXT3_DIND_BLOCK:
2275                         nr = i_data[EXT3_TIND_BLOCK];
2276                         if (nr) {
2277                                 ext3_free_branches(handle, inode, NULL,
2278                                                    &nr, &nr+1, 3);
2279                                 i_data[EXT3_TIND_BLOCK] = 0;
2280                         }
2281                 case EXT3_TIND_BLOCK:
2282                         ;
2283         }
2284         up(&ei->truncate_sem);
2285         inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2286         ext3_mark_inode_dirty(handle, inode);
2287
2288         /* In a multi-transaction truncate, we only make the final
2289          * transaction synchronous */
2290         if (IS_SYNC(inode))
2291                 handle->h_sync = 1;
2292 out_stop:
2293         /*
2294          * If this was a simple ftruncate(), and the file will remain alive
2295          * then we need to clear up the orphan record which we created above.
2296          * However, if this was a real unlink then we were called by
2297          * ext3_delete_inode(), and we allow that function to clean up the
2298          * orphan info for us.
2299          */
2300         if (inode->i_nlink)
2301                 ext3_orphan_del(handle, inode);
2302
2303         ext3_journal_stop(handle);
2304 }
2305
2306 static unsigned long ext3_get_inode_block(struct super_block *sb,
2307                 unsigned long ino, struct ext3_iloc *iloc)
2308 {
2309         unsigned long desc, group_desc, block_group;
2310         unsigned long offset, block;
2311         struct buffer_head *bh;
2312         struct ext3_group_desc * gdp;
2313
2314         if ((ino != EXT3_ROOT_INO &&
2315                 ino != EXT3_JOURNAL_INO &&
2316                 ino < EXT3_FIRST_INO(sb)) ||
2317                 ino > le32_to_cpu(
2318                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2319                 ext3_error (sb, "ext3_get_inode_block",
2320                             "bad inode number: %lu", ino);
2321                 return 0;
2322         }
2323         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2324         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2325                 ext3_error (sb, "ext3_get_inode_block",
2326                             "group >= groups count");
2327                 return 0;
2328         }
2329         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2330         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2331         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2332         if (!bh) {
2333                 ext3_error (sb, "ext3_get_inode_block",
2334                             "Descriptor not loaded");
2335                 return 0;
2336         }
2337
2338         gdp = (struct ext3_group_desc *) bh->b_data;
2339         /*
2340          * Figure out the offset within the block group inode table
2341          */
2342         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2343                 EXT3_INODE_SIZE(sb);
2344         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2345                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2346
2347         iloc->block_group = block_group;
2348         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2349         return block;
2350 }
2351
2352 /* 
2353  * ext3_get_inode_loc returns with an extra refcount against the inode's
2354  * underlying buffer_head on success.  If `in_mem' is false then we're purely
2355  * trying to determine the inode's location on-disk and no read need be
2356  * performed.
2357  */
2358 static int ext3_get_inode_loc(struct inode *inode,
2359                                 struct ext3_iloc *iloc, int in_mem)
2360 {
2361         unsigned long block;
2362         struct buffer_head *bh;
2363
2364         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2365         if (!block)
2366                 return -EIO;
2367
2368         bh = sb_getblk(inode->i_sb, block);
2369         if (!bh) {
2370                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2371                                 "unable to read inode block - "
2372                                 "inode=%lu, block=%lu", inode->i_ino, block);
2373                 return -EIO;
2374         }
2375         if (!buffer_uptodate(bh)) {
2376                 lock_buffer(bh);
2377                 if (buffer_uptodate(bh)) {
2378                         /* someone brought it uptodate while we waited */
2379                         unlock_buffer(bh);
2380                         goto has_buffer;
2381                 }
2382
2383                 /* we can't skip I/O if inode is on a disk only */
2384                 if (in_mem) {
2385                         struct buffer_head *bitmap_bh;
2386                         struct ext3_group_desc *desc;
2387                         int inodes_per_buffer;
2388                         int inode_offset, i;
2389                         int block_group;
2390                         int start;
2391
2392                         /*
2393                          * If this is the only valid inode in the block we
2394                          * need not read the block.
2395                          */
2396                         block_group = (inode->i_ino - 1) /
2397                                         EXT3_INODES_PER_GROUP(inode->i_sb);
2398                         inodes_per_buffer = bh->b_size /
2399                                 EXT3_INODE_SIZE(inode->i_sb);
2400                         inode_offset = ((inode->i_ino - 1) %
2401                                         EXT3_INODES_PER_GROUP(inode->i_sb));
2402                         start = inode_offset & ~(inodes_per_buffer - 1);
2403
2404                         /* Is the inode bitmap in cache? */
2405                         desc = ext3_get_group_desc(inode->i_sb,
2406                                                 block_group, NULL);
2407                         if (!desc)
2408                                 goto make_io;
2409
2410                         bitmap_bh = sb_getblk(inode->i_sb,
2411                                         le32_to_cpu(desc->bg_inode_bitmap));
2412                         if (!bitmap_bh)
2413                                 goto make_io;
2414
2415                         /*
2416                          * If the inode bitmap isn't in cache then the
2417                          * optimisation may end up performing two reads instead
2418                          * of one, so skip it.
2419                          */
2420                         if (!buffer_uptodate(bitmap_bh)) {
2421                                 brelse(bitmap_bh);
2422                                 goto make_io;
2423                         }
2424                         for (i = start; i < start + inodes_per_buffer; i++) {
2425                                 if (i == inode_offset)
2426                                         continue;
2427                                 if (ext3_test_bit(i, bitmap_bh->b_data))
2428                                         break;
2429                         }
2430                         brelse(bitmap_bh);
2431                         if (i == start + inodes_per_buffer) {
2432                                 /* all other inodes are free, so skip I/O */
2433                                 memset(bh->b_data, 0, bh->b_size);
2434                                 set_buffer_uptodate(bh);
2435                                 unlock_buffer(bh);
2436                                 goto has_buffer;
2437                         }
2438                 }
2439
2440 make_io:
2441                 /*
2442                  * There are another valid inodes in the buffer so we must
2443                  * read the block from disk
2444                  */
2445                 get_bh(bh);
2446                 bh->b_end_io = end_buffer_read_sync;
2447                 submit_bh(READ, bh);
2448                 wait_on_buffer(bh);
2449                 if (!buffer_uptodate(bh)) {
2450                         ext3_error(inode->i_sb, "ext3_get_inode_loc",
2451                                         "unable to read inode block - "
2452                                         "inode=%lu, block=%lu",
2453                                         inode->i_ino, block);
2454                         brelse(bh);
2455                         return -EIO;
2456                 }
2457         }
2458 has_buffer:
2459         iloc->bh = bh;
2460         return 0;
2461 }
2462
2463 void ext3_set_inode_flags(struct inode *inode)
2464 {
2465         unsigned int flags = EXT3_I(inode)->i_flags;
2466
2467         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2468         if (flags & EXT3_SYNC_FL)
2469                 inode->i_flags |= S_SYNC;
2470         if (flags & EXT3_APPEND_FL)
2471                 inode->i_flags |= S_APPEND;
2472         if (flags & EXT3_IMMUTABLE_FL)
2473                 inode->i_flags |= S_IMMUTABLE;
2474         if (flags & EXT3_NOATIME_FL)
2475                 inode->i_flags |= S_NOATIME;
2476         if (flags & EXT3_DIRSYNC_FL)
2477                 inode->i_flags |= S_DIRSYNC;
2478 }
2479
2480 void ext3_read_inode(struct inode * inode)
2481 {
2482         struct ext3_iloc iloc;
2483         struct ext3_inode *raw_inode;
2484         struct ext3_inode_info *ei = EXT3_I(inode);
2485         struct buffer_head *bh;
2486         int block;
2487
2488 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2489         ei->i_acl = EXT3_ACL_NOT_CACHED;
2490         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2491 #endif
2492         if (ext3_get_inode_loc(inode, &iloc, 0))
2493                 goto bad_inode;
2494         bh = iloc.bh;
2495         raw_inode = ext3_raw_inode(&iloc);
2496         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2497         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2498         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2499         if(!(test_opt (inode->i_sb, NO_UID32))) {
2500                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2501                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2502         }
2503         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2504         inode->i_size = le32_to_cpu(raw_inode->i_size);
2505         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2506         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2507         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2508         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2509
2510         ei->i_state = 0;
2511         ei->i_next_alloc_block = 0;
2512         ei->i_next_alloc_goal = 0;
2513         ei->i_dir_start_lookup = 0;
2514         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2515         /* We now have enough fields to check if the inode was active or not.
2516          * This is needed because nfsd might try to access dead inodes
2517          * the test is that same one that e2fsck uses
2518          * NeilBrown 1999oct15
2519          */
2520         if (inode->i_nlink == 0) {
2521                 if (inode->i_mode == 0 ||
2522                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2523                         /* this inode is deleted */
2524                         brelse (bh);
2525                         goto bad_inode;
2526                 }
2527                 /* The only unlinked inodes we let through here have
2528                  * valid i_mode and are being read by the orphan
2529                  * recovery code: that's fine, we're about to complete
2530                  * the process of deleting those. */
2531         }
2532         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2533                                          * (for stat), not the fs block
2534                                          * size */  
2535         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2536         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2537 #ifdef EXT3_FRAGMENTS
2538         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2539         ei->i_frag_no = raw_inode->i_frag;
2540         ei->i_frag_size = raw_inode->i_fsize;
2541 #endif
2542         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2543         if (!S_ISREG(inode->i_mode)) {
2544                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2545         } else {
2546                 inode->i_size |=
2547                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2548         }
2549         ei->i_disksize = inode->i_size;
2550         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2551 #ifdef EXT3_PREALLOCATE
2552         ei->i_prealloc_count = 0;
2553 #endif
2554         ei->i_block_group = iloc.block_group;
2555
2556         /*
2557          * NOTE! The in-memory inode i_data array is in little-endian order
2558          * even on big-endian machines: we do NOT byteswap the block numbers!
2559          */
2560         for (block = 0; block < EXT3_N_BLOCKS; block++)
2561                 ei->i_data[block] = raw_inode->i_block[block];
2562         INIT_LIST_HEAD(&ei->i_orphan);
2563
2564         if (S_ISREG(inode->i_mode)) {
2565                 inode->i_op = &ext3_file_inode_operations;
2566                 inode->i_fop = &ext3_file_operations;
2567                 ext3_set_aops(inode);
2568         } else if (S_ISDIR(inode->i_mode)) {
2569                 inode->i_op = &ext3_dir_inode_operations;
2570                 inode->i_fop = &ext3_dir_operations;
2571         } else if (S_ISLNK(inode->i_mode)) {
2572                 if (ext3_inode_is_fast_symlink(inode))
2573                         inode->i_op = &ext3_fast_symlink_inode_operations;
2574                 else {
2575                         inode->i_op = &ext3_symlink_inode_operations;
2576                         ext3_set_aops(inode);
2577                 }
2578         } else {
2579                 inode->i_op = &ext3_special_inode_operations;
2580                 if (raw_inode->i_block[0])
2581                         init_special_inode(inode, inode->i_mode,
2582                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2583                 else 
2584                         init_special_inode(inode, inode->i_mode,
2585                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2586         }
2587         brelse (iloc.bh);
2588         ext3_set_inode_flags(inode);
2589         return;
2590
2591 bad_inode:
2592         make_bad_inode(inode);
2593         return;
2594 }
2595
2596 /*
2597  * Post the struct inode info into an on-disk inode location in the
2598  * buffer-cache.  This gobbles the caller's reference to the
2599  * buffer_head in the inode location struct.
2600  *
2601  * The caller must have write access to iloc->bh.
2602  */
2603 static int ext3_do_update_inode(handle_t *handle, 
2604                                 struct inode *inode, 
2605                                 struct ext3_iloc *iloc)
2606 {
2607         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2608         struct ext3_inode_info *ei = EXT3_I(inode);
2609         struct buffer_head *bh = iloc->bh;
2610         int err = 0, rc, block;
2611
2612         /* For fields not not tracking in the in-memory inode,
2613          * initialise them to zero for new inodes. */
2614         if (ei->i_state & EXT3_STATE_NEW)
2615                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2616
2617         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2618         if(!(test_opt(inode->i_sb, NO_UID32))) {
2619                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2620                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2621 /*
2622  * Fix up interoperability with old kernels. Otherwise, old inodes get
2623  * re-used with the upper 16 bits of the uid/gid intact
2624  */
2625                 if(!ei->i_dtime) {
2626                         raw_inode->i_uid_high =
2627                                 cpu_to_le16(high_16_bits(inode->i_uid));
2628                         raw_inode->i_gid_high =
2629                                 cpu_to_le16(high_16_bits(inode->i_gid));
2630                 } else {
2631                         raw_inode->i_uid_high = 0;
2632                         raw_inode->i_gid_high = 0;
2633                 }
2634         } else {
2635                 raw_inode->i_uid_low =
2636                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
2637                 raw_inode->i_gid_low =
2638                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
2639                 raw_inode->i_uid_high = 0;
2640                 raw_inode->i_gid_high = 0;
2641         }
2642         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2643         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2644         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2645         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2646         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2647         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2648         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2649         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2650 #ifdef EXT3_FRAGMENTS
2651         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2652         raw_inode->i_frag = ei->i_frag_no;
2653         raw_inode->i_fsize = ei->i_frag_size;
2654 #endif
2655         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2656         if (!S_ISREG(inode->i_mode)) {
2657                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2658         } else {
2659                 raw_inode->i_size_high =
2660                         cpu_to_le32(ei->i_disksize >> 32);
2661                 if (ei->i_disksize > 0x7fffffffULL) {
2662                         struct super_block *sb = inode->i_sb;
2663                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2664                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2665                             EXT3_SB(sb)->s_es->s_rev_level ==
2666                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2667                                /* If this is the first large file
2668                                 * created, add a flag to the superblock.
2669                                 */
2670                                 err = ext3_journal_get_write_access(handle,
2671                                                 EXT3_SB(sb)->s_sbh);
2672                                 if (err)
2673                                         goto out_brelse;
2674                                 ext3_update_dynamic_rev(sb);
2675                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2676                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2677                                 sb->s_dirt = 1;
2678                                 handle->h_sync = 1;
2679                                 err = ext3_journal_dirty_metadata(handle,
2680                                                 EXT3_SB(sb)->s_sbh);
2681                         }
2682                 }
2683         }
2684         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2685         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2686                 if (old_valid_dev(inode->i_rdev)) {
2687                         raw_inode->i_block[0] =
2688                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
2689                         raw_inode->i_block[1] = 0;
2690                 } else {
2691                         raw_inode->i_block[0] = 0;
2692                         raw_inode->i_block[1] =
2693                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
2694                         raw_inode->i_block[2] = 0;
2695                 }
2696         } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2697                 raw_inode->i_block[block] = ei->i_data[block];
2698
2699         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2700         rc = ext3_journal_dirty_metadata(handle, bh);
2701         if (!err)
2702                 err = rc;
2703         ei->i_state &= ~EXT3_STATE_NEW;
2704
2705 out_brelse:
2706         brelse (bh);
2707         ext3_std_error(inode->i_sb, err);
2708         return err;
2709 }
2710
2711 /*
2712  * ext3_write_inode()
2713  *
2714  * We are called from a few places:
2715  *
2716  * - Within generic_file_write() for O_SYNC files.
2717  *   Here, there will be no transaction running. We wait for any running
2718  *   trasnaction to commit.
2719  *
2720  * - Within sys_sync(), kupdate and such.
2721  *   We wait on commit, if tol to.
2722  *
2723  * - Within prune_icache() (PF_MEMALLOC == true)
2724  *   Here we simply return.  We can't afford to block kswapd on the
2725  *   journal commit.
2726  *
2727  * In all cases it is actually safe for us to return without doing anything,
2728  * because the inode has been copied into a raw inode buffer in
2729  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2730  * knfsd.
2731  *
2732  * Note that we are absolutely dependent upon all inode dirtiers doing the
2733  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2734  * which we are interested.
2735  *
2736  * It would be a bug for them to not do this.  The code:
2737  *
2738  *      mark_inode_dirty(inode)
2739  *      stuff();
2740  *      inode->i_size = expr;
2741  *
2742  * is in error because a kswapd-driven write_inode() could occur while
2743  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2744  * will no longer be on the superblock's dirty inode list.
2745  */
2746 void ext3_write_inode(struct inode *inode, int wait)
2747 {
2748         if (current->flags & PF_MEMALLOC)
2749                 return;
2750
2751         if (ext3_journal_current_handle()) {
2752                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2753                 dump_stack();
2754                 return;
2755         }
2756
2757         if (!wait)
2758                 return;
2759
2760         ext3_force_commit(inode->i_sb);
2761 }
2762
2763 /*
2764  * ext3_setattr()
2765  *
2766  * Called from notify_change.
2767  *
2768  * We want to trap VFS attempts to truncate the file as soon as
2769  * possible.  In particular, we want to make sure that when the VFS
2770  * shrinks i_size, we put the inode on the orphan list and modify
2771  * i_disksize immediately, so that during the subsequent flushing of
2772  * dirty pages and freeing of disk blocks, we can guarantee that any
2773  * commit will leave the blocks being flushed in an unused state on
2774  * disk.  (On recovery, the inode will get truncated and the blocks will
2775  * be freed, so we have a strong guarantee that no future commit will
2776  * leave these blocks visible to the user.)  
2777  *
2778  * Called with inode->sem down.
2779  */
2780 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2781 {
2782         struct inode *inode = dentry->d_inode;
2783         int error, rc = 0;
2784         const unsigned int ia_valid = attr->ia_valid;
2785
2786         error = inode_change_ok(inode, attr);
2787         if (error)
2788                 return error;
2789
2790         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2791                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2792                 handle_t *handle;
2793
2794                 /* (user+group)*(old+new) structure, inode write (sb,
2795                  * inode block, ? - but truncate inode update has it) */
2796                 handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
2797                 if (IS_ERR(handle)) {
2798                         error = PTR_ERR(handle);
2799                         goto err_out;
2800                 }
2801                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2802                 if (error) {
2803                         ext3_journal_stop(handle);
2804                         return error;
2805                 }
2806                 /* Update corresponding info in inode so that everything is in
2807                  * one transaction */
2808                 if (attr->ia_valid & ATTR_UID)
2809                         inode->i_uid = attr->ia_uid;
2810                 if (attr->ia_valid & ATTR_GID)
2811                         inode->i_gid = attr->ia_gid;
2812                 error = ext3_mark_inode_dirty(handle, inode);
2813                 ext3_journal_stop(handle);
2814         }
2815
2816         if (S_ISREG(inode->i_mode) &&
2817             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2818                 handle_t *handle;
2819
2820                 handle = ext3_journal_start(inode, 3);
2821                 if (IS_ERR(handle)) {
2822                         error = PTR_ERR(handle);
2823                         goto err_out;
2824                 }
2825
2826                 error = ext3_orphan_add(handle, inode);
2827                 EXT3_I(inode)->i_disksize = attr->ia_size;
2828                 rc = ext3_mark_inode_dirty(handle, inode);
2829                 if (!error)
2830                         error = rc;
2831                 ext3_journal_stop(handle);
2832         }
2833
2834         rc = inode_setattr(inode, attr);
2835
2836         /* If inode_setattr's call to ext3_truncate failed to get a
2837          * transaction handle at all, we need to clean up the in-core
2838          * orphan list manually. */
2839         if (inode->i_nlink)
2840                 ext3_orphan_del(NULL, inode);
2841
2842         if (!rc && (ia_valid & ATTR_MODE))
2843                 rc = ext3_acl_chmod(inode);
2844
2845 err_out:
2846         ext3_std_error(inode->i_sb, error);
2847         if (!error)
2848                 error = rc;
2849         return error;
2850 }
2851
2852
2853 /*
2854  * akpm: how many blocks doth make a writepage()?
2855  *
2856  * With N blocks per page, it may be:
2857  * N data blocks
2858  * 2 indirect block
2859  * 2 dindirect
2860  * 1 tindirect
2861  * N+5 bitmap blocks (from the above)
2862  * N+5 group descriptor summary blocks
2863  * 1 inode block
2864  * 1 superblock.
2865  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2866  *
2867  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2868  *
2869  * With ordered or writeback data it's the same, less the N data blocks.
2870  *
2871  * If the inode's direct blocks can hold an integral number of pages then a
2872  * page cannot straddle two indirect blocks, and we can only touch one indirect
2873  * and dindirect block, and the "5" above becomes "3".
2874  *
2875  * This still overestimates under most circumstances.  If we were to pass the
2876  * start and end offsets in here as well we could do block_to_path() on each
2877  * block and work out the exact number of indirects which are touched.  Pah.
2878  */
2879
2880 int ext3_writepage_trans_blocks(struct inode *inode)
2881 {
2882         int bpp = ext3_journal_blocks_per_page(inode);
2883         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2884         int ret;
2885
2886         if (ext3_should_journal_data(inode))
2887                 ret = 3 * (bpp + indirects) + 2;
2888         else
2889                 ret = 2 * (bpp + indirects) + 2;
2890
2891 #ifdef CONFIG_QUOTA
2892         /* We know that structure was already allocated during DQUOT_INIT so
2893          * we will be updating only the data blocks + inodes */
2894         ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
2895 #endif
2896
2897         return ret;
2898 }
2899
2900 /*
2901  * The caller must have previously called ext3_reserve_inode_write().
2902  * Give this, we know that the caller already has write access to iloc->bh.
2903  */
2904 int ext3_mark_iloc_dirty(handle_t *handle,
2905                 struct inode *inode, struct ext3_iloc *iloc)
2906 {
2907         int err = 0;
2908
2909         /* the do_update_inode consumes one bh->b_count */
2910         get_bh(iloc->bh);
2911
2912         /* ext3_do_update_inode() does journal_dirty_metadata */
2913         err = ext3_do_update_inode(handle, inode, iloc);
2914         put_bh(iloc->bh);
2915         return err;
2916 }
2917
2918 /* 
2919  * On success, We end up with an outstanding reference count against
2920  * iloc->bh.  This _must_ be cleaned up later. 
2921  */
2922
2923 int
2924 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
2925                          struct ext3_iloc *iloc)
2926 {
2927         int err = 0;
2928         if (handle) {
2929                 err = ext3_get_inode_loc(inode, iloc, 1);
2930                 if (!err) {
2931                         BUFFER_TRACE(iloc->bh, "get_write_access");
2932                         err = ext3_journal_get_write_access(handle, iloc->bh);
2933                         if (err) {
2934                                 brelse(iloc->bh);
2935                                 iloc->bh = NULL;
2936                         }
2937                 }
2938         }
2939         ext3_std_error(inode->i_sb, err);
2940         return err;
2941 }
2942
2943 /*
2944  * akpm: What we do here is to mark the in-core inode as clean
2945  * with respect to inode dirtiness (it may still be data-dirty).
2946  * This means that the in-core inode may be reaped by prune_icache
2947  * without having to perform any I/O.  This is a very good thing,
2948  * because *any* task may call prune_icache - even ones which
2949  * have a transaction open against a different journal.
2950  *
2951  * Is this cheating?  Not really.  Sure, we haven't written the
2952  * inode out, but prune_icache isn't a user-visible syncing function.
2953  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2954  * we start and wait on commits.
2955  *
2956  * Is this efficient/effective?  Well, we're being nice to the system
2957  * by cleaning up our inodes proactively so they can be reaped
2958  * without I/O.  But we are potentially leaving up to five seconds'
2959  * worth of inodes floating about which prune_icache wants us to
2960  * write out.  One way to fix that would be to get prune_icache()
2961  * to do a write_super() to free up some memory.  It has the desired
2962  * effect.
2963  */
2964 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2965 {
2966         struct ext3_iloc iloc;
2967         int err;
2968
2969         err = ext3_reserve_inode_write(handle, inode, &iloc);
2970         if (!err)
2971                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2972         return err;
2973 }
2974
2975 /*
2976  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
2977  *
2978  * We're really interested in the case where a file is being extended.
2979  * i_size has been changed by generic_commit_write() and we thus need
2980  * to include the updated inode in the current transaction.
2981  *
2982  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
2983  * are allocated to the file.
2984  *
2985  * If the inode is marked synchronous, we don't honour that here - doing
2986  * so would cause a commit on atime updates, which we don't bother doing.
2987  * We handle synchronous inodes at the highest possible level.
2988  */
2989 void ext3_dirty_inode(struct inode *inode)
2990 {
2991         handle_t *current_handle = ext3_journal_current_handle();
2992         handle_t *handle;
2993
2994         handle = ext3_journal_start(inode, 2);
2995         if (IS_ERR(handle))
2996                 goto out;
2997         if (current_handle &&
2998                 current_handle->h_transaction != handle->h_transaction) {
2999                 /* This task has a transaction open against a different fs */
3000                 printk(KERN_EMERG "%s: transactions do not match!\n",
3001                        __FUNCTION__);
3002         } else {
3003                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
3004                                 current_handle);
3005                 ext3_mark_inode_dirty(handle, inode);
3006         }
3007         ext3_journal_stop(handle);
3008 out:
3009         return;
3010 }
3011
3012 #ifdef AKPM
3013 /* 
3014  * Bind an inode's backing buffer_head into this transaction, to prevent
3015  * it from being flushed to disk early.  Unlike
3016  * ext3_reserve_inode_write, this leaves behind no bh reference and
3017  * returns no iloc structure, so the caller needs to repeat the iloc
3018  * lookup to mark the inode dirty later.
3019  */
3020 static inline int
3021 ext3_pin_inode(handle_t *handle, struct inode *inode)
3022 {
3023         struct ext3_iloc iloc;
3024
3025         int err = 0;
3026         if (handle) {
3027                 err = ext3_get_inode_loc(inode, &iloc, 1);
3028                 if (!err) {
3029                         BUFFER_TRACE(iloc.bh, "get_write_access");
3030                         err = journal_get_write_access(handle, iloc.bh);
3031                         if (!err)
3032                                 err = ext3_journal_dirty_metadata(handle, 
3033                                                                   iloc.bh);
3034                         brelse(iloc.bh);
3035                 }
3036         }
3037         ext3_std_error(inode->i_sb, err);
3038         return err;
3039 }
3040 #endif
3041
3042 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3043 {
3044         journal_t *journal;
3045         handle_t *handle;
3046         int err;
3047
3048         /*
3049          * We have to be very careful here: changing a data block's
3050          * journaling status dynamically is dangerous.  If we write a
3051          * data block to the journal, change the status and then delete
3052          * that block, we risk forgetting to revoke the old log record
3053          * from the journal and so a subsequent replay can corrupt data.
3054          * So, first we make sure that the journal is empty and that
3055          * nobody is changing anything.
3056          */
3057
3058         journal = EXT3_JOURNAL(inode);
3059         if (is_journal_aborted(journal) || IS_RDONLY(inode))
3060                 return -EROFS;
3061
3062         journal_lock_updates(journal);
3063         journal_flush(journal);
3064
3065         /*
3066          * OK, there are no updates running now, and all cached data is
3067          * synced to disk.  We are now in a completely consistent state
3068          * which doesn't have anything in the journal, and we know that
3069          * no filesystem updates are running, so it is safe to modify
3070          * the inode's in-core data-journaling state flag now.
3071          */
3072
3073         if (val)
3074                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3075         else
3076                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3077         ext3_set_aops(inode);
3078
3079         journal_unlock_updates(journal);
3080
3081         /* Finally we can mark the inode as dirty. */
3082
3083         handle = ext3_journal_start(inode, 1);
3084         if (IS_ERR(handle))
3085                 return PTR_ERR(handle);
3086
3087         err = ext3_mark_inode_dirty(handle, inode);
3088         handle->h_sync = 1;
3089         ext3_journal_stop(handle);
3090         ext3_std_error(inode->i_sb, err);
3091
3092         return err;
3093 }