VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / fs / ext3 / inode.c
1 /*
2  *  linux/fs/ext3/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *      (sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *      (jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23  */
24
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
39 #include <linux/vserver/xid.h>
40 #include "xattr.h"
41 #include "acl.h"
42
43 /*
44  * Test whether an inode is a fast symlink.
45  */
46 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
47 {
48         int ea_blocks = EXT3_I(inode)->i_file_acl ?
49                 (inode->i_sb->s_blocksize >> 9) : 0;
50
51         return (S_ISLNK(inode->i_mode) &&
52                 inode->i_blocks - ea_blocks == 0);
53 }
54
55 /* The ext3 forget function must perform a revoke if we are freeing data
56  * which has been journaled.  Metadata (eg. indirect blocks) must be
57  * revoked in all cases. 
58  *
59  * "bh" may be NULL: a metadata block may have been freed from memory
60  * but there may still be a record of it in the journal, and that record
61  * still needs to be revoked.
62  */
63
64 int ext3_forget(handle_t *handle, int is_metadata,
65                        struct inode *inode, struct buffer_head *bh,
66                        int blocknr)
67 {
68         int err;
69
70         BUFFER_TRACE(bh, "enter");
71
72         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
73                   "data mode %lx\n",
74                   bh, is_metadata, inode->i_mode,
75                   test_opt(inode->i_sb, DATA_FLAGS));
76
77         /* Never use the revoke function if we are doing full data
78          * journaling: there is no need to, and a V1 superblock won't
79          * support it.  Otherwise, only skip the revoke on un-journaled
80          * data blocks. */
81
82         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
83             (!is_metadata && !ext3_should_journal_data(inode))) {
84                 if (bh) {
85                         BUFFER_TRACE(bh, "call journal_forget");
86                         ext3_journal_forget(handle, bh);
87                 }
88                 return 0;
89         }
90
91         /*
92          * data!=journal && (is_metadata || should_journal_data(inode))
93          */
94         BUFFER_TRACE(bh, "call ext3_journal_revoke");
95         err = ext3_journal_revoke(handle, blocknr, bh);
96         if (err)
97                 ext3_abort(inode->i_sb, __FUNCTION__,
98                            "error %d when attempting revoke", err);
99         BUFFER_TRACE(bh, "exit");
100         return err;
101 }
102
103 /*
104  * Work out how many blocks we need to progress with the next chunk of a
105  * truncate transaction.
106  */
107
108 static unsigned long blocks_for_truncate(struct inode *inode) 
109 {
110         unsigned long needed;
111
112         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
113
114         /* Give ourselves just enough room to cope with inodes in which
115          * i_blocks is corrupt: we've seen disk corruptions in the past
116          * which resulted in random data in an inode which looked enough
117          * like a regular file for ext3 to try to delete it.  Things
118          * will go a bit crazy if that happens, but at least we should
119          * try not to panic the whole kernel. */
120         if (needed < 2)
121                 needed = 2;
122
123         /* But we need to bound the transaction so we don't overflow the
124          * journal. */
125         if (needed > EXT3_MAX_TRANS_DATA) 
126                 needed = EXT3_MAX_TRANS_DATA;
127
128         return EXT3_DATA_TRANS_BLOCKS + needed;
129 }
130
131 /* 
132  * Truncate transactions can be complex and absolutely huge.  So we need to
133  * be able to restart the transaction at a conventient checkpoint to make
134  * sure we don't overflow the journal.
135  *
136  * start_transaction gets us a new handle for a truncate transaction,
137  * and extend_transaction tries to extend the existing one a bit.  If
138  * extend fails, we need to propagate the failure up and restart the
139  * transaction in the top-level truncate loop. --sct 
140  */
141
142 static handle_t *start_transaction(struct inode *inode) 
143 {
144         handle_t *result;
145
146         result = ext3_journal_start(inode, blocks_for_truncate(inode));
147         if (!IS_ERR(result))
148                 return result;
149
150         ext3_std_error(inode->i_sb, PTR_ERR(result));
151         return result;
152 }
153
154 /*
155  * Try to extend this transaction for the purposes of truncation.
156  *
157  * Returns 0 if we managed to create more room.  If we can't create more
158  * room, and the transaction must be restarted we return 1.
159  */
160 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
161 {
162         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
163                 return 0;
164         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
165                 return 0;
166         return 1;
167 }
168
169 /*
170  * Restart the transaction associated with *handle.  This does a commit,
171  * so before we call here everything must be consistently dirtied against
172  * this transaction.
173  */
174 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
175 {
176         jbd_debug(2, "restarting handle %p\n", handle);
177         return ext3_journal_restart(handle, blocks_for_truncate(inode));
178 }
179
180 /*
181  * Called at each iput()
182  *
183  * The inode may be "bad" if ext3_read_inode() saw an error from
184  * ext3_get_inode(), so we need to check that to avoid freeing random disk
185  * blocks.
186  */
187 void ext3_put_inode(struct inode *inode)
188 {
189         if (!is_bad_inode(inode))
190                 ext3_discard_prealloc(inode);
191 }
192
193 static void ext3_truncate_nocheck (struct inode *inode);
194
195 /*
196  * Called at the last iput() if i_nlink is zero.
197  */
198 void ext3_delete_inode (struct inode * inode)
199 {
200         handle_t *handle;
201
202         if (is_bad_inode(inode))
203                 goto no_delete;
204
205         handle = start_transaction(inode);
206         if (IS_ERR(handle)) {
207                 /* If we're going to skip the normal cleanup, we still
208                  * need to make sure that the in-core orphan linked list
209                  * is properly cleaned up. */
210                 ext3_orphan_del(NULL, inode);
211                 goto no_delete;
212         }
213
214         if (IS_SYNC(inode))
215                 handle->h_sync = 1;
216         inode->i_size = 0;
217         if (inode->i_blocks)
218                 ext3_truncate_nocheck(inode);
219         /*
220          * Kill off the orphan record which ext3_truncate created.
221          * AKPM: I think this can be inside the above `if'.
222          * Note that ext3_orphan_del() has to be able to cope with the
223          * deletion of a non-existent orphan - this is because we don't
224          * know if ext3_truncate() actually created an orphan record.
225          * (Well, we could do this if we need to, but heck - it works)
226          */
227         ext3_orphan_del(handle, inode);
228         EXT3_I(inode)->i_dtime  = get_seconds();
229
230         /* 
231          * One subtle ordering requirement: if anything has gone wrong
232          * (transaction abort, IO errors, whatever), then we can still
233          * do these next steps (the fs will already have been marked as
234          * having errors), but we can't free the inode if the mark_dirty
235          * fails.  
236          */
237         if (ext3_mark_inode_dirty(handle, inode))
238                 /* If that failed, just do the required in-core inode clear. */
239                 clear_inode(inode);
240         else
241                 ext3_free_inode(handle, inode);
242         ext3_journal_stop(handle);
243         return;
244 no_delete:
245         clear_inode(inode);     /* We must guarantee clearing of inode... */
246 }
247
248 void ext3_discard_prealloc (struct inode * inode)
249 {
250 #ifdef EXT3_PREALLOCATE
251         struct ext3_inode_info *ei = EXT3_I(inode);
252         /* Writer: ->i_prealloc* */
253         if (ei->i_prealloc_count) {
254                 unsigned short total = ei->i_prealloc_count;
255                 unsigned long block = ei->i_prealloc_block;
256                 ei->i_prealloc_count = 0;
257                 ei->i_prealloc_block = 0;
258                 /* Writer: end */
259                 ext3_free_blocks (inode, block, total);
260         }
261 #endif
262 }
263
264 static int ext3_alloc_block (handle_t *handle,
265                         struct inode * inode, unsigned long goal, int *err)
266 {
267         unsigned long result;
268
269 #ifdef EXT3_PREALLOCATE
270 #ifdef EXT3FS_DEBUG
271         static unsigned long alloc_hits, alloc_attempts;
272 #endif
273         struct ext3_inode_info *ei = EXT3_I(inode);
274         /* Writer: ->i_prealloc* */
275         if (ei->i_prealloc_count &&
276             (goal == ei->i_prealloc_block ||
277              goal + 1 == ei->i_prealloc_block))
278         {
279                 result = ei->i_prealloc_block++;
280                 ei->i_prealloc_count--;
281                 /* Writer: end */
282                 ext3_debug ("preallocation hit (%lu/%lu).\n",
283                             ++alloc_hits, ++alloc_attempts);
284         } else {
285                 ext3_discard_prealloc (inode);
286                 ext3_debug ("preallocation miss (%lu/%lu).\n",
287                             alloc_hits, ++alloc_attempts);
288                 if (S_ISREG(inode->i_mode))
289                         result = ext3_new_block (inode, goal, 
290                                  &ei->i_prealloc_count,
291                                  &ei->i_prealloc_block, err);
292                 else
293                         result = ext3_new_block(inode, goal, NULL, NULL, err);
294                 /*
295                  * AKPM: this is somewhat sticky.  I'm not surprised it was
296                  * disabled in 2.2's ext3.  Need to integrate b_committed_data
297                  * guarding with preallocation, if indeed preallocation is
298                  * effective.
299                  */
300         }
301 #else
302         result = ext3_new_block(handle, inode, goal, NULL, NULL, err);
303 #endif
304         return result;
305 }
306
307
308 typedef struct {
309         u32     *p;
310         u32     key;
311         struct buffer_head *bh;
312 } Indirect;
313
314 static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
315 {
316         p->key = *(p->p = v);
317         p->bh = bh;
318 }
319
320 static inline int verify_chain(Indirect *from, Indirect *to)
321 {
322         while (from <= to && from->key == *from->p)
323                 from++;
324         return (from > to);
325 }
326
327 /**
328  *      ext3_block_to_path - parse the block number into array of offsets
329  *      @inode: inode in question (we are only interested in its superblock)
330  *      @i_block: block number to be parsed
331  *      @offsets: array to store the offsets in
332  *      @boundary: set this non-zero if the referred-to block is likely to be
333  *             followed (on disk) by an indirect block.
334  *
335  *      To store the locations of file's data ext3 uses a data structure common
336  *      for UNIX filesystems - tree of pointers anchored in the inode, with
337  *      data blocks at leaves and indirect blocks in intermediate nodes.
338  *      This function translates the block number into path in that tree -
339  *      return value is the path length and @offsets[n] is the offset of
340  *      pointer to (n+1)th node in the nth one. If @block is out of range
341  *      (negative or too large) warning is printed and zero returned.
342  *
343  *      Note: function doesn't find node addresses, so no IO is needed. All
344  *      we need to know is the capacity of indirect blocks (taken from the
345  *      inode->i_sb).
346  */
347
348 /*
349  * Portability note: the last comparison (check that we fit into triple
350  * indirect block) is spelled differently, because otherwise on an
351  * architecture with 32-bit longs and 8Kb pages we might get into trouble
352  * if our filesystem had 8Kb blocks. We might use long long, but that would
353  * kill us on x86. Oh, well, at least the sign propagation does not matter -
354  * i_block would have to be negative in the very beginning, so we would not
355  * get there at all.
356  */
357
358 static int ext3_block_to_path(struct inode *inode,
359                         long i_block, int offsets[4], int *boundary)
360 {
361         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
362         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
363         const long direct_blocks = EXT3_NDIR_BLOCKS,
364                 indirect_blocks = ptrs,
365                 double_blocks = (1 << (ptrs_bits * 2));
366         int n = 0;
367         int final = 0;
368
369         if (i_block < 0) {
370                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
371         } else if (i_block < direct_blocks) {
372                 offsets[n++] = i_block;
373                 final = direct_blocks;
374         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
375                 offsets[n++] = EXT3_IND_BLOCK;
376                 offsets[n++] = i_block;
377                 final = ptrs;
378         } else if ((i_block -= indirect_blocks) < double_blocks) {
379                 offsets[n++] = EXT3_DIND_BLOCK;
380                 offsets[n++] = i_block >> ptrs_bits;
381                 offsets[n++] = i_block & (ptrs - 1);
382                 final = ptrs;
383         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
384                 offsets[n++] = EXT3_TIND_BLOCK;
385                 offsets[n++] = i_block >> (ptrs_bits * 2);
386                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
387                 offsets[n++] = i_block & (ptrs - 1);
388                 final = ptrs;
389         } else {
390                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
391         }
392         if (boundary)
393                 *boundary = (i_block & (ptrs - 1)) == (final - 1);
394         return n;
395 }
396
397 /**
398  *      ext3_get_branch - read the chain of indirect blocks leading to data
399  *      @inode: inode in question
400  *      @depth: depth of the chain (1 - direct pointer, etc.)
401  *      @offsets: offsets of pointers in inode/indirect blocks
402  *      @chain: place to store the result
403  *      @err: here we store the error value
404  *
405  *      Function fills the array of triples <key, p, bh> and returns %NULL
406  *      if everything went OK or the pointer to the last filled triple
407  *      (incomplete one) otherwise. Upon the return chain[i].key contains
408  *      the number of (i+1)-th block in the chain (as it is stored in memory,
409  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
410  *      number (it points into struct inode for i==0 and into the bh->b_data
411  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
412  *      block for i>0 and NULL for i==0. In other words, it holds the block
413  *      numbers of the chain, addresses they were taken from (and where we can
414  *      verify that chain did not change) and buffer_heads hosting these
415  *      numbers.
416  *
417  *      Function stops when it stumbles upon zero pointer (absent block)
418  *              (pointer to last triple returned, *@err == 0)
419  *      or when it gets an IO error reading an indirect block
420  *              (ditto, *@err == -EIO)
421  *      or when it notices that chain had been changed while it was reading
422  *              (ditto, *@err == -EAGAIN)
423  *      or when it reads all @depth-1 indirect blocks successfully and finds
424  *      the whole chain, all way to the data (returns %NULL, *err == 0).
425  */
426 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
427                                  Indirect chain[4], int *err)
428 {
429         struct super_block *sb = inode->i_sb;
430         Indirect *p = chain;
431         struct buffer_head *bh;
432
433         *err = 0;
434         /* i_data is not going away, no lock needed */
435         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
436         if (!p->key)
437                 goto no_block;
438         while (--depth) {
439                 bh = sb_bread(sb, le32_to_cpu(p->key));
440                 if (!bh)
441                         goto failure;
442                 /* Reader: pointers */
443                 if (!verify_chain(chain, p))
444                         goto changed;
445                 add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
446                 /* Reader: end */
447                 if (!p->key)
448                         goto no_block;
449         }
450         return NULL;
451
452 changed:
453         brelse(bh);
454         *err = -EAGAIN;
455         goto no_block;
456 failure:
457         *err = -EIO;
458 no_block:
459         return p;
460 }
461
462 /**
463  *      ext3_find_near - find a place for allocation with sufficient locality
464  *      @inode: owner
465  *      @ind: descriptor of indirect block.
466  *
467  *      This function returns the prefered place for block allocation.
468  *      It is used when heuristic for sequential allocation fails.
469  *      Rules are:
470  *        + if there is a block to the left of our position - allocate near it.
471  *        + if pointer will live in indirect block - allocate near that block.
472  *        + if pointer will live in inode - allocate in the same
473  *          cylinder group. 
474  *
475  * In the latter case we colour the starting block by the callers PID to
476  * prevent it from clashing with concurrent allocations for a different inode
477  * in the same block group.   The PID is used here so that functionally related
478  * files will be close-by on-disk.
479  *
480  *      Caller must make sure that @ind is valid and will stay that way.
481  */
482
483 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
484 {
485         struct ext3_inode_info *ei = EXT3_I(inode);
486         u32 *start = ind->bh ? (u32*) ind->bh->b_data : ei->i_data;
487         u32 *p;
488         unsigned long bg_start;
489         unsigned long colour;
490
491         /* Try to find previous block */
492         for (p = ind->p - 1; p >= start; p--)
493                 if (*p)
494                         return le32_to_cpu(*p);
495
496         /* No such thing, so let's try location of indirect block */
497         if (ind->bh)
498                 return ind->bh->b_blocknr;
499
500         /*
501          * It is going to be refered from inode itself? OK, just put it into
502          * the same cylinder group then.
503          */
504         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
505                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
506         colour = (current->pid % 16) *
507                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
508         return bg_start + colour;
509 }
510
511 /**
512  *      ext3_find_goal - find a prefered place for allocation.
513  *      @inode: owner
514  *      @block:  block we want
515  *      @chain:  chain of indirect blocks
516  *      @partial: pointer to the last triple within a chain
517  *      @goal:  place to store the result.
518  *
519  *      Normally this function find the prefered place for block allocation,
520  *      stores it in *@goal and returns zero. If the branch had been changed
521  *      under us we return -EAGAIN.
522  */
523
524 static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
525                           Indirect *partial, unsigned long *goal)
526 {
527         struct ext3_inode_info *ei = EXT3_I(inode);
528         /* Writer: ->i_next_alloc* */
529         if (block == ei->i_next_alloc_block + 1) {
530                 ei->i_next_alloc_block++;
531                 ei->i_next_alloc_goal++;
532         }
533         /* Writer: end */
534         /* Reader: pointers, ->i_next_alloc* */
535         if (verify_chain(chain, partial)) {
536                 /*
537                  * try the heuristic for sequential allocation,
538                  * failing that at least try to get decent locality.
539                  */
540                 if (block == ei->i_next_alloc_block)
541                         *goal = ei->i_next_alloc_goal;
542                 if (!*goal)
543                         *goal = ext3_find_near(inode, partial);
544                 return 0;
545         }
546         /* Reader: end */
547         return -EAGAIN;
548 }
549
550 /**
551  *      ext3_alloc_branch - allocate and set up a chain of blocks.
552  *      @inode: owner
553  *      @num: depth of the chain (number of blocks to allocate)
554  *      @offsets: offsets (in the blocks) to store the pointers to next.
555  *      @branch: place to store the chain in.
556  *
557  *      This function allocates @num blocks, zeroes out all but the last one,
558  *      links them into chain and (if we are synchronous) writes them to disk.
559  *      In other words, it prepares a branch that can be spliced onto the
560  *      inode. It stores the information about that chain in the branch[], in
561  *      the same format as ext3_get_branch() would do. We are calling it after
562  *      we had read the existing part of chain and partial points to the last
563  *      triple of that (one with zero ->key). Upon the exit we have the same
564  *      picture as after the successful ext3_get_block(), excpet that in one
565  *      place chain is disconnected - *branch->p is still zero (we did not
566  *      set the last link), but branch->key contains the number that should
567  *      be placed into *branch->p to fill that gap.
568  *
569  *      If allocation fails we free all blocks we've allocated (and forget
570  *      their buffer_heads) and return the error value the from failed
571  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
572  *      as described above and return 0.
573  */
574
575 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
576                              int num,
577                              unsigned long goal,
578                              int *offsets,
579                              Indirect *branch)
580 {
581         int blocksize = inode->i_sb->s_blocksize;
582         int n = 0, keys = 0;
583         int err = 0;
584         int i;
585         int parent = ext3_alloc_block(handle, inode, goal, &err);
586
587         branch[0].key = cpu_to_le32(parent);
588         if (parent) {
589                 for (n = 1; n < num; n++) {
590                         struct buffer_head *bh;
591                         /* Allocate the next block */
592                         int nr = ext3_alloc_block(handle, inode, parent, &err);
593                         if (!nr)
594                                 break;
595                         branch[n].key = cpu_to_le32(nr);
596                         keys = n+1;
597
598                         /*
599                          * Get buffer_head for parent block, zero it out
600                          * and set the pointer to new one, then send
601                          * parent to disk.  
602                          */
603                         bh = sb_getblk(inode->i_sb, parent);
604                         branch[n].bh = bh;
605                         lock_buffer(bh);
606                         BUFFER_TRACE(bh, "call get_create_access");
607                         err = ext3_journal_get_create_access(handle, bh);
608                         if (err) {
609                                 unlock_buffer(bh);
610                                 brelse(bh);
611                                 break;
612                         }
613
614                         memset(bh->b_data, 0, blocksize);
615                         branch[n].p = (u32*) bh->b_data + offsets[n];
616                         *branch[n].p = branch[n].key;
617                         BUFFER_TRACE(bh, "marking uptodate");
618                         set_buffer_uptodate(bh);
619                         unlock_buffer(bh);
620
621                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
622                         err = ext3_journal_dirty_metadata(handle, bh);
623                         if (err)
624                                 break;
625
626                         parent = nr;
627                 }
628         }
629         if (n == num)
630                 return 0;
631
632         /* Allocation failed, free what we already allocated */
633         for (i = 1; i < keys; i++) {
634                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
635                 ext3_journal_forget(handle, branch[i].bh);
636         }
637         for (i = 0; i < keys; i++)
638                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
639         return err;
640 }
641
642 /**
643  *      ext3_splice_branch - splice the allocated branch onto inode.
644  *      @inode: owner
645  *      @block: (logical) number of block we are adding
646  *      @chain: chain of indirect blocks (with a missing link - see
647  *              ext3_alloc_branch)
648  *      @where: location of missing link
649  *      @num:   number of blocks we are adding
650  *
651  *      This function verifies that chain (up to the missing link) had not
652  *      changed, fills the missing link and does all housekeeping needed in
653  *      inode (->i_blocks, etc.). In case of success we end up with the full
654  *      chain to new block and return 0. Otherwise (== chain had been changed)
655  *      we free the new blocks (forgetting their buffer_heads, indeed) and
656  *      return -EAGAIN.
657  */
658
659 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
660                               Indirect chain[4], Indirect *where, int num)
661 {
662         int i;
663         int err = 0;
664         struct ext3_inode_info *ei = EXT3_I(inode);
665
666         /*
667          * If we're splicing into a [td]indirect block (as opposed to the
668          * inode) then we need to get write access to the [td]indirect block
669          * before the splice.
670          */
671         if (where->bh) {
672                 BUFFER_TRACE(where->bh, "get_write_access");
673                 err = ext3_journal_get_write_access(handle, where->bh);
674                 if (err)
675                         goto err_out;
676         }
677         /* Verify that place we are splicing to is still there and vacant */
678
679         /* Writer: pointers, ->i_next_alloc* */
680         if (!verify_chain(chain, where-1) || *where->p)
681                 /* Writer: end */
682                 goto changed;
683
684         /* That's it */
685
686         *where->p = where->key;
687         ei->i_next_alloc_block = block;
688         ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key);
689         /* Writer: end */
690
691         /* We are done with atomic stuff, now do the rest of housekeeping */
692
693         inode->i_ctime = CURRENT_TIME;
694         ext3_mark_inode_dirty(handle, inode);
695
696         /* had we spliced it onto indirect block? */
697         if (where->bh) {
698                 /*
699                  * akpm: If we spliced it onto an indirect block, we haven't
700                  * altered the inode.  Note however that if it is being spliced
701                  * onto an indirect block at the very end of the file (the
702                  * file is growing) then we *will* alter the inode to reflect
703                  * the new i_size.  But that is not done here - it is done in
704                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
705                  */
706                 jbd_debug(5, "splicing indirect only\n");
707                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
708                 err = ext3_journal_dirty_metadata(handle, where->bh);
709                 if (err) 
710                         goto err_out;
711         } else {
712                 /*
713                  * OK, we spliced it into the inode itself on a direct block.
714                  * Inode was dirtied above.
715                  */
716                 jbd_debug(5, "splicing direct\n");
717         }
718         return err;
719
720 changed:
721         /*
722          * AKPM: if where[i].bh isn't part of the current updating
723          * transaction then we explode nastily.  Test this code path.
724          */
725         jbd_debug(1, "the chain changed: try again\n");
726         err = -EAGAIN;
727
728 err_out:
729         for (i = 1; i < num; i++) {
730                 BUFFER_TRACE(where[i].bh, "call journal_forget");
731                 ext3_journal_forget(handle, where[i].bh);
732         }
733         /* For the normal collision cleanup case, we free up the blocks.
734          * On genuine filesystem errors we don't even think about doing
735          * that. */
736         if (err == -EAGAIN)
737                 for (i = 0; i < num; i++)
738                         ext3_free_blocks(handle, inode, 
739                                          le32_to_cpu(where[i].key), 1);
740         return err;
741 }
742
743 /*
744  * Allocation strategy is simple: if we have to allocate something, we will
745  * have to go the whole way to leaf. So let's do it before attaching anything
746  * to tree, set linkage between the newborn blocks, write them if sync is
747  * required, recheck the path, free and repeat if check fails, otherwise
748  * set the last missing link (that will protect us from any truncate-generated
749  * removals - all blocks on the path are immune now) and possibly force the
750  * write on the parent block.
751  * That has a nice additional property: no special recovery from the failed
752  * allocations is needed - we simply release blocks and do not touch anything
753  * reachable from inode.
754  *
755  * akpm: `handle' can be NULL if create == 0.
756  *
757  * The BKL may not be held on entry here.  Be sure to take it early.
758  */
759
760 static int
761 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
762                 struct buffer_head *bh_result, int create, int extend_disksize)
763 {
764         int err = -EIO;
765         int offsets[4];
766         Indirect chain[4];
767         Indirect *partial;
768         unsigned long goal;
769         int left;
770         int boundary = 0;
771         int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
772         struct ext3_inode_info *ei = EXT3_I(inode);
773
774         J_ASSERT(handle != NULL || create == 0);
775
776         if (depth == 0)
777                 goto out;
778
779 reread:
780         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
781
782         /* Simplest case - block found, no allocation needed */
783         if (!partial) {
784                 clear_buffer_new(bh_result);
785 got_it:
786                 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
787                 if (boundary)
788                         set_buffer_boundary(bh_result);
789                 /* Clean up and exit */
790                 partial = chain+depth-1; /* the whole chain */
791                 goto cleanup;
792         }
793
794         /* Next simple case - plain lookup or failed read of indirect block */
795         if (!create || err == -EIO) {
796 cleanup:
797                 while (partial > chain) {
798                         BUFFER_TRACE(partial->bh, "call brelse");
799                         brelse(partial->bh);
800                         partial--;
801                 }
802                 BUFFER_TRACE(bh_result, "returned");
803 out:
804                 return err;
805         }
806
807         /*
808          * Indirect block might be removed by truncate while we were
809          * reading it. Handling of that case (forget what we've got and
810          * reread) is taken out of the main path.
811          */
812         if (err == -EAGAIN)
813                 goto changed;
814
815         goal = 0;
816         down(&ei->truncate_sem);
817         if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) {
818                 up(&ei->truncate_sem);
819                 goto changed;
820         }
821
822         left = (chain + depth) - partial;
823
824         /*
825          * Block out ext3_truncate while we alter the tree
826          */
827         err = ext3_alloc_branch(handle, inode, left, goal,
828                                         offsets+(partial-chain), partial);
829
830         /* The ext3_splice_branch call will free and forget any buffers
831          * on the new chain if there is a failure, but that risks using
832          * up transaction credits, especially for bitmaps where the
833          * credits cannot be returned.  Can we handle this somehow?  We
834          * may need to return -EAGAIN upwards in the worst case.  --sct */
835         if (!err)
836                 err = ext3_splice_branch(handle, inode, iblock, chain,
837                                          partial, left);
838         /* i_disksize growing is protected by truncate_sem
839          * don't forget to protect it if you're about to implement
840          * concurrent ext3_get_block() -bzzz */
841         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
842                 ei->i_disksize = inode->i_size;
843         up(&ei->truncate_sem);
844         if (err == -EAGAIN)
845                 goto changed;
846         if (err)
847                 goto cleanup;
848
849         set_buffer_new(bh_result);
850         goto got_it;
851
852 changed:
853         while (partial > chain) {
854                 jbd_debug(1, "buffer chain changed, retrying\n");
855                 BUFFER_TRACE(partial->bh, "brelsing");
856                 brelse(partial->bh);
857                 partial--;
858         }
859         goto reread;
860 }
861
862 static int ext3_get_block(struct inode *inode, sector_t iblock,
863                         struct buffer_head *bh_result, int create)
864 {
865         handle_t *handle = NULL;
866         int ret;
867
868         if (create) {
869                 handle = ext3_journal_current_handle();
870                 J_ASSERT(handle != 0);
871         }
872         ret = ext3_get_block_handle(handle, inode, iblock,
873                                 bh_result, create, 1);
874         return ret;
875 }
876
877 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
878
879 static int
880 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
881                 unsigned long max_blocks, struct buffer_head *bh_result,
882                 int create)
883 {
884         handle_t *handle = journal_current_handle();
885         int ret = 0;
886
887         if (!handle)
888                 goto get_block;         /* A read */
889
890         if (handle->h_transaction->t_state == T_LOCKED) {
891                 /*
892                  * Huge direct-io writes can hold off commits for long
893                  * periods of time.  Let this commit run.
894                  */
895                 ext3_journal_stop(handle);
896                 handle = ext3_journal_start(inode, DIO_CREDITS);
897                 if (IS_ERR(handle))
898                         ret = PTR_ERR(handle);
899                 goto get_block;
900         }
901
902         if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
903                 /*
904                  * Getting low on buffer credits...
905                  */
906                 ret = ext3_journal_extend(handle, DIO_CREDITS);
907                 if (ret > 0) {
908                         /*
909                          * Couldn't extend the transaction.  Start a new one.
910                          */
911                         ret = ext3_journal_restart(handle, DIO_CREDITS);
912                 }
913         }
914
915 get_block:
916         if (ret == 0)
917                 ret = ext3_get_block_handle(handle, inode, iblock,
918                                         bh_result, create, 0);
919         bh_result->b_size = (1 << inode->i_blkbits);
920         return ret;
921 }
922
923 /*
924  * `handle' can be NULL if create is zero
925  */
926 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
927                                 long block, int create, int * errp)
928 {
929         struct buffer_head dummy;
930         int fatal = 0, err;
931
932         J_ASSERT(handle != NULL || create == 0);
933
934         dummy.b_state = 0;
935         dummy.b_blocknr = -1000;
936         buffer_trace_init(&dummy.b_history);
937         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
938         if (!*errp && buffer_mapped(&dummy)) {
939                 struct buffer_head *bh;
940                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
941                 if (buffer_new(&dummy)) {
942                         J_ASSERT(create != 0);
943                         J_ASSERT(handle != 0);
944
945                         /* Now that we do not always journal data, we
946                            should keep in mind whether this should
947                            always journal the new buffer as metadata.
948                            For now, regular file writes use
949                            ext3_get_block instead, so it's not a
950                            problem. */
951                         lock_buffer(bh);
952                         BUFFER_TRACE(bh, "call get_create_access");
953                         fatal = ext3_journal_get_create_access(handle, bh);
954                         if (!fatal && !buffer_uptodate(bh)) {
955                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
956                                 set_buffer_uptodate(bh);
957                         }
958                         unlock_buffer(bh);
959                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
960                         err = ext3_journal_dirty_metadata(handle, bh);
961                         if (!fatal)
962                                 fatal = err;
963                 } else {
964                         BUFFER_TRACE(bh, "not a new buffer");
965                 }
966                 if (fatal) {
967                         *errp = fatal;
968                         brelse(bh);
969                         bh = NULL;
970                 }
971                 return bh;
972         }
973         return NULL;
974 }
975
976 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
977                                int block, int create, int *err)
978 {
979         struct buffer_head * bh;
980         int prev_blocks;
981
982         prev_blocks = inode->i_blocks;
983
984         bh = ext3_getblk (handle, inode, block, create, err);
985         if (!bh)
986                 return bh;
987 #ifdef EXT3_PREALLOCATE
988         /*
989          * If the inode has grown, and this is a directory, then use a few
990          * more of the preallocated blocks to keep directory fragmentation
991          * down.  The preallocated blocks are guaranteed to be contiguous.
992          */
993         if (create &&
994             S_ISDIR(inode->i_mode) &&
995             inode->i_blocks > prev_blocks &&
996             EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
997                                     EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
998                 int i;
999                 struct buffer_head *tmp_bh;
1000
1001                 for (i = 1;
1002                      EXT3_I(inode)->i_prealloc_count &&
1003                      i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
1004                      i++) {
1005                         /*
1006                          * ext3_getblk will zero out the contents of the
1007                          * directory for us
1008                          */
1009                         tmp_bh = ext3_getblk(handle, inode,
1010                                                 block+i, create, err);
1011                         if (!tmp_bh) {
1012                                 brelse (bh);
1013                                 return 0;
1014                         }
1015                         brelse (tmp_bh);
1016                 }
1017         }
1018 #endif
1019         if (buffer_uptodate(bh))
1020                 return bh;
1021         ll_rw_block (READ, 1, &bh);
1022         wait_on_buffer (bh);
1023         if (buffer_uptodate(bh))
1024                 return bh;
1025         brelse (bh);
1026         *err = -EIO;
1027         return NULL;
1028 }
1029
1030 static int walk_page_buffers(   handle_t *handle,
1031                                 struct buffer_head *head,
1032                                 unsigned from,
1033                                 unsigned to,
1034                                 int *partial,
1035                                 int (*fn)(      handle_t *handle,
1036                                                 struct buffer_head *bh))
1037 {
1038         struct buffer_head *bh;
1039         unsigned block_start, block_end;
1040         unsigned blocksize = head->b_size;
1041         int err, ret = 0;
1042         struct buffer_head *next;
1043
1044         for (   bh = head, block_start = 0;
1045                 ret == 0 && (bh != head || !block_start);
1046                 block_start = block_end, bh = next)
1047         {
1048                 next = bh->b_this_page;
1049                 block_end = block_start + blocksize;
1050                 if (block_end <= from || block_start >= to) {
1051                         if (partial && !buffer_uptodate(bh))
1052                                 *partial = 1;
1053                         continue;
1054                 }
1055                 err = (*fn)(handle, bh);
1056                 if (!ret)
1057                         ret = err;
1058         }
1059         return ret;
1060 }
1061
1062 /*
1063  * To preserve ordering, it is essential that the hole instantiation and
1064  * the data write be encapsulated in a single transaction.  We cannot
1065  * close off a transaction and start a new one between the ext3_get_block()
1066  * and the commit_write().  So doing the journal_start at the start of
1067  * prepare_write() is the right place.
1068  *
1069  * Also, this function can nest inside ext3_writepage() ->
1070  * block_write_full_page(). In that case, we *know* that ext3_writepage()
1071  * has generated enough buffer credits to do the whole page.  So we won't
1072  * block on the journal in that case, which is good, because the caller may
1073  * be PF_MEMALLOC.
1074  *
1075  * By accident, ext3 can be reentered when a transaction is open via
1076  * quota file writes.  If we were to commit the transaction while thus
1077  * reentered, there can be a deadlock - we would be holding a quota
1078  * lock, and the commit would never complete if another thread had a
1079  * transaction open and was blocking on the quota lock - a ranking
1080  * violation.
1081  *
1082  * So what we do is to rely on the fact that journal_stop/journal_start
1083  * will _not_ run commit under these circumstances because handle->h_ref
1084  * is elevated.  We'll still have enough credits for the tiny quotafile
1085  * write.  
1086  */
1087
1088 static int do_journal_get_write_access(handle_t *handle, 
1089                                        struct buffer_head *bh)
1090 {
1091         if (!buffer_mapped(bh) || buffer_freed(bh))
1092                 return 0;
1093         return ext3_journal_get_write_access(handle, bh);
1094 }
1095
1096 static int ext3_prepare_write(struct file *file, struct page *page,
1097                               unsigned from, unsigned to)
1098 {
1099         struct inode *inode = page->mapping->host;
1100         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1101         handle_t *handle;
1102         int retries = 0;
1103
1104 retry:
1105         handle = ext3_journal_start(inode, needed_blocks);
1106         if (IS_ERR(handle)) {
1107                 ret = PTR_ERR(handle);
1108                 goto out;
1109         }
1110         ret = block_prepare_write(page, from, to, ext3_get_block);
1111         if (ret)
1112                 goto prepare_write_failed;
1113
1114         if (ext3_should_journal_data(inode)) {
1115                 ret = walk_page_buffers(handle, page_buffers(page),
1116                                 from, to, NULL, do_journal_get_write_access);
1117         }
1118 prepare_write_failed:
1119         if (ret)
1120                 ext3_journal_stop(handle);
1121         if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1122                 goto retry;
1123 out:
1124         return ret;
1125 }
1126
1127 static int
1128 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1129 {
1130         int err = journal_dirty_data(handle, bh);
1131         if (err)
1132                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1133                                                 bh, handle,err);
1134         return err;
1135 }
1136
1137 /* For commit_write() in data=journal mode */
1138 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1139 {
1140         if (!buffer_mapped(bh) || buffer_freed(bh))
1141                 return 0;
1142         set_buffer_uptodate(bh);
1143         return ext3_journal_dirty_metadata(handle, bh);
1144 }
1145
1146 /*
1147  * We need to pick up the new inode size which generic_commit_write gave us
1148  * `file' can be NULL - eg, when called from page_symlink().
1149  *
1150  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1151  * buffers are managed internally.
1152  */
1153
1154 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1155                              unsigned from, unsigned to)
1156 {
1157         handle_t *handle = ext3_journal_current_handle();
1158         struct inode *inode = page->mapping->host;
1159         int ret = 0, ret2;
1160
1161         ret = walk_page_buffers(handle, page_buffers(page),
1162                 from, to, NULL, ext3_journal_dirty_data);
1163
1164         if (ret == 0) {
1165                 /*
1166                  * generic_commit_write() will run mark_inode_dirty() if i_size
1167                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1168                  * into that.
1169                  */
1170                 loff_t new_i_size;
1171
1172                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1173                 if (new_i_size > EXT3_I(inode)->i_disksize)
1174                         EXT3_I(inode)->i_disksize = new_i_size;
1175                 ret = generic_commit_write(file, page, from, to);
1176         }
1177         ret2 = ext3_journal_stop(handle);
1178         if (!ret)
1179                 ret = ret2;
1180         return ret;
1181 }
1182
1183 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1184                              unsigned from, unsigned to)
1185 {
1186         handle_t *handle = ext3_journal_current_handle();
1187         struct inode *inode = page->mapping->host;
1188         int ret = 0, ret2;
1189         loff_t new_i_size;
1190
1191         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1192         if (new_i_size > EXT3_I(inode)->i_disksize)
1193                 EXT3_I(inode)->i_disksize = new_i_size;
1194         ret = generic_commit_write(file, page, from, to);
1195         ret2 = ext3_journal_stop(handle);
1196         if (!ret)
1197                 ret = ret2;
1198         return ret;
1199 }
1200
1201 static int ext3_journalled_commit_write(struct file *file,
1202                         struct page *page, unsigned from, unsigned to)
1203 {
1204         handle_t *handle = ext3_journal_current_handle();
1205         struct inode *inode = page->mapping->host;
1206         int ret = 0, ret2;
1207         int partial = 0;
1208         loff_t pos;
1209
1210         /*
1211          * Here we duplicate the generic_commit_write() functionality
1212          */
1213         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1214
1215         ret = walk_page_buffers(handle, page_buffers(page), from,
1216                                 to, &partial, commit_write_fn);
1217         if (!partial)
1218                 SetPageUptodate(page);
1219         if (pos > inode->i_size)
1220                 i_size_write(inode, pos);
1221         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1222         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1223                 EXT3_I(inode)->i_disksize = inode->i_size;
1224                 ret2 = ext3_mark_inode_dirty(handle, inode);
1225                 if (!ret) 
1226                         ret = ret2;
1227         }
1228         ret2 = ext3_journal_stop(handle);
1229         if (!ret)
1230                 ret = ret2;
1231         return ret;
1232 }
1233
1234 /* 
1235  * bmap() is special.  It gets used by applications such as lilo and by
1236  * the swapper to find the on-disk block of a specific piece of data.
1237  *
1238  * Naturally, this is dangerous if the block concerned is still in the
1239  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1240  * filesystem and enables swap, then they may get a nasty shock when the
1241  * data getting swapped to that swapfile suddenly gets overwritten by
1242  * the original zero's written out previously to the journal and
1243  * awaiting writeback in the kernel's buffer cache. 
1244  *
1245  * So, if we see any bmap calls here on a modified, data-journaled file,
1246  * take extra steps to flush any blocks which might be in the cache. 
1247  */
1248 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1249 {
1250         struct inode *inode = mapping->host;
1251         journal_t *journal;
1252         int err;
1253
1254         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1255                 /* 
1256                  * This is a REALLY heavyweight approach, but the use of
1257                  * bmap on dirty files is expected to be extremely rare:
1258                  * only if we run lilo or swapon on a freshly made file
1259                  * do we expect this to happen. 
1260                  *
1261                  * (bmap requires CAP_SYS_RAWIO so this does not
1262                  * represent an unprivileged user DOS attack --- we'd be
1263                  * in trouble if mortal users could trigger this path at
1264                  * will.) 
1265                  *
1266                  * NB. EXT3_STATE_JDATA is not set on files other than
1267                  * regular files.  If somebody wants to bmap a directory
1268                  * or symlink and gets confused because the buffer
1269                  * hasn't yet been flushed to disk, they deserve
1270                  * everything they get.
1271                  */
1272
1273                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1274                 journal = EXT3_JOURNAL(inode);
1275                 journal_lock_updates(journal);
1276                 err = journal_flush(journal);
1277                 journal_unlock_updates(journal);
1278
1279                 if (err)
1280                         return 0;
1281         }
1282
1283         return generic_block_bmap(mapping,block,ext3_get_block);
1284 }
1285
1286 static int bget_one(handle_t *handle, struct buffer_head *bh)
1287 {
1288         get_bh(bh);
1289         return 0;
1290 }
1291
1292 static int bput_one(handle_t *handle, struct buffer_head *bh)
1293 {
1294         put_bh(bh);
1295         return 0;
1296 }
1297
1298 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1299 {
1300         if (buffer_mapped(bh))
1301                 return ext3_journal_dirty_data(handle, bh);
1302         return 0;
1303 }
1304
1305 /*
1306  * Note that we always start a transaction even if we're not journalling
1307  * data.  This is to preserve ordering: any hole instantiation within
1308  * __block_write_full_page -> ext3_get_block() should be journalled
1309  * along with the data so we don't crash and then get metadata which
1310  * refers to old data.
1311  *
1312  * In all journalling modes block_write_full_page() will start the I/O.
1313  *
1314  * Problem:
1315  *
1316  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1317  *              ext3_writepage()
1318  *
1319  * Similar for:
1320  *
1321  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1322  *
1323  * Same applies to ext3_get_block().  We will deadlock on various things like
1324  * lock_journal and i_truncate_sem.
1325  *
1326  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1327  * allocations fail.
1328  *
1329  * 16May01: If we're reentered then journal_current_handle() will be
1330  *          non-zero. We simply *return*.
1331  *
1332  * 1 July 2001: @@@ FIXME:
1333  *   In journalled data mode, a data buffer may be metadata against the
1334  *   current transaction.  But the same file is part of a shared mapping
1335  *   and someone does a writepage() on it.
1336  *
1337  *   We will move the buffer onto the async_data list, but *after* it has
1338  *   been dirtied. So there's a small window where we have dirty data on
1339  *   BJ_Metadata.
1340  *
1341  *   Note that this only applies to the last partial page in the file.  The
1342  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1343  *   broken code anyway: it's wrong for msync()).
1344  *
1345  *   It's a rare case: affects the final partial page, for journalled data
1346  *   where the file is subject to bith write() and writepage() in the same
1347  *   transction.  To fix it we'll need a custom block_write_full_page().
1348  *   We'll probably need that anyway for journalling writepage() output.
1349  *
1350  * We don't honour synchronous mounts for writepage().  That would be
1351  * disastrous.  Any write() or metadata operation will sync the fs for
1352  * us.
1353  *
1354  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1355  * we don't need to open a transaction here.
1356  */
1357 static int ext3_ordered_writepage(struct page *page,
1358                         struct writeback_control *wbc)
1359 {
1360         struct inode *inode = page->mapping->host;
1361         struct buffer_head *page_bufs;
1362         handle_t *handle = NULL;
1363         int ret = 0;
1364         int err;
1365
1366         J_ASSERT(PageLocked(page));
1367
1368         /*
1369          * We give up here if we're reentered, because it might be for a
1370          * different filesystem.
1371          */
1372         if (ext3_journal_current_handle())
1373                 goto out_fail;
1374
1375         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1376
1377         if (IS_ERR(handle)) {
1378                 ret = PTR_ERR(handle);
1379                 goto out_fail;
1380         }
1381
1382         if (!page_has_buffers(page)) {
1383                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1384                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1385         }
1386         page_bufs = page_buffers(page);
1387         walk_page_buffers(handle, page_bufs, 0,
1388                         PAGE_CACHE_SIZE, NULL, bget_one);
1389
1390         ret = block_write_full_page(page, ext3_get_block, wbc);
1391
1392         /*
1393          * The page can become unlocked at any point now, and
1394          * truncate can then come in and change things.  So we
1395          * can't touch *page from now on.  But *page_bufs is
1396          * safe due to elevated refcount.
1397          */
1398
1399         /*
1400          * And attach them to the current transaction.  But only if 
1401          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1402          * and generally junk.
1403          */
1404         if (ret == 0) {
1405                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1406                                         NULL, journal_dirty_data_fn);
1407                 if (!ret)
1408                         ret = err;
1409         }
1410         walk_page_buffers(handle, page_bufs, 0,
1411                         PAGE_CACHE_SIZE, NULL, bput_one);
1412         err = ext3_journal_stop(handle);
1413         if (!ret)
1414                 ret = err;
1415         return ret;
1416
1417 out_fail:
1418         redirty_page_for_writepage(wbc, page);
1419         unlock_page(page);
1420         return ret;
1421 }
1422
1423 static int ext3_writeback_writepage(struct page *page,
1424                                 struct writeback_control *wbc)
1425 {
1426         struct inode *inode = page->mapping->host;
1427         handle_t *handle = NULL;
1428         int ret = 0;
1429         int err;
1430
1431         if (ext3_journal_current_handle())
1432                 goto out_fail;
1433
1434         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1435         if (IS_ERR(handle)) {
1436                 ret = PTR_ERR(handle);
1437                 goto out_fail;
1438         }
1439
1440         ret = block_write_full_page(page, ext3_get_block, wbc);
1441         err = ext3_journal_stop(handle);
1442         if (!ret)
1443                 ret = err;
1444         return ret;
1445
1446 out_fail:
1447         redirty_page_for_writepage(wbc, page);
1448         unlock_page(page);
1449         return ret;
1450 }
1451
1452 static int ext3_journalled_writepage(struct page *page,
1453                                 struct writeback_control *wbc)
1454 {
1455         struct inode *inode = page->mapping->host;
1456         handle_t *handle = NULL;
1457         int ret = 0;
1458         int err;
1459
1460         if (ext3_journal_current_handle())
1461                 goto no_write;
1462
1463         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1464         if (IS_ERR(handle)) {
1465                 ret = PTR_ERR(handle);
1466                 goto no_write;
1467         }
1468
1469         if (!page_has_buffers(page) || PageChecked(page)) {
1470                 /*
1471                  * It's mmapped pagecache.  Add buffers and journal it.  There
1472                  * doesn't seem much point in redirtying the page here.
1473                  */
1474                 ClearPageChecked(page);
1475                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1476                                         ext3_get_block);
1477                 if (ret != 0)
1478                         goto out_unlock;
1479                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1480                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1481
1482                 err = walk_page_buffers(handle, page_buffers(page), 0,
1483                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1484                 if (ret == 0)
1485                         ret = err;
1486                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1487                 unlock_page(page);
1488         } else {
1489                 /*
1490                  * It may be a page full of checkpoint-mode buffers.  We don't
1491                  * really know unless we go poke around in the buffer_heads.
1492                  * But block_write_full_page will do the right thing.
1493                  */
1494                 ret = block_write_full_page(page, ext3_get_block, wbc);
1495         }
1496         err = ext3_journal_stop(handle);
1497         if (!ret)
1498                 ret = err;
1499 out:
1500         return ret;
1501
1502 no_write:
1503         redirty_page_for_writepage(wbc, page);
1504 out_unlock:
1505         unlock_page(page);
1506         goto out;
1507 }
1508
1509 static int ext3_readpage(struct file *file, struct page *page)
1510 {
1511         return mpage_readpage(page, ext3_get_block);
1512 }
1513
1514 static int
1515 ext3_readpages(struct file *file, struct address_space *mapping,
1516                 struct list_head *pages, unsigned nr_pages)
1517 {
1518         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1519 }
1520
1521 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1522 {
1523         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1524
1525         /*
1526          * If it's a full truncate we just forget about the pending dirtying
1527          */
1528         if (offset == 0)
1529                 ClearPageChecked(page);
1530
1531         return journal_invalidatepage(journal, page, offset);
1532 }
1533
1534 static int ext3_releasepage(struct page *page, int wait)
1535 {
1536         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1537
1538         WARN_ON(PageChecked(page));
1539         return journal_try_to_free_buffers(journal, page, wait);
1540 }
1541
1542 /*
1543  * If the O_DIRECT write will extend the file then add this inode to the
1544  * orphan list.  So recovery will truncate it back to the original size
1545  * if the machine crashes during the write.
1546  *
1547  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1548  * crashes then stale disk data _may_ be exposed inside the file.
1549  */
1550 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1551                         const struct iovec *iov, loff_t offset,
1552                         unsigned long nr_segs)
1553 {
1554         struct file *file = iocb->ki_filp;
1555         struct inode *inode = file->f_mapping->host;
1556         struct ext3_inode_info *ei = EXT3_I(inode);
1557         handle_t *handle = NULL;
1558         ssize_t ret;
1559         int orphan = 0;
1560         size_t count = iov_length(iov, nr_segs);
1561
1562         if (rw == WRITE) {
1563                 loff_t final_size = offset + count;
1564
1565                 handle = ext3_journal_start(inode, DIO_CREDITS);
1566                 if (IS_ERR(handle)) {
1567                         ret = PTR_ERR(handle);
1568                         goto out;
1569                 }
1570                 if (final_size > inode->i_size) {
1571                         ret = ext3_orphan_add(handle, inode);
1572                         if (ret)
1573                                 goto out_stop;
1574                         orphan = 1;
1575                         ei->i_disksize = inode->i_size;
1576                 }
1577         }
1578
1579         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
1580                                  offset, nr_segs,
1581                                  ext3_direct_io_get_blocks, NULL);
1582
1583 out_stop:
1584         if (handle) {
1585                 int err;
1586
1587                 if (orphan) 
1588                         ext3_orphan_del(handle, inode);
1589                 if (orphan && ret > 0) {
1590                         loff_t end = offset + ret;
1591                         if (end > inode->i_size) {
1592                                 ei->i_disksize = end;
1593                                 i_size_write(inode, end);
1594                                 err = ext3_mark_inode_dirty(handle, inode);
1595                                 if (!ret) 
1596                                         ret = err;
1597                         }
1598                 }
1599                 err = ext3_journal_stop(handle);
1600                 if (ret == 0)
1601                         ret = err;
1602         }
1603 out:
1604         return ret;
1605 }
1606
1607 /*
1608  * Pages can be marked dirty completely asynchronously from ext3's journalling
1609  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1610  * much here because ->set_page_dirty is called under VFS locks.  The page is
1611  * not necessarily locked.
1612  *
1613  * We cannot just dirty the page and leave attached buffers clean, because the
1614  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1615  * or jbddirty because all the journalling code will explode.
1616  *
1617  * So what we do is to mark the page "pending dirty" and next time writepage
1618  * is called, propagate that into the buffers appropriately.
1619  */
1620 static int ext3_journalled_set_page_dirty(struct page *page)
1621 {
1622         SetPageChecked(page);
1623         return __set_page_dirty_nobuffers(page);
1624 }
1625
1626 static struct address_space_operations ext3_ordered_aops = {
1627         .readpage       = ext3_readpage,
1628         .readpages      = ext3_readpages,
1629         .writepage      = ext3_ordered_writepage,
1630         .sync_page      = block_sync_page,
1631         .prepare_write  = ext3_prepare_write,
1632         .commit_write   = ext3_ordered_commit_write,
1633         .bmap           = ext3_bmap,
1634         .invalidatepage = ext3_invalidatepage,
1635         .releasepage    = ext3_releasepage,
1636         .direct_IO      = ext3_direct_IO,
1637 };
1638
1639 static struct address_space_operations ext3_writeback_aops = {
1640         .readpage       = ext3_readpage,
1641         .readpages      = ext3_readpages,
1642         .writepage      = ext3_writeback_writepage,
1643         .sync_page      = block_sync_page,
1644         .prepare_write  = ext3_prepare_write,
1645         .commit_write   = ext3_writeback_commit_write,
1646         .bmap           = ext3_bmap,
1647         .invalidatepage = ext3_invalidatepage,
1648         .releasepage    = ext3_releasepage,
1649         .direct_IO      = ext3_direct_IO,
1650 };
1651
1652 static struct address_space_operations ext3_journalled_aops = {
1653         .readpage       = ext3_readpage,
1654         .readpages      = ext3_readpages,
1655         .writepage      = ext3_journalled_writepage,
1656         .sync_page      = block_sync_page,
1657         .prepare_write  = ext3_prepare_write,
1658         .commit_write   = ext3_journalled_commit_write,
1659         .set_page_dirty = ext3_journalled_set_page_dirty,
1660         .bmap           = ext3_bmap,
1661         .invalidatepage = ext3_invalidatepage,
1662         .releasepage    = ext3_releasepage,
1663 };
1664
1665 void ext3_set_aops(struct inode *inode)
1666 {
1667         if (ext3_should_order_data(inode))
1668                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1669         else if (ext3_should_writeback_data(inode))
1670                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1671         else
1672                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1673 }
1674
1675 /*
1676  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1677  * up to the end of the block which corresponds to `from'.
1678  * This required during truncate. We need to physically zero the tail end
1679  * of that block so it doesn't yield old data if the file is later grown.
1680  */
1681 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1682                 struct address_space *mapping, loff_t from)
1683 {
1684         unsigned long index = from >> PAGE_CACHE_SHIFT;
1685         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1686         unsigned blocksize, iblock, length, pos;
1687         struct inode *inode = mapping->host;
1688         struct buffer_head *bh;
1689         int err;
1690         void *kaddr;
1691
1692         blocksize = inode->i_sb->s_blocksize;
1693         length = blocksize - (offset & (blocksize - 1));
1694         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1695
1696         if (!page_has_buffers(page))
1697                 create_empty_buffers(page, blocksize, 0);
1698
1699         /* Find the buffer that contains "offset" */
1700         bh = page_buffers(page);
1701         pos = blocksize;
1702         while (offset >= pos) {
1703                 bh = bh->b_this_page;
1704                 iblock++;
1705                 pos += blocksize;
1706         }
1707
1708         err = 0;
1709         if (buffer_freed(bh)) {
1710                 BUFFER_TRACE(bh, "freed: skip");
1711                 goto unlock;
1712         }
1713
1714         if (!buffer_mapped(bh)) {
1715                 BUFFER_TRACE(bh, "unmapped");
1716                 ext3_get_block(inode, iblock, bh, 0);
1717                 /* unmapped? It's a hole - nothing to do */
1718                 if (!buffer_mapped(bh)) {
1719                         BUFFER_TRACE(bh, "still unmapped");
1720                         goto unlock;
1721                 }
1722         }
1723
1724         /* Ok, it's mapped. Make sure it's up-to-date */
1725         if (PageUptodate(page))
1726                 set_buffer_uptodate(bh);
1727
1728         if (!buffer_uptodate(bh)) {
1729                 err = -EIO;
1730                 ll_rw_block(READ, 1, &bh);
1731                 wait_on_buffer(bh);
1732                 /* Uhhuh. Read error. Complain and punt. */
1733                 if (!buffer_uptodate(bh))
1734                         goto unlock;
1735         }
1736
1737         if (ext3_should_journal_data(inode)) {
1738                 BUFFER_TRACE(bh, "get write access");
1739                 err = ext3_journal_get_write_access(handle, bh);
1740                 if (err)
1741                         goto unlock;
1742         }
1743
1744         kaddr = kmap_atomic(page, KM_USER0);
1745         memset(kaddr + offset, 0, length);
1746         flush_dcache_page(page);
1747         kunmap_atomic(kaddr, KM_USER0);
1748
1749         BUFFER_TRACE(bh, "zeroed end of block");
1750
1751         err = 0;
1752         if (ext3_should_journal_data(inode)) {
1753                 err = ext3_journal_dirty_metadata(handle, bh);
1754         } else {
1755                 if (ext3_should_order_data(inode))
1756                         err = ext3_journal_dirty_data(handle, bh);
1757                 mark_buffer_dirty(bh);
1758         }
1759
1760 unlock:
1761         unlock_page(page);
1762         page_cache_release(page);
1763         return err;
1764 }
1765
1766 /*
1767  * Probably it should be a library function... search for first non-zero word
1768  * or memcmp with zero_page, whatever is better for particular architecture.
1769  * Linus?
1770  */
1771 static inline int all_zeroes(u32 *p, u32 *q)
1772 {
1773         while (p < q)
1774                 if (*p++)
1775                         return 0;
1776         return 1;
1777 }
1778
1779 /**
1780  *      ext3_find_shared - find the indirect blocks for partial truncation.
1781  *      @inode:   inode in question
1782  *      @depth:   depth of the affected branch
1783  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1784  *      @chain:   place to store the pointers to partial indirect blocks
1785  *      @top:     place to the (detached) top of branch
1786  *
1787  *      This is a helper function used by ext3_truncate().
1788  *
1789  *      When we do truncate() we may have to clean the ends of several
1790  *      indirect blocks but leave the blocks themselves alive. Block is
1791  *      partially truncated if some data below the new i_size is refered
1792  *      from it (and it is on the path to the first completely truncated
1793  *      data block, indeed).  We have to free the top of that path along
1794  *      with everything to the right of the path. Since no allocation
1795  *      past the truncation point is possible until ext3_truncate()
1796  *      finishes, we may safely do the latter, but top of branch may
1797  *      require special attention - pageout below the truncation point
1798  *      might try to populate it.
1799  *
1800  *      We atomically detach the top of branch from the tree, store the
1801  *      block number of its root in *@top, pointers to buffer_heads of
1802  *      partially truncated blocks - in @chain[].bh and pointers to
1803  *      their last elements that should not be removed - in
1804  *      @chain[].p. Return value is the pointer to last filled element
1805  *      of @chain.
1806  *
1807  *      The work left to caller to do the actual freeing of subtrees:
1808  *              a) free the subtree starting from *@top
1809  *              b) free the subtrees whose roots are stored in
1810  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1811  *              c) free the subtrees growing from the inode past the @chain[0].
1812  *                      (no partially truncated stuff there).  */
1813
1814 static Indirect *ext3_find_shared(struct inode *inode,
1815                                 int depth,
1816                                 int offsets[4],
1817                                 Indirect chain[4],
1818                                 u32 *top)
1819 {
1820         Indirect *partial, *p;
1821         int k, err;
1822
1823         *top = 0;
1824         /* Make k index the deepest non-null offest + 1 */
1825         for (k = depth; k > 1 && !offsets[k-1]; k--)
1826                 ;
1827         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1828         /* Writer: pointers */
1829         if (!partial)
1830                 partial = chain + k-1;
1831         /*
1832          * If the branch acquired continuation since we've looked at it -
1833          * fine, it should all survive and (new) top doesn't belong to us.
1834          */
1835         if (!partial->key && *partial->p)
1836                 /* Writer: end */
1837                 goto no_top;
1838         for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
1839                 ;
1840         /*
1841          * OK, we've found the last block that must survive. The rest of our
1842          * branch should be detached before unlocking. However, if that rest
1843          * of branch is all ours and does not grow immediately from the inode
1844          * it's easier to cheat and just decrement partial->p.
1845          */
1846         if (p == chain + k - 1 && p > chain) {
1847                 p->p--;
1848         } else {
1849                 *top = *p->p;
1850                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1851 #if 0
1852                 *p->p = 0;
1853 #endif
1854         }
1855         /* Writer: end */
1856
1857         while(partial > p)
1858         {
1859                 brelse(partial->bh);
1860                 partial--;
1861         }
1862 no_top:
1863         return partial;
1864 }
1865
1866 /*
1867  * Zero a number of block pointers in either an inode or an indirect block.
1868  * If we restart the transaction we must again get write access to the
1869  * indirect block for further modification.
1870  *
1871  * We release `count' blocks on disk, but (last - first) may be greater
1872  * than `count' because there can be holes in there.
1873  */
1874 static void
1875 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1876                 unsigned long block_to_free, unsigned long count,
1877                 u32 *first, u32 *last)
1878 {
1879         u32 *p;
1880         if (try_to_extend_transaction(handle, inode)) {
1881                 if (bh) {
1882                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1883                         ext3_journal_dirty_metadata(handle, bh);
1884                 }
1885                 ext3_mark_inode_dirty(handle, inode);
1886                 ext3_journal_test_restart(handle, inode);
1887                 if (bh) {
1888                         BUFFER_TRACE(bh, "retaking write access");
1889                         ext3_journal_get_write_access(handle, bh);
1890                 }
1891         }
1892
1893         /*
1894          * Any buffers which are on the journal will be in memory. We find
1895          * them on the hash table so journal_revoke() will run journal_forget()
1896          * on them.  We've already detached each block from the file, so
1897          * bforget() in journal_forget() should be safe.
1898          *
1899          * AKPM: turn on bforget in journal_forget()!!!
1900          */
1901         for (p = first; p < last; p++) {
1902                 u32 nr = le32_to_cpu(*p);
1903                 if (nr) {
1904                         struct buffer_head *bh;
1905
1906                         *p = 0;
1907                         bh = sb_find_get_block(inode->i_sb, nr);
1908                         ext3_forget(handle, 0, inode, bh, nr);
1909                 }
1910         }
1911
1912         ext3_free_blocks(handle, inode, block_to_free, count);
1913 }
1914
1915 /**
1916  * ext3_free_data - free a list of data blocks
1917  * @handle:     handle for this transaction
1918  * @inode:      inode we are dealing with
1919  * @this_bh:    indirect buffer_head which contains *@first and *@last
1920  * @first:      array of block numbers
1921  * @last:       points immediately past the end of array
1922  *
1923  * We are freeing all blocks refered from that array (numbers are stored as
1924  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1925  *
1926  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1927  * blocks are contiguous then releasing them at one time will only affect one
1928  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1929  * actually use a lot of journal space.
1930  *
1931  * @this_bh will be %NULL if @first and @last point into the inode's direct
1932  * block pointers.
1933  */
1934 static void ext3_free_data(handle_t *handle, struct inode *inode,
1935                            struct buffer_head *this_bh, u32 *first, u32 *last)
1936 {
1937         unsigned long block_to_free = 0;    /* Starting block # of a run */
1938         unsigned long count = 0;            /* Number of blocks in the run */ 
1939         u32 *block_to_free_p = NULL;        /* Pointer into inode/ind
1940                                                corresponding to
1941                                                block_to_free */
1942         unsigned long nr;                   /* Current block # */
1943         u32 *p;                             /* Pointer into inode/ind
1944                                                for current block */
1945         int err;
1946
1947         if (this_bh) {                          /* For indirect block */
1948                 BUFFER_TRACE(this_bh, "get_write_access");
1949                 err = ext3_journal_get_write_access(handle, this_bh);
1950                 /* Important: if we can't update the indirect pointers
1951                  * to the blocks, we can't free them. */
1952                 if (err)
1953                         return;
1954         }
1955
1956         for (p = first; p < last; p++) {
1957                 nr = le32_to_cpu(*p);
1958                 if (nr) {
1959                         /* accumulate blocks to free if they're contiguous */
1960                         if (count == 0) {
1961                                 block_to_free = nr;
1962                                 block_to_free_p = p;
1963                                 count = 1;
1964                         } else if (nr == block_to_free + count) {
1965                                 count++;
1966                         } else {
1967                                 ext3_clear_blocks(handle, inode, this_bh, 
1968                                                   block_to_free,
1969                                                   count, block_to_free_p, p);
1970                                 block_to_free = nr;
1971                                 block_to_free_p = p;
1972                                 count = 1;
1973                         }
1974                 }
1975         }
1976
1977         if (count > 0)
1978                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1979                                   count, block_to_free_p, p);
1980
1981         if (this_bh) {
1982                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1983                 ext3_journal_dirty_metadata(handle, this_bh);
1984         }
1985 }
1986
1987 /**
1988  *      ext3_free_branches - free an array of branches
1989  *      @handle: JBD handle for this transaction
1990  *      @inode: inode we are dealing with
1991  *      @parent_bh: the buffer_head which contains *@first and *@last
1992  *      @first: array of block numbers
1993  *      @last:  pointer immediately past the end of array
1994  *      @depth: depth of the branches to free
1995  *
1996  *      We are freeing all blocks refered from these branches (numbers are
1997  *      stored as little-endian 32-bit) and updating @inode->i_blocks
1998  *      appropriately.
1999  */
2000 static void ext3_free_branches(handle_t *handle, struct inode *inode,
2001                                struct buffer_head *parent_bh,
2002                                u32 *first, u32 *last, int depth)
2003 {
2004         unsigned long nr;
2005         u32 *p;
2006
2007         if (is_handle_aborted(handle))
2008                 return;
2009
2010         if (depth--) {
2011                 struct buffer_head *bh;
2012                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2013                 p = last;
2014                 while (--p >= first) {
2015                         nr = le32_to_cpu(*p);
2016                         if (!nr)
2017                                 continue;               /* A hole */
2018
2019                         /* Go read the buffer for the next level down */
2020                         bh = sb_bread(inode->i_sb, nr);
2021
2022                         /*
2023                          * A read failure? Report error and clear slot
2024                          * (should be rare).
2025                          */
2026                         if (!bh) {
2027                                 ext3_error(inode->i_sb, "ext3_free_branches",
2028                                            "Read failure, inode=%ld, block=%ld",
2029                                            inode->i_ino, nr);
2030                                 continue;
2031                         }
2032
2033                         /* This zaps the entire block.  Bottom up. */
2034                         BUFFER_TRACE(bh, "free child branches");
2035                         ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
2036                                            (u32*)bh->b_data + addr_per_block,
2037                                            depth);
2038
2039                         /*
2040                          * We've probably journalled the indirect block several
2041                          * times during the truncate.  But it's no longer
2042                          * needed and we now drop it from the transaction via
2043                          * journal_revoke().
2044                          *
2045                          * That's easy if it's exclusively part of this
2046                          * transaction.  But if it's part of the committing
2047                          * transaction then journal_forget() will simply
2048                          * brelse() it.  That means that if the underlying
2049                          * block is reallocated in ext3_get_block(),
2050                          * unmap_underlying_metadata() will find this block
2051                          * and will try to get rid of it.  damn, damn.
2052                          *
2053                          * If this block has already been committed to the
2054                          * journal, a revoke record will be written.  And
2055                          * revoke records must be emitted *before* clearing
2056                          * this block's bit in the bitmaps.
2057                          */
2058                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2059
2060                         /*
2061                          * Everything below this this pointer has been
2062                          * released.  Now let this top-of-subtree go.
2063                          *
2064                          * We want the freeing of this indirect block to be
2065                          * atomic in the journal with the updating of the
2066                          * bitmap block which owns it.  So make some room in
2067                          * the journal.
2068                          *
2069                          * We zero the parent pointer *after* freeing its
2070                          * pointee in the bitmaps, so if extend_transaction()
2071                          * for some reason fails to put the bitmap changes and
2072                          * the release into the same transaction, recovery
2073                          * will merely complain about releasing a free block,
2074                          * rather than leaking blocks.
2075                          */
2076                         if (is_handle_aborted(handle))
2077                                 return;
2078                         if (try_to_extend_transaction(handle, inode)) {
2079                                 ext3_mark_inode_dirty(handle, inode);
2080                                 ext3_journal_test_restart(handle, inode);
2081                         }
2082
2083                         ext3_free_blocks(handle, inode, nr, 1);
2084
2085                         if (parent_bh) {
2086                                 /*
2087                                  * The block which we have just freed is
2088                                  * pointed to by an indirect block: journal it
2089                                  */
2090                                 BUFFER_TRACE(parent_bh, "get_write_access");
2091                                 if (!ext3_journal_get_write_access(handle,
2092                                                                    parent_bh)){
2093                                         *p = 0;
2094                                         BUFFER_TRACE(parent_bh,
2095                                         "call ext3_journal_dirty_metadata");
2096                                         ext3_journal_dirty_metadata(handle, 
2097                                                                     parent_bh);
2098                                 }
2099                         }
2100                 }
2101         } else {
2102                 /* We have reached the bottom of the tree. */
2103                 BUFFER_TRACE(parent_bh, "free data blocks");
2104                 ext3_free_data(handle, inode, parent_bh, first, last);
2105         }
2106 }
2107
2108 /*
2109  * ext3_truncate()
2110  *
2111  * We block out ext3_get_block() block instantiations across the entire
2112  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2113  * simultaneously on behalf of the same inode.
2114  *
2115  * As we work through the truncate and commmit bits of it to the journal there
2116  * is one core, guiding principle: the file's tree must always be consistent on
2117  * disk.  We must be able to restart the truncate after a crash.
2118  *
2119  * The file's tree may be transiently inconsistent in memory (although it
2120  * probably isn't), but whenever we close off and commit a journal transaction,
2121  * the contents of (the filesystem + the journal) must be consistent and
2122  * restartable.  It's pretty simple, really: bottom up, right to left (although
2123  * left-to-right works OK too).
2124  *
2125  * Note that at recovery time, journal replay occurs *before* the restart of
2126  * truncate against the orphan inode list.
2127  *
2128  * The committed inode has the new, desired i_size (which is the same as
2129  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2130  * that this inode's truncate did not complete and it will again call
2131  * ext3_truncate() to have another go.  So there will be instantiated blocks
2132  * to the right of the truncation point in a crashed ext3 filesystem.  But
2133  * that's fine - as long as they are linked from the inode, the post-crash
2134  * ext3_truncate() run will find them and release them.
2135  */
2136
2137 void ext3_truncate_nocheck(struct inode * inode)
2138 {
2139         handle_t *handle;
2140         struct ext3_inode_info *ei = EXT3_I(inode);
2141         u32 *i_data = ei->i_data;
2142         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2143         struct address_space *mapping = inode->i_mapping;
2144         int offsets[4];
2145         Indirect chain[4];
2146         Indirect *partial;
2147         int nr = 0;
2148         int n;
2149         long last_block;
2150         unsigned blocksize = inode->i_sb->s_blocksize;
2151         struct page *page;
2152
2153         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2154             S_ISLNK(inode->i_mode)))
2155                 return;
2156         if (ext3_inode_is_fast_symlink(inode))
2157                 return;
2158
2159         ext3_discard_prealloc(inode);
2160
2161         /*
2162          * We have to lock the EOF page here, because lock_page() nests
2163          * outside journal_start().
2164          */
2165         if ((inode->i_size & (blocksize - 1)) == 0) {
2166                 /* Block boundary? Nothing to do */
2167                 page = NULL;
2168         } else {
2169                 page = grab_cache_page(mapping,
2170                                 inode->i_size >> PAGE_CACHE_SHIFT);
2171                 if (!page)
2172                         return;
2173         }
2174
2175         handle = start_transaction(inode);
2176         if (IS_ERR(handle)) {
2177                 if (page) {
2178                         clear_highpage(page);
2179                         flush_dcache_page(page);
2180                         unlock_page(page);
2181                         page_cache_release(page);
2182                 }
2183                 return;         /* AKPM: return what? */
2184         }
2185
2186         last_block = (inode->i_size + blocksize-1)
2187                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2188
2189         if (page)
2190                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2191
2192         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2193         if (n == 0)
2194                 goto out_stop;  /* error */
2195
2196         /*
2197          * OK.  This truncate is going to happen.  We add the inode to the
2198          * orphan list, so that if this truncate spans multiple transactions,
2199          * and we crash, we will resume the truncate when the filesystem
2200          * recovers.  It also marks the inode dirty, to catch the new size.
2201          *
2202          * Implication: the file must always be in a sane, consistent
2203          * truncatable state while each transaction commits.
2204          */
2205         if (ext3_orphan_add(handle, inode))
2206                 goto out_stop;
2207
2208         /*
2209          * The orphan list entry will now protect us from any crash which
2210          * occurs before the truncate completes, so it is now safe to propagate
2211          * the new, shorter inode size (held for now in i_size) into the
2212          * on-disk inode. We do this via i_disksize, which is the value which
2213          * ext3 *really* writes onto the disk inode.
2214          */
2215         ei->i_disksize = inode->i_size;
2216
2217         /*
2218          * From here we block out all ext3_get_block() callers who want to
2219          * modify the block allocation tree.
2220          */
2221         down(&ei->truncate_sem);
2222
2223         if (n == 1) {           /* direct blocks */
2224                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2225                                i_data + EXT3_NDIR_BLOCKS);
2226                 goto do_indirects;
2227         }
2228
2229         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2230         /* Kill the top of shared branch (not detached) */
2231         if (nr) {
2232                 if (partial == chain) {
2233                         /* Shared branch grows from the inode */
2234                         ext3_free_branches(handle, inode, NULL,
2235                                            &nr, &nr+1, (chain+n-1) - partial);
2236                         *partial->p = 0;
2237                         /*
2238                          * We mark the inode dirty prior to restart,
2239                          * and prior to stop.  No need for it here.
2240                          */
2241                 } else {
2242                         /* Shared branch grows from an indirect block */
2243                         BUFFER_TRACE(partial->bh, "get_write_access");
2244                         ext3_free_branches(handle, inode, partial->bh,
2245                                         partial->p,
2246                                         partial->p+1, (chain+n-1) - partial);
2247                 }
2248         }
2249         /* Clear the ends of indirect blocks on the shared branch */
2250         while (partial > chain) {
2251                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2252                                    (u32*)partial->bh->b_data + addr_per_block,
2253                                    (chain+n-1) - partial);
2254                 BUFFER_TRACE(partial->bh, "call brelse");
2255                 brelse (partial->bh);
2256                 partial--;
2257         }
2258 do_indirects:
2259         /* Kill the remaining (whole) subtrees */
2260         switch (offsets[0]) {
2261                 default:
2262                         nr = i_data[EXT3_IND_BLOCK];
2263                         if (nr) {
2264                                 ext3_free_branches(handle, inode, NULL,
2265                                                    &nr, &nr+1, 1);
2266                                 i_data[EXT3_IND_BLOCK] = 0;
2267                         }
2268                 case EXT3_IND_BLOCK:
2269                         nr = i_data[EXT3_DIND_BLOCK];
2270                         if (nr) {
2271                                 ext3_free_branches(handle, inode, NULL,
2272                                                    &nr, &nr+1, 2);
2273                                 i_data[EXT3_DIND_BLOCK] = 0;
2274                         }
2275                 case EXT3_DIND_BLOCK:
2276                         nr = i_data[EXT3_TIND_BLOCK];
2277                         if (nr) {
2278                                 ext3_free_branches(handle, inode, NULL,
2279                                                    &nr, &nr+1, 3);
2280                                 i_data[EXT3_TIND_BLOCK] = 0;
2281                         }
2282                 case EXT3_TIND_BLOCK:
2283                         ;
2284         }
2285         up(&ei->truncate_sem);
2286         inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2287         ext3_mark_inode_dirty(handle, inode);
2288
2289         /* In a multi-transaction truncate, we only make the final
2290          * transaction synchronous */
2291         if (IS_SYNC(inode))
2292                 handle->h_sync = 1;
2293 out_stop:
2294         /*
2295          * If this was a simple ftruncate(), and the file will remain alive
2296          * then we need to clear up the orphan record which we created above.
2297          * However, if this was a real unlink then we were called by
2298          * ext3_delete_inode(), and we allow that function to clean up the
2299          * orphan info for us.
2300          */
2301         if (inode->i_nlink)
2302                 ext3_orphan_del(handle, inode);
2303
2304         ext3_journal_stop(handle);
2305 }
2306
2307 static unsigned long ext3_get_inode_block(struct super_block *sb,
2308                 unsigned long ino, struct ext3_iloc *iloc)
2309 {
2310         unsigned long desc, group_desc, block_group;
2311         unsigned long offset, block;
2312         struct buffer_head *bh;
2313         struct ext3_group_desc * gdp;
2314
2315         if ((ino != EXT3_ROOT_INO &&
2316                 ino != EXT3_JOURNAL_INO &&
2317                 ino < EXT3_FIRST_INO(sb)) ||
2318                 ino > le32_to_cpu(
2319                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2320                 ext3_error (sb, "ext3_get_inode_block",
2321                             "bad inode number: %lu", ino);
2322                 return 0;
2323         }
2324         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2325         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2326                 ext3_error (sb, "ext3_get_inode_block",
2327                             "group >= groups count");
2328                 return 0;
2329         }
2330         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2331         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2332         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2333         if (!bh) {
2334                 ext3_error (sb, "ext3_get_inode_block",
2335                             "Descriptor not loaded");
2336                 return 0;
2337         }
2338
2339         gdp = (struct ext3_group_desc *) bh->b_data;
2340         /*
2341          * Figure out the offset within the block group inode table
2342          */
2343         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2344                 EXT3_INODE_SIZE(sb);
2345         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2346                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2347
2348         iloc->block_group = block_group;
2349         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2350         return block;
2351 }
2352
2353 /* 
2354  * ext3_get_inode_loc returns with an extra refcount against the inode's
2355  * underlying buffer_head on success.  If `in_mem' is false then we're purely
2356  * trying to determine the inode's location on-disk and no read need be
2357  * performed.
2358  */
2359 static int ext3_get_inode_loc(struct inode *inode,
2360                                 struct ext3_iloc *iloc, int in_mem)
2361 {
2362         unsigned long block;
2363         struct buffer_head *bh;
2364
2365         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2366         if (!block)
2367                 return -EIO;
2368
2369         bh = sb_getblk(inode->i_sb, block);
2370         if (!bh) {
2371                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2372                                 "unable to read inode block - "
2373                                 "inode=%lu, block=%lu", inode->i_ino, block);
2374                 return -EIO;
2375         }
2376         if (!buffer_uptodate(bh)) {
2377                 lock_buffer(bh);
2378                 if (buffer_uptodate(bh)) {
2379                         /* someone brought it uptodate while we waited */
2380                         unlock_buffer(bh);
2381                         goto has_buffer;
2382                 }
2383
2384                 /* we can't skip I/O if inode is on a disk only */
2385                 if (in_mem) {
2386                         struct buffer_head *bitmap_bh;
2387                         struct ext3_group_desc *desc;
2388                         int inodes_per_buffer;
2389                         int inode_offset, i;
2390                         int block_group;
2391                         int start;
2392
2393                         /*
2394                          * If this is the only valid inode in the block we
2395                          * need not read the block.
2396                          */
2397                         block_group = (inode->i_ino - 1) /
2398                                         EXT3_INODES_PER_GROUP(inode->i_sb);
2399                         inodes_per_buffer = bh->b_size /
2400                                 EXT3_INODE_SIZE(inode->i_sb);
2401                         inode_offset = ((inode->i_ino - 1) %
2402                                         EXT3_INODES_PER_GROUP(inode->i_sb));
2403                         start = inode_offset & ~(inodes_per_buffer - 1);
2404
2405                         /* Is the inode bitmap in cache? */
2406                         desc = ext3_get_group_desc(inode->i_sb,
2407                                                 block_group, NULL);
2408                         if (!desc)
2409                                 goto make_io;
2410
2411                         bitmap_bh = sb_getblk(inode->i_sb,
2412                                         le32_to_cpu(desc->bg_inode_bitmap));
2413                         if (!bitmap_bh)
2414                                 goto make_io;
2415
2416                         /*
2417                          * If the inode bitmap isn't in cache then the
2418                          * optimisation may end up performing two reads instead
2419                          * of one, so skip it.
2420                          */
2421                         if (!buffer_uptodate(bitmap_bh)) {
2422                                 brelse(bitmap_bh);
2423                                 goto make_io;
2424                         }
2425                         for (i = start; i < start + inodes_per_buffer; i++) {
2426                                 if (i == inode_offset)
2427                                         continue;
2428                                 if (ext3_test_bit(i, bitmap_bh->b_data))
2429                                         break;
2430                         }
2431                         brelse(bitmap_bh);
2432                         if (i == start + inodes_per_buffer) {
2433                                 /* all other inodes are free, so skip I/O */
2434                                 memset(bh->b_data, 0, bh->b_size);
2435                                 set_buffer_uptodate(bh);
2436                                 unlock_buffer(bh);
2437                                 goto has_buffer;
2438                         }
2439                 }
2440
2441 make_io:
2442                 /*
2443                  * There are another valid inodes in the buffer so we must
2444                  * read the block from disk
2445                  */
2446                 get_bh(bh);
2447                 bh->b_end_io = end_buffer_read_sync;
2448                 submit_bh(READ, bh);
2449                 wait_on_buffer(bh);
2450                 if (!buffer_uptodate(bh)) {
2451                         ext3_error(inode->i_sb, "ext3_get_inode_loc",
2452                                         "unable to read inode block - "
2453                                         "inode=%lu, block=%lu",
2454                                         inode->i_ino, block);
2455                         brelse(bh);
2456                         return -EIO;
2457                 }
2458         }
2459 has_buffer:
2460         iloc->bh = bh;
2461         return 0;
2462 }
2463
2464 void ext3_truncate(struct inode * inode)
2465 {
2466         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2467                 return;
2468         ext3_truncate_nocheck(inode);
2469 }
2470
2471 void ext3_set_inode_flags(struct inode *inode)
2472 {
2473         unsigned int flags = EXT3_I(inode)->i_flags;
2474
2475         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2476         if (flags & EXT3_SYNC_FL)
2477                 inode->i_flags |= S_SYNC;
2478         if (flags & EXT3_APPEND_FL)
2479                 inode->i_flags |= S_APPEND;
2480         if (flags & EXT3_IMMUTABLE_FL)
2481                 inode->i_flags |= S_IMMUTABLE;
2482         if (flags & EXT3_IUNLINK_FL)
2483                 inode->i_flags |= S_IUNLINK;
2484         if (flags & EXT3_BARRIER_FL)
2485                 inode->i_flags |= S_BARRIER;
2486         if (flags & EXT3_NOATIME_FL)
2487                 inode->i_flags |= S_NOATIME;
2488         if (flags & EXT3_DIRSYNC_FL)
2489                 inode->i_flags |= S_DIRSYNC;
2490 }
2491
2492 void ext3_read_inode(struct inode * inode)
2493 {
2494         struct ext3_iloc iloc;
2495         struct ext3_inode *raw_inode;
2496         struct ext3_inode_info *ei = EXT3_I(inode);
2497         struct buffer_head *bh;
2498         int block;
2499         uid_t uid;
2500         gid_t gid;
2501
2502 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2503         ei->i_acl = EXT3_ACL_NOT_CACHED;
2504         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2505 #endif
2506         if (ext3_get_inode_loc(inode, &iloc, 0))
2507                 goto bad_inode;
2508         bh = iloc.bh;
2509         raw_inode = ext3_raw_inode(&iloc);
2510         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2511         uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2512         gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2513         if(!(test_opt (inode->i_sb, NO_UID32))) {
2514                 uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2515                 gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2516         }
2517         inode->i_uid = INOXID_UID(XID_TAG(inode), uid, gid);
2518         inode->i_gid = INOXID_GID(XID_TAG(inode), uid, gid);
2519         inode->i_xid = INOXID_XID(XID_TAG(inode), uid, gid,
2520                 le16_to_cpu(raw_inode->i_raw_xid));
2521
2522         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2523         inode->i_size = le32_to_cpu(raw_inode->i_size);
2524         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2525         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2526         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2527         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2528
2529         ei->i_state = 0;
2530         ei->i_next_alloc_block = 0;
2531         ei->i_next_alloc_goal = 0;
2532         ei->i_dir_start_lookup = 0;
2533         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2534         /* We now have enough fields to check if the inode was active or not.
2535          * This is needed because nfsd might try to access dead inodes
2536          * the test is that same one that e2fsck uses
2537          * NeilBrown 1999oct15
2538          */
2539         if (inode->i_nlink == 0) {
2540                 if (inode->i_mode == 0 ||
2541                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2542                         /* this inode is deleted */
2543                         brelse (bh);
2544                         goto bad_inode;
2545                 }
2546                 /* The only unlinked inodes we let through here have
2547                  * valid i_mode and are being read by the orphan
2548                  * recovery code: that's fine, we're about to complete
2549                  * the process of deleting those. */
2550         }
2551         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2552                                          * (for stat), not the fs block
2553                                          * size */  
2554         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2555         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2556 #ifdef EXT3_FRAGMENTS
2557         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2558         ei->i_frag_no = raw_inode->i_frag;
2559         ei->i_frag_size = raw_inode->i_fsize;
2560 #endif
2561         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2562         if (!S_ISREG(inode->i_mode)) {
2563                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2564         } else {
2565                 inode->i_size |=
2566                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2567         }
2568         ei->i_disksize = inode->i_size;
2569         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2570 #ifdef EXT3_PREALLOCATE
2571         ei->i_prealloc_count = 0;
2572 #endif
2573         ei->i_block_group = iloc.block_group;
2574
2575         /*
2576          * NOTE! The in-memory inode i_data array is in little-endian order
2577          * even on big-endian machines: we do NOT byteswap the block numbers!
2578          */
2579         for (block = 0; block < EXT3_N_BLOCKS; block++)
2580                 ei->i_data[block] = raw_inode->i_block[block];
2581         INIT_LIST_HEAD(&ei->i_orphan);
2582
2583         if (S_ISREG(inode->i_mode)) {
2584                 inode->i_op = &ext3_file_inode_operations;
2585                 inode->i_fop = &ext3_file_operations;
2586                 ext3_set_aops(inode);
2587         } else if (S_ISDIR(inode->i_mode)) {
2588                 inode->i_op = &ext3_dir_inode_operations;
2589                 inode->i_fop = &ext3_dir_operations;
2590         } else if (S_ISLNK(inode->i_mode)) {
2591                 if (ext3_inode_is_fast_symlink(inode))
2592                         inode->i_op = &ext3_fast_symlink_inode_operations;
2593                 else {
2594                         inode->i_op = &ext3_symlink_inode_operations;
2595                         ext3_set_aops(inode);
2596                 }
2597         } else {
2598                 inode->i_op = &ext3_special_inode_operations;
2599                 if (raw_inode->i_block[0])
2600                         init_special_inode(inode, inode->i_mode,
2601                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2602                 else 
2603                         init_special_inode(inode, inode->i_mode,
2604                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2605         }
2606         brelse (iloc.bh);
2607         ext3_set_inode_flags(inode);
2608         return;
2609
2610 bad_inode:
2611         make_bad_inode(inode);
2612         return;
2613 }
2614
2615 /*
2616  * Post the struct inode info into an on-disk inode location in the
2617  * buffer-cache.  This gobbles the caller's reference to the
2618  * buffer_head in the inode location struct.
2619  *
2620  * The caller must have write access to iloc->bh.
2621  */
2622 static int ext3_do_update_inode(handle_t *handle, 
2623                                 struct inode *inode, 
2624                                 struct ext3_iloc *iloc)
2625 {
2626         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2627         struct ext3_inode_info *ei = EXT3_I(inode);
2628         struct buffer_head *bh = iloc->bh;
2629         uid_t uid = XIDINO_UID(XID_TAG(inode), inode->i_uid, inode->i_xid);
2630         gid_t gid = XIDINO_GID(XID_TAG(inode), inode->i_gid, inode->i_xid);
2631         int err = 0, rc, block;
2632
2633         /* For fields not not tracking in the in-memory inode,
2634          * initialise them to zero for new inodes. */
2635         if (ei->i_state & EXT3_STATE_NEW)
2636                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2637
2638         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2639         if(!(test_opt(inode->i_sb, NO_UID32))) {
2640                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
2641                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
2642 /*
2643  * Fix up interoperability with old kernels. Otherwise, old inodes get
2644  * re-used with the upper 16 bits of the uid/gid intact
2645  */
2646                 if(!ei->i_dtime) {
2647                         raw_inode->i_uid_high =
2648                                 cpu_to_le16(high_16_bits(uid));
2649                         raw_inode->i_gid_high =
2650                                 cpu_to_le16(high_16_bits(gid));
2651                 } else {
2652                         raw_inode->i_uid_high = 0;
2653                         raw_inode->i_gid_high = 0;
2654                 }
2655         } else {
2656                 raw_inode->i_uid_low =
2657                         cpu_to_le16(fs_high2lowuid(uid));
2658                 raw_inode->i_gid_low =
2659                         cpu_to_le16(fs_high2lowgid(gid));
2660                 raw_inode->i_uid_high = 0;
2661                 raw_inode->i_gid_high = 0;
2662         }
2663 #ifdef CONFIG_INOXID_GID32
2664         raw_inode->i_raw_xid = cpu_to_le16(inode->i_xid);
2665 #endif
2666         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2667         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2668         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2669         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2670         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2671         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2672         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2673         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2674 #ifdef EXT3_FRAGMENTS
2675         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2676         raw_inode->i_frag = ei->i_frag_no;
2677         raw_inode->i_fsize = ei->i_frag_size;
2678 #endif
2679         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2680         if (!S_ISREG(inode->i_mode)) {
2681                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2682         } else {
2683                 raw_inode->i_size_high =
2684                         cpu_to_le32(ei->i_disksize >> 32);
2685                 if (ei->i_disksize > 0x7fffffffULL) {
2686                         struct super_block *sb = inode->i_sb;
2687                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2688                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2689                             EXT3_SB(sb)->s_es->s_rev_level ==
2690                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2691                                /* If this is the first large file
2692                                 * created, add a flag to the superblock.
2693                                 */
2694                                 err = ext3_journal_get_write_access(handle,
2695                                                 EXT3_SB(sb)->s_sbh);
2696                                 if (err)
2697                                         goto out_brelse;
2698                                 ext3_update_dynamic_rev(sb);
2699                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2700                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2701                                 sb->s_dirt = 1;
2702                                 handle->h_sync = 1;
2703                                 err = ext3_journal_dirty_metadata(handle,
2704                                                 EXT3_SB(sb)->s_sbh);
2705                         }
2706                 }
2707         }
2708         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2709         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2710                 if (old_valid_dev(inode->i_rdev)) {
2711                         raw_inode->i_block[0] =
2712                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
2713                         raw_inode->i_block[1] = 0;
2714                 } else {
2715                         raw_inode->i_block[0] = 0;
2716                         raw_inode->i_block[1] =
2717                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
2718                         raw_inode->i_block[2] = 0;
2719                 }
2720         } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2721                 raw_inode->i_block[block] = ei->i_data[block];
2722
2723         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2724         rc = ext3_journal_dirty_metadata(handle, bh);
2725         if (!err)
2726                 err = rc;
2727         ei->i_state &= ~EXT3_STATE_NEW;
2728
2729 out_brelse:
2730         brelse (bh);
2731         ext3_std_error(inode->i_sb, err);
2732         return err;
2733 }
2734
2735 /*
2736  * ext3_write_inode()
2737  *
2738  * We are called from a few places:
2739  *
2740  * - Within generic_file_write() for O_SYNC files.
2741  *   Here, there will be no transaction running. We wait for any running
2742  *   trasnaction to commit.
2743  *
2744  * - Within sys_sync(), kupdate and such.
2745  *   We wait on commit, if tol to.
2746  *
2747  * - Within prune_icache() (PF_MEMALLOC == true)
2748  *   Here we simply return.  We can't afford to block kswapd on the
2749  *   journal commit.
2750  *
2751  * In all cases it is actually safe for us to return without doing anything,
2752  * because the inode has been copied into a raw inode buffer in
2753  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2754  * knfsd.
2755  *
2756  * Note that we are absolutely dependent upon all inode dirtiers doing the
2757  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2758  * which we are interested.
2759  *
2760  * It would be a bug for them to not do this.  The code:
2761  *
2762  *      mark_inode_dirty(inode)
2763  *      stuff();
2764  *      inode->i_size = expr;
2765  *
2766  * is in error because a kswapd-driven write_inode() could occur while
2767  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2768  * will no longer be on the superblock's dirty inode list.
2769  */
2770 void ext3_write_inode(struct inode *inode, int wait)
2771 {
2772         if (current->flags & PF_MEMALLOC)
2773                 return;
2774
2775         if (ext3_journal_current_handle()) {
2776                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2777                 dump_stack();
2778                 return;
2779         }
2780
2781         if (!wait)
2782                 return;
2783
2784         ext3_force_commit(inode->i_sb);
2785 }
2786
2787 int ext3_setattr_flags(struct inode *inode, unsigned int flags)
2788 {
2789         unsigned int oldflags, newflags;
2790         int err = 0;
2791
2792         oldflags = EXT3_I(inode)->i_flags;
2793         newflags = oldflags &
2794                 ~(EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL | EXT3_BARRIER_FL);       
2795         if (flags & ATTR_FLAG_IMMUTABLE)
2796                 newflags |= EXT3_IMMUTABLE_FL;
2797         if (flags & ATTR_FLAG_IUNLINK)
2798                 newflags |= EXT3_IUNLINK_FL;
2799         if (flags & ATTR_FLAG_BARRIER)
2800                 newflags |= EXT3_BARRIER_FL;
2801
2802         if (oldflags ^ newflags) {
2803                 handle_t *handle;
2804                 struct ext3_iloc iloc;
2805
2806                 handle = ext3_journal_start(inode, 1);
2807                 if (IS_ERR(handle))
2808                         return PTR_ERR(handle);
2809                 if (IS_SYNC(inode))
2810                         handle->h_sync = 1;
2811                 err = ext3_reserve_inode_write(handle, inode, &iloc);
2812                 if (err)
2813                         goto flags_err;
2814                 
2815                 EXT3_I(inode)->i_flags = newflags;
2816                 inode->i_ctime = CURRENT_TIME;
2817
2818                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2819         flags_err:
2820                 ext3_journal_stop(handle);
2821         }
2822         return err;
2823 }
2824
2825 /*
2826  * ext3_setattr()
2827  *
2828  * Called from notify_change.
2829  *
2830  * We want to trap VFS attempts to truncate the file as soon as
2831  * possible.  In particular, we want to make sure that when the VFS
2832  * shrinks i_size, we put the inode on the orphan list and modify
2833  * i_disksize immediately, so that during the subsequent flushing of
2834  * dirty pages and freeing of disk blocks, we can guarantee that any
2835  * commit will leave the blocks being flushed in an unused state on
2836  * disk.  (On recovery, the inode will get truncated and the blocks will
2837  * be freed, so we have a strong guarantee that no future commit will
2838  * leave these blocks visible to the user.)  
2839  *
2840  * Called with inode->sem down.
2841  */
2842 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2843 {
2844         struct inode *inode = dentry->d_inode;
2845         int error, rc = 0;
2846         const unsigned int ia_valid = attr->ia_valid;
2847
2848         error = inode_change_ok(inode, attr);
2849         if (error)
2850                 return error;
2851
2852         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2853                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) ||
2854                 (ia_valid & ATTR_XID && attr->ia_xid != inode->i_xid)) {
2855                 handle_t *handle;
2856
2857                 /* (user+group)*(old+new) structure, inode write (sb,
2858                  * inode block, ? - but truncate inode update has it) */
2859                 handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
2860                 if (IS_ERR(handle)) {
2861                         error = PTR_ERR(handle);
2862                         goto err_out;
2863                 }
2864                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2865                 if (error) {
2866                         ext3_journal_stop(handle);
2867                         return error;
2868                 }
2869                 /* Update corresponding info in inode so that everything is in
2870                  * one transaction */
2871                 if (attr->ia_valid & ATTR_UID)
2872                         inode->i_uid = attr->ia_uid;
2873                 if (attr->ia_valid & ATTR_GID)
2874                         inode->i_gid = attr->ia_gid;
2875                 if ((attr->ia_valid & ATTR_XID)
2876                         && inode->i_sb
2877                         && (inode->i_sb->s_flags & MS_TAGXID))
2878                         inode->i_xid = attr->ia_xid;
2879                 error = ext3_mark_inode_dirty(handle, inode);
2880                 ext3_journal_stop(handle);
2881         }
2882
2883         if (S_ISREG(inode->i_mode) &&
2884             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2885                 handle_t *handle;
2886
2887                 handle = ext3_journal_start(inode, 3);
2888                 if (IS_ERR(handle)) {
2889                         error = PTR_ERR(handle);
2890                         goto err_out;
2891                 }
2892
2893                 error = ext3_orphan_add(handle, inode);
2894                 EXT3_I(inode)->i_disksize = attr->ia_size;
2895                 rc = ext3_mark_inode_dirty(handle, inode);
2896                 if (!error)
2897                         error = rc;
2898                 ext3_journal_stop(handle);
2899         }
2900
2901         if (ia_valid & ATTR_ATTR_FLAG) {
2902                 rc = ext3_setattr_flags(inode, attr->ia_attr_flags);
2903                 if (!error)
2904                         error = rc;
2905         }
2906
2907         rc = inode_setattr(inode, attr);
2908
2909         /* If inode_setattr's call to ext3_truncate failed to get a
2910          * transaction handle at all, we need to clean up the in-core
2911          * orphan list manually. */
2912         if (inode->i_nlink)
2913                 ext3_orphan_del(NULL, inode);
2914
2915         if (!rc && (ia_valid & ATTR_MODE))
2916                 rc = ext3_acl_chmod(inode);
2917
2918 err_out:
2919         ext3_std_error(inode->i_sb, error);
2920         if (!error)
2921                 error = rc;
2922         return error;
2923 }
2924
2925
2926 /*
2927  * akpm: how many blocks doth make a writepage()?
2928  *
2929  * With N blocks per page, it may be:
2930  * N data blocks
2931  * 2 indirect block
2932  * 2 dindirect
2933  * 1 tindirect
2934  * N+5 bitmap blocks (from the above)
2935  * N+5 group descriptor summary blocks
2936  * 1 inode block
2937  * 1 superblock.
2938  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2939  *
2940  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2941  *
2942  * With ordered or writeback data it's the same, less the N data blocks.
2943  *
2944  * If the inode's direct blocks can hold an integral number of pages then a
2945  * page cannot straddle two indirect blocks, and we can only touch one indirect
2946  * and dindirect block, and the "5" above becomes "3".
2947  *
2948  * This still overestimates under most circumstances.  If we were to pass the
2949  * start and end offsets in here as well we could do block_to_path() on each
2950  * block and work out the exact number of indirects which are touched.  Pah.
2951  */
2952
2953 int ext3_writepage_trans_blocks(struct inode *inode)
2954 {
2955         int bpp = ext3_journal_blocks_per_page(inode);
2956         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2957         int ret;
2958
2959         if (ext3_should_journal_data(inode))
2960                 ret = 3 * (bpp + indirects) + 2;
2961         else
2962                 ret = 2 * (bpp + indirects) + 2;
2963
2964 #ifdef CONFIG_QUOTA
2965         /* We know that structure was already allocated during DQUOT_INIT so
2966          * we will be updating only the data blocks + inodes */
2967         ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
2968 #endif
2969
2970         return ret;
2971 }
2972
2973 /*
2974  * The caller must have previously called ext3_reserve_inode_write().
2975  * Give this, we know that the caller already has write access to iloc->bh.
2976  */
2977 int ext3_mark_iloc_dirty(handle_t *handle,
2978                 struct inode *inode, struct ext3_iloc *iloc)
2979 {
2980         int err = 0;
2981
2982         /* the do_update_inode consumes one bh->b_count */
2983         get_bh(iloc->bh);
2984
2985         /* ext3_do_update_inode() does journal_dirty_metadata */
2986         err = ext3_do_update_inode(handle, inode, iloc);
2987         put_bh(iloc->bh);
2988         return err;
2989 }
2990
2991 /* 
2992  * On success, We end up with an outstanding reference count against
2993  * iloc->bh.  This _must_ be cleaned up later. 
2994  */
2995
2996 int
2997 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
2998                          struct ext3_iloc *iloc)
2999 {
3000         int err = 0;
3001         if (handle) {
3002                 err = ext3_get_inode_loc(inode, iloc, 1);
3003                 if (!err) {
3004                         BUFFER_TRACE(iloc->bh, "get_write_access");
3005                         err = ext3_journal_get_write_access(handle, iloc->bh);
3006                         if (err) {
3007                                 brelse(iloc->bh);
3008                                 iloc->bh = NULL;
3009                         }
3010                 }
3011         }
3012         ext3_std_error(inode->i_sb, err);
3013         return err;
3014 }
3015
3016 /*
3017  * akpm: What we do here is to mark the in-core inode as clean
3018  * with respect to inode dirtiness (it may still be data-dirty).
3019  * This means that the in-core inode may be reaped by prune_icache
3020  * without having to perform any I/O.  This is a very good thing,
3021  * because *any* task may call prune_icache - even ones which
3022  * have a transaction open against a different journal.
3023  *
3024  * Is this cheating?  Not really.  Sure, we haven't written the
3025  * inode out, but prune_icache isn't a user-visible syncing function.
3026  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3027  * we start and wait on commits.
3028  *
3029  * Is this efficient/effective?  Well, we're being nice to the system
3030  * by cleaning up our inodes proactively so they can be reaped
3031  * without I/O.  But we are potentially leaving up to five seconds'
3032  * worth of inodes floating about which prune_icache wants us to
3033  * write out.  One way to fix that would be to get prune_icache()
3034  * to do a write_super() to free up some memory.  It has the desired
3035  * effect.
3036  */
3037 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3038 {
3039         struct ext3_iloc iloc;
3040         int err;
3041
3042         err = ext3_reserve_inode_write(handle, inode, &iloc);
3043         if (!err)
3044                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3045         return err;
3046 }
3047
3048 /*
3049  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
3050  *
3051  * We're really interested in the case where a file is being extended.
3052  * i_size has been changed by generic_commit_write() and we thus need
3053  * to include the updated inode in the current transaction.
3054  *
3055  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3056  * are allocated to the file.
3057  *
3058  * If the inode is marked synchronous, we don't honour that here - doing
3059  * so would cause a commit on atime updates, which we don't bother doing.
3060  * We handle synchronous inodes at the highest possible level.
3061  */
3062 void ext3_dirty_inode(struct inode *inode)
3063 {
3064         handle_t *current_handle = ext3_journal_current_handle();
3065         handle_t *handle;
3066
3067         handle = ext3_journal_start(inode, 2);
3068         if (IS_ERR(handle))
3069                 goto out;
3070         if (current_handle &&
3071                 current_handle->h_transaction != handle->h_transaction) {
3072                 /* This task has a transaction open against a different fs */
3073                 printk(KERN_EMERG "%s: transactions do not match!\n",
3074                        __FUNCTION__);
3075         } else {
3076                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
3077                                 current_handle);
3078                 ext3_mark_inode_dirty(handle, inode);
3079         }
3080         ext3_journal_stop(handle);
3081 out:
3082         return;
3083 }
3084
3085 #ifdef AKPM
3086 /* 
3087  * Bind an inode's backing buffer_head into this transaction, to prevent
3088  * it from being flushed to disk early.  Unlike
3089  * ext3_reserve_inode_write, this leaves behind no bh reference and
3090  * returns no iloc structure, so the caller needs to repeat the iloc
3091  * lookup to mark the inode dirty later.
3092  */
3093 static inline int
3094 ext3_pin_inode(handle_t *handle, struct inode *inode)
3095 {
3096         struct ext3_iloc iloc;
3097
3098         int err = 0;
3099         if (handle) {
3100                 err = ext3_get_inode_loc(inode, &iloc, 1);
3101                 if (!err) {
3102                         BUFFER_TRACE(iloc.bh, "get_write_access");
3103                         err = journal_get_write_access(handle, iloc.bh);
3104                         if (!err)
3105                                 err = ext3_journal_dirty_metadata(handle, 
3106                                                                   iloc.bh);
3107                         brelse(iloc.bh);
3108                 }
3109         }
3110         ext3_std_error(inode->i_sb, err);
3111         return err;
3112 }
3113 #endif
3114
3115 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3116 {
3117         journal_t *journal;
3118         handle_t *handle;
3119         int err;
3120
3121         /*
3122          * We have to be very careful here: changing a data block's
3123          * journaling status dynamically is dangerous.  If we write a
3124          * data block to the journal, change the status and then delete
3125          * that block, we risk forgetting to revoke the old log record
3126          * from the journal and so a subsequent replay can corrupt data.
3127          * So, first we make sure that the journal is empty and that
3128          * nobody is changing anything.
3129          */
3130
3131         journal = EXT3_JOURNAL(inode);
3132         if (is_journal_aborted(journal) || IS_RDONLY(inode))
3133                 return -EROFS;
3134
3135         journal_lock_updates(journal);
3136         journal_flush(journal);
3137
3138         /*
3139          * OK, there are no updates running now, and all cached data is
3140          * synced to disk.  We are now in a completely consistent state
3141          * which doesn't have anything in the journal, and we know that
3142          * no filesystem updates are running, so it is safe to modify
3143          * the inode's in-core data-journaling state flag now.
3144          */
3145
3146         if (val)
3147                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3148         else
3149                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3150         ext3_set_aops(inode);
3151
3152         journal_unlock_updates(journal);
3153
3154         /* Finally we can mark the inode as dirty. */
3155
3156         handle = ext3_journal_start(inode, 1);
3157         if (IS_ERR(handle))
3158                 return PTR_ERR(handle);
3159
3160         err = ext3_mark_inode_dirty(handle, inode);
3161         handle->h_sync = 1;
3162         ext3_journal_stop(handle);
3163         ext3_std_error(inode->i_sb, err);
3164
3165         return err;
3166 }