Fedora Core 2 - 1.492
[linux-2.6.git] / fs / ext3 / inode.c
1 /*
2  *  linux/fs/ext3/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *      (sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *      (jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23  */
24
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
39 #include "xattr.h"
40 #include "acl.h"
41
42 /*
43  * Test whether an inode is a fast symlink.
44  */
45 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
46 {
47         int ea_blocks = EXT3_I(inode)->i_file_acl ?
48                 (inode->i_sb->s_blocksize >> 9) : 0;
49
50         return (S_ISLNK(inode->i_mode) &&
51                 inode->i_blocks - ea_blocks == 0);
52 }
53
54 /* The ext3 forget function must perform a revoke if we are freeing data
55  * which has been journaled.  Metadata (eg. indirect blocks) must be
56  * revoked in all cases. 
57  *
58  * "bh" may be NULL: a metadata block may have been freed from memory
59  * but there may still be a record of it in the journal, and that record
60  * still needs to be revoked.
61  */
62
63 int ext3_forget(handle_t *handle, int is_metadata,
64                        struct inode *inode, struct buffer_head *bh,
65                        int blocknr)
66 {
67         int err;
68
69         might_sleep();
70
71         BUFFER_TRACE(bh, "enter");
72
73         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
74                   "data mode %lx\n",
75                   bh, is_metadata, inode->i_mode,
76                   test_opt(inode->i_sb, DATA_FLAGS));
77
78         /* Never use the revoke function if we are doing full data
79          * journaling: there is no need to, and a V1 superblock won't
80          * support it.  Otherwise, only skip the revoke on un-journaled
81          * data blocks. */
82
83         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
84             (!is_metadata && !ext3_should_journal_data(inode))) {
85                 if (bh) {
86                         BUFFER_TRACE(bh, "call journal_forget");
87                         ext3_journal_forget(handle, bh);
88                 }
89                 return 0;
90         }
91
92         /*
93          * data!=journal && (is_metadata || should_journal_data(inode))
94          */
95         BUFFER_TRACE(bh, "call ext3_journal_revoke");
96         err = ext3_journal_revoke(handle, blocknr, bh);
97         if (err)
98                 ext3_abort(inode->i_sb, __FUNCTION__,
99                            "error %d when attempting revoke", err);
100         BUFFER_TRACE(bh, "exit");
101         return err;
102 }
103
104 /*
105  * Work out how many blocks we need to progress with the next chunk of a
106  * truncate transaction.
107  */
108
109 static unsigned long blocks_for_truncate(struct inode *inode) 
110 {
111         unsigned long needed;
112
113         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
114
115         /* Give ourselves just enough room to cope with inodes in which
116          * i_blocks is corrupt: we've seen disk corruptions in the past
117          * which resulted in random data in an inode which looked enough
118          * like a regular file for ext3 to try to delete it.  Things
119          * will go a bit crazy if that happens, but at least we should
120          * try not to panic the whole kernel. */
121         if (needed < 2)
122                 needed = 2;
123
124         /* But we need to bound the transaction so we don't overflow the
125          * journal. */
126         if (needed > EXT3_MAX_TRANS_DATA) 
127                 needed = EXT3_MAX_TRANS_DATA;
128
129         return EXT3_DATA_TRANS_BLOCKS + needed;
130 }
131
132 /* 
133  * Truncate transactions can be complex and absolutely huge.  So we need to
134  * be able to restart the transaction at a conventient checkpoint to make
135  * sure we don't overflow the journal.
136  *
137  * start_transaction gets us a new handle for a truncate transaction,
138  * and extend_transaction tries to extend the existing one a bit.  If
139  * extend fails, we need to propagate the failure up and restart the
140  * transaction in the top-level truncate loop. --sct 
141  */
142
143 static handle_t *start_transaction(struct inode *inode) 
144 {
145         handle_t *result;
146
147         result = ext3_journal_start(inode, blocks_for_truncate(inode));
148         if (!IS_ERR(result))
149                 return result;
150
151         ext3_std_error(inode->i_sb, PTR_ERR(result));
152         return result;
153 }
154
155 /*
156  * Try to extend this transaction for the purposes of truncation.
157  *
158  * Returns 0 if we managed to create more room.  If we can't create more
159  * room, and the transaction must be restarted we return 1.
160  */
161 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
162 {
163         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
164                 return 0;
165         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
166                 return 0;
167         return 1;
168 }
169
170 /*
171  * Restart the transaction associated with *handle.  This does a commit,
172  * so before we call here everything must be consistently dirtied against
173  * this transaction.
174  */
175 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
176 {
177         jbd_debug(2, "restarting handle %p\n", handle);
178         return ext3_journal_restart(handle, blocks_for_truncate(inode));
179 }
180
181 /*
182  * Called at each iput()
183  *
184  * The inode may be "bad" if ext3_read_inode() saw an error from
185  * ext3_get_inode(), so we need to check that to avoid freeing random disk
186  * blocks.
187  */
188 void ext3_put_inode(struct inode *inode)
189 {
190         if (!is_bad_inode(inode))
191                 ext3_discard_prealloc(inode);
192 }
193
194 /*
195  * Called at the last iput() if i_nlink is zero.
196  */
197 void ext3_delete_inode (struct inode * inode)
198 {
199         handle_t *handle;
200
201         if (is_bad_inode(inode))
202                 goto no_delete;
203
204         handle = start_transaction(inode);
205         if (IS_ERR(handle)) {
206                 /* If we're going to skip the normal cleanup, we still
207                  * need to make sure that the in-core orphan linked list
208                  * is properly cleaned up. */
209                 ext3_orphan_del(NULL, inode);
210                 goto no_delete;
211         }
212
213         if (IS_SYNC(inode))
214                 handle->h_sync = 1;
215         inode->i_size = 0;
216         if (inode->i_blocks)
217                 ext3_truncate(inode);
218         /*
219          * Kill off the orphan record which ext3_truncate created.
220          * AKPM: I think this can be inside the above `if'.
221          * Note that ext3_orphan_del() has to be able to cope with the
222          * deletion of a non-existent orphan - this is because we don't
223          * know if ext3_truncate() actually created an orphan record.
224          * (Well, we could do this if we need to, but heck - it works)
225          */
226         ext3_orphan_del(handle, inode);
227         EXT3_I(inode)->i_dtime  = get_seconds();
228
229         /* 
230          * One subtle ordering requirement: if anything has gone wrong
231          * (transaction abort, IO errors, whatever), then we can still
232          * do these next steps (the fs will already have been marked as
233          * having errors), but we can't free the inode if the mark_dirty
234          * fails.  
235          */
236         if (ext3_mark_inode_dirty(handle, inode))
237                 /* If that failed, just do the required in-core inode clear. */
238                 clear_inode(inode);
239         else
240                 ext3_free_inode(handle, inode);
241         ext3_journal_stop(handle);
242         return;
243 no_delete:
244         clear_inode(inode);     /* We must guarantee clearing of inode... */
245 }
246
247 void ext3_discard_prealloc (struct inode * inode)
248 {
249 #ifdef EXT3_PREALLOCATE
250         struct ext3_inode_info *ei = EXT3_I(inode);
251         /* Writer: ->i_prealloc* */
252         if (ei->i_prealloc_count) {
253                 unsigned short total = ei->i_prealloc_count;
254                 unsigned long block = ei->i_prealloc_block;
255                 ei->i_prealloc_count = 0;
256                 ei->i_prealloc_block = 0;
257                 /* Writer: end */
258                 ext3_free_blocks (inode, block, total);
259         }
260 #endif
261 }
262
263 static int ext3_alloc_block (handle_t *handle,
264                         struct inode * inode, unsigned long goal, int *err)
265 {
266         unsigned long result;
267
268 #ifdef EXT3_PREALLOCATE
269 #ifdef EXT3FS_DEBUG
270         static unsigned long alloc_hits, alloc_attempts;
271 #endif
272         struct ext3_inode_info *ei = EXT3_I(inode);
273         /* Writer: ->i_prealloc* */
274         if (ei->i_prealloc_count &&
275             (goal == ei->i_prealloc_block ||
276              goal + 1 == ei->i_prealloc_block))
277         {
278                 result = ei->i_prealloc_block++;
279                 ei->i_prealloc_count--;
280                 /* Writer: end */
281                 ext3_debug ("preallocation hit (%lu/%lu).\n",
282                             ++alloc_hits, ++alloc_attempts);
283         } else {
284                 ext3_discard_prealloc (inode);
285                 ext3_debug ("preallocation miss (%lu/%lu).\n",
286                             alloc_hits, ++alloc_attempts);
287                 if (S_ISREG(inode->i_mode))
288                         result = ext3_new_block (inode, goal, 
289                                  &ei->i_prealloc_count,
290                                  &ei->i_prealloc_block, err);
291                 else
292                         result = ext3_new_block(inode, goal, NULL, NULL, err);
293                 /*
294                  * AKPM: this is somewhat sticky.  I'm not surprised it was
295                  * disabled in 2.2's ext3.  Need to integrate b_committed_data
296                  * guarding with preallocation, if indeed preallocation is
297                  * effective.
298                  */
299         }
300 #else
301         result = ext3_new_block(handle, inode, goal, NULL, NULL, err);
302 #endif
303         return result;
304 }
305
306
307 typedef struct {
308         u32     *p;
309         u32     key;
310         struct buffer_head *bh;
311 } Indirect;
312
313 static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
314 {
315         p->key = *(p->p = v);
316         p->bh = bh;
317 }
318
319 static inline int verify_chain(Indirect *from, Indirect *to)
320 {
321         while (from <= to && from->key == *from->p)
322                 from++;
323         return (from > to);
324 }
325
326 /**
327  *      ext3_block_to_path - parse the block number into array of offsets
328  *      @inode: inode in question (we are only interested in its superblock)
329  *      @i_block: block number to be parsed
330  *      @offsets: array to store the offsets in
331  *      @boundary: set this non-zero if the referred-to block is likely to be
332  *             followed (on disk) by an indirect block.
333  *
334  *      To store the locations of file's data ext3 uses a data structure common
335  *      for UNIX filesystems - tree of pointers anchored in the inode, with
336  *      data blocks at leaves and indirect blocks in intermediate nodes.
337  *      This function translates the block number into path in that tree -
338  *      return value is the path length and @offsets[n] is the offset of
339  *      pointer to (n+1)th node in the nth one. If @block is out of range
340  *      (negative or too large) warning is printed and zero returned.
341  *
342  *      Note: function doesn't find node addresses, so no IO is needed. All
343  *      we need to know is the capacity of indirect blocks (taken from the
344  *      inode->i_sb).
345  */
346
347 /*
348  * Portability note: the last comparison (check that we fit into triple
349  * indirect block) is spelled differently, because otherwise on an
350  * architecture with 32-bit longs and 8Kb pages we might get into trouble
351  * if our filesystem had 8Kb blocks. We might use long long, but that would
352  * kill us on x86. Oh, well, at least the sign propagation does not matter -
353  * i_block would have to be negative in the very beginning, so we would not
354  * get there at all.
355  */
356
357 static int ext3_block_to_path(struct inode *inode,
358                         long i_block, int offsets[4], int *boundary)
359 {
360         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
361         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
362         const long direct_blocks = EXT3_NDIR_BLOCKS,
363                 indirect_blocks = ptrs,
364                 double_blocks = (1 << (ptrs_bits * 2));
365         int n = 0;
366         int final = 0;
367
368         if (i_block < 0) {
369                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
370         } else if (i_block < direct_blocks) {
371                 offsets[n++] = i_block;
372                 final = direct_blocks;
373         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
374                 offsets[n++] = EXT3_IND_BLOCK;
375                 offsets[n++] = i_block;
376                 final = ptrs;
377         } else if ((i_block -= indirect_blocks) < double_blocks) {
378                 offsets[n++] = EXT3_DIND_BLOCK;
379                 offsets[n++] = i_block >> ptrs_bits;
380                 offsets[n++] = i_block & (ptrs - 1);
381                 final = ptrs;
382         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
383                 offsets[n++] = EXT3_TIND_BLOCK;
384                 offsets[n++] = i_block >> (ptrs_bits * 2);
385                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
386                 offsets[n++] = i_block & (ptrs - 1);
387                 final = ptrs;
388         } else {
389                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
390         }
391         if (boundary)
392                 *boundary = (i_block & (ptrs - 1)) == (final - 1);
393         return n;
394 }
395
396 /**
397  *      ext3_get_branch - read the chain of indirect blocks leading to data
398  *      @inode: inode in question
399  *      @depth: depth of the chain (1 - direct pointer, etc.)
400  *      @offsets: offsets of pointers in inode/indirect blocks
401  *      @chain: place to store the result
402  *      @err: here we store the error value
403  *
404  *      Function fills the array of triples <key, p, bh> and returns %NULL
405  *      if everything went OK or the pointer to the last filled triple
406  *      (incomplete one) otherwise. Upon the return chain[i].key contains
407  *      the number of (i+1)-th block in the chain (as it is stored in memory,
408  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
409  *      number (it points into struct inode for i==0 and into the bh->b_data
410  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
411  *      block for i>0 and NULL for i==0. In other words, it holds the block
412  *      numbers of the chain, addresses they were taken from (and where we can
413  *      verify that chain did not change) and buffer_heads hosting these
414  *      numbers.
415  *
416  *      Function stops when it stumbles upon zero pointer (absent block)
417  *              (pointer to last triple returned, *@err == 0)
418  *      or when it gets an IO error reading an indirect block
419  *              (ditto, *@err == -EIO)
420  *      or when it notices that chain had been changed while it was reading
421  *              (ditto, *@err == -EAGAIN)
422  *      or when it reads all @depth-1 indirect blocks successfully and finds
423  *      the whole chain, all way to the data (returns %NULL, *err == 0).
424  */
425 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
426                                  Indirect chain[4], int *err)
427 {
428         struct super_block *sb = inode->i_sb;
429         Indirect *p = chain;
430         struct buffer_head *bh;
431
432         *err = 0;
433         /* i_data is not going away, no lock needed */
434         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
435         if (!p->key)
436                 goto no_block;
437         while (--depth) {
438                 bh = sb_bread(sb, le32_to_cpu(p->key));
439                 if (!bh)
440                         goto failure;
441                 /* Reader: pointers */
442                 if (!verify_chain(chain, p))
443                         goto changed;
444                 add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
445                 /* Reader: end */
446                 if (!p->key)
447                         goto no_block;
448         }
449         return NULL;
450
451 changed:
452         brelse(bh);
453         *err = -EAGAIN;
454         goto no_block;
455 failure:
456         *err = -EIO;
457 no_block:
458         return p;
459 }
460
461 /**
462  *      ext3_find_near - find a place for allocation with sufficient locality
463  *      @inode: owner
464  *      @ind: descriptor of indirect block.
465  *
466  *      This function returns the prefered place for block allocation.
467  *      It is used when heuristic for sequential allocation fails.
468  *      Rules are:
469  *        + if there is a block to the left of our position - allocate near it.
470  *        + if pointer will live in indirect block - allocate near that block.
471  *        + if pointer will live in inode - allocate in the same
472  *          cylinder group. 
473  *
474  * In the latter case we colour the starting block by the callers PID to
475  * prevent it from clashing with concurrent allocations for a different inode
476  * in the same block group.   The PID is used here so that functionally related
477  * files will be close-by on-disk.
478  *
479  *      Caller must make sure that @ind is valid and will stay that way.
480  */
481
482 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
483 {
484         struct ext3_inode_info *ei = EXT3_I(inode);
485         u32 *start = ind->bh ? (u32*) ind->bh->b_data : ei->i_data;
486         u32 *p;
487         unsigned long bg_start;
488         unsigned long colour;
489
490         /* Try to find previous block */
491         for (p = ind->p - 1; p >= start; p--)
492                 if (*p)
493                         return le32_to_cpu(*p);
494
495         /* No such thing, so let's try location of indirect block */
496         if (ind->bh)
497                 return ind->bh->b_blocknr;
498
499         /*
500          * It is going to be refered from inode itself? OK, just put it into
501          * the same cylinder group then.
502          */
503         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
504                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
505         colour = (current->pid % 16) *
506                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
507         return bg_start + colour;
508 }
509
510 /**
511  *      ext3_find_goal - find a prefered place for allocation.
512  *      @inode: owner
513  *      @block:  block we want
514  *      @chain:  chain of indirect blocks
515  *      @partial: pointer to the last triple within a chain
516  *      @goal:  place to store the result.
517  *
518  *      Normally this function find the prefered place for block allocation,
519  *      stores it in *@goal and returns zero. If the branch had been changed
520  *      under us we return -EAGAIN.
521  */
522
523 static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
524                           Indirect *partial, unsigned long *goal)
525 {
526         struct ext3_inode_info *ei = EXT3_I(inode);
527         /* Writer: ->i_next_alloc* */
528         if (block == ei->i_next_alloc_block + 1) {
529                 ei->i_next_alloc_block++;
530                 ei->i_next_alloc_goal++;
531         }
532         /* Writer: end */
533         /* Reader: pointers, ->i_next_alloc* */
534         if (verify_chain(chain, partial)) {
535                 /*
536                  * try the heuristic for sequential allocation,
537                  * failing that at least try to get decent locality.
538                  */
539                 if (block == ei->i_next_alloc_block)
540                         *goal = ei->i_next_alloc_goal;
541                 if (!*goal)
542                         *goal = ext3_find_near(inode, partial);
543                 return 0;
544         }
545         /* Reader: end */
546         return -EAGAIN;
547 }
548
549 /**
550  *      ext3_alloc_branch - allocate and set up a chain of blocks.
551  *      @inode: owner
552  *      @num: depth of the chain (number of blocks to allocate)
553  *      @offsets: offsets (in the blocks) to store the pointers to next.
554  *      @branch: place to store the chain in.
555  *
556  *      This function allocates @num blocks, zeroes out all but the last one,
557  *      links them into chain and (if we are synchronous) writes them to disk.
558  *      In other words, it prepares a branch that can be spliced onto the
559  *      inode. It stores the information about that chain in the branch[], in
560  *      the same format as ext3_get_branch() would do. We are calling it after
561  *      we had read the existing part of chain and partial points to the last
562  *      triple of that (one with zero ->key). Upon the exit we have the same
563  *      picture as after the successful ext3_get_block(), excpet that in one
564  *      place chain is disconnected - *branch->p is still zero (we did not
565  *      set the last link), but branch->key contains the number that should
566  *      be placed into *branch->p to fill that gap.
567  *
568  *      If allocation fails we free all blocks we've allocated (and forget
569  *      their buffer_heads) and return the error value the from failed
570  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
571  *      as described above and return 0.
572  */
573
574 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
575                              int num,
576                              unsigned long goal,
577                              int *offsets,
578                              Indirect *branch)
579 {
580         int blocksize = inode->i_sb->s_blocksize;
581         int n = 0, keys = 0;
582         int err = 0;
583         int i;
584         int parent = ext3_alloc_block(handle, inode, goal, &err);
585
586         branch[0].key = cpu_to_le32(parent);
587         if (parent) {
588                 for (n = 1; n < num; n++) {
589                         struct buffer_head *bh;
590                         /* Allocate the next block */
591                         int nr = ext3_alloc_block(handle, inode, parent, &err);
592                         if (!nr)
593                                 break;
594                         branch[n].key = cpu_to_le32(nr);
595                         keys = n+1;
596
597                         /*
598                          * Get buffer_head for parent block, zero it out
599                          * and set the pointer to new one, then send
600                          * parent to disk.  
601                          */
602                         bh = sb_getblk(inode->i_sb, parent);
603                         branch[n].bh = bh;
604                         lock_buffer(bh);
605                         BUFFER_TRACE(bh, "call get_create_access");
606                         err = ext3_journal_get_create_access(handle, bh);
607                         if (err) {
608                                 unlock_buffer(bh);
609                                 brelse(bh);
610                                 break;
611                         }
612
613                         memset(bh->b_data, 0, blocksize);
614                         branch[n].p = (u32*) bh->b_data + offsets[n];
615                         *branch[n].p = branch[n].key;
616                         BUFFER_TRACE(bh, "marking uptodate");
617                         set_buffer_uptodate(bh);
618                         unlock_buffer(bh);
619
620                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
621                         err = ext3_journal_dirty_metadata(handle, bh);
622                         if (err)
623                                 break;
624
625                         parent = nr;
626                 }
627         }
628         if (n == num)
629                 return 0;
630
631         /* Allocation failed, free what we already allocated */
632         for (i = 1; i < keys; i++) {
633                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
634                 ext3_journal_forget(handle, branch[i].bh);
635         }
636         for (i = 0; i < keys; i++)
637                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
638         return err;
639 }
640
641 /**
642  *      ext3_splice_branch - splice the allocated branch onto inode.
643  *      @inode: owner
644  *      @block: (logical) number of block we are adding
645  *      @chain: chain of indirect blocks (with a missing link - see
646  *              ext3_alloc_branch)
647  *      @where: location of missing link
648  *      @num:   number of blocks we are adding
649  *
650  *      This function verifies that chain (up to the missing link) had not
651  *      changed, fills the missing link and does all housekeeping needed in
652  *      inode (->i_blocks, etc.). In case of success we end up with the full
653  *      chain to new block and return 0. Otherwise (== chain had been changed)
654  *      we free the new blocks (forgetting their buffer_heads, indeed) and
655  *      return -EAGAIN.
656  */
657
658 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
659                               Indirect chain[4], Indirect *where, int num)
660 {
661         int i;
662         int err = 0;
663         struct ext3_inode_info *ei = EXT3_I(inode);
664
665         /*
666          * If we're splicing into a [td]indirect block (as opposed to the
667          * inode) then we need to get write access to the [td]indirect block
668          * before the splice.
669          */
670         if (where->bh) {
671                 BUFFER_TRACE(where->bh, "get_write_access");
672                 err = ext3_journal_get_write_access(handle, where->bh);
673                 if (err)
674                         goto err_out;
675         }
676         /* Verify that place we are splicing to is still there and vacant */
677
678         /* Writer: pointers, ->i_next_alloc* */
679         if (!verify_chain(chain, where-1) || *where->p)
680                 /* Writer: end */
681                 goto changed;
682
683         /* That's it */
684
685         *where->p = where->key;
686         ei->i_next_alloc_block = block;
687         ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key);
688         /* Writer: end */
689
690         /* We are done with atomic stuff, now do the rest of housekeeping */
691
692         inode->i_ctime = CURRENT_TIME;
693         ext3_mark_inode_dirty(handle, inode);
694
695         /* had we spliced it onto indirect block? */
696         if (where->bh) {
697                 /*
698                  * akpm: If we spliced it onto an indirect block, we haven't
699                  * altered the inode.  Note however that if it is being spliced
700                  * onto an indirect block at the very end of the file (the
701                  * file is growing) then we *will* alter the inode to reflect
702                  * the new i_size.  But that is not done here - it is done in
703                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
704                  */
705                 jbd_debug(5, "splicing indirect only\n");
706                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
707                 err = ext3_journal_dirty_metadata(handle, where->bh);
708                 if (err) 
709                         goto err_out;
710         } else {
711                 /*
712                  * OK, we spliced it into the inode itself on a direct block.
713                  * Inode was dirtied above.
714                  */
715                 jbd_debug(5, "splicing direct\n");
716         }
717         return err;
718
719 changed:
720         /*
721          * AKPM: if where[i].bh isn't part of the current updating
722          * transaction then we explode nastily.  Test this code path.
723          */
724         jbd_debug(1, "the chain changed: try again\n");
725         err = -EAGAIN;
726
727 err_out:
728         for (i = 1; i < num; i++) {
729                 BUFFER_TRACE(where[i].bh, "call journal_forget");
730                 ext3_journal_forget(handle, where[i].bh);
731         }
732         /* For the normal collision cleanup case, we free up the blocks.
733          * On genuine filesystem errors we don't even think about doing
734          * that. */
735         if (err == -EAGAIN)
736                 for (i = 0; i < num; i++)
737                         ext3_free_blocks(handle, inode, 
738                                          le32_to_cpu(where[i].key), 1);
739         return err;
740 }
741
742 /*
743  * Allocation strategy is simple: if we have to allocate something, we will
744  * have to go the whole way to leaf. So let's do it before attaching anything
745  * to tree, set linkage between the newborn blocks, write them if sync is
746  * required, recheck the path, free and repeat if check fails, otherwise
747  * set the last missing link (that will protect us from any truncate-generated
748  * removals - all blocks on the path are immune now) and possibly force the
749  * write on the parent block.
750  * That has a nice additional property: no special recovery from the failed
751  * allocations is needed - we simply release blocks and do not touch anything
752  * reachable from inode.
753  *
754  * akpm: `handle' can be NULL if create == 0.
755  *
756  * The BKL may not be held on entry here.  Be sure to take it early.
757  */
758
759 static int
760 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
761                 struct buffer_head *bh_result, int create, int extend_disksize)
762 {
763         int err = -EIO;
764         int offsets[4];
765         Indirect chain[4];
766         Indirect *partial;
767         unsigned long goal;
768         int left;
769         int boundary = 0;
770         int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
771         struct ext3_inode_info *ei = EXT3_I(inode);
772
773         J_ASSERT(handle != NULL || create == 0);
774
775         if (depth == 0)
776                 goto out;
777
778 reread:
779         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
780
781         /* Simplest case - block found, no allocation needed */
782         if (!partial) {
783                 clear_buffer_new(bh_result);
784 got_it:
785                 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
786                 if (boundary)
787                         set_buffer_boundary(bh_result);
788                 /* Clean up and exit */
789                 partial = chain+depth-1; /* the whole chain */
790                 goto cleanup;
791         }
792
793         /* Next simple case - plain lookup or failed read of indirect block */
794         if (!create || err == -EIO) {
795 cleanup:
796                 while (partial > chain) {
797                         BUFFER_TRACE(partial->bh, "call brelse");
798                         brelse(partial->bh);
799                         partial--;
800                 }
801                 BUFFER_TRACE(bh_result, "returned");
802 out:
803                 return err;
804         }
805
806         /*
807          * Indirect block might be removed by truncate while we were
808          * reading it. Handling of that case (forget what we've got and
809          * reread) is taken out of the main path.
810          */
811         if (err == -EAGAIN)
812                 goto changed;
813
814         goal = 0;
815         down(&ei->truncate_sem);
816         if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) {
817                 up(&ei->truncate_sem);
818                 goto changed;
819         }
820
821         left = (chain + depth) - partial;
822
823         /*
824          * Block out ext3_truncate while we alter the tree
825          */
826         err = ext3_alloc_branch(handle, inode, left, goal,
827                                         offsets+(partial-chain), partial);
828
829         /* The ext3_splice_branch call will free and forget any buffers
830          * on the new chain if there is a failure, but that risks using
831          * up transaction credits, especially for bitmaps where the
832          * credits cannot be returned.  Can we handle this somehow?  We
833          * may need to return -EAGAIN upwards in the worst case.  --sct */
834         if (!err)
835                 err = ext3_splice_branch(handle, inode, iblock, chain,
836                                          partial, left);
837         /* i_disksize growing is protected by truncate_sem
838          * don't forget to protect it if you're about to implement
839          * concurrent ext3_get_block() -bzzz */
840         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
841                 ei->i_disksize = inode->i_size;
842         up(&ei->truncate_sem);
843         if (err == -EAGAIN)
844                 goto changed;
845         if (err)
846                 goto cleanup;
847
848         set_buffer_new(bh_result);
849         goto got_it;
850
851 changed:
852         while (partial > chain) {
853                 jbd_debug(1, "buffer chain changed, retrying\n");
854                 BUFFER_TRACE(partial->bh, "brelsing");
855                 brelse(partial->bh);
856                 partial--;
857         }
858         goto reread;
859 }
860
861 static int ext3_get_block(struct inode *inode, sector_t iblock,
862                         struct buffer_head *bh_result, int create)
863 {
864         handle_t *handle = NULL;
865         int ret;
866
867         if (create) {
868                 handle = ext3_journal_current_handle();
869                 J_ASSERT(handle != 0);
870         }
871         ret = ext3_get_block_handle(handle, inode, iblock,
872                                 bh_result, create, 1);
873         return ret;
874 }
875
876 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
877
878 static int
879 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
880                 unsigned long max_blocks, struct buffer_head *bh_result,
881                 int create)
882 {
883         handle_t *handle = journal_current_handle();
884         int ret = 0;
885
886         if (!handle)
887                 goto get_block;         /* A read */
888
889         if (handle->h_transaction->t_state == T_LOCKED) {
890                 /*
891                  * Huge direct-io writes can hold off commits for long
892                  * periods of time.  Let this commit run.
893                  */
894                 ext3_journal_stop(handle);
895                 handle = ext3_journal_start(inode, DIO_CREDITS);
896                 if (IS_ERR(handle))
897                         ret = PTR_ERR(handle);
898                 goto get_block;
899         }
900
901         if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
902                 /*
903                  * Getting low on buffer credits...
904                  */
905                 ret = ext3_journal_extend(handle, DIO_CREDITS);
906                 if (ret > 0) {
907                         /*
908                          * Couldn't extend the transaction.  Start a new one.
909                          */
910                         ret = ext3_journal_restart(handle, DIO_CREDITS);
911                 }
912         }
913
914 get_block:
915         if (ret == 0)
916                 ret = ext3_get_block_handle(handle, inode, iblock,
917                                         bh_result, create, 0);
918         bh_result->b_size = (1 << inode->i_blkbits);
919         return ret;
920 }
921
922 /*
923  * `handle' can be NULL if create is zero
924  */
925 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
926                                 long block, int create, int * errp)
927 {
928         struct buffer_head dummy;
929         int fatal = 0, err;
930
931         J_ASSERT(handle != NULL || create == 0);
932
933         dummy.b_state = 0;
934         dummy.b_blocknr = -1000;
935         buffer_trace_init(&dummy.b_history);
936         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
937         if (!*errp && buffer_mapped(&dummy)) {
938                 struct buffer_head *bh;
939                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
940                 if (buffer_new(&dummy)) {
941                         J_ASSERT(create != 0);
942                         J_ASSERT(handle != 0);
943
944                         /* Now that we do not always journal data, we
945                            should keep in mind whether this should
946                            always journal the new buffer as metadata.
947                            For now, regular file writes use
948                            ext3_get_block instead, so it's not a
949                            problem. */
950                         lock_buffer(bh);
951                         BUFFER_TRACE(bh, "call get_create_access");
952                         fatal = ext3_journal_get_create_access(handle, bh);
953                         if (!fatal && !buffer_uptodate(bh)) {
954                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
955                                 set_buffer_uptodate(bh);
956                         }
957                         unlock_buffer(bh);
958                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
959                         err = ext3_journal_dirty_metadata(handle, bh);
960                         if (!fatal)
961                                 fatal = err;
962                 } else {
963                         BUFFER_TRACE(bh, "not a new buffer");
964                 }
965                 if (fatal) {
966                         *errp = fatal;
967                         brelse(bh);
968                         bh = NULL;
969                 }
970                 return bh;
971         }
972         return NULL;
973 }
974
975 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
976                                int block, int create, int *err)
977 {
978         struct buffer_head * bh;
979         int prev_blocks;
980
981         prev_blocks = inode->i_blocks;
982
983         bh = ext3_getblk (handle, inode, block, create, err);
984         if (!bh)
985                 return bh;
986 #ifdef EXT3_PREALLOCATE
987         /*
988          * If the inode has grown, and this is a directory, then use a few
989          * more of the preallocated blocks to keep directory fragmentation
990          * down.  The preallocated blocks are guaranteed to be contiguous.
991          */
992         if (create &&
993             S_ISDIR(inode->i_mode) &&
994             inode->i_blocks > prev_blocks &&
995             EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
996                                     EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
997                 int i;
998                 struct buffer_head *tmp_bh;
999
1000                 for (i = 1;
1001                      EXT3_I(inode)->i_prealloc_count &&
1002                      i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
1003                      i++) {
1004                         /*
1005                          * ext3_getblk will zero out the contents of the
1006                          * directory for us
1007                          */
1008                         tmp_bh = ext3_getblk(handle, inode,
1009                                                 block+i, create, err);
1010                         if (!tmp_bh) {
1011                                 brelse (bh);
1012                                 return 0;
1013                         }
1014                         brelse (tmp_bh);
1015                 }
1016         }
1017 #endif
1018         if (buffer_uptodate(bh))
1019                 return bh;
1020         ll_rw_block (READ, 1, &bh);
1021         wait_on_buffer (bh);
1022         if (buffer_uptodate(bh))
1023                 return bh;
1024         brelse (bh);
1025         *err = -EIO;
1026         return NULL;
1027 }
1028
1029 static int walk_page_buffers(   handle_t *handle,
1030                                 struct buffer_head *head,
1031                                 unsigned from,
1032                                 unsigned to,
1033                                 int *partial,
1034                                 int (*fn)(      handle_t *handle,
1035                                                 struct buffer_head *bh))
1036 {
1037         struct buffer_head *bh;
1038         unsigned block_start, block_end;
1039         unsigned blocksize = head->b_size;
1040         int err, ret = 0;
1041         struct buffer_head *next;
1042
1043         for (   bh = head, block_start = 0;
1044                 ret == 0 && (bh != head || !block_start);
1045                 block_start = block_end, bh = next)
1046         {
1047                 next = bh->b_this_page;
1048                 block_end = block_start + blocksize;
1049                 if (block_end <= from || block_start >= to) {
1050                         if (partial && !buffer_uptodate(bh))
1051                                 *partial = 1;
1052                         continue;
1053                 }
1054                 err = (*fn)(handle, bh);
1055                 if (!ret)
1056                         ret = err;
1057         }
1058         return ret;
1059 }
1060
1061 /*
1062  * To preserve ordering, it is essential that the hole instantiation and
1063  * the data write be encapsulated in a single transaction.  We cannot
1064  * close off a transaction and start a new one between the ext3_get_block()
1065  * and the commit_write().  So doing the journal_start at the start of
1066  * prepare_write() is the right place.
1067  *
1068  * Also, this function can nest inside ext3_writepage() ->
1069  * block_write_full_page(). In that case, we *know* that ext3_writepage()
1070  * has generated enough buffer credits to do the whole page.  So we won't
1071  * block on the journal in that case, which is good, because the caller may
1072  * be PF_MEMALLOC.
1073  *
1074  * By accident, ext3 can be reentered when a transaction is open via
1075  * quota file writes.  If we were to commit the transaction while thus
1076  * reentered, there can be a deadlock - we would be holding a quota
1077  * lock, and the commit would never complete if another thread had a
1078  * transaction open and was blocking on the quota lock - a ranking
1079  * violation.
1080  *
1081  * So what we do is to rely on the fact that journal_stop/journal_start
1082  * will _not_ run commit under these circumstances because handle->h_ref
1083  * is elevated.  We'll still have enough credits for the tiny quotafile
1084  * write.  
1085  */
1086
1087 static int do_journal_get_write_access(handle_t *handle, 
1088                                        struct buffer_head *bh)
1089 {
1090         if (!buffer_mapped(bh) || buffer_freed(bh))
1091                 return 0;
1092         return ext3_journal_get_write_access(handle, bh);
1093 }
1094
1095 static int ext3_prepare_write(struct file *file, struct page *page,
1096                               unsigned from, unsigned to)
1097 {
1098         struct inode *inode = page->mapping->host;
1099         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1100         handle_t *handle;
1101         int retries = 0;
1102
1103 retry:
1104         handle = ext3_journal_start(inode, needed_blocks);
1105         if (IS_ERR(handle)) {
1106                 ret = PTR_ERR(handle);
1107                 goto out;
1108         }
1109         ret = block_prepare_write(page, from, to, ext3_get_block);
1110         if (ret)
1111                 goto prepare_write_failed;
1112
1113         if (ext3_should_journal_data(inode)) {
1114                 ret = walk_page_buffers(handle, page_buffers(page),
1115                                 from, to, NULL, do_journal_get_write_access);
1116         }
1117 prepare_write_failed:
1118         if (ret)
1119                 ext3_journal_stop(handle);
1120         if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1121                 goto retry;
1122 out:
1123         return ret;
1124 }
1125
1126 static int
1127 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1128 {
1129         int err = journal_dirty_data(handle, bh);
1130         if (err)
1131                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1132                                                 bh, handle,err);
1133         return err;
1134 }
1135
1136 /* For commit_write() in data=journal mode */
1137 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1138 {
1139         if (!buffer_mapped(bh) || buffer_freed(bh))
1140                 return 0;
1141         set_buffer_uptodate(bh);
1142         return ext3_journal_dirty_metadata(handle, bh);
1143 }
1144
1145 /*
1146  * We need to pick up the new inode size which generic_commit_write gave us
1147  * `file' can be NULL - eg, when called from page_symlink().
1148  *
1149  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1150  * buffers are managed internally.
1151  */
1152
1153 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1154                              unsigned from, unsigned to)
1155 {
1156         handle_t *handle = ext3_journal_current_handle();
1157         struct inode *inode = page->mapping->host;
1158         int ret = 0, ret2;
1159
1160         ret = walk_page_buffers(handle, page_buffers(page),
1161                 from, to, NULL, ext3_journal_dirty_data);
1162
1163         if (ret == 0) {
1164                 /*
1165                  * generic_commit_write() will run mark_inode_dirty() if i_size
1166                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1167                  * into that.
1168                  */
1169                 loff_t new_i_size;
1170
1171                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1172                 if (new_i_size > EXT3_I(inode)->i_disksize)
1173                         EXT3_I(inode)->i_disksize = new_i_size;
1174                 ret = generic_commit_write(file, page, from, to);
1175         }
1176         ret2 = ext3_journal_stop(handle);
1177         if (!ret)
1178                 ret = ret2;
1179         return ret;
1180 }
1181
1182 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1183                              unsigned from, unsigned to)
1184 {
1185         handle_t *handle = ext3_journal_current_handle();
1186         struct inode *inode = page->mapping->host;
1187         int ret = 0, ret2;
1188         loff_t new_i_size;
1189
1190         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1191         if (new_i_size > EXT3_I(inode)->i_disksize)
1192                 EXT3_I(inode)->i_disksize = new_i_size;
1193         ret = generic_commit_write(file, page, from, to);
1194         ret2 = ext3_journal_stop(handle);
1195         if (!ret)
1196                 ret = ret2;
1197         return ret;
1198 }
1199
1200 static int ext3_journalled_commit_write(struct file *file,
1201                         struct page *page, unsigned from, unsigned to)
1202 {
1203         handle_t *handle = ext3_journal_current_handle();
1204         struct inode *inode = page->mapping->host;
1205         int ret = 0, ret2;
1206         int partial = 0;
1207         loff_t pos;
1208
1209         /*
1210          * Here we duplicate the generic_commit_write() functionality
1211          */
1212         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1213
1214         ret = walk_page_buffers(handle, page_buffers(page), from,
1215                                 to, &partial, commit_write_fn);
1216         if (!partial)
1217                 SetPageUptodate(page);
1218         if (pos > inode->i_size)
1219                 i_size_write(inode, pos);
1220         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1221         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1222                 EXT3_I(inode)->i_disksize = inode->i_size;
1223                 ret2 = ext3_mark_inode_dirty(handle, inode);
1224                 if (!ret) 
1225                         ret = ret2;
1226         }
1227         ret2 = ext3_journal_stop(handle);
1228         if (!ret)
1229                 ret = ret2;
1230         return ret;
1231 }
1232
1233 /* 
1234  * bmap() is special.  It gets used by applications such as lilo and by
1235  * the swapper to find the on-disk block of a specific piece of data.
1236  *
1237  * Naturally, this is dangerous if the block concerned is still in the
1238  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1239  * filesystem and enables swap, then they may get a nasty shock when the
1240  * data getting swapped to that swapfile suddenly gets overwritten by
1241  * the original zero's written out previously to the journal and
1242  * awaiting writeback in the kernel's buffer cache. 
1243  *
1244  * So, if we see any bmap calls here on a modified, data-journaled file,
1245  * take extra steps to flush any blocks which might be in the cache. 
1246  */
1247 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1248 {
1249         struct inode *inode = mapping->host;
1250         journal_t *journal;
1251         int err;
1252
1253         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1254                 /* 
1255                  * This is a REALLY heavyweight approach, but the use of
1256                  * bmap on dirty files is expected to be extremely rare:
1257                  * only if we run lilo or swapon on a freshly made file
1258                  * do we expect this to happen. 
1259                  *
1260                  * (bmap requires CAP_SYS_RAWIO so this does not
1261                  * represent an unprivileged user DOS attack --- we'd be
1262                  * in trouble if mortal users could trigger this path at
1263                  * will.) 
1264                  *
1265                  * NB. EXT3_STATE_JDATA is not set on files other than
1266                  * regular files.  If somebody wants to bmap a directory
1267                  * or symlink and gets confused because the buffer
1268                  * hasn't yet been flushed to disk, they deserve
1269                  * everything they get.
1270                  */
1271
1272                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1273                 journal = EXT3_JOURNAL(inode);
1274                 journal_lock_updates(journal);
1275                 err = journal_flush(journal);
1276                 journal_unlock_updates(journal);
1277
1278                 if (err)
1279                         return 0;
1280         }
1281
1282         return generic_block_bmap(mapping,block,ext3_get_block);
1283 }
1284
1285 static int bget_one(handle_t *handle, struct buffer_head *bh)
1286 {
1287         get_bh(bh);
1288         return 0;
1289 }
1290
1291 static int bput_one(handle_t *handle, struct buffer_head *bh)
1292 {
1293         put_bh(bh);
1294         return 0;
1295 }
1296
1297 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1298 {
1299         if (buffer_mapped(bh))
1300                 return ext3_journal_dirty_data(handle, bh);
1301         return 0;
1302 }
1303
1304 /*
1305  * Note that we always start a transaction even if we're not journalling
1306  * data.  This is to preserve ordering: any hole instantiation within
1307  * __block_write_full_page -> ext3_get_block() should be journalled
1308  * along with the data so we don't crash and then get metadata which
1309  * refers to old data.
1310  *
1311  * In all journalling modes block_write_full_page() will start the I/O.
1312  *
1313  * Problem:
1314  *
1315  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1316  *              ext3_writepage()
1317  *
1318  * Similar for:
1319  *
1320  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1321  *
1322  * Same applies to ext3_get_block().  We will deadlock on various things like
1323  * lock_journal and i_truncate_sem.
1324  *
1325  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1326  * allocations fail.
1327  *
1328  * 16May01: If we're reentered then journal_current_handle() will be
1329  *          non-zero. We simply *return*.
1330  *
1331  * 1 July 2001: @@@ FIXME:
1332  *   In journalled data mode, a data buffer may be metadata against the
1333  *   current transaction.  But the same file is part of a shared mapping
1334  *   and someone does a writepage() on it.
1335  *
1336  *   We will move the buffer onto the async_data list, but *after* it has
1337  *   been dirtied. So there's a small window where we have dirty data on
1338  *   BJ_Metadata.
1339  *
1340  *   Note that this only applies to the last partial page in the file.  The
1341  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1342  *   broken code anyway: it's wrong for msync()).
1343  *
1344  *   It's a rare case: affects the final partial page, for journalled data
1345  *   where the file is subject to bith write() and writepage() in the same
1346  *   transction.  To fix it we'll need a custom block_write_full_page().
1347  *   We'll probably need that anyway for journalling writepage() output.
1348  *
1349  * We don't honour synchronous mounts for writepage().  That would be
1350  * disastrous.  Any write() or metadata operation will sync the fs for
1351  * us.
1352  *
1353  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1354  * we don't need to open a transaction here.
1355  */
1356 static int ext3_ordered_writepage(struct page *page,
1357                         struct writeback_control *wbc)
1358 {
1359         struct inode *inode = page->mapping->host;
1360         struct buffer_head *page_bufs;
1361         handle_t *handle = NULL;
1362         int ret = 0;
1363         int err;
1364
1365         J_ASSERT(PageLocked(page));
1366
1367         /*
1368          * We give up here if we're reentered, because it might be for a
1369          * different filesystem.
1370          */
1371         if (ext3_journal_current_handle())
1372                 goto out_fail;
1373
1374         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1375
1376         if (IS_ERR(handle)) {
1377                 ret = PTR_ERR(handle);
1378                 goto out_fail;
1379         }
1380
1381         if (!page_has_buffers(page)) {
1382                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1383                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1384         }
1385         page_bufs = page_buffers(page);
1386         walk_page_buffers(handle, page_bufs, 0,
1387                         PAGE_CACHE_SIZE, NULL, bget_one);
1388
1389         ret = block_write_full_page(page, ext3_get_block, wbc);
1390
1391         /*
1392          * The page can become unlocked at any point now, and
1393          * truncate can then come in and change things.  So we
1394          * can't touch *page from now on.  But *page_bufs is
1395          * safe due to elevated refcount.
1396          */
1397
1398         /*
1399          * And attach them to the current transaction.  But only if 
1400          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1401          * and generally junk.
1402          */
1403         if (ret == 0) {
1404                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1405                                         NULL, journal_dirty_data_fn);
1406                 if (!ret)
1407                         ret = err;
1408         }
1409         walk_page_buffers(handle, page_bufs, 0,
1410                         PAGE_CACHE_SIZE, NULL, bput_one);
1411         err = ext3_journal_stop(handle);
1412         if (!ret)
1413                 ret = err;
1414         return ret;
1415
1416 out_fail:
1417         redirty_page_for_writepage(wbc, page);
1418         unlock_page(page);
1419         return ret;
1420 }
1421
1422 static int ext3_writeback_writepage(struct page *page,
1423                                 struct writeback_control *wbc)
1424 {
1425         struct inode *inode = page->mapping->host;
1426         handle_t *handle = NULL;
1427         int ret = 0;
1428         int err;
1429
1430         if (ext3_journal_current_handle())
1431                 goto out_fail;
1432
1433         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1434         if (IS_ERR(handle)) {
1435                 ret = PTR_ERR(handle);
1436                 goto out_fail;
1437         }
1438
1439         ret = block_write_full_page(page, ext3_get_block, wbc);
1440         err = ext3_journal_stop(handle);
1441         if (!ret)
1442                 ret = err;
1443         return ret;
1444
1445 out_fail:
1446         redirty_page_for_writepage(wbc, page);
1447         unlock_page(page);
1448         return ret;
1449 }
1450
1451 static int ext3_journalled_writepage(struct page *page,
1452                                 struct writeback_control *wbc)
1453 {
1454         struct inode *inode = page->mapping->host;
1455         handle_t *handle = NULL;
1456         int ret = 0;
1457         int err;
1458
1459         if (ext3_journal_current_handle())
1460                 goto no_write;
1461
1462         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1463         if (IS_ERR(handle)) {
1464                 ret = PTR_ERR(handle);
1465                 goto no_write;
1466         }
1467
1468         if (!page_has_buffers(page) || PageChecked(page)) {
1469                 /*
1470                  * It's mmapped pagecache.  Add buffers and journal it.  There
1471                  * doesn't seem much point in redirtying the page here.
1472                  */
1473                 ClearPageChecked(page);
1474                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1475                                         ext3_get_block);
1476                 if (ret != 0)
1477                         goto out_unlock;
1478                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1479                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1480
1481                 err = walk_page_buffers(handle, page_buffers(page), 0,
1482                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1483                 if (ret == 0)
1484                         ret = err;
1485                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1486                 unlock_page(page);
1487         } else {
1488                 /*
1489                  * It may be a page full of checkpoint-mode buffers.  We don't
1490                  * really know unless we go poke around in the buffer_heads.
1491                  * But block_write_full_page will do the right thing.
1492                  */
1493                 ret = block_write_full_page(page, ext3_get_block, wbc);
1494         }
1495         err = ext3_journal_stop(handle);
1496         if (!ret)
1497                 ret = err;
1498 out:
1499         return ret;
1500
1501 no_write:
1502         redirty_page_for_writepage(wbc, page);
1503 out_unlock:
1504         unlock_page(page);
1505         goto out;
1506 }
1507
1508 static int ext3_readpage(struct file *file, struct page *page)
1509 {
1510         return mpage_readpage(page, ext3_get_block);
1511 }
1512
1513 static int
1514 ext3_readpages(struct file *file, struct address_space *mapping,
1515                 struct list_head *pages, unsigned nr_pages)
1516 {
1517         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1518 }
1519
1520 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1521 {
1522         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1523
1524         /*
1525          * If it's a full truncate we just forget about the pending dirtying
1526          */
1527         if (offset == 0)
1528                 ClearPageChecked(page);
1529
1530         return journal_invalidatepage(journal, page, offset);
1531 }
1532
1533 static int ext3_releasepage(struct page *page, int wait)
1534 {
1535         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1536
1537         WARN_ON(PageChecked(page));
1538         return journal_try_to_free_buffers(journal, page, wait);
1539 }
1540
1541 /*
1542  * If the O_DIRECT write will extend the file then add this inode to the
1543  * orphan list.  So recovery will truncate it back to the original size
1544  * if the machine crashes during the write.
1545  *
1546  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1547  * crashes then stale disk data _may_ be exposed inside the file.
1548  */
1549 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1550                         const struct iovec *iov, loff_t offset,
1551                         unsigned long nr_segs)
1552 {
1553         struct file *file = iocb->ki_filp;
1554         struct inode *inode = file->f_mapping->host;
1555         struct ext3_inode_info *ei = EXT3_I(inode);
1556         handle_t *handle = NULL;
1557         ssize_t ret;
1558         int orphan = 0;
1559         size_t count = iov_length(iov, nr_segs);
1560
1561         if (rw == WRITE) {
1562                 loff_t final_size = offset + count;
1563
1564                 handle = ext3_journal_start(inode, DIO_CREDITS);
1565                 if (IS_ERR(handle)) {
1566                         ret = PTR_ERR(handle);
1567                         goto out;
1568                 }
1569                 if (final_size > inode->i_size) {
1570                         ret = ext3_orphan_add(handle, inode);
1571                         if (ret)
1572                                 goto out_stop;
1573                         orphan = 1;
1574                         ei->i_disksize = inode->i_size;
1575                 }
1576         }
1577
1578         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
1579                                  offset, nr_segs,
1580                                  ext3_direct_io_get_blocks, NULL);
1581
1582 out_stop:
1583         if (handle) {
1584                 int err;
1585
1586                 if (orphan) 
1587                         ext3_orphan_del(handle, inode);
1588                 if (orphan && ret > 0) {
1589                         loff_t end = offset + ret;
1590                         if (end > inode->i_size) {
1591                                 ei->i_disksize = end;
1592                                 i_size_write(inode, end);
1593                                 err = ext3_mark_inode_dirty(handle, inode);
1594                                 if (!ret) 
1595                                         ret = err;
1596                         }
1597                 }
1598                 err = ext3_journal_stop(handle);
1599                 if (ret == 0)
1600                         ret = err;
1601         }
1602 out:
1603         return ret;
1604 }
1605
1606 /*
1607  * Pages can be marked dirty completely asynchronously from ext3's journalling
1608  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1609  * much here because ->set_page_dirty is called under VFS locks.  The page is
1610  * not necessarily locked.
1611  *
1612  * We cannot just dirty the page and leave attached buffers clean, because the
1613  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1614  * or jbddirty because all the journalling code will explode.
1615  *
1616  * So what we do is to mark the page "pending dirty" and next time writepage
1617  * is called, propagate that into the buffers appropriately.
1618  */
1619 static int ext3_journalled_set_page_dirty(struct page *page)
1620 {
1621         SetPageChecked(page);
1622         return __set_page_dirty_nobuffers(page);
1623 }
1624
1625 static struct address_space_operations ext3_ordered_aops = {
1626         .readpage       = ext3_readpage,
1627         .readpages      = ext3_readpages,
1628         .writepage      = ext3_ordered_writepage,
1629         .sync_page      = block_sync_page,
1630         .prepare_write  = ext3_prepare_write,
1631         .commit_write   = ext3_ordered_commit_write,
1632         .bmap           = ext3_bmap,
1633         .invalidatepage = ext3_invalidatepage,
1634         .releasepage    = ext3_releasepage,
1635         .direct_IO      = ext3_direct_IO,
1636 };
1637
1638 static struct address_space_operations ext3_writeback_aops = {
1639         .readpage       = ext3_readpage,
1640         .readpages      = ext3_readpages,
1641         .writepage      = ext3_writeback_writepage,
1642         .sync_page      = block_sync_page,
1643         .prepare_write  = ext3_prepare_write,
1644         .commit_write   = ext3_writeback_commit_write,
1645         .bmap           = ext3_bmap,
1646         .invalidatepage = ext3_invalidatepage,
1647         .releasepage    = ext3_releasepage,
1648         .direct_IO      = ext3_direct_IO,
1649 };
1650
1651 static struct address_space_operations ext3_journalled_aops = {
1652         .readpage       = ext3_readpage,
1653         .readpages      = ext3_readpages,
1654         .writepage      = ext3_journalled_writepage,
1655         .sync_page      = block_sync_page,
1656         .prepare_write  = ext3_prepare_write,
1657         .commit_write   = ext3_journalled_commit_write,
1658         .set_page_dirty = ext3_journalled_set_page_dirty,
1659         .bmap           = ext3_bmap,
1660         .invalidatepage = ext3_invalidatepage,
1661         .releasepage    = ext3_releasepage,
1662 };
1663
1664 void ext3_set_aops(struct inode *inode)
1665 {
1666         if (ext3_should_order_data(inode))
1667                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1668         else if (ext3_should_writeback_data(inode))
1669                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1670         else
1671                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1672 }
1673
1674 /*
1675  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1676  * up to the end of the block which corresponds to `from'.
1677  * This required during truncate. We need to physically zero the tail end
1678  * of that block so it doesn't yield old data if the file is later grown.
1679  */
1680 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1681                 struct address_space *mapping, loff_t from)
1682 {
1683         unsigned long index = from >> PAGE_CACHE_SHIFT;
1684         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1685         unsigned blocksize, iblock, length, pos;
1686         struct inode *inode = mapping->host;
1687         struct buffer_head *bh;
1688         int err;
1689         void *kaddr;
1690
1691         blocksize = inode->i_sb->s_blocksize;
1692         length = blocksize - (offset & (blocksize - 1));
1693         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1694
1695         if (!page_has_buffers(page))
1696                 create_empty_buffers(page, blocksize, 0);
1697
1698         /* Find the buffer that contains "offset" */
1699         bh = page_buffers(page);
1700         pos = blocksize;
1701         while (offset >= pos) {
1702                 bh = bh->b_this_page;
1703                 iblock++;
1704                 pos += blocksize;
1705         }
1706
1707         err = 0;
1708         if (buffer_freed(bh)) {
1709                 BUFFER_TRACE(bh, "freed: skip");
1710                 goto unlock;
1711         }
1712
1713         if (!buffer_mapped(bh)) {
1714                 BUFFER_TRACE(bh, "unmapped");
1715                 ext3_get_block(inode, iblock, bh, 0);
1716                 /* unmapped? It's a hole - nothing to do */
1717                 if (!buffer_mapped(bh)) {
1718                         BUFFER_TRACE(bh, "still unmapped");
1719                         goto unlock;
1720                 }
1721         }
1722
1723         /* Ok, it's mapped. Make sure it's up-to-date */
1724         if (PageUptodate(page))
1725                 set_buffer_uptodate(bh);
1726
1727         if (!buffer_uptodate(bh)) {
1728                 err = -EIO;
1729                 ll_rw_block(READ, 1, &bh);
1730                 wait_on_buffer(bh);
1731                 /* Uhhuh. Read error. Complain and punt. */
1732                 if (!buffer_uptodate(bh))
1733                         goto unlock;
1734         }
1735
1736         if (ext3_should_journal_data(inode)) {
1737                 BUFFER_TRACE(bh, "get write access");
1738                 err = ext3_journal_get_write_access(handle, bh);
1739                 if (err)
1740                         goto unlock;
1741         }
1742
1743         kaddr = kmap_atomic(page, KM_USER0);
1744         memset(kaddr + offset, 0, length);
1745         flush_dcache_page(page);
1746         kunmap_atomic(kaddr, KM_USER0);
1747
1748         BUFFER_TRACE(bh, "zeroed end of block");
1749
1750         err = 0;
1751         if (ext3_should_journal_data(inode)) {
1752                 err = ext3_journal_dirty_metadata(handle, bh);
1753         } else {
1754                 if (ext3_should_order_data(inode))
1755                         err = ext3_journal_dirty_data(handle, bh);
1756                 mark_buffer_dirty(bh);
1757         }
1758
1759 unlock:
1760         unlock_page(page);
1761         page_cache_release(page);
1762         return err;
1763 }
1764
1765 /*
1766  * Probably it should be a library function... search for first non-zero word
1767  * or memcmp with zero_page, whatever is better for particular architecture.
1768  * Linus?
1769  */
1770 static inline int all_zeroes(u32 *p, u32 *q)
1771 {
1772         while (p < q)
1773                 if (*p++)
1774                         return 0;
1775         return 1;
1776 }
1777
1778 /**
1779  *      ext3_find_shared - find the indirect blocks for partial truncation.
1780  *      @inode:   inode in question
1781  *      @depth:   depth of the affected branch
1782  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1783  *      @chain:   place to store the pointers to partial indirect blocks
1784  *      @top:     place to the (detached) top of branch
1785  *
1786  *      This is a helper function used by ext3_truncate().
1787  *
1788  *      When we do truncate() we may have to clean the ends of several
1789  *      indirect blocks but leave the blocks themselves alive. Block is
1790  *      partially truncated if some data below the new i_size is refered
1791  *      from it (and it is on the path to the first completely truncated
1792  *      data block, indeed).  We have to free the top of that path along
1793  *      with everything to the right of the path. Since no allocation
1794  *      past the truncation point is possible until ext3_truncate()
1795  *      finishes, we may safely do the latter, but top of branch may
1796  *      require special attention - pageout below the truncation point
1797  *      might try to populate it.
1798  *
1799  *      We atomically detach the top of branch from the tree, store the
1800  *      block number of its root in *@top, pointers to buffer_heads of
1801  *      partially truncated blocks - in @chain[].bh and pointers to
1802  *      their last elements that should not be removed - in
1803  *      @chain[].p. Return value is the pointer to last filled element
1804  *      of @chain.
1805  *
1806  *      The work left to caller to do the actual freeing of subtrees:
1807  *              a) free the subtree starting from *@top
1808  *              b) free the subtrees whose roots are stored in
1809  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1810  *              c) free the subtrees growing from the inode past the @chain[0].
1811  *                      (no partially truncated stuff there).  */
1812
1813 static Indirect *ext3_find_shared(struct inode *inode,
1814                                 int depth,
1815                                 int offsets[4],
1816                                 Indirect chain[4],
1817                                 u32 *top)
1818 {
1819         Indirect *partial, *p;
1820         int k, err;
1821
1822         *top = 0;
1823         /* Make k index the deepest non-null offest + 1 */
1824         for (k = depth; k > 1 && !offsets[k-1]; k--)
1825                 ;
1826         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1827         /* Writer: pointers */
1828         if (!partial)
1829                 partial = chain + k-1;
1830         /*
1831          * If the branch acquired continuation since we've looked at it -
1832          * fine, it should all survive and (new) top doesn't belong to us.
1833          */
1834         if (!partial->key && *partial->p)
1835                 /* Writer: end */
1836                 goto no_top;
1837         for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
1838                 ;
1839         /*
1840          * OK, we've found the last block that must survive. The rest of our
1841          * branch should be detached before unlocking. However, if that rest
1842          * of branch is all ours and does not grow immediately from the inode
1843          * it's easier to cheat and just decrement partial->p.
1844          */
1845         if (p == chain + k - 1 && p > chain) {
1846                 p->p--;
1847         } else {
1848                 *top = *p->p;
1849                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1850 #if 0
1851                 *p->p = 0;
1852 #endif
1853         }
1854         /* Writer: end */
1855
1856         while(partial > p)
1857         {
1858                 brelse(partial->bh);
1859                 partial--;
1860         }
1861 no_top:
1862         return partial;
1863 }
1864
1865 /*
1866  * Zero a number of block pointers in either an inode or an indirect block.
1867  * If we restart the transaction we must again get write access to the
1868  * indirect block for further modification.
1869  *
1870  * We release `count' blocks on disk, but (last - first) may be greater
1871  * than `count' because there can be holes in there.
1872  */
1873 static void
1874 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1875                 unsigned long block_to_free, unsigned long count,
1876                 u32 *first, u32 *last)
1877 {
1878         u32 *p;
1879         if (try_to_extend_transaction(handle, inode)) {
1880                 if (bh) {
1881                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1882                         ext3_journal_dirty_metadata(handle, bh);
1883                 }
1884                 ext3_mark_inode_dirty(handle, inode);
1885                 ext3_journal_test_restart(handle, inode);
1886                 if (bh) {
1887                         BUFFER_TRACE(bh, "retaking write access");
1888                         ext3_journal_get_write_access(handle, bh);
1889                 }
1890         }
1891
1892         /*
1893          * Any buffers which are on the journal will be in memory. We find
1894          * them on the hash table so journal_revoke() will run journal_forget()
1895          * on them.  We've already detached each block from the file, so
1896          * bforget() in journal_forget() should be safe.
1897          *
1898          * AKPM: turn on bforget in journal_forget()!!!
1899          */
1900         for (p = first; p < last; p++) {
1901                 u32 nr = le32_to_cpu(*p);
1902                 if (nr) {
1903                         struct buffer_head *bh;
1904
1905                         *p = 0;
1906                         bh = sb_find_get_block(inode->i_sb, nr);
1907                         ext3_forget(handle, 0, inode, bh, nr);
1908                 }
1909         }
1910
1911         ext3_free_blocks(handle, inode, block_to_free, count);
1912 }
1913
1914 /**
1915  * ext3_free_data - free a list of data blocks
1916  * @handle:     handle for this transaction
1917  * @inode:      inode we are dealing with
1918  * @this_bh:    indirect buffer_head which contains *@first and *@last
1919  * @first:      array of block numbers
1920  * @last:       points immediately past the end of array
1921  *
1922  * We are freeing all blocks refered from that array (numbers are stored as
1923  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1924  *
1925  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1926  * blocks are contiguous then releasing them at one time will only affect one
1927  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1928  * actually use a lot of journal space.
1929  *
1930  * @this_bh will be %NULL if @first and @last point into the inode's direct
1931  * block pointers.
1932  */
1933 static void ext3_free_data(handle_t *handle, struct inode *inode,
1934                            struct buffer_head *this_bh, u32 *first, u32 *last)
1935 {
1936         unsigned long block_to_free = 0;    /* Starting block # of a run */
1937         unsigned long count = 0;            /* Number of blocks in the run */ 
1938         u32 *block_to_free_p = NULL;        /* Pointer into inode/ind
1939                                                corresponding to
1940                                                block_to_free */
1941         unsigned long nr;                   /* Current block # */
1942         u32 *p;                             /* Pointer into inode/ind
1943                                                for current block */
1944         int err;
1945
1946         if (this_bh) {                          /* For indirect block */
1947                 BUFFER_TRACE(this_bh, "get_write_access");
1948                 err = ext3_journal_get_write_access(handle, this_bh);
1949                 /* Important: if we can't update the indirect pointers
1950                  * to the blocks, we can't free them. */
1951                 if (err)
1952                         return;
1953         }
1954
1955         for (p = first; p < last; p++) {
1956                 nr = le32_to_cpu(*p);
1957                 if (nr) {
1958                         /* accumulate blocks to free if they're contiguous */
1959                         if (count == 0) {
1960                                 block_to_free = nr;
1961                                 block_to_free_p = p;
1962                                 count = 1;
1963                         } else if (nr == block_to_free + count) {
1964                                 count++;
1965                         } else {
1966                                 ext3_clear_blocks(handle, inode, this_bh, 
1967                                                   block_to_free,
1968                                                   count, block_to_free_p, p);
1969                                 block_to_free = nr;
1970                                 block_to_free_p = p;
1971                                 count = 1;
1972                         }
1973                 }
1974         }
1975
1976         if (count > 0)
1977                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1978                                   count, block_to_free_p, p);
1979
1980         if (this_bh) {
1981                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1982                 ext3_journal_dirty_metadata(handle, this_bh);
1983         }
1984 }
1985
1986 /**
1987  *      ext3_free_branches - free an array of branches
1988  *      @handle: JBD handle for this transaction
1989  *      @inode: inode we are dealing with
1990  *      @parent_bh: the buffer_head which contains *@first and *@last
1991  *      @first: array of block numbers
1992  *      @last:  pointer immediately past the end of array
1993  *      @depth: depth of the branches to free
1994  *
1995  *      We are freeing all blocks refered from these branches (numbers are
1996  *      stored as little-endian 32-bit) and updating @inode->i_blocks
1997  *      appropriately.
1998  */
1999 static void ext3_free_branches(handle_t *handle, struct inode *inode,
2000                                struct buffer_head *parent_bh,
2001                                u32 *first, u32 *last, int depth)
2002 {
2003         unsigned long nr;
2004         u32 *p;
2005
2006         if (is_handle_aborted(handle))
2007                 return;
2008
2009         if (depth--) {
2010                 struct buffer_head *bh;
2011                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2012                 p = last;
2013                 while (--p >= first) {
2014                         nr = le32_to_cpu(*p);
2015                         if (!nr)
2016                                 continue;               /* A hole */
2017
2018                         /* Go read the buffer for the next level down */
2019                         bh = sb_bread(inode->i_sb, nr);
2020
2021                         /*
2022                          * A read failure? Report error and clear slot
2023                          * (should be rare).
2024                          */
2025                         if (!bh) {
2026                                 ext3_error(inode->i_sb, "ext3_free_branches",
2027                                            "Read failure, inode=%ld, block=%ld",
2028                                            inode->i_ino, nr);
2029                                 continue;
2030                         }
2031
2032                         /* This zaps the entire block.  Bottom up. */
2033                         BUFFER_TRACE(bh, "free child branches");
2034                         ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
2035                                            (u32*)bh->b_data + addr_per_block,
2036                                            depth);
2037
2038                         /*
2039                          * We've probably journalled the indirect block several
2040                          * times during the truncate.  But it's no longer
2041                          * needed and we now drop it from the transaction via
2042                          * journal_revoke().
2043                          *
2044                          * That's easy if it's exclusively part of this
2045                          * transaction.  But if it's part of the committing
2046                          * transaction then journal_forget() will simply
2047                          * brelse() it.  That means that if the underlying
2048                          * block is reallocated in ext3_get_block(),
2049                          * unmap_underlying_metadata() will find this block
2050                          * and will try to get rid of it.  damn, damn.
2051                          *
2052                          * If this block has already been committed to the
2053                          * journal, a revoke record will be written.  And
2054                          * revoke records must be emitted *before* clearing
2055                          * this block's bit in the bitmaps.
2056                          */
2057                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2058
2059                         /*
2060                          * Everything below this this pointer has been
2061                          * released.  Now let this top-of-subtree go.
2062                          *
2063                          * We want the freeing of this indirect block to be
2064                          * atomic in the journal with the updating of the
2065                          * bitmap block which owns it.  So make some room in
2066                          * the journal.
2067                          *
2068                          * We zero the parent pointer *after* freeing its
2069                          * pointee in the bitmaps, so if extend_transaction()
2070                          * for some reason fails to put the bitmap changes and
2071                          * the release into the same transaction, recovery
2072                          * will merely complain about releasing a free block,
2073                          * rather than leaking blocks.
2074                          */
2075                         if (is_handle_aborted(handle))
2076                                 return;
2077                         if (try_to_extend_transaction(handle, inode)) {
2078                                 ext3_mark_inode_dirty(handle, inode);
2079                                 ext3_journal_test_restart(handle, inode);
2080                         }
2081
2082                         ext3_free_blocks(handle, inode, nr, 1);
2083
2084                         if (parent_bh) {
2085                                 /*
2086                                  * The block which we have just freed is
2087                                  * pointed to by an indirect block: journal it
2088                                  */
2089                                 BUFFER_TRACE(parent_bh, "get_write_access");
2090                                 if (!ext3_journal_get_write_access(handle,
2091                                                                    parent_bh)){
2092                                         *p = 0;
2093                                         BUFFER_TRACE(parent_bh,
2094                                         "call ext3_journal_dirty_metadata");
2095                                         ext3_journal_dirty_metadata(handle, 
2096                                                                     parent_bh);
2097                                 }
2098                         }
2099                 }
2100         } else {
2101                 /* We have reached the bottom of the tree. */
2102                 BUFFER_TRACE(parent_bh, "free data blocks");
2103                 ext3_free_data(handle, inode, parent_bh, first, last);
2104         }
2105 }
2106
2107 /*
2108  * ext3_truncate()
2109  *
2110  * We block out ext3_get_block() block instantiations across the entire
2111  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2112  * simultaneously on behalf of the same inode.
2113  *
2114  * As we work through the truncate and commmit bits of it to the journal there
2115  * is one core, guiding principle: the file's tree must always be consistent on
2116  * disk.  We must be able to restart the truncate after a crash.
2117  *
2118  * The file's tree may be transiently inconsistent in memory (although it
2119  * probably isn't), but whenever we close off and commit a journal transaction,
2120  * the contents of (the filesystem + the journal) must be consistent and
2121  * restartable.  It's pretty simple, really: bottom up, right to left (although
2122  * left-to-right works OK too).
2123  *
2124  * Note that at recovery time, journal replay occurs *before* the restart of
2125  * truncate against the orphan inode list.
2126  *
2127  * The committed inode has the new, desired i_size (which is the same as
2128  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2129  * that this inode's truncate did not complete and it will again call
2130  * ext3_truncate() to have another go.  So there will be instantiated blocks
2131  * to the right of the truncation point in a crashed ext3 filesystem.  But
2132  * that's fine - as long as they are linked from the inode, the post-crash
2133  * ext3_truncate() run will find them and release them.
2134  */
2135
2136 void ext3_truncate(struct inode * inode)
2137 {
2138         handle_t *handle;
2139         struct ext3_inode_info *ei = EXT3_I(inode);
2140         u32 *i_data = ei->i_data;
2141         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2142         struct address_space *mapping = inode->i_mapping;
2143         int offsets[4];
2144         Indirect chain[4];
2145         Indirect *partial;
2146         int nr = 0;
2147         int n;
2148         long last_block;
2149         unsigned blocksize = inode->i_sb->s_blocksize;
2150         struct page *page;
2151
2152         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2153             S_ISLNK(inode->i_mode)))
2154                 return;
2155         if (ext3_inode_is_fast_symlink(inode))
2156                 return;
2157         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2158                 return;
2159
2160         ext3_discard_prealloc(inode);
2161
2162         /*
2163          * We have to lock the EOF page here, because lock_page() nests
2164          * outside journal_start().
2165          */
2166         if ((inode->i_size & (blocksize - 1)) == 0) {
2167                 /* Block boundary? Nothing to do */
2168                 page = NULL;
2169         } else {
2170                 page = grab_cache_page(mapping,
2171                                 inode->i_size >> PAGE_CACHE_SHIFT);
2172                 if (!page)
2173                         return;
2174         }
2175
2176         handle = start_transaction(inode);
2177         if (IS_ERR(handle)) {
2178                 if (page) {
2179                         clear_highpage(page);
2180                         flush_dcache_page(page);
2181                         unlock_page(page);
2182                         page_cache_release(page);
2183                 }
2184                 return;         /* AKPM: return what? */
2185         }
2186
2187         last_block = (inode->i_size + blocksize-1)
2188                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2189
2190         if (page)
2191                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2192
2193         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2194         if (n == 0)
2195                 goto out_stop;  /* error */
2196
2197         /*
2198          * OK.  This truncate is going to happen.  We add the inode to the
2199          * orphan list, so that if this truncate spans multiple transactions,
2200          * and we crash, we will resume the truncate when the filesystem
2201          * recovers.  It also marks the inode dirty, to catch the new size.
2202          *
2203          * Implication: the file must always be in a sane, consistent
2204          * truncatable state while each transaction commits.
2205          */
2206         if (ext3_orphan_add(handle, inode))
2207                 goto out_stop;
2208
2209         /*
2210          * The orphan list entry will now protect us from any crash which
2211          * occurs before the truncate completes, so it is now safe to propagate
2212          * the new, shorter inode size (held for now in i_size) into the
2213          * on-disk inode. We do this via i_disksize, which is the value which
2214          * ext3 *really* writes onto the disk inode.
2215          */
2216         ei->i_disksize = inode->i_size;
2217
2218         /*
2219          * From here we block out all ext3_get_block() callers who want to
2220          * modify the block allocation tree.
2221          */
2222         down(&ei->truncate_sem);
2223
2224         if (n == 1) {           /* direct blocks */
2225                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2226                                i_data + EXT3_NDIR_BLOCKS);
2227                 goto do_indirects;
2228         }
2229
2230         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2231         /* Kill the top of shared branch (not detached) */
2232         if (nr) {
2233                 if (partial == chain) {
2234                         /* Shared branch grows from the inode */
2235                         ext3_free_branches(handle, inode, NULL,
2236                                            &nr, &nr+1, (chain+n-1) - partial);
2237                         *partial->p = 0;
2238                         /*
2239                          * We mark the inode dirty prior to restart,
2240                          * and prior to stop.  No need for it here.
2241                          */
2242                 } else {
2243                         /* Shared branch grows from an indirect block */
2244                         BUFFER_TRACE(partial->bh, "get_write_access");
2245                         ext3_free_branches(handle, inode, partial->bh,
2246                                         partial->p,
2247                                         partial->p+1, (chain+n-1) - partial);
2248                 }
2249         }
2250         /* Clear the ends of indirect blocks on the shared branch */
2251         while (partial > chain) {
2252                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2253                                    (u32*)partial->bh->b_data + addr_per_block,
2254                                    (chain+n-1) - partial);
2255                 BUFFER_TRACE(partial->bh, "call brelse");
2256                 brelse (partial->bh);
2257                 partial--;
2258         }
2259 do_indirects:
2260         /* Kill the remaining (whole) subtrees */
2261         switch (offsets[0]) {
2262                 default:
2263                         nr = i_data[EXT3_IND_BLOCK];
2264                         if (nr) {
2265                                 ext3_free_branches(handle, inode, NULL,
2266                                                    &nr, &nr+1, 1);
2267                                 i_data[EXT3_IND_BLOCK] = 0;
2268                         }
2269                 case EXT3_IND_BLOCK:
2270                         nr = i_data[EXT3_DIND_BLOCK];
2271                         if (nr) {
2272                                 ext3_free_branches(handle, inode, NULL,
2273                                                    &nr, &nr+1, 2);
2274                                 i_data[EXT3_DIND_BLOCK] = 0;
2275                         }
2276                 case EXT3_DIND_BLOCK:
2277                         nr = i_data[EXT3_TIND_BLOCK];
2278                         if (nr) {
2279                                 ext3_free_branches(handle, inode, NULL,
2280                                                    &nr, &nr+1, 3);
2281                                 i_data[EXT3_TIND_BLOCK] = 0;
2282                         }
2283                 case EXT3_TIND_BLOCK:
2284                         ;
2285         }
2286         up(&ei->truncate_sem);
2287         inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2288         ext3_mark_inode_dirty(handle, inode);
2289
2290         /* In a multi-transaction truncate, we only make the final
2291          * transaction synchronous */
2292         if (IS_SYNC(inode))
2293                 handle->h_sync = 1;
2294 out_stop:
2295         /*
2296          * If this was a simple ftruncate(), and the file will remain alive
2297          * then we need to clear up the orphan record which we created above.
2298          * However, if this was a real unlink then we were called by
2299          * ext3_delete_inode(), and we allow that function to clean up the
2300          * orphan info for us.
2301          */
2302         if (inode->i_nlink)
2303                 ext3_orphan_del(handle, inode);
2304
2305         ext3_journal_stop(handle);
2306 }
2307
2308 static unsigned long ext3_get_inode_block(struct super_block *sb,
2309                 unsigned long ino, struct ext3_iloc *iloc)
2310 {
2311         unsigned long desc, group_desc, block_group;
2312         unsigned long offset, block;
2313         struct buffer_head *bh;
2314         struct ext3_group_desc * gdp;
2315
2316         if ((ino != EXT3_ROOT_INO &&
2317                 ino != EXT3_JOURNAL_INO &&
2318                 ino < EXT3_FIRST_INO(sb)) ||
2319                 ino > le32_to_cpu(
2320                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2321                 ext3_error (sb, "ext3_get_inode_block",
2322                             "bad inode number: %lu", ino);
2323                 return 0;
2324         }
2325         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2326         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2327                 ext3_error (sb, "ext3_get_inode_block",
2328                             "group >= groups count");
2329                 return 0;
2330         }
2331         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2332         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2333         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2334         if (!bh) {
2335                 ext3_error (sb, "ext3_get_inode_block",
2336                             "Descriptor not loaded");
2337                 return 0;
2338         }
2339
2340         gdp = (struct ext3_group_desc *) bh->b_data;
2341         /*
2342          * Figure out the offset within the block group inode table
2343          */
2344         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2345                 EXT3_INODE_SIZE(sb);
2346         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2347                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2348
2349         iloc->block_group = block_group;
2350         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2351         return block;
2352 }
2353
2354 /* 
2355  * ext3_get_inode_loc returns with an extra refcount against the inode's
2356  * underlying buffer_head on success.  If `in_mem' is false then we're purely
2357  * trying to determine the inode's location on-disk and no read need be
2358  * performed.
2359  */
2360 static int ext3_get_inode_loc(struct inode *inode,
2361                                 struct ext3_iloc *iloc, int in_mem)
2362 {
2363         unsigned long block;
2364         struct buffer_head *bh;
2365
2366         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2367         if (!block)
2368                 return -EIO;
2369
2370         bh = sb_getblk(inode->i_sb, block);
2371         if (!bh) {
2372                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2373                                 "unable to read inode block - "
2374                                 "inode=%lu, block=%lu", inode->i_ino, block);
2375                 return -EIO;
2376         }
2377         if (!buffer_uptodate(bh)) {
2378                 lock_buffer(bh);
2379                 if (buffer_uptodate(bh)) {
2380                         /* someone brought it uptodate while we waited */
2381                         unlock_buffer(bh);
2382                         goto has_buffer;
2383                 }
2384
2385                 /* we can't skip I/O if inode is on a disk only */
2386                 if (in_mem) {
2387                         struct buffer_head *bitmap_bh;
2388                         struct ext3_group_desc *desc;
2389                         int inodes_per_buffer;
2390                         int inode_offset, i;
2391                         int block_group;
2392                         int start;
2393
2394                         /*
2395                          * If this is the only valid inode in the block we
2396                          * need not read the block.
2397                          */
2398                         block_group = (inode->i_ino - 1) /
2399                                         EXT3_INODES_PER_GROUP(inode->i_sb);
2400                         inodes_per_buffer = bh->b_size /
2401                                 EXT3_INODE_SIZE(inode->i_sb);
2402                         inode_offset = ((inode->i_ino - 1) %
2403                                         EXT3_INODES_PER_GROUP(inode->i_sb));
2404                         start = inode_offset & ~(inodes_per_buffer - 1);
2405
2406                         /* Is the inode bitmap in cache? */
2407                         desc = ext3_get_group_desc(inode->i_sb,
2408                                                 block_group, NULL);
2409                         if (!desc)
2410                                 goto make_io;
2411
2412                         bitmap_bh = sb_getblk(inode->i_sb,
2413                                         le32_to_cpu(desc->bg_inode_bitmap));
2414                         if (!bitmap_bh)
2415                                 goto make_io;
2416
2417                         /*
2418                          * If the inode bitmap isn't in cache then the
2419                          * optimisation may end up performing two reads instead
2420                          * of one, so skip it.
2421                          */
2422                         if (!buffer_uptodate(bitmap_bh)) {
2423                                 brelse(bitmap_bh);
2424                                 goto make_io;
2425                         }
2426                         for (i = start; i < start + inodes_per_buffer; i++) {
2427                                 if (i == inode_offset)
2428                                         continue;
2429                                 if (ext3_test_bit(i, bitmap_bh->b_data))
2430                                         break;
2431                         }
2432                         brelse(bitmap_bh);
2433                         if (i == start + inodes_per_buffer) {
2434                                 /* all other inodes are free, so skip I/O */
2435                                 memset(bh->b_data, 0, bh->b_size);
2436                                 set_buffer_uptodate(bh);
2437                                 unlock_buffer(bh);
2438                                 goto has_buffer;
2439                         }
2440                 }
2441
2442 make_io:
2443                 /*
2444                  * There are another valid inodes in the buffer so we must
2445                  * read the block from disk
2446                  */
2447                 get_bh(bh);
2448                 bh->b_end_io = end_buffer_read_sync;
2449                 submit_bh(READ, bh);
2450                 wait_on_buffer(bh);
2451                 if (!buffer_uptodate(bh)) {
2452                         ext3_error(inode->i_sb, "ext3_get_inode_loc",
2453                                         "unable to read inode block - "
2454                                         "inode=%lu, block=%lu",
2455                                         inode->i_ino, block);
2456                         brelse(bh);
2457                         return -EIO;
2458                 }
2459         }
2460 has_buffer:
2461         iloc->bh = bh;
2462         return 0;
2463 }
2464
2465 void ext3_set_inode_flags(struct inode *inode)
2466 {
2467         unsigned int flags = EXT3_I(inode)->i_flags;
2468
2469         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2470         if (flags & EXT3_SYNC_FL)
2471                 inode->i_flags |= S_SYNC;
2472         if (flags & EXT3_APPEND_FL)
2473                 inode->i_flags |= S_APPEND;
2474         if (flags & EXT3_IMMUTABLE_FL)
2475                 inode->i_flags |= S_IMMUTABLE;
2476         if (flags & EXT3_NOATIME_FL)
2477                 inode->i_flags |= S_NOATIME;
2478         if (flags & EXT3_DIRSYNC_FL)
2479                 inode->i_flags |= S_DIRSYNC;
2480 }
2481
2482 void ext3_read_inode(struct inode * inode)
2483 {
2484         struct ext3_iloc iloc;
2485         struct ext3_inode *raw_inode;
2486         struct ext3_inode_info *ei = EXT3_I(inode);
2487         struct buffer_head *bh;
2488         int block;
2489
2490 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2491         ei->i_acl = EXT3_ACL_NOT_CACHED;
2492         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2493 #endif
2494         if (ext3_get_inode_loc(inode, &iloc, 0))
2495                 goto bad_inode;
2496         bh = iloc.bh;
2497         raw_inode = ext3_raw_inode(&iloc);
2498         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2499         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2500         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2501         if(!(test_opt (inode->i_sb, NO_UID32))) {
2502                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2503                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2504         }
2505         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2506         inode->i_size = le32_to_cpu(raw_inode->i_size);
2507         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2508         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2509         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2510         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2511
2512         ei->i_state = 0;
2513         ei->i_next_alloc_block = 0;
2514         ei->i_next_alloc_goal = 0;
2515         ei->i_dir_start_lookup = 0;
2516         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2517         /* We now have enough fields to check if the inode was active or not.
2518          * This is needed because nfsd might try to access dead inodes
2519          * the test is that same one that e2fsck uses
2520          * NeilBrown 1999oct15
2521          */
2522         if (inode->i_nlink == 0) {
2523                 if (inode->i_mode == 0 ||
2524                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2525                         /* this inode is deleted */
2526                         brelse (bh);
2527                         goto bad_inode;
2528                 }
2529                 /* The only unlinked inodes we let through here have
2530                  * valid i_mode and are being read by the orphan
2531                  * recovery code: that's fine, we're about to complete
2532                  * the process of deleting those. */
2533         }
2534         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2535                                          * (for stat), not the fs block
2536                                          * size */  
2537         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2538         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2539 #ifdef EXT3_FRAGMENTS
2540         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2541         ei->i_frag_no = raw_inode->i_frag;
2542         ei->i_frag_size = raw_inode->i_fsize;
2543 #endif
2544         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2545         if (!S_ISREG(inode->i_mode)) {
2546                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2547         } else {
2548                 inode->i_size |=
2549                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2550         }
2551         ei->i_disksize = inode->i_size;
2552         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2553 #ifdef EXT3_PREALLOCATE
2554         ei->i_prealloc_count = 0;
2555 #endif
2556         ei->i_block_group = iloc.block_group;
2557
2558         /*
2559          * NOTE! The in-memory inode i_data array is in little-endian order
2560          * even on big-endian machines: we do NOT byteswap the block numbers!
2561          */
2562         for (block = 0; block < EXT3_N_BLOCKS; block++)
2563                 ei->i_data[block] = raw_inode->i_block[block];
2564         INIT_LIST_HEAD(&ei->i_orphan);
2565
2566         if (S_ISREG(inode->i_mode)) {
2567                 inode->i_op = &ext3_file_inode_operations;
2568                 inode->i_fop = &ext3_file_operations;
2569                 ext3_set_aops(inode);
2570         } else if (S_ISDIR(inode->i_mode)) {
2571                 inode->i_op = &ext3_dir_inode_operations;
2572                 inode->i_fop = &ext3_dir_operations;
2573         } else if (S_ISLNK(inode->i_mode)) {
2574                 if (ext3_inode_is_fast_symlink(inode))
2575                         inode->i_op = &ext3_fast_symlink_inode_operations;
2576                 else {
2577                         inode->i_op = &ext3_symlink_inode_operations;
2578                         ext3_set_aops(inode);
2579                 }
2580         } else {
2581                 inode->i_op = &ext3_special_inode_operations;
2582                 if (raw_inode->i_block[0])
2583                         init_special_inode(inode, inode->i_mode,
2584                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2585                 else 
2586                         init_special_inode(inode, inode->i_mode,
2587                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2588         }
2589         brelse (iloc.bh);
2590         ext3_set_inode_flags(inode);
2591         return;
2592
2593 bad_inode:
2594         make_bad_inode(inode);
2595         return;
2596 }
2597
2598 /*
2599  * Post the struct inode info into an on-disk inode location in the
2600  * buffer-cache.  This gobbles the caller's reference to the
2601  * buffer_head in the inode location struct.
2602  *
2603  * The caller must have write access to iloc->bh.
2604  */
2605 static int ext3_do_update_inode(handle_t *handle, 
2606                                 struct inode *inode, 
2607                                 struct ext3_iloc *iloc)
2608 {
2609         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2610         struct ext3_inode_info *ei = EXT3_I(inode);
2611         struct buffer_head *bh = iloc->bh;
2612         int err = 0, rc, block;
2613
2614         /* For fields not not tracking in the in-memory inode,
2615          * initialise them to zero for new inodes. */
2616         if (ei->i_state & EXT3_STATE_NEW)
2617                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2618
2619         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2620         if(!(test_opt(inode->i_sb, NO_UID32))) {
2621                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2622                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2623 /*
2624  * Fix up interoperability with old kernels. Otherwise, old inodes get
2625  * re-used with the upper 16 bits of the uid/gid intact
2626  */
2627                 if(!ei->i_dtime) {
2628                         raw_inode->i_uid_high =
2629                                 cpu_to_le16(high_16_bits(inode->i_uid));
2630                         raw_inode->i_gid_high =
2631                                 cpu_to_le16(high_16_bits(inode->i_gid));
2632                 } else {
2633                         raw_inode->i_uid_high = 0;
2634                         raw_inode->i_gid_high = 0;
2635                 }
2636         } else {
2637                 raw_inode->i_uid_low =
2638                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
2639                 raw_inode->i_gid_low =
2640                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
2641                 raw_inode->i_uid_high = 0;
2642                 raw_inode->i_gid_high = 0;
2643         }
2644         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2645         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2646         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2647         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2648         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2649         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2650         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2651         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2652 #ifdef EXT3_FRAGMENTS
2653         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2654         raw_inode->i_frag = ei->i_frag_no;
2655         raw_inode->i_fsize = ei->i_frag_size;
2656 #endif
2657         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2658         if (!S_ISREG(inode->i_mode)) {
2659                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2660         } else {
2661                 raw_inode->i_size_high =
2662                         cpu_to_le32(ei->i_disksize >> 32);
2663                 if (ei->i_disksize > 0x7fffffffULL) {
2664                         struct super_block *sb = inode->i_sb;
2665                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2666                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2667                             EXT3_SB(sb)->s_es->s_rev_level ==
2668                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2669                                /* If this is the first large file
2670                                 * created, add a flag to the superblock.
2671                                 */
2672                                 err = ext3_journal_get_write_access(handle,
2673                                                 EXT3_SB(sb)->s_sbh);
2674                                 if (err)
2675                                         goto out_brelse;
2676                                 ext3_update_dynamic_rev(sb);
2677                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2678                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2679                                 sb->s_dirt = 1;
2680                                 handle->h_sync = 1;
2681                                 err = ext3_journal_dirty_metadata(handle,
2682                                                 EXT3_SB(sb)->s_sbh);
2683                         }
2684                 }
2685         }
2686         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2687         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2688                 if (old_valid_dev(inode->i_rdev)) {
2689                         raw_inode->i_block[0] =
2690                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
2691                         raw_inode->i_block[1] = 0;
2692                 } else {
2693                         raw_inode->i_block[0] = 0;
2694                         raw_inode->i_block[1] =
2695                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
2696                         raw_inode->i_block[2] = 0;
2697                 }
2698         } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2699                 raw_inode->i_block[block] = ei->i_data[block];
2700
2701         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2702         rc = ext3_journal_dirty_metadata(handle, bh);
2703         if (!err)
2704                 err = rc;
2705         ei->i_state &= ~EXT3_STATE_NEW;
2706
2707 out_brelse:
2708         brelse (bh);
2709         ext3_std_error(inode->i_sb, err);
2710         return err;
2711 }
2712
2713 /*
2714  * ext3_write_inode()
2715  *
2716  * We are called from a few places:
2717  *
2718  * - Within generic_file_write() for O_SYNC files.
2719  *   Here, there will be no transaction running. We wait for any running
2720  *   trasnaction to commit.
2721  *
2722  * - Within sys_sync(), kupdate and such.
2723  *   We wait on commit, if tol to.
2724  *
2725  * - Within prune_icache() (PF_MEMALLOC == true)
2726  *   Here we simply return.  We can't afford to block kswapd on the
2727  *   journal commit.
2728  *
2729  * In all cases it is actually safe for us to return without doing anything,
2730  * because the inode has been copied into a raw inode buffer in
2731  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2732  * knfsd.
2733  *
2734  * Note that we are absolutely dependent upon all inode dirtiers doing the
2735  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2736  * which we are interested.
2737  *
2738  * It would be a bug for them to not do this.  The code:
2739  *
2740  *      mark_inode_dirty(inode)
2741  *      stuff();
2742  *      inode->i_size = expr;
2743  *
2744  * is in error because a kswapd-driven write_inode() could occur while
2745  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2746  * will no longer be on the superblock's dirty inode list.
2747  */
2748 void ext3_write_inode(struct inode *inode, int wait)
2749 {
2750         if (current->flags & PF_MEMALLOC)
2751                 return;
2752
2753         if (ext3_journal_current_handle()) {
2754                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2755                 dump_stack();
2756                 return;
2757         }
2758
2759         if (!wait)
2760                 return;
2761
2762         ext3_force_commit(inode->i_sb);
2763 }
2764
2765 /*
2766  * ext3_setattr()
2767  *
2768  * Called from notify_change.
2769  *
2770  * We want to trap VFS attempts to truncate the file as soon as
2771  * possible.  In particular, we want to make sure that when the VFS
2772  * shrinks i_size, we put the inode on the orphan list and modify
2773  * i_disksize immediately, so that during the subsequent flushing of
2774  * dirty pages and freeing of disk blocks, we can guarantee that any
2775  * commit will leave the blocks being flushed in an unused state on
2776  * disk.  (On recovery, the inode will get truncated and the blocks will
2777  * be freed, so we have a strong guarantee that no future commit will
2778  * leave these blocks visible to the user.)  
2779  *
2780  * Called with inode->sem down.
2781  */
2782 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2783 {
2784         struct inode *inode = dentry->d_inode;
2785         int error, rc = 0;
2786         const unsigned int ia_valid = attr->ia_valid;
2787
2788         error = inode_change_ok(inode, attr);
2789         if (error)
2790                 return error;
2791
2792         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2793                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2794                 handle_t *handle;
2795
2796                 /* (user+group)*(old+new) structure, inode write (sb,
2797                  * inode block, ? - but truncate inode update has it) */
2798                 handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
2799                 if (IS_ERR(handle)) {
2800                         error = PTR_ERR(handle);
2801                         goto err_out;
2802                 }
2803                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2804                 if (error) {
2805                         ext3_journal_stop(handle);
2806                         return error;
2807                 }
2808                 /* Update corresponding info in inode so that everything is in
2809                  * one transaction */
2810                 if (attr->ia_valid & ATTR_UID)
2811                         inode->i_uid = attr->ia_uid;
2812                 if (attr->ia_valid & ATTR_GID)
2813                         inode->i_gid = attr->ia_gid;
2814                 error = ext3_mark_inode_dirty(handle, inode);
2815                 ext3_journal_stop(handle);
2816         }
2817
2818         if (S_ISREG(inode->i_mode) &&
2819             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2820                 handle_t *handle;
2821
2822                 handle = ext3_journal_start(inode, 3);
2823                 if (IS_ERR(handle)) {
2824                         error = PTR_ERR(handle);
2825                         goto err_out;
2826                 }
2827
2828                 error = ext3_orphan_add(handle, inode);
2829                 EXT3_I(inode)->i_disksize = attr->ia_size;
2830                 rc = ext3_mark_inode_dirty(handle, inode);
2831                 if (!error)
2832                         error = rc;
2833                 ext3_journal_stop(handle);
2834         }
2835
2836         rc = inode_setattr(inode, attr);
2837
2838         /* If inode_setattr's call to ext3_truncate failed to get a
2839          * transaction handle at all, we need to clean up the in-core
2840          * orphan list manually. */
2841         if (inode->i_nlink)
2842                 ext3_orphan_del(NULL, inode);
2843
2844         if (!rc && (ia_valid & ATTR_MODE))
2845                 rc = ext3_acl_chmod(inode);
2846
2847 err_out:
2848         ext3_std_error(inode->i_sb, error);
2849         if (!error)
2850                 error = rc;
2851         return error;
2852 }
2853
2854
2855 /*
2856  * akpm: how many blocks doth make a writepage()?
2857  *
2858  * With N blocks per page, it may be:
2859  * N data blocks
2860  * 2 indirect block
2861  * 2 dindirect
2862  * 1 tindirect
2863  * N+5 bitmap blocks (from the above)
2864  * N+5 group descriptor summary blocks
2865  * 1 inode block
2866  * 1 superblock.
2867  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2868  *
2869  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2870  *
2871  * With ordered or writeback data it's the same, less the N data blocks.
2872  *
2873  * If the inode's direct blocks can hold an integral number of pages then a
2874  * page cannot straddle two indirect blocks, and we can only touch one indirect
2875  * and dindirect block, and the "5" above becomes "3".
2876  *
2877  * This still overestimates under most circumstances.  If we were to pass the
2878  * start and end offsets in here as well we could do block_to_path() on each
2879  * block and work out the exact number of indirects which are touched.  Pah.
2880  */
2881
2882 int ext3_writepage_trans_blocks(struct inode *inode)
2883 {
2884         int bpp = ext3_journal_blocks_per_page(inode);
2885         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2886         int ret;
2887
2888         if (ext3_should_journal_data(inode))
2889                 ret = 3 * (bpp + indirects) + 2;
2890         else
2891                 ret = 2 * (bpp + indirects) + 2;
2892
2893 #ifdef CONFIG_QUOTA
2894         /* We know that structure was already allocated during DQUOT_INIT so
2895          * we will be updating only the data blocks + inodes */
2896         ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
2897 #endif
2898
2899         return ret;
2900 }
2901
2902 /*
2903  * The caller must have previously called ext3_reserve_inode_write().
2904  * Give this, we know that the caller already has write access to iloc->bh.
2905  */
2906 int ext3_mark_iloc_dirty(handle_t *handle,
2907                 struct inode *inode, struct ext3_iloc *iloc)
2908 {
2909         int err = 0;
2910
2911         /* the do_update_inode consumes one bh->b_count */
2912         get_bh(iloc->bh);
2913
2914         /* ext3_do_update_inode() does journal_dirty_metadata */
2915         err = ext3_do_update_inode(handle, inode, iloc);
2916         put_bh(iloc->bh);
2917         return err;
2918 }
2919
2920 /* 
2921  * On success, We end up with an outstanding reference count against
2922  * iloc->bh.  This _must_ be cleaned up later. 
2923  */
2924
2925 int
2926 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
2927                          struct ext3_iloc *iloc)
2928 {
2929         int err = 0;
2930         if (handle) {
2931                 err = ext3_get_inode_loc(inode, iloc, 1);
2932                 if (!err) {
2933                         BUFFER_TRACE(iloc->bh, "get_write_access");
2934                         err = ext3_journal_get_write_access(handle, iloc->bh);
2935                         if (err) {
2936                                 brelse(iloc->bh);
2937                                 iloc->bh = NULL;
2938                         }
2939                 }
2940         }
2941         ext3_std_error(inode->i_sb, err);
2942         return err;
2943 }
2944
2945 /*
2946  * akpm: What we do here is to mark the in-core inode as clean
2947  * with respect to inode dirtiness (it may still be data-dirty).
2948  * This means that the in-core inode may be reaped by prune_icache
2949  * without having to perform any I/O.  This is a very good thing,
2950  * because *any* task may call prune_icache - even ones which
2951  * have a transaction open against a different journal.
2952  *
2953  * Is this cheating?  Not really.  Sure, we haven't written the
2954  * inode out, but prune_icache isn't a user-visible syncing function.
2955  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2956  * we start and wait on commits.
2957  *
2958  * Is this efficient/effective?  Well, we're being nice to the system
2959  * by cleaning up our inodes proactively so they can be reaped
2960  * without I/O.  But we are potentially leaving up to five seconds'
2961  * worth of inodes floating about which prune_icache wants us to
2962  * write out.  One way to fix that would be to get prune_icache()
2963  * to do a write_super() to free up some memory.  It has the desired
2964  * effect.
2965  */
2966 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2967 {
2968         struct ext3_iloc iloc;
2969         int err;
2970
2971         might_sleep();
2972         err = ext3_reserve_inode_write(handle, inode, &iloc);
2973         if (!err)
2974                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2975         return err;
2976 }
2977
2978 /*
2979  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
2980  *
2981  * We're really interested in the case where a file is being extended.
2982  * i_size has been changed by generic_commit_write() and we thus need
2983  * to include the updated inode in the current transaction.
2984  *
2985  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
2986  * are allocated to the file.
2987  *
2988  * If the inode is marked synchronous, we don't honour that here - doing
2989  * so would cause a commit on atime updates, which we don't bother doing.
2990  * We handle synchronous inodes at the highest possible level.
2991  */
2992 void ext3_dirty_inode(struct inode *inode)
2993 {
2994         handle_t *current_handle = ext3_journal_current_handle();
2995         handle_t *handle;
2996
2997         handle = ext3_journal_start(inode, 2);
2998         if (IS_ERR(handle))
2999                 goto out;
3000         if (current_handle &&
3001                 current_handle->h_transaction != handle->h_transaction) {
3002                 /* This task has a transaction open against a different fs */
3003                 printk(KERN_EMERG "%s: transactions do not match!\n",
3004                        __FUNCTION__);
3005         } else {
3006                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
3007                                 current_handle);
3008                 ext3_mark_inode_dirty(handle, inode);
3009         }
3010         ext3_journal_stop(handle);
3011 out:
3012         return;
3013 }
3014
3015 #ifdef AKPM
3016 /* 
3017  * Bind an inode's backing buffer_head into this transaction, to prevent
3018  * it from being flushed to disk early.  Unlike
3019  * ext3_reserve_inode_write, this leaves behind no bh reference and
3020  * returns no iloc structure, so the caller needs to repeat the iloc
3021  * lookup to mark the inode dirty later.
3022  */
3023 static inline int
3024 ext3_pin_inode(handle_t *handle, struct inode *inode)
3025 {
3026         struct ext3_iloc iloc;
3027
3028         int err = 0;
3029         if (handle) {
3030                 err = ext3_get_inode_loc(inode, &iloc, 1);
3031                 if (!err) {
3032                         BUFFER_TRACE(iloc.bh, "get_write_access");
3033                         err = journal_get_write_access(handle, iloc.bh);
3034                         if (!err)
3035                                 err = ext3_journal_dirty_metadata(handle, 
3036                                                                   iloc.bh);
3037                         brelse(iloc.bh);
3038                 }
3039         }
3040         ext3_std_error(inode->i_sb, err);
3041         return err;
3042 }
3043 #endif
3044
3045 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3046 {
3047         journal_t *journal;
3048         handle_t *handle;
3049         int err;
3050
3051         /*
3052          * We have to be very careful here: changing a data block's
3053          * journaling status dynamically is dangerous.  If we write a
3054          * data block to the journal, change the status and then delete
3055          * that block, we risk forgetting to revoke the old log record
3056          * from the journal and so a subsequent replay can corrupt data.
3057          * So, first we make sure that the journal is empty and that
3058          * nobody is changing anything.
3059          */
3060
3061         journal = EXT3_JOURNAL(inode);
3062         if (is_journal_aborted(journal) || IS_RDONLY(inode))
3063                 return -EROFS;
3064
3065         journal_lock_updates(journal);
3066         journal_flush(journal);
3067
3068         /*
3069          * OK, there are no updates running now, and all cached data is
3070          * synced to disk.  We are now in a completely consistent state
3071          * which doesn't have anything in the journal, and we know that
3072          * no filesystem updates are running, so it is safe to modify
3073          * the inode's in-core data-journaling state flag now.
3074          */
3075
3076         if (val)
3077                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3078         else
3079                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3080         ext3_set_aops(inode);
3081
3082         journal_unlock_updates(journal);
3083
3084         /* Finally we can mark the inode as dirty. */
3085
3086         handle = ext3_journal_start(inode, 1);
3087         if (IS_ERR(handle))
3088                 return PTR_ERR(handle);
3089
3090         err = ext3_mark_inode_dirty(handle, inode);
3091         handle->h_sync = 1;
3092         ext3_journal_stop(handle);
3093         ext3_std_error(inode->i_sb, err);
3094
3095         return err;
3096 }