vserver 1.9.5.x5
[linux-2.6.git] / fs / ext3 / inode.c
1 /*
2  *  linux/fs/ext3/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *      (sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *      (jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23  */
24
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
39 #include <linux/vserver/xid.h>
40 #include "xattr.h"
41 #include "acl.h"
42
43 static int ext3_writepage_trans_blocks(struct inode *inode);
44
45 /*
46  * Test whether an inode is a fast symlink.
47  */
48 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
49 {
50         int ea_blocks = EXT3_I(inode)->i_file_acl ?
51                 (inode->i_sb->s_blocksize >> 9) : 0;
52
53         return (S_ISLNK(inode->i_mode) &&
54                 inode->i_blocks - ea_blocks == 0);
55 }
56
57 /* The ext3 forget function must perform a revoke if we are freeing data
58  * which has been journaled.  Metadata (eg. indirect blocks) must be
59  * revoked in all cases. 
60  *
61  * "bh" may be NULL: a metadata block may have been freed from memory
62  * but there may still be a record of it in the journal, and that record
63  * still needs to be revoked.
64  */
65
66 int ext3_forget(handle_t *handle, int is_metadata,
67                        struct inode *inode, struct buffer_head *bh,
68                        int blocknr)
69 {
70         int err;
71
72         might_sleep();
73
74         BUFFER_TRACE(bh, "enter");
75
76         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
77                   "data mode %lx\n",
78                   bh, is_metadata, inode->i_mode,
79                   test_opt(inode->i_sb, DATA_FLAGS));
80
81         /* Never use the revoke function if we are doing full data
82          * journaling: there is no need to, and a V1 superblock won't
83          * support it.  Otherwise, only skip the revoke on un-journaled
84          * data blocks. */
85
86         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
87             (!is_metadata && !ext3_should_journal_data(inode))) {
88                 if (bh) {
89                         BUFFER_TRACE(bh, "call journal_forget");
90                         return ext3_journal_forget(handle, bh);
91                 }
92                 return 0;
93         }
94
95         /*
96          * data!=journal && (is_metadata || should_journal_data(inode))
97          */
98         BUFFER_TRACE(bh, "call ext3_journal_revoke");
99         err = ext3_journal_revoke(handle, blocknr, bh);
100         if (err)
101                 ext3_abort(inode->i_sb, __FUNCTION__,
102                            "error %d when attempting revoke", err);
103         BUFFER_TRACE(bh, "exit");
104         return err;
105 }
106
107 /*
108  * Work out how many blocks we need to progress with the next chunk of a
109  * truncate transaction.
110  */
111
112 static unsigned long blocks_for_truncate(struct inode *inode) 
113 {
114         unsigned long needed;
115
116         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
117
118         /* Give ourselves just enough room to cope with inodes in which
119          * i_blocks is corrupt: we've seen disk corruptions in the past
120          * which resulted in random data in an inode which looked enough
121          * like a regular file for ext3 to try to delete it.  Things
122          * will go a bit crazy if that happens, but at least we should
123          * try not to panic the whole kernel. */
124         if (needed < 2)
125                 needed = 2;
126
127         /* But we need to bound the transaction so we don't overflow the
128          * journal. */
129         if (needed > EXT3_MAX_TRANS_DATA) 
130                 needed = EXT3_MAX_TRANS_DATA;
131
132         return EXT3_DATA_TRANS_BLOCKS + needed;
133 }
134
135 /* 
136  * Truncate transactions can be complex and absolutely huge.  So we need to
137  * be able to restart the transaction at a conventient checkpoint to make
138  * sure we don't overflow the journal.
139  *
140  * start_transaction gets us a new handle for a truncate transaction,
141  * and extend_transaction tries to extend the existing one a bit.  If
142  * extend fails, we need to propagate the failure up and restart the
143  * transaction in the top-level truncate loop. --sct 
144  */
145
146 static handle_t *start_transaction(struct inode *inode) 
147 {
148         handle_t *result;
149
150         result = ext3_journal_start(inode, blocks_for_truncate(inode));
151         if (!IS_ERR(result))
152                 return result;
153
154         ext3_std_error(inode->i_sb, PTR_ERR(result));
155         return result;
156 }
157
158 /*
159  * Try to extend this transaction for the purposes of truncation.
160  *
161  * Returns 0 if we managed to create more room.  If we can't create more
162  * room, and the transaction must be restarted we return 1.
163  */
164 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
165 {
166         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
167                 return 0;
168         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
169                 return 0;
170         return 1;
171 }
172
173 /*
174  * Restart the transaction associated with *handle.  This does a commit,
175  * so before we call here everything must be consistently dirtied against
176  * this transaction.
177  */
178 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
179 {
180         jbd_debug(2, "restarting handle %p\n", handle);
181         return ext3_journal_restart(handle, blocks_for_truncate(inode));
182 }
183
184 static void ext3_truncate_nocheck (struct inode *inode);
185
186 /*
187  * Called at the last iput() if i_nlink is zero.
188  */
189 void ext3_delete_inode (struct inode * inode)
190 {
191         handle_t *handle;
192
193         if (is_bad_inode(inode))
194                 goto no_delete;
195
196         handle = start_transaction(inode);
197         if (IS_ERR(handle)) {
198                 /* If we're going to skip the normal cleanup, we still
199                  * need to make sure that the in-core orphan linked list
200                  * is properly cleaned up. */
201                 ext3_orphan_del(NULL, inode);
202                 goto no_delete;
203         }
204
205         if (IS_SYNC(inode))
206                 handle->h_sync = 1;
207         inode->i_size = 0;
208         if (inode->i_blocks)
209                 ext3_truncate_nocheck(inode);
210         /*
211          * Kill off the orphan record which ext3_truncate created.
212          * AKPM: I think this can be inside the above `if'.
213          * Note that ext3_orphan_del() has to be able to cope with the
214          * deletion of a non-existent orphan - this is because we don't
215          * know if ext3_truncate() actually created an orphan record.
216          * (Well, we could do this if we need to, but heck - it works)
217          */
218         ext3_orphan_del(handle, inode);
219         EXT3_I(inode)->i_dtime  = get_seconds();
220
221         /* 
222          * One subtle ordering requirement: if anything has gone wrong
223          * (transaction abort, IO errors, whatever), then we can still
224          * do these next steps (the fs will already have been marked as
225          * having errors), but we can't free the inode if the mark_dirty
226          * fails.  
227          */
228         if (ext3_mark_inode_dirty(handle, inode))
229                 /* If that failed, just do the required in-core inode clear. */
230                 clear_inode(inode);
231         else
232                 ext3_free_inode(handle, inode);
233         ext3_journal_stop(handle);
234         return;
235 no_delete:
236         clear_inode(inode);     /* We must guarantee clearing of inode... */
237 }
238
239 static int ext3_alloc_block (handle_t *handle,
240                         struct inode * inode, unsigned long goal, int *err)
241 {
242         unsigned long result;
243
244         result = ext3_new_block(handle, inode, goal, err);
245         return result;
246 }
247
248
249 typedef struct {
250         __le32  *p;
251         __le32  key;
252         struct buffer_head *bh;
253 } Indirect;
254
255 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
256 {
257         p->key = *(p->p = v);
258         p->bh = bh;
259 }
260
261 static inline int verify_chain(Indirect *from, Indirect *to)
262 {
263         while (from <= to && from->key == *from->p)
264                 from++;
265         return (from > to);
266 }
267
268 /**
269  *      ext3_block_to_path - parse the block number into array of offsets
270  *      @inode: inode in question (we are only interested in its superblock)
271  *      @i_block: block number to be parsed
272  *      @offsets: array to store the offsets in
273  *      @boundary: set this non-zero if the referred-to block is likely to be
274  *             followed (on disk) by an indirect block.
275  *
276  *      To store the locations of file's data ext3 uses a data structure common
277  *      for UNIX filesystems - tree of pointers anchored in the inode, with
278  *      data blocks at leaves and indirect blocks in intermediate nodes.
279  *      This function translates the block number into path in that tree -
280  *      return value is the path length and @offsets[n] is the offset of
281  *      pointer to (n+1)th node in the nth one. If @block is out of range
282  *      (negative or too large) warning is printed and zero returned.
283  *
284  *      Note: function doesn't find node addresses, so no IO is needed. All
285  *      we need to know is the capacity of indirect blocks (taken from the
286  *      inode->i_sb).
287  */
288
289 /*
290  * Portability note: the last comparison (check that we fit into triple
291  * indirect block) is spelled differently, because otherwise on an
292  * architecture with 32-bit longs and 8Kb pages we might get into trouble
293  * if our filesystem had 8Kb blocks. We might use long long, but that would
294  * kill us on x86. Oh, well, at least the sign propagation does not matter -
295  * i_block would have to be negative in the very beginning, so we would not
296  * get there at all.
297  */
298
299 static int ext3_block_to_path(struct inode *inode,
300                         long i_block, int offsets[4], int *boundary)
301 {
302         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
303         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
304         const long direct_blocks = EXT3_NDIR_BLOCKS,
305                 indirect_blocks = ptrs,
306                 double_blocks = (1 << (ptrs_bits * 2));
307         int n = 0;
308         int final = 0;
309
310         if (i_block < 0) {
311                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
312         } else if (i_block < direct_blocks) {
313                 offsets[n++] = i_block;
314                 final = direct_blocks;
315         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
316                 offsets[n++] = EXT3_IND_BLOCK;
317                 offsets[n++] = i_block;
318                 final = ptrs;
319         } else if ((i_block -= indirect_blocks) < double_blocks) {
320                 offsets[n++] = EXT3_DIND_BLOCK;
321                 offsets[n++] = i_block >> ptrs_bits;
322                 offsets[n++] = i_block & (ptrs - 1);
323                 final = ptrs;
324         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
325                 offsets[n++] = EXT3_TIND_BLOCK;
326                 offsets[n++] = i_block >> (ptrs_bits * 2);
327                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
328                 offsets[n++] = i_block & (ptrs - 1);
329                 final = ptrs;
330         } else {
331                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
332         }
333         if (boundary)
334                 *boundary = (i_block & (ptrs - 1)) == (final - 1);
335         return n;
336 }
337
338 /**
339  *      ext3_get_branch - read the chain of indirect blocks leading to data
340  *      @inode: inode in question
341  *      @depth: depth of the chain (1 - direct pointer, etc.)
342  *      @offsets: offsets of pointers in inode/indirect blocks
343  *      @chain: place to store the result
344  *      @err: here we store the error value
345  *
346  *      Function fills the array of triples <key, p, bh> and returns %NULL
347  *      if everything went OK or the pointer to the last filled triple
348  *      (incomplete one) otherwise. Upon the return chain[i].key contains
349  *      the number of (i+1)-th block in the chain (as it is stored in memory,
350  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
351  *      number (it points into struct inode for i==0 and into the bh->b_data
352  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
353  *      block for i>0 and NULL for i==0. In other words, it holds the block
354  *      numbers of the chain, addresses they were taken from (and where we can
355  *      verify that chain did not change) and buffer_heads hosting these
356  *      numbers.
357  *
358  *      Function stops when it stumbles upon zero pointer (absent block)
359  *              (pointer to last triple returned, *@err == 0)
360  *      or when it gets an IO error reading an indirect block
361  *              (ditto, *@err == -EIO)
362  *      or when it notices that chain had been changed while it was reading
363  *              (ditto, *@err == -EAGAIN)
364  *      or when it reads all @depth-1 indirect blocks successfully and finds
365  *      the whole chain, all way to the data (returns %NULL, *err == 0).
366  */
367 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
368                                  Indirect chain[4], int *err)
369 {
370         struct super_block *sb = inode->i_sb;
371         Indirect *p = chain;
372         struct buffer_head *bh;
373
374         *err = 0;
375         /* i_data is not going away, no lock needed */
376         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
377         if (!p->key)
378                 goto no_block;
379         while (--depth) {
380                 bh = sb_bread(sb, le32_to_cpu(p->key));
381                 if (!bh)
382                         goto failure;
383                 /* Reader: pointers */
384                 if (!verify_chain(chain, p))
385                         goto changed;
386                 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
387                 /* Reader: end */
388                 if (!p->key)
389                         goto no_block;
390         }
391         return NULL;
392
393 changed:
394         brelse(bh);
395         *err = -EAGAIN;
396         goto no_block;
397 failure:
398         *err = -EIO;
399 no_block:
400         return p;
401 }
402
403 /**
404  *      ext3_find_near - find a place for allocation with sufficient locality
405  *      @inode: owner
406  *      @ind: descriptor of indirect block.
407  *
408  *      This function returns the prefered place for block allocation.
409  *      It is used when heuristic for sequential allocation fails.
410  *      Rules are:
411  *        + if there is a block to the left of our position - allocate near it.
412  *        + if pointer will live in indirect block - allocate near that block.
413  *        + if pointer will live in inode - allocate in the same
414  *          cylinder group. 
415  *
416  * In the latter case we colour the starting block by the callers PID to
417  * prevent it from clashing with concurrent allocations for a different inode
418  * in the same block group.   The PID is used here so that functionally related
419  * files will be close-by on-disk.
420  *
421  *      Caller must make sure that @ind is valid and will stay that way.
422  */
423
424 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
425 {
426         struct ext3_inode_info *ei = EXT3_I(inode);
427         __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
428         __le32 *p;
429         unsigned long bg_start;
430         unsigned long colour;
431
432         /* Try to find previous block */
433         for (p = ind->p - 1; p >= start; p--)
434                 if (*p)
435                         return le32_to_cpu(*p);
436
437         /* No such thing, so let's try location of indirect block */
438         if (ind->bh)
439                 return ind->bh->b_blocknr;
440
441         /*
442          * It is going to be refered from inode itself? OK, just put it into
443          * the same cylinder group then.
444          */
445         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
446                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
447         colour = (current->pid % 16) *
448                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
449         return bg_start + colour;
450 }
451
452 /**
453  *      ext3_find_goal - find a prefered place for allocation.
454  *      @inode: owner
455  *      @block:  block we want
456  *      @chain:  chain of indirect blocks
457  *      @partial: pointer to the last triple within a chain
458  *      @goal:  place to store the result.
459  *
460  *      Normally this function find the prefered place for block allocation,
461  *      stores it in *@goal and returns zero. If the branch had been changed
462  *      under us we return -EAGAIN.
463  */
464
465 static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
466                           Indirect *partial, unsigned long *goal)
467 {
468         struct ext3_inode_info *ei = EXT3_I(inode);
469         /* Writer: ->i_next_alloc* */
470         if ((block == ei->i_next_alloc_block + 1)&& ei->i_next_alloc_goal) {
471                 ei->i_next_alloc_block++;
472                 ei->i_next_alloc_goal++;
473         }
474         /* Writer: end */
475         /* Reader: pointers, ->i_next_alloc* */
476         if (verify_chain(chain, partial)) {
477                 /*
478                  * try the heuristic for sequential allocation,
479                  * failing that at least try to get decent locality.
480                  */
481                 if (block == ei->i_next_alloc_block)
482                         *goal = ei->i_next_alloc_goal;
483                 if (!*goal)
484                         *goal = ext3_find_near(inode, partial);
485                 return 0;
486         }
487         /* Reader: end */
488         return -EAGAIN;
489 }
490
491 /**
492  *      ext3_alloc_branch - allocate and set up a chain of blocks.
493  *      @inode: owner
494  *      @num: depth of the chain (number of blocks to allocate)
495  *      @offsets: offsets (in the blocks) to store the pointers to next.
496  *      @branch: place to store the chain in.
497  *
498  *      This function allocates @num blocks, zeroes out all but the last one,
499  *      links them into chain and (if we are synchronous) writes them to disk.
500  *      In other words, it prepares a branch that can be spliced onto the
501  *      inode. It stores the information about that chain in the branch[], in
502  *      the same format as ext3_get_branch() would do. We are calling it after
503  *      we had read the existing part of chain and partial points to the last
504  *      triple of that (one with zero ->key). Upon the exit we have the same
505  *      picture as after the successful ext3_get_block(), excpet that in one
506  *      place chain is disconnected - *branch->p is still zero (we did not
507  *      set the last link), but branch->key contains the number that should
508  *      be placed into *branch->p to fill that gap.
509  *
510  *      If allocation fails we free all blocks we've allocated (and forget
511  *      their buffer_heads) and return the error value the from failed
512  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
513  *      as described above and return 0.
514  */
515
516 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
517                              int num,
518                              unsigned long goal,
519                              int *offsets,
520                              Indirect *branch)
521 {
522         int blocksize = inode->i_sb->s_blocksize;
523         int n = 0, keys = 0;
524         int err = 0;
525         int i;
526         int parent = ext3_alloc_block(handle, inode, goal, &err);
527
528         branch[0].key = cpu_to_le32(parent);
529         if (parent) {
530                 for (n = 1; n < num; n++) {
531                         struct buffer_head *bh;
532                         /* Allocate the next block */
533                         int nr = ext3_alloc_block(handle, inode, parent, &err);
534                         if (!nr)
535                                 break;
536                         branch[n].key = cpu_to_le32(nr);
537                         keys = n+1;
538
539                         /*
540                          * Get buffer_head for parent block, zero it out
541                          * and set the pointer to new one, then send
542                          * parent to disk.  
543                          */
544                         bh = sb_getblk(inode->i_sb, parent);
545                         branch[n].bh = bh;
546                         lock_buffer(bh);
547                         BUFFER_TRACE(bh, "call get_create_access");
548                         err = ext3_journal_get_create_access(handle, bh);
549                         if (err) {
550                                 unlock_buffer(bh);
551                                 brelse(bh);
552                                 break;
553                         }
554
555                         memset(bh->b_data, 0, blocksize);
556                         branch[n].p = (__le32*) bh->b_data + offsets[n];
557                         *branch[n].p = branch[n].key;
558                         BUFFER_TRACE(bh, "marking uptodate");
559                         set_buffer_uptodate(bh);
560                         unlock_buffer(bh);
561
562                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
563                         err = ext3_journal_dirty_metadata(handle, bh);
564                         if (err)
565                                 break;
566
567                         parent = nr;
568                 }
569         }
570         if (n == num)
571                 return 0;
572
573         /* Allocation failed, free what we already allocated */
574         for (i = 1; i < keys; i++) {
575                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
576                 ext3_journal_forget(handle, branch[i].bh);
577         }
578         for (i = 0; i < keys; i++)
579                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
580         return err;
581 }
582
583 /**
584  *      ext3_splice_branch - splice the allocated branch onto inode.
585  *      @inode: owner
586  *      @block: (logical) number of block we are adding
587  *      @chain: chain of indirect blocks (with a missing link - see
588  *              ext3_alloc_branch)
589  *      @where: location of missing link
590  *      @num:   number of blocks we are adding
591  *
592  *      This function verifies that chain (up to the missing link) had not
593  *      changed, fills the missing link and does all housekeeping needed in
594  *      inode (->i_blocks, etc.). In case of success we end up with the full
595  *      chain to new block and return 0. Otherwise (== chain had been changed)
596  *      we free the new blocks (forgetting their buffer_heads, indeed) and
597  *      return -EAGAIN.
598  */
599
600 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
601                               Indirect chain[4], Indirect *where, int num)
602 {
603         int i;
604         int err = 0;
605         struct ext3_inode_info *ei = EXT3_I(inode);
606
607         /*
608          * If we're splicing into a [td]indirect block (as opposed to the
609          * inode) then we need to get write access to the [td]indirect block
610          * before the splice.
611          */
612         if (where->bh) {
613                 BUFFER_TRACE(where->bh, "get_write_access");
614                 err = ext3_journal_get_write_access(handle, where->bh);
615                 if (err)
616                         goto err_out;
617         }
618         /* Verify that place we are splicing to is still there and vacant */
619
620         /* Writer: pointers, ->i_next_alloc* */
621         if (!verify_chain(chain, where-1) || *where->p)
622                 /* Writer: end */
623                 goto changed;
624
625         /* That's it */
626
627         *where->p = where->key;
628         ei->i_next_alloc_block = block;
629         ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key);
630         /* Writer: end */
631
632         /* We are done with atomic stuff, now do the rest of housekeeping */
633
634         inode->i_ctime = CURRENT_TIME_SEC;
635         ext3_mark_inode_dirty(handle, inode);
636
637         /* had we spliced it onto indirect block? */
638         if (where->bh) {
639                 /*
640                  * akpm: If we spliced it onto an indirect block, we haven't
641                  * altered the inode.  Note however that if it is being spliced
642                  * onto an indirect block at the very end of the file (the
643                  * file is growing) then we *will* alter the inode to reflect
644                  * the new i_size.  But that is not done here - it is done in
645                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
646                  */
647                 jbd_debug(5, "splicing indirect only\n");
648                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
649                 err = ext3_journal_dirty_metadata(handle, where->bh);
650                 if (err) 
651                         goto err_out;
652         } else {
653                 /*
654                  * OK, we spliced it into the inode itself on a direct block.
655                  * Inode was dirtied above.
656                  */
657                 jbd_debug(5, "splicing direct\n");
658         }
659         return err;
660
661 changed:
662         /*
663          * AKPM: if where[i].bh isn't part of the current updating
664          * transaction then we explode nastily.  Test this code path.
665          */
666         jbd_debug(1, "the chain changed: try again\n");
667         err = -EAGAIN;
668
669 err_out:
670         for (i = 1; i < num; i++) {
671                 BUFFER_TRACE(where[i].bh, "call journal_forget");
672                 ext3_journal_forget(handle, where[i].bh);
673         }
674         /* For the normal collision cleanup case, we free up the blocks.
675          * On genuine filesystem errors we don't even think about doing
676          * that. */
677         if (err == -EAGAIN)
678                 for (i = 0; i < num; i++)
679                         ext3_free_blocks(handle, inode, 
680                                          le32_to_cpu(where[i].key), 1);
681         return err;
682 }
683
684 /*
685  * Allocation strategy is simple: if we have to allocate something, we will
686  * have to go the whole way to leaf. So let's do it before attaching anything
687  * to tree, set linkage between the newborn blocks, write them if sync is
688  * required, recheck the path, free and repeat if check fails, otherwise
689  * set the last missing link (that will protect us from any truncate-generated
690  * removals - all blocks on the path are immune now) and possibly force the
691  * write on the parent block.
692  * That has a nice additional property: no special recovery from the failed
693  * allocations is needed - we simply release blocks and do not touch anything
694  * reachable from inode.
695  *
696  * akpm: `handle' can be NULL if create == 0.
697  *
698  * The BKL may not be held on entry here.  Be sure to take it early.
699  */
700
701 static int
702 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
703                 struct buffer_head *bh_result, int create, int extend_disksize)
704 {
705         int err = -EIO;
706         int offsets[4];
707         Indirect chain[4];
708         Indirect *partial;
709         unsigned long goal;
710         int left;
711         int boundary = 0;
712         int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
713         struct ext3_inode_info *ei = EXT3_I(inode);
714
715         J_ASSERT(handle != NULL || create == 0);
716
717         if (depth == 0)
718                 goto out;
719
720 reread:
721         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
722
723         /* Simplest case - block found, no allocation needed */
724         if (!partial) {
725                 clear_buffer_new(bh_result);
726 got_it:
727                 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
728                 if (boundary)
729                         set_buffer_boundary(bh_result);
730                 /* Clean up and exit */
731                 partial = chain+depth-1; /* the whole chain */
732                 goto cleanup;
733         }
734
735         /* Next simple case - plain lookup or failed read of indirect block */
736         if (!create || err == -EIO) {
737 cleanup:
738                 while (partial > chain) {
739                         BUFFER_TRACE(partial->bh, "call brelse");
740                         brelse(partial->bh);
741                         partial--;
742                 }
743                 BUFFER_TRACE(bh_result, "returned");
744 out:
745                 return err;
746         }
747
748         /*
749          * Indirect block might be removed by truncate while we were
750          * reading it. Handling of that case (forget what we've got and
751          * reread) is taken out of the main path.
752          */
753         if (err == -EAGAIN)
754                 goto changed;
755
756         goal = 0;
757         down(&ei->truncate_sem);
758         if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) {
759                 up(&ei->truncate_sem);
760                 goto changed;
761         }
762
763         left = (chain + depth) - partial;
764
765         /*
766          * Block out ext3_truncate while we alter the tree
767          */
768         err = ext3_alloc_branch(handle, inode, left, goal,
769                                         offsets+(partial-chain), partial);
770
771         /* The ext3_splice_branch call will free and forget any buffers
772          * on the new chain if there is a failure, but that risks using
773          * up transaction credits, especially for bitmaps where the
774          * credits cannot be returned.  Can we handle this somehow?  We
775          * may need to return -EAGAIN upwards in the worst case.  --sct */
776         if (!err)
777                 err = ext3_splice_branch(handle, inode, iblock, chain,
778                                          partial, left);
779         /* i_disksize growing is protected by truncate_sem
780          * don't forget to protect it if you're about to implement
781          * concurrent ext3_get_block() -bzzz */
782         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
783                 ei->i_disksize = inode->i_size;
784         up(&ei->truncate_sem);
785         if (err == -EAGAIN)
786                 goto changed;
787         if (err)
788                 goto cleanup;
789
790         set_buffer_new(bh_result);
791         goto got_it;
792
793 changed:
794         while (partial > chain) {
795                 jbd_debug(1, "buffer chain changed, retrying\n");
796                 BUFFER_TRACE(partial->bh, "brelsing");
797                 brelse(partial->bh);
798                 partial--;
799         }
800         goto reread;
801 }
802
803 static int ext3_get_block(struct inode *inode, sector_t iblock,
804                         struct buffer_head *bh_result, int create)
805 {
806         handle_t *handle = NULL;
807         int ret;
808
809         if (create) {
810                 handle = ext3_journal_current_handle();
811                 J_ASSERT(handle != 0);
812         }
813         ret = ext3_get_block_handle(handle, inode, iblock,
814                                 bh_result, create, 1);
815         return ret;
816 }
817
818 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
819
820 static int
821 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
822                 unsigned long max_blocks, struct buffer_head *bh_result,
823                 int create)
824 {
825         handle_t *handle = journal_current_handle();
826         int ret = 0;
827
828         if (!handle)
829                 goto get_block;         /* A read */
830
831         if (handle->h_transaction->t_state == T_LOCKED) {
832                 /*
833                  * Huge direct-io writes can hold off commits for long
834                  * periods of time.  Let this commit run.
835                  */
836                 ext3_journal_stop(handle);
837                 handle = ext3_journal_start(inode, DIO_CREDITS);
838                 if (IS_ERR(handle))
839                         ret = PTR_ERR(handle);
840                 goto get_block;
841         }
842
843         if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
844                 /*
845                  * Getting low on buffer credits...
846                  */
847                 ret = ext3_journal_extend(handle, DIO_CREDITS);
848                 if (ret > 0) {
849                         /*
850                          * Couldn't extend the transaction.  Start a new one.
851                          */
852                         ret = ext3_journal_restart(handle, DIO_CREDITS);
853                 }
854         }
855
856 get_block:
857         if (ret == 0)
858                 ret = ext3_get_block_handle(handle, inode, iblock,
859                                         bh_result, create, 0);
860         bh_result->b_size = (1 << inode->i_blkbits);
861         return ret;
862 }
863
864 /*
865  * `handle' can be NULL if create is zero
866  */
867 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
868                                 long block, int create, int * errp)
869 {
870         struct buffer_head dummy;
871         int fatal = 0, err;
872
873         J_ASSERT(handle != NULL || create == 0);
874
875         dummy.b_state = 0;
876         dummy.b_blocknr = -1000;
877         buffer_trace_init(&dummy.b_history);
878         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
879         if (!*errp && buffer_mapped(&dummy)) {
880                 struct buffer_head *bh;
881                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
882                 if (buffer_new(&dummy)) {
883                         J_ASSERT(create != 0);
884                         J_ASSERT(handle != 0);
885
886                         /* Now that we do not always journal data, we
887                            should keep in mind whether this should
888                            always journal the new buffer as metadata.
889                            For now, regular file writes use
890                            ext3_get_block instead, so it's not a
891                            problem. */
892                         lock_buffer(bh);
893                         BUFFER_TRACE(bh, "call get_create_access");
894                         fatal = ext3_journal_get_create_access(handle, bh);
895                         if (!fatal && !buffer_uptodate(bh)) {
896                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
897                                 set_buffer_uptodate(bh);
898                         }
899                         unlock_buffer(bh);
900                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
901                         err = ext3_journal_dirty_metadata(handle, bh);
902                         if (!fatal)
903                                 fatal = err;
904                 } else {
905                         BUFFER_TRACE(bh, "not a new buffer");
906                 }
907                 if (fatal) {
908                         *errp = fatal;
909                         brelse(bh);
910                         bh = NULL;
911                 }
912                 return bh;
913         }
914         return NULL;
915 }
916
917 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
918                                int block, int create, int *err)
919 {
920         struct buffer_head * bh;
921
922         bh = ext3_getblk(handle, inode, block, create, err);
923         if (!bh)
924                 return bh;
925         if (buffer_uptodate(bh))
926                 return bh;
927         ll_rw_block(READ, 1, &bh);
928         wait_on_buffer(bh);
929         if (buffer_uptodate(bh))
930                 return bh;
931         put_bh(bh);
932         *err = -EIO;
933         return NULL;
934 }
935
936 static int walk_page_buffers(   handle_t *handle,
937                                 struct buffer_head *head,
938                                 unsigned from,
939                                 unsigned to,
940                                 int *partial,
941                                 int (*fn)(      handle_t *handle,
942                                                 struct buffer_head *bh))
943 {
944         struct buffer_head *bh;
945         unsigned block_start, block_end;
946         unsigned blocksize = head->b_size;
947         int err, ret = 0;
948         struct buffer_head *next;
949
950         for (   bh = head, block_start = 0;
951                 ret == 0 && (bh != head || !block_start);
952                 block_start = block_end, bh = next)
953         {
954                 next = bh->b_this_page;
955                 block_end = block_start + blocksize;
956                 if (block_end <= from || block_start >= to) {
957                         if (partial && !buffer_uptodate(bh))
958                                 *partial = 1;
959                         continue;
960                 }
961                 err = (*fn)(handle, bh);
962                 if (!ret)
963                         ret = err;
964         }
965         return ret;
966 }
967
968 /*
969  * To preserve ordering, it is essential that the hole instantiation and
970  * the data write be encapsulated in a single transaction.  We cannot
971  * close off a transaction and start a new one between the ext3_get_block()
972  * and the commit_write().  So doing the journal_start at the start of
973  * prepare_write() is the right place.
974  *
975  * Also, this function can nest inside ext3_writepage() ->
976  * block_write_full_page(). In that case, we *know* that ext3_writepage()
977  * has generated enough buffer credits to do the whole page.  So we won't
978  * block on the journal in that case, which is good, because the caller may
979  * be PF_MEMALLOC.
980  *
981  * By accident, ext3 can be reentered when a transaction is open via
982  * quota file writes.  If we were to commit the transaction while thus
983  * reentered, there can be a deadlock - we would be holding a quota
984  * lock, and the commit would never complete if another thread had a
985  * transaction open and was blocking on the quota lock - a ranking
986  * violation.
987  *
988  * So what we do is to rely on the fact that journal_stop/journal_start
989  * will _not_ run commit under these circumstances because handle->h_ref
990  * is elevated.  We'll still have enough credits for the tiny quotafile
991  * write.  
992  */
993
994 static int do_journal_get_write_access(handle_t *handle, 
995                                        struct buffer_head *bh)
996 {
997         if (!buffer_mapped(bh) || buffer_freed(bh))
998                 return 0;
999         return ext3_journal_get_write_access(handle, bh);
1000 }
1001
1002 static int ext3_prepare_write(struct file *file, struct page *page,
1003                               unsigned from, unsigned to)
1004 {
1005         struct inode *inode = page->mapping->host;
1006         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1007         handle_t *handle;
1008         int retries = 0;
1009
1010 retry:
1011         handle = ext3_journal_start(inode, needed_blocks);
1012         if (IS_ERR(handle)) {
1013                 ret = PTR_ERR(handle);
1014                 goto out;
1015         }
1016         ret = block_prepare_write(page, from, to, ext3_get_block);
1017         if (ret)
1018                 goto prepare_write_failed;
1019
1020         if (ext3_should_journal_data(inode)) {
1021                 ret = walk_page_buffers(handle, page_buffers(page),
1022                                 from, to, NULL, do_journal_get_write_access);
1023         }
1024 prepare_write_failed:
1025         if (ret)
1026                 ext3_journal_stop(handle);
1027         if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1028                 goto retry;
1029 out:
1030         return ret;
1031 }
1032
1033 int
1034 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1035 {
1036         int err = journal_dirty_data(handle, bh);
1037         if (err)
1038                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1039                                                 bh, handle,err);
1040         return err;
1041 }
1042
1043 /* For commit_write() in data=journal mode */
1044 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1045 {
1046         if (!buffer_mapped(bh) || buffer_freed(bh))
1047                 return 0;
1048         set_buffer_uptodate(bh);
1049         return ext3_journal_dirty_metadata(handle, bh);
1050 }
1051
1052 /*
1053  * We need to pick up the new inode size which generic_commit_write gave us
1054  * `file' can be NULL - eg, when called from page_symlink().
1055  *
1056  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1057  * buffers are managed internally.
1058  */
1059
1060 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1061                              unsigned from, unsigned to)
1062 {
1063         handle_t *handle = ext3_journal_current_handle();
1064         struct inode *inode = page->mapping->host;
1065         int ret = 0, ret2;
1066
1067         ret = walk_page_buffers(handle, page_buffers(page),
1068                 from, to, NULL, ext3_journal_dirty_data);
1069
1070         if (ret == 0) {
1071                 /*
1072                  * generic_commit_write() will run mark_inode_dirty() if i_size
1073                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1074                  * into that.
1075                  */
1076                 loff_t new_i_size;
1077
1078                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1079                 if (new_i_size > EXT3_I(inode)->i_disksize)
1080                         EXT3_I(inode)->i_disksize = new_i_size;
1081                 ret = generic_commit_write(file, page, from, to);
1082         }
1083         ret2 = ext3_journal_stop(handle);
1084         if (!ret)
1085                 ret = ret2;
1086         return ret;
1087 }
1088
1089 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1090                              unsigned from, unsigned to)
1091 {
1092         handle_t *handle = ext3_journal_current_handle();
1093         struct inode *inode = page->mapping->host;
1094         int ret = 0, ret2;
1095         loff_t new_i_size;
1096
1097         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1098         if (new_i_size > EXT3_I(inode)->i_disksize)
1099                 EXT3_I(inode)->i_disksize = new_i_size;
1100         ret = generic_commit_write(file, page, from, to);
1101         ret2 = ext3_journal_stop(handle);
1102         if (!ret)
1103                 ret = ret2;
1104         return ret;
1105 }
1106
1107 static int ext3_journalled_commit_write(struct file *file,
1108                         struct page *page, unsigned from, unsigned to)
1109 {
1110         handle_t *handle = ext3_journal_current_handle();
1111         struct inode *inode = page->mapping->host;
1112         int ret = 0, ret2;
1113         int partial = 0;
1114         loff_t pos;
1115
1116         /*
1117          * Here we duplicate the generic_commit_write() functionality
1118          */
1119         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1120
1121         ret = walk_page_buffers(handle, page_buffers(page), from,
1122                                 to, &partial, commit_write_fn);
1123         if (!partial)
1124                 SetPageUptodate(page);
1125         if (pos > inode->i_size)
1126                 i_size_write(inode, pos);
1127         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1128         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1129                 EXT3_I(inode)->i_disksize = inode->i_size;
1130                 ret2 = ext3_mark_inode_dirty(handle, inode);
1131                 if (!ret) 
1132                         ret = ret2;
1133         }
1134         ret2 = ext3_journal_stop(handle);
1135         if (!ret)
1136                 ret = ret2;
1137         return ret;
1138 }
1139
1140 /* 
1141  * bmap() is special.  It gets used by applications such as lilo and by
1142  * the swapper to find the on-disk block of a specific piece of data.
1143  *
1144  * Naturally, this is dangerous if the block concerned is still in the
1145  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1146  * filesystem and enables swap, then they may get a nasty shock when the
1147  * data getting swapped to that swapfile suddenly gets overwritten by
1148  * the original zero's written out previously to the journal and
1149  * awaiting writeback in the kernel's buffer cache. 
1150  *
1151  * So, if we see any bmap calls here on a modified, data-journaled file,
1152  * take extra steps to flush any blocks which might be in the cache. 
1153  */
1154 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1155 {
1156         struct inode *inode = mapping->host;
1157         journal_t *journal;
1158         int err;
1159
1160         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1161                 /* 
1162                  * This is a REALLY heavyweight approach, but the use of
1163                  * bmap on dirty files is expected to be extremely rare:
1164                  * only if we run lilo or swapon on a freshly made file
1165                  * do we expect this to happen. 
1166                  *
1167                  * (bmap requires CAP_SYS_RAWIO so this does not
1168                  * represent an unprivileged user DOS attack --- we'd be
1169                  * in trouble if mortal users could trigger this path at
1170                  * will.) 
1171                  *
1172                  * NB. EXT3_STATE_JDATA is not set on files other than
1173                  * regular files.  If somebody wants to bmap a directory
1174                  * or symlink and gets confused because the buffer
1175                  * hasn't yet been flushed to disk, they deserve
1176                  * everything they get.
1177                  */
1178
1179                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1180                 journal = EXT3_JOURNAL(inode);
1181                 journal_lock_updates(journal);
1182                 err = journal_flush(journal);
1183                 journal_unlock_updates(journal);
1184
1185                 if (err)
1186                         return 0;
1187         }
1188
1189         return generic_block_bmap(mapping,block,ext3_get_block);
1190 }
1191
1192 static int bget_one(handle_t *handle, struct buffer_head *bh)
1193 {
1194         get_bh(bh);
1195         return 0;
1196 }
1197
1198 static int bput_one(handle_t *handle, struct buffer_head *bh)
1199 {
1200         put_bh(bh);
1201         return 0;
1202 }
1203
1204 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1205 {
1206         if (buffer_mapped(bh))
1207                 return ext3_journal_dirty_data(handle, bh);
1208         return 0;
1209 }
1210
1211 /*
1212  * Note that we always start a transaction even if we're not journalling
1213  * data.  This is to preserve ordering: any hole instantiation within
1214  * __block_write_full_page -> ext3_get_block() should be journalled
1215  * along with the data so we don't crash and then get metadata which
1216  * refers to old data.
1217  *
1218  * In all journalling modes block_write_full_page() will start the I/O.
1219  *
1220  * Problem:
1221  *
1222  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1223  *              ext3_writepage()
1224  *
1225  * Similar for:
1226  *
1227  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1228  *
1229  * Same applies to ext3_get_block().  We will deadlock on various things like
1230  * lock_journal and i_truncate_sem.
1231  *
1232  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1233  * allocations fail.
1234  *
1235  * 16May01: If we're reentered then journal_current_handle() will be
1236  *          non-zero. We simply *return*.
1237  *
1238  * 1 July 2001: @@@ FIXME:
1239  *   In journalled data mode, a data buffer may be metadata against the
1240  *   current transaction.  But the same file is part of a shared mapping
1241  *   and someone does a writepage() on it.
1242  *
1243  *   We will move the buffer onto the async_data list, but *after* it has
1244  *   been dirtied. So there's a small window where we have dirty data on
1245  *   BJ_Metadata.
1246  *
1247  *   Note that this only applies to the last partial page in the file.  The
1248  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1249  *   broken code anyway: it's wrong for msync()).
1250  *
1251  *   It's a rare case: affects the final partial page, for journalled data
1252  *   where the file is subject to bith write() and writepage() in the same
1253  *   transction.  To fix it we'll need a custom block_write_full_page().
1254  *   We'll probably need that anyway for journalling writepage() output.
1255  *
1256  * We don't honour synchronous mounts for writepage().  That would be
1257  * disastrous.  Any write() or metadata operation will sync the fs for
1258  * us.
1259  *
1260  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1261  * we don't need to open a transaction here.
1262  */
1263 static int ext3_ordered_writepage(struct page *page,
1264                         struct writeback_control *wbc)
1265 {
1266         struct inode *inode = page->mapping->host;
1267         struct buffer_head *page_bufs;
1268         handle_t *handle = NULL;
1269         int ret = 0;
1270         int err;
1271
1272         J_ASSERT(PageLocked(page));
1273
1274         /*
1275          * We give up here if we're reentered, because it might be for a
1276          * different filesystem.
1277          */
1278         if (ext3_journal_current_handle())
1279                 goto out_fail;
1280
1281         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1282
1283         if (IS_ERR(handle)) {
1284                 ret = PTR_ERR(handle);
1285                 goto out_fail;
1286         }
1287
1288         if (!page_has_buffers(page)) {
1289                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1290                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1291         }
1292         page_bufs = page_buffers(page);
1293         walk_page_buffers(handle, page_bufs, 0,
1294                         PAGE_CACHE_SIZE, NULL, bget_one);
1295
1296         ret = block_write_full_page(page, ext3_get_block, wbc);
1297
1298         /*
1299          * The page can become unlocked at any point now, and
1300          * truncate can then come in and change things.  So we
1301          * can't touch *page from now on.  But *page_bufs is
1302          * safe due to elevated refcount.
1303          */
1304
1305         /*
1306          * And attach them to the current transaction.  But only if 
1307          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1308          * and generally junk.
1309          */
1310         if (ret == 0) {
1311                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1312                                         NULL, journal_dirty_data_fn);
1313                 if (!ret)
1314                         ret = err;
1315         }
1316         walk_page_buffers(handle, page_bufs, 0,
1317                         PAGE_CACHE_SIZE, NULL, bput_one);
1318         err = ext3_journal_stop(handle);
1319         if (!ret)
1320                 ret = err;
1321         return ret;
1322
1323 out_fail:
1324         redirty_page_for_writepage(wbc, page);
1325         unlock_page(page);
1326         return ret;
1327 }
1328
1329 static int ext3_writeback_writepage(struct page *page,
1330                                 struct writeback_control *wbc)
1331 {
1332         struct inode *inode = page->mapping->host;
1333         handle_t *handle = NULL;
1334         int ret = 0;
1335         int err;
1336
1337         if (ext3_journal_current_handle())
1338                 goto out_fail;
1339
1340         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1341         if (IS_ERR(handle)) {
1342                 ret = PTR_ERR(handle);
1343                 goto out_fail;
1344         }
1345
1346         ret = block_write_full_page(page, ext3_get_block, wbc);
1347         err = ext3_journal_stop(handle);
1348         if (!ret)
1349                 ret = err;
1350         return ret;
1351
1352 out_fail:
1353         redirty_page_for_writepage(wbc, page);
1354         unlock_page(page);
1355         return ret;
1356 }
1357
1358 static int ext3_journalled_writepage(struct page *page,
1359                                 struct writeback_control *wbc)
1360 {
1361         struct inode *inode = page->mapping->host;
1362         handle_t *handle = NULL;
1363         int ret = 0;
1364         int err;
1365
1366         if (ext3_journal_current_handle())
1367                 goto no_write;
1368
1369         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1370         if (IS_ERR(handle)) {
1371                 ret = PTR_ERR(handle);
1372                 goto no_write;
1373         }
1374
1375         if (!page_has_buffers(page) || PageChecked(page)) {
1376                 /*
1377                  * It's mmapped pagecache.  Add buffers and journal it.  There
1378                  * doesn't seem much point in redirtying the page here.
1379                  */
1380                 ClearPageChecked(page);
1381                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1382                                         ext3_get_block);
1383                 if (ret != 0)
1384                         goto out_unlock;
1385                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1386                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1387
1388                 err = walk_page_buffers(handle, page_buffers(page), 0,
1389                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1390                 if (ret == 0)
1391                         ret = err;
1392                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1393                 unlock_page(page);
1394         } else {
1395                 /*
1396                  * It may be a page full of checkpoint-mode buffers.  We don't
1397                  * really know unless we go poke around in the buffer_heads.
1398                  * But block_write_full_page will do the right thing.
1399                  */
1400                 ret = block_write_full_page(page, ext3_get_block, wbc);
1401         }
1402         err = ext3_journal_stop(handle);
1403         if (!ret)
1404                 ret = err;
1405 out:
1406         return ret;
1407
1408 no_write:
1409         redirty_page_for_writepage(wbc, page);
1410 out_unlock:
1411         unlock_page(page);
1412         goto out;
1413 }
1414
1415 static int ext3_readpage(struct file *file, struct page *page)
1416 {
1417         return mpage_readpage(page, ext3_get_block);
1418 }
1419
1420 static int
1421 ext3_readpages(struct file *file, struct address_space *mapping,
1422                 struct list_head *pages, unsigned nr_pages)
1423 {
1424         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1425 }
1426
1427 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1428 {
1429         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1430
1431         /*
1432          * If it's a full truncate we just forget about the pending dirtying
1433          */
1434         if (offset == 0)
1435                 ClearPageChecked(page);
1436
1437         return journal_invalidatepage(journal, page, offset);
1438 }
1439
1440 static int ext3_releasepage(struct page *page, int wait)
1441 {
1442         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1443
1444         WARN_ON(PageChecked(page));
1445         return journal_try_to_free_buffers(journal, page, wait);
1446 }
1447
1448 /*
1449  * If the O_DIRECT write will extend the file then add this inode to the
1450  * orphan list.  So recovery will truncate it back to the original size
1451  * if the machine crashes during the write.
1452  *
1453  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1454  * crashes then stale disk data _may_ be exposed inside the file.
1455  */
1456 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1457                         const struct iovec *iov, loff_t offset,
1458                         unsigned long nr_segs)
1459 {
1460         struct file *file = iocb->ki_filp;
1461         struct inode *inode = file->f_mapping->host;
1462         struct ext3_inode_info *ei = EXT3_I(inode);
1463         handle_t *handle = NULL;
1464         ssize_t ret;
1465         int orphan = 0;
1466         size_t count = iov_length(iov, nr_segs);
1467
1468         if (rw == WRITE) {
1469                 loff_t final_size = offset + count;
1470
1471                 handle = ext3_journal_start(inode, DIO_CREDITS);
1472                 if (IS_ERR(handle)) {
1473                         ret = PTR_ERR(handle);
1474                         goto out;
1475                 }
1476                 if (final_size > inode->i_size) {
1477                         ret = ext3_orphan_add(handle, inode);
1478                         if (ret)
1479                                 goto out_stop;
1480                         orphan = 1;
1481                         ei->i_disksize = inode->i_size;
1482                 }
1483         }
1484
1485         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
1486                                  offset, nr_segs,
1487                                  ext3_direct_io_get_blocks, NULL);
1488
1489         /*
1490          * Reacquire the handle: ext3_direct_io_get_block() can restart the
1491          * transaction
1492          */
1493         handle = journal_current_handle();
1494
1495 out_stop:
1496         if (handle) {
1497                 int err;
1498
1499                 if (orphan && inode->i_nlink)
1500                         ext3_orphan_del(handle, inode);
1501                 if (orphan && ret > 0) {
1502                         loff_t end = offset + ret;
1503                         if (end > inode->i_size) {
1504                                 ei->i_disksize = end;
1505                                 i_size_write(inode, end);
1506                                 /*
1507                                  * We're going to return a positive `ret'
1508                                  * here due to non-zero-length I/O, so there's
1509                                  * no way of reporting error returns from
1510                                  * ext3_mark_inode_dirty() to userspace.  So
1511                                  * ignore it.
1512                                  */
1513                                 ext3_mark_inode_dirty(handle, inode);
1514                         }
1515                 }
1516                 err = ext3_journal_stop(handle);
1517                 if (ret == 0)
1518                         ret = err;
1519         }
1520 out:
1521         return ret;
1522 }
1523
1524 /*
1525  * Pages can be marked dirty completely asynchronously from ext3's journalling
1526  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1527  * much here because ->set_page_dirty is called under VFS locks.  The page is
1528  * not necessarily locked.
1529  *
1530  * We cannot just dirty the page and leave attached buffers clean, because the
1531  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1532  * or jbddirty because all the journalling code will explode.
1533  *
1534  * So what we do is to mark the page "pending dirty" and next time writepage
1535  * is called, propagate that into the buffers appropriately.
1536  */
1537 static int ext3_journalled_set_page_dirty(struct page *page)
1538 {
1539         SetPageChecked(page);
1540         return __set_page_dirty_nobuffers(page);
1541 }
1542
1543 static struct address_space_operations ext3_ordered_aops = {
1544         .readpage       = ext3_readpage,
1545         .readpages      = ext3_readpages,
1546         .writepage      = ext3_ordered_writepage,
1547         .sync_page      = block_sync_page,
1548         .prepare_write  = ext3_prepare_write,
1549         .commit_write   = ext3_ordered_commit_write,
1550         .bmap           = ext3_bmap,
1551         .invalidatepage = ext3_invalidatepage,
1552         .releasepage    = ext3_releasepage,
1553         .direct_IO      = ext3_direct_IO,
1554 };
1555
1556 static struct address_space_operations ext3_writeback_aops = {
1557         .readpage       = ext3_readpage,
1558         .readpages      = ext3_readpages,
1559         .writepage      = ext3_writeback_writepage,
1560         .sync_page      = block_sync_page,
1561         .prepare_write  = ext3_prepare_write,
1562         .commit_write   = ext3_writeback_commit_write,
1563         .bmap           = ext3_bmap,
1564         .invalidatepage = ext3_invalidatepage,
1565         .releasepage    = ext3_releasepage,
1566         .direct_IO      = ext3_direct_IO,
1567 };
1568
1569 static struct address_space_operations ext3_journalled_aops = {
1570         .readpage       = ext3_readpage,
1571         .readpages      = ext3_readpages,
1572         .writepage      = ext3_journalled_writepage,
1573         .sync_page      = block_sync_page,
1574         .prepare_write  = ext3_prepare_write,
1575         .commit_write   = ext3_journalled_commit_write,
1576         .set_page_dirty = ext3_journalled_set_page_dirty,
1577         .bmap           = ext3_bmap,
1578         .invalidatepage = ext3_invalidatepage,
1579         .releasepage    = ext3_releasepage,
1580 };
1581
1582 void ext3_set_aops(struct inode *inode)
1583 {
1584         if (ext3_should_order_data(inode))
1585                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1586         else if (ext3_should_writeback_data(inode))
1587                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1588         else
1589                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1590 }
1591
1592 /*
1593  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1594  * up to the end of the block which corresponds to `from'.
1595  * This required during truncate. We need to physically zero the tail end
1596  * of that block so it doesn't yield old data if the file is later grown.
1597  */
1598 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1599                 struct address_space *mapping, loff_t from)
1600 {
1601         unsigned long index = from >> PAGE_CACHE_SHIFT;
1602         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1603         unsigned blocksize, iblock, length, pos;
1604         struct inode *inode = mapping->host;
1605         struct buffer_head *bh;
1606         int err;
1607         void *kaddr;
1608
1609         blocksize = inode->i_sb->s_blocksize;
1610         length = blocksize - (offset & (blocksize - 1));
1611         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1612
1613         if (!page_has_buffers(page))
1614                 create_empty_buffers(page, blocksize, 0);
1615
1616         /* Find the buffer that contains "offset" */
1617         bh = page_buffers(page);
1618         pos = blocksize;
1619         while (offset >= pos) {
1620                 bh = bh->b_this_page;
1621                 iblock++;
1622                 pos += blocksize;
1623         }
1624
1625         err = 0;
1626         if (buffer_freed(bh)) {
1627                 BUFFER_TRACE(bh, "freed: skip");
1628                 goto unlock;
1629         }
1630
1631         if (!buffer_mapped(bh)) {
1632                 BUFFER_TRACE(bh, "unmapped");
1633                 ext3_get_block(inode, iblock, bh, 0);
1634                 /* unmapped? It's a hole - nothing to do */
1635                 if (!buffer_mapped(bh)) {
1636                         BUFFER_TRACE(bh, "still unmapped");
1637                         goto unlock;
1638                 }
1639         }
1640
1641         /* Ok, it's mapped. Make sure it's up-to-date */
1642         if (PageUptodate(page))
1643                 set_buffer_uptodate(bh);
1644
1645         if (!buffer_uptodate(bh)) {
1646                 err = -EIO;
1647                 ll_rw_block(READ, 1, &bh);
1648                 wait_on_buffer(bh);
1649                 /* Uhhuh. Read error. Complain and punt. */
1650                 if (!buffer_uptodate(bh))
1651                         goto unlock;
1652         }
1653
1654         if (ext3_should_journal_data(inode)) {
1655                 BUFFER_TRACE(bh, "get write access");
1656                 err = ext3_journal_get_write_access(handle, bh);
1657                 if (err)
1658                         goto unlock;
1659         }
1660
1661         kaddr = kmap_atomic(page, KM_USER0);
1662         memset(kaddr + offset, 0, length);
1663         flush_dcache_page(page);
1664         kunmap_atomic(kaddr, KM_USER0);
1665
1666         BUFFER_TRACE(bh, "zeroed end of block");
1667
1668         err = 0;
1669         if (ext3_should_journal_data(inode)) {
1670                 err = ext3_journal_dirty_metadata(handle, bh);
1671         } else {
1672                 if (ext3_should_order_data(inode))
1673                         err = ext3_journal_dirty_data(handle, bh);
1674                 mark_buffer_dirty(bh);
1675         }
1676
1677 unlock:
1678         unlock_page(page);
1679         page_cache_release(page);
1680         return err;
1681 }
1682
1683 /*
1684  * Probably it should be a library function... search for first non-zero word
1685  * or memcmp with zero_page, whatever is better for particular architecture.
1686  * Linus?
1687  */
1688 static inline int all_zeroes(__le32 *p, __le32 *q)
1689 {
1690         while (p < q)
1691                 if (*p++)
1692                         return 0;
1693         return 1;
1694 }
1695
1696 /**
1697  *      ext3_find_shared - find the indirect blocks for partial truncation.
1698  *      @inode:   inode in question
1699  *      @depth:   depth of the affected branch
1700  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1701  *      @chain:   place to store the pointers to partial indirect blocks
1702  *      @top:     place to the (detached) top of branch
1703  *
1704  *      This is a helper function used by ext3_truncate().
1705  *
1706  *      When we do truncate() we may have to clean the ends of several
1707  *      indirect blocks but leave the blocks themselves alive. Block is
1708  *      partially truncated if some data below the new i_size is refered
1709  *      from it (and it is on the path to the first completely truncated
1710  *      data block, indeed).  We have to free the top of that path along
1711  *      with everything to the right of the path. Since no allocation
1712  *      past the truncation point is possible until ext3_truncate()
1713  *      finishes, we may safely do the latter, but top of branch may
1714  *      require special attention - pageout below the truncation point
1715  *      might try to populate it.
1716  *
1717  *      We atomically detach the top of branch from the tree, store the
1718  *      block number of its root in *@top, pointers to buffer_heads of
1719  *      partially truncated blocks - in @chain[].bh and pointers to
1720  *      their last elements that should not be removed - in
1721  *      @chain[].p. Return value is the pointer to last filled element
1722  *      of @chain.
1723  *
1724  *      The work left to caller to do the actual freeing of subtrees:
1725  *              a) free the subtree starting from *@top
1726  *              b) free the subtrees whose roots are stored in
1727  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1728  *              c) free the subtrees growing from the inode past the @chain[0].
1729  *                      (no partially truncated stuff there).  */
1730
1731 static Indirect *ext3_find_shared(struct inode *inode,
1732                                 int depth,
1733                                 int offsets[4],
1734                                 Indirect chain[4],
1735                                 __le32 *top)
1736 {
1737         Indirect *partial, *p;
1738         int k, err;
1739
1740         *top = 0;
1741         /* Make k index the deepest non-null offest + 1 */
1742         for (k = depth; k > 1 && !offsets[k-1]; k--)
1743                 ;
1744         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1745         /* Writer: pointers */
1746         if (!partial)
1747                 partial = chain + k-1;
1748         /*
1749          * If the branch acquired continuation since we've looked at it -
1750          * fine, it should all survive and (new) top doesn't belong to us.
1751          */
1752         if (!partial->key && *partial->p)
1753                 /* Writer: end */
1754                 goto no_top;
1755         for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1756                 ;
1757         /*
1758          * OK, we've found the last block that must survive. The rest of our
1759          * branch should be detached before unlocking. However, if that rest
1760          * of branch is all ours and does not grow immediately from the inode
1761          * it's easier to cheat and just decrement partial->p.
1762          */
1763         if (p == chain + k - 1 && p > chain) {
1764                 p->p--;
1765         } else {
1766                 *top = *p->p;
1767                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1768 #if 0
1769                 *p->p = 0;
1770 #endif
1771         }
1772         /* Writer: end */
1773
1774         while(partial > p)
1775         {
1776                 brelse(partial->bh);
1777                 partial--;
1778         }
1779 no_top:
1780         return partial;
1781 }
1782
1783 /*
1784  * Zero a number of block pointers in either an inode or an indirect block.
1785  * If we restart the transaction we must again get write access to the
1786  * indirect block for further modification.
1787  *
1788  * We release `count' blocks on disk, but (last - first) may be greater
1789  * than `count' because there can be holes in there.
1790  */
1791 static void
1792 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1793                 unsigned long block_to_free, unsigned long count,
1794                 __le32 *first, __le32 *last)
1795 {
1796         __le32 *p;
1797         if (try_to_extend_transaction(handle, inode)) {
1798                 if (bh) {
1799                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1800                         ext3_journal_dirty_metadata(handle, bh);
1801                 }
1802                 ext3_mark_inode_dirty(handle, inode);
1803                 ext3_journal_test_restart(handle, inode);
1804                 if (bh) {
1805                         BUFFER_TRACE(bh, "retaking write access");
1806                         ext3_journal_get_write_access(handle, bh);
1807                 }
1808         }
1809
1810         /*
1811          * Any buffers which are on the journal will be in memory. We find
1812          * them on the hash table so journal_revoke() will run journal_forget()
1813          * on them.  We've already detached each block from the file, so
1814          * bforget() in journal_forget() should be safe.
1815          *
1816          * AKPM: turn on bforget in journal_forget()!!!
1817          */
1818         for (p = first; p < last; p++) {
1819                 u32 nr = le32_to_cpu(*p);
1820                 if (nr) {
1821                         struct buffer_head *bh;
1822
1823                         *p = 0;
1824                         bh = sb_find_get_block(inode->i_sb, nr);
1825                         ext3_forget(handle, 0, inode, bh, nr);
1826                 }
1827         }
1828
1829         ext3_free_blocks(handle, inode, block_to_free, count);
1830 }
1831
1832 /**
1833  * ext3_free_data - free a list of data blocks
1834  * @handle:     handle for this transaction
1835  * @inode:      inode we are dealing with
1836  * @this_bh:    indirect buffer_head which contains *@first and *@last
1837  * @first:      array of block numbers
1838  * @last:       points immediately past the end of array
1839  *
1840  * We are freeing all blocks refered from that array (numbers are stored as
1841  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1842  *
1843  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1844  * blocks are contiguous then releasing them at one time will only affect one
1845  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1846  * actually use a lot of journal space.
1847  *
1848  * @this_bh will be %NULL if @first and @last point into the inode's direct
1849  * block pointers.
1850  */
1851 static void ext3_free_data(handle_t *handle, struct inode *inode,
1852                            struct buffer_head *this_bh,
1853                            __le32 *first, __le32 *last)
1854 {
1855         unsigned long block_to_free = 0;    /* Starting block # of a run */
1856         unsigned long count = 0;            /* Number of blocks in the run */ 
1857         __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
1858                                                corresponding to
1859                                                block_to_free */
1860         unsigned long nr;                   /* Current block # */
1861         __le32 *p;                          /* Pointer into inode/ind
1862                                                for current block */
1863         int err;
1864
1865         if (this_bh) {                          /* For indirect block */
1866                 BUFFER_TRACE(this_bh, "get_write_access");
1867                 err = ext3_journal_get_write_access(handle, this_bh);
1868                 /* Important: if we can't update the indirect pointers
1869                  * to the blocks, we can't free them. */
1870                 if (err)
1871                         return;
1872         }
1873
1874         for (p = first; p < last; p++) {
1875                 nr = le32_to_cpu(*p);
1876                 if (nr) {
1877                         /* accumulate blocks to free if they're contiguous */
1878                         if (count == 0) {
1879                                 block_to_free = nr;
1880                                 block_to_free_p = p;
1881                                 count = 1;
1882                         } else if (nr == block_to_free + count) {
1883                                 count++;
1884                         } else {
1885                                 ext3_clear_blocks(handle, inode, this_bh, 
1886                                                   block_to_free,
1887                                                   count, block_to_free_p, p);
1888                                 block_to_free = nr;
1889                                 block_to_free_p = p;
1890                                 count = 1;
1891                         }
1892                 }
1893         }
1894
1895         if (count > 0)
1896                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1897                                   count, block_to_free_p, p);
1898
1899         if (this_bh) {
1900                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1901                 ext3_journal_dirty_metadata(handle, this_bh);
1902         }
1903 }
1904
1905 /**
1906  *      ext3_free_branches - free an array of branches
1907  *      @handle: JBD handle for this transaction
1908  *      @inode: inode we are dealing with
1909  *      @parent_bh: the buffer_head which contains *@first and *@last
1910  *      @first: array of block numbers
1911  *      @last:  pointer immediately past the end of array
1912  *      @depth: depth of the branches to free
1913  *
1914  *      We are freeing all blocks refered from these branches (numbers are
1915  *      stored as little-endian 32-bit) and updating @inode->i_blocks
1916  *      appropriately.
1917  */
1918 static void ext3_free_branches(handle_t *handle, struct inode *inode,
1919                                struct buffer_head *parent_bh,
1920                                __le32 *first, __le32 *last, int depth)
1921 {
1922         unsigned long nr;
1923         __le32 *p;
1924
1925         if (is_handle_aborted(handle))
1926                 return;
1927
1928         if (depth--) {
1929                 struct buffer_head *bh;
1930                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1931                 p = last;
1932                 while (--p >= first) {
1933                         nr = le32_to_cpu(*p);
1934                         if (!nr)
1935                                 continue;               /* A hole */
1936
1937                         /* Go read the buffer for the next level down */
1938                         bh = sb_bread(inode->i_sb, nr);
1939
1940                         /*
1941                          * A read failure? Report error and clear slot
1942                          * (should be rare).
1943                          */
1944                         if (!bh) {
1945                                 ext3_error(inode->i_sb, "ext3_free_branches",
1946                                            "Read failure, inode=%ld, block=%ld",
1947                                            inode->i_ino, nr);
1948                                 continue;
1949                         }
1950
1951                         /* This zaps the entire block.  Bottom up. */
1952                         BUFFER_TRACE(bh, "free child branches");
1953                         ext3_free_branches(handle, inode, bh,
1954                                            (__le32*)bh->b_data,
1955                                            (__le32*)bh->b_data + addr_per_block,
1956                                            depth);
1957
1958                         /*
1959                          * We've probably journalled the indirect block several
1960                          * times during the truncate.  But it's no longer
1961                          * needed and we now drop it from the transaction via
1962                          * journal_revoke().
1963                          *
1964                          * That's easy if it's exclusively part of this
1965                          * transaction.  But if it's part of the committing
1966                          * transaction then journal_forget() will simply
1967                          * brelse() it.  That means that if the underlying
1968                          * block is reallocated in ext3_get_block(),
1969                          * unmap_underlying_metadata() will find this block
1970                          * and will try to get rid of it.  damn, damn.
1971                          *
1972                          * If this block has already been committed to the
1973                          * journal, a revoke record will be written.  And
1974                          * revoke records must be emitted *before* clearing
1975                          * this block's bit in the bitmaps.
1976                          */
1977                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
1978
1979                         /*
1980                          * Everything below this this pointer has been
1981                          * released.  Now let this top-of-subtree go.
1982                          *
1983                          * We want the freeing of this indirect block to be
1984                          * atomic in the journal with the updating of the
1985                          * bitmap block which owns it.  So make some room in
1986                          * the journal.
1987                          *
1988                          * We zero the parent pointer *after* freeing its
1989                          * pointee in the bitmaps, so if extend_transaction()
1990                          * for some reason fails to put the bitmap changes and
1991                          * the release into the same transaction, recovery
1992                          * will merely complain about releasing a free block,
1993                          * rather than leaking blocks.
1994                          */
1995                         if (is_handle_aborted(handle))
1996                                 return;
1997                         if (try_to_extend_transaction(handle, inode)) {
1998                                 ext3_mark_inode_dirty(handle, inode);
1999                                 ext3_journal_test_restart(handle, inode);
2000                         }
2001
2002                         ext3_free_blocks(handle, inode, nr, 1);
2003
2004                         if (parent_bh) {
2005                                 /*
2006                                  * The block which we have just freed is
2007                                  * pointed to by an indirect block: journal it
2008                                  */
2009                                 BUFFER_TRACE(parent_bh, "get_write_access");
2010                                 if (!ext3_journal_get_write_access(handle,
2011                                                                    parent_bh)){
2012                                         *p = 0;
2013                                         BUFFER_TRACE(parent_bh,
2014                                         "call ext3_journal_dirty_metadata");
2015                                         ext3_journal_dirty_metadata(handle, 
2016                                                                     parent_bh);
2017                                 }
2018                         }
2019                 }
2020         } else {
2021                 /* We have reached the bottom of the tree. */
2022                 BUFFER_TRACE(parent_bh, "free data blocks");
2023                 ext3_free_data(handle, inode, parent_bh, first, last);
2024         }
2025 }
2026
2027 /*
2028  * ext3_truncate()
2029  *
2030  * We block out ext3_get_block() block instantiations across the entire
2031  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2032  * simultaneously on behalf of the same inode.
2033  *
2034  * As we work through the truncate and commmit bits of it to the journal there
2035  * is one core, guiding principle: the file's tree must always be consistent on
2036  * disk.  We must be able to restart the truncate after a crash.
2037  *
2038  * The file's tree may be transiently inconsistent in memory (although it
2039  * probably isn't), but whenever we close off and commit a journal transaction,
2040  * the contents of (the filesystem + the journal) must be consistent and
2041  * restartable.  It's pretty simple, really: bottom up, right to left (although
2042  * left-to-right works OK too).
2043  *
2044  * Note that at recovery time, journal replay occurs *before* the restart of
2045  * truncate against the orphan inode list.
2046  *
2047  * The committed inode has the new, desired i_size (which is the same as
2048  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2049  * that this inode's truncate did not complete and it will again call
2050  * ext3_truncate() to have another go.  So there will be instantiated blocks
2051  * to the right of the truncation point in a crashed ext3 filesystem.  But
2052  * that's fine - as long as they are linked from the inode, the post-crash
2053  * ext3_truncate() run will find them and release them.
2054  */
2055
2056 void ext3_truncate_nocheck(struct inode * inode)
2057 {
2058         handle_t *handle;
2059         struct ext3_inode_info *ei = EXT3_I(inode);
2060         __le32 *i_data = ei->i_data;
2061         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2062         struct address_space *mapping = inode->i_mapping;
2063         int offsets[4];
2064         Indirect chain[4];
2065         Indirect *partial;
2066         __le32 nr = 0;
2067         int n;
2068         long last_block;
2069         unsigned blocksize = inode->i_sb->s_blocksize;
2070         struct page *page;
2071
2072         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2073             S_ISLNK(inode->i_mode)))
2074                 return;
2075         if (ext3_inode_is_fast_symlink(inode))
2076                 return;
2077
2078         ext3_discard_reservation(inode);
2079
2080         /*
2081          * We have to lock the EOF page here, because lock_page() nests
2082          * outside journal_start().
2083          */
2084         if ((inode->i_size & (blocksize - 1)) == 0) {
2085                 /* Block boundary? Nothing to do */
2086                 page = NULL;
2087         } else {
2088                 page = grab_cache_page(mapping,
2089                                 inode->i_size >> PAGE_CACHE_SHIFT);
2090                 if (!page)
2091                         return;
2092         }
2093
2094         handle = start_transaction(inode);
2095         if (IS_ERR(handle)) {
2096                 if (page) {
2097                         clear_highpage(page);
2098                         flush_dcache_page(page);
2099                         unlock_page(page);
2100                         page_cache_release(page);
2101                 }
2102                 return;         /* AKPM: return what? */
2103         }
2104
2105         last_block = (inode->i_size + blocksize-1)
2106                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2107
2108         if (page)
2109                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2110
2111         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2112         if (n == 0)
2113                 goto out_stop;  /* error */
2114
2115         /*
2116          * OK.  This truncate is going to happen.  We add the inode to the
2117          * orphan list, so that if this truncate spans multiple transactions,
2118          * and we crash, we will resume the truncate when the filesystem
2119          * recovers.  It also marks the inode dirty, to catch the new size.
2120          *
2121          * Implication: the file must always be in a sane, consistent
2122          * truncatable state while each transaction commits.
2123          */
2124         if (ext3_orphan_add(handle, inode))
2125                 goto out_stop;
2126
2127         /*
2128          * The orphan list entry will now protect us from any crash which
2129          * occurs before the truncate completes, so it is now safe to propagate
2130          * the new, shorter inode size (held for now in i_size) into the
2131          * on-disk inode. We do this via i_disksize, which is the value which
2132          * ext3 *really* writes onto the disk inode.
2133          */
2134         ei->i_disksize = inode->i_size;
2135
2136         /*
2137          * From here we block out all ext3_get_block() callers who want to
2138          * modify the block allocation tree.
2139          */
2140         down(&ei->truncate_sem);
2141
2142         if (n == 1) {           /* direct blocks */
2143                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2144                                i_data + EXT3_NDIR_BLOCKS);
2145                 goto do_indirects;
2146         }
2147
2148         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2149         /* Kill the top of shared branch (not detached) */
2150         if (nr) {
2151                 if (partial == chain) {
2152                         /* Shared branch grows from the inode */
2153                         ext3_free_branches(handle, inode, NULL,
2154                                            &nr, &nr+1, (chain+n-1) - partial);
2155                         *partial->p = 0;
2156                         /*
2157                          * We mark the inode dirty prior to restart,
2158                          * and prior to stop.  No need for it here.
2159                          */
2160                 } else {
2161                         /* Shared branch grows from an indirect block */
2162                         BUFFER_TRACE(partial->bh, "get_write_access");
2163                         ext3_free_branches(handle, inode, partial->bh,
2164                                         partial->p,
2165                                         partial->p+1, (chain+n-1) - partial);
2166                 }
2167         }
2168         /* Clear the ends of indirect blocks on the shared branch */
2169         while (partial > chain) {
2170                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2171                                    (__le32*)partial->bh->b_data+addr_per_block,
2172                                    (chain+n-1) - partial);
2173                 BUFFER_TRACE(partial->bh, "call brelse");
2174                 brelse (partial->bh);
2175                 partial--;
2176         }
2177 do_indirects:
2178         /* Kill the remaining (whole) subtrees */
2179         switch (offsets[0]) {
2180                 default:
2181                         nr = i_data[EXT3_IND_BLOCK];
2182                         if (nr) {
2183                                 ext3_free_branches(handle, inode, NULL,
2184                                                    &nr, &nr+1, 1);
2185                                 i_data[EXT3_IND_BLOCK] = 0;
2186                         }
2187                 case EXT3_IND_BLOCK:
2188                         nr = i_data[EXT3_DIND_BLOCK];
2189                         if (nr) {
2190                                 ext3_free_branches(handle, inode, NULL,
2191                                                    &nr, &nr+1, 2);
2192                                 i_data[EXT3_DIND_BLOCK] = 0;
2193                         }
2194                 case EXT3_DIND_BLOCK:
2195                         nr = i_data[EXT3_TIND_BLOCK];
2196                         if (nr) {
2197                                 ext3_free_branches(handle, inode, NULL,
2198                                                    &nr, &nr+1, 3);
2199                                 i_data[EXT3_TIND_BLOCK] = 0;
2200                         }
2201                 case EXT3_TIND_BLOCK:
2202                         ;
2203         }
2204         up(&ei->truncate_sem);
2205         inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2206         ext3_mark_inode_dirty(handle, inode);
2207
2208         /* In a multi-transaction truncate, we only make the final
2209          * transaction synchronous */
2210         if (IS_SYNC(inode))
2211                 handle->h_sync = 1;
2212 out_stop:
2213         /*
2214          * If this was a simple ftruncate(), and the file will remain alive
2215          * then we need to clear up the orphan record which we created above.
2216          * However, if this was a real unlink then we were called by
2217          * ext3_delete_inode(), and we allow that function to clean up the
2218          * orphan info for us.
2219          */
2220         if (inode->i_nlink)
2221                 ext3_orphan_del(handle, inode);
2222
2223         ext3_journal_stop(handle);
2224 }
2225
2226 static unsigned long ext3_get_inode_block(struct super_block *sb,
2227                 unsigned long ino, struct ext3_iloc *iloc)
2228 {
2229         unsigned long desc, group_desc, block_group;
2230         unsigned long offset, block;
2231         struct buffer_head *bh;
2232         struct ext3_group_desc * gdp;
2233
2234
2235         if ((ino != EXT3_ROOT_INO &&
2236                 ino != EXT3_JOURNAL_INO &&
2237                 ino != EXT3_RESIZE_INO &&
2238                 ino < EXT3_FIRST_INO(sb)) ||
2239                 ino > le32_to_cpu(
2240                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2241                 ext3_error (sb, "ext3_get_inode_block",
2242                             "bad inode number: %lu", ino);
2243                 return 0;
2244         }
2245         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2246         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2247                 ext3_error (sb, "ext3_get_inode_block",
2248                             "group >= groups count");
2249                 return 0;
2250         }
2251         smp_rmb();
2252         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2253         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2254         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2255         if (!bh) {
2256                 ext3_error (sb, "ext3_get_inode_block",
2257                             "Descriptor not loaded");
2258                 return 0;
2259         }
2260
2261         gdp = (struct ext3_group_desc *) bh->b_data;
2262         /*
2263          * Figure out the offset within the block group inode table
2264          */
2265         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2266                 EXT3_INODE_SIZE(sb);
2267         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2268                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2269
2270         iloc->block_group = block_group;
2271         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2272         return block;
2273 }
2274
2275 /*
2276  * ext3_get_inode_loc returns with an extra refcount against the inode's
2277  * underlying buffer_head on success. If 'in_mem' is true, we have all
2278  * data in memory that is needed to recreate the on-disk version of this
2279  * inode.
2280  */
2281 static int __ext3_get_inode_loc(struct inode *inode,
2282                                 struct ext3_iloc *iloc, int in_mem)
2283 {
2284         unsigned long block;
2285         struct buffer_head *bh;
2286
2287         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2288         if (!block)
2289                 return -EIO;
2290
2291         bh = sb_getblk(inode->i_sb, block);
2292         if (!bh) {
2293                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2294                                 "unable to read inode block - "
2295                                 "inode=%lu, block=%lu", inode->i_ino, block);
2296                 return -EIO;
2297         }
2298         if (!buffer_uptodate(bh)) {
2299                 lock_buffer(bh);
2300                 if (buffer_uptodate(bh)) {
2301                         /* someone brought it uptodate while we waited */
2302                         unlock_buffer(bh);
2303                         goto has_buffer;
2304                 }
2305
2306                 /*
2307                  * If we have all information of the inode in memory and this
2308                  * is the only valid inode in the block, we need not read the
2309                  * block.
2310                  */
2311                 if (in_mem) {
2312                         struct buffer_head *bitmap_bh;
2313                         struct ext3_group_desc *desc;
2314                         int inodes_per_buffer;
2315                         int inode_offset, i;
2316                         int block_group;
2317                         int start;
2318
2319                         block_group = (inode->i_ino - 1) /
2320                                         EXT3_INODES_PER_GROUP(inode->i_sb);
2321                         inodes_per_buffer = bh->b_size /
2322                                 EXT3_INODE_SIZE(inode->i_sb);
2323                         inode_offset = ((inode->i_ino - 1) %
2324                                         EXT3_INODES_PER_GROUP(inode->i_sb));
2325                         start = inode_offset & ~(inodes_per_buffer - 1);
2326
2327                         /* Is the inode bitmap in cache? */
2328                         desc = ext3_get_group_desc(inode->i_sb,
2329                                                 block_group, NULL);
2330                         if (!desc)
2331                                 goto make_io;
2332
2333                         bitmap_bh = sb_getblk(inode->i_sb,
2334                                         le32_to_cpu(desc->bg_inode_bitmap));
2335                         if (!bitmap_bh)
2336                                 goto make_io;
2337
2338                         /*
2339                          * If the inode bitmap isn't in cache then the
2340                          * optimisation may end up performing two reads instead
2341                          * of one, so skip it.
2342                          */
2343                         if (!buffer_uptodate(bitmap_bh)) {
2344                                 brelse(bitmap_bh);
2345                                 goto make_io;
2346                         }
2347                         for (i = start; i < start + inodes_per_buffer; i++) {
2348                                 if (i == inode_offset)
2349                                         continue;
2350                                 if (ext3_test_bit(i, bitmap_bh->b_data))
2351                                         break;
2352                         }
2353                         brelse(bitmap_bh);
2354                         if (i == start + inodes_per_buffer) {
2355                                 /* all other inodes are free, so skip I/O */
2356                                 memset(bh->b_data, 0, bh->b_size);
2357                                 set_buffer_uptodate(bh);
2358                                 unlock_buffer(bh);
2359                                 goto has_buffer;
2360                         }
2361                 }
2362
2363 make_io:
2364                 /*
2365                  * There are other valid inodes in the buffer, this inode
2366                  * has in-inode xattrs, or we don't have this inode in memory.
2367                  * Read the block from disk.
2368                  */
2369                 get_bh(bh);
2370                 bh->b_end_io = end_buffer_read_sync;
2371                 submit_bh(READ, bh);
2372                 wait_on_buffer(bh);
2373                 if (!buffer_uptodate(bh)) {
2374                         ext3_error(inode->i_sb, "ext3_get_inode_loc",
2375                                         "unable to read inode block - "
2376                                         "inode=%lu, block=%lu",
2377                                         inode->i_ino, block);
2378                         brelse(bh);
2379                         return -EIO;
2380                 }
2381         }
2382 has_buffer:
2383         iloc->bh = bh;
2384         return 0;
2385 }
2386
2387 int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2388 {
2389         /* We have all inode data except xattrs in memory here. */
2390         return __ext3_get_inode_loc(inode, iloc,
2391                 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2392 }
2393
2394 void ext3_truncate(struct inode * inode)
2395 {
2396         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2397                 return;
2398         ext3_truncate_nocheck(inode);
2399 }
2400
2401 void ext3_set_inode_flags(struct inode *inode)
2402 {
2403         unsigned int flags = EXT3_I(inode)->i_flags;
2404
2405         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_IUNLINK|S_BARRIER|S_NOATIME|S_DIRSYNC);
2406         if (flags & EXT3_SYNC_FL)
2407                 inode->i_flags |= S_SYNC;
2408         if (flags & EXT3_APPEND_FL)
2409                 inode->i_flags |= S_APPEND;
2410         if (flags & EXT3_IMMUTABLE_FL)
2411                 inode->i_flags |= S_IMMUTABLE;
2412         if (flags & EXT3_IUNLINK_FL)
2413                 inode->i_flags |= S_IUNLINK;
2414         if (flags & EXT3_BARRIER_FL)
2415                 inode->i_flags |= S_BARRIER;
2416         if (flags & EXT3_NOATIME_FL)
2417                 inode->i_flags |= S_NOATIME;
2418         if (flags & EXT3_DIRSYNC_FL)
2419                 inode->i_flags |= S_DIRSYNC;
2420 }
2421
2422 void ext3_read_inode(struct inode * inode)
2423 {
2424         struct ext3_iloc iloc;
2425         struct ext3_inode *raw_inode;
2426         struct ext3_inode_info *ei = EXT3_I(inode);
2427         struct buffer_head *bh;
2428         int block;
2429         uid_t uid;
2430         gid_t gid;
2431
2432 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2433         ei->i_acl = EXT3_ACL_NOT_CACHED;
2434         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2435 #endif
2436         ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
2437
2438         if (__ext3_get_inode_loc(inode, &iloc, 0))
2439                 goto bad_inode;
2440         bh = iloc.bh;
2441         raw_inode = ext3_raw_inode(&iloc);
2442         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2443         uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2444         gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2445         if(!(test_opt (inode->i_sb, NO_UID32))) {
2446                 uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2447                 gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2448         }
2449         inode->i_uid = INOXID_UID(XID_TAG(inode), uid, gid);
2450         inode->i_gid = INOXID_GID(XID_TAG(inode), uid, gid);
2451         inode->i_xid = INOXID_XID(XID_TAG(inode), uid, gid,
2452                 le16_to_cpu(raw_inode->i_raw_xid));
2453
2454         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2455         inode->i_size = le32_to_cpu(raw_inode->i_size);
2456         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2457         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2458         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2459         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2460
2461         ei->i_state = 0;
2462         ei->i_next_alloc_block = 0;
2463         ei->i_next_alloc_goal = 0;
2464         ei->i_dir_start_lookup = 0;
2465         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2466         /* We now have enough fields to check if the inode was active or not.
2467          * This is needed because nfsd might try to access dead inodes
2468          * the test is that same one that e2fsck uses
2469          * NeilBrown 1999oct15
2470          */
2471         if (inode->i_nlink == 0) {
2472                 if (inode->i_mode == 0 ||
2473                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2474                         /* this inode is deleted */
2475                         brelse (bh);
2476                         goto bad_inode;
2477                 }
2478                 /* The only unlinked inodes we let through here have
2479                  * valid i_mode and are being read by the orphan
2480                  * recovery code: that's fine, we're about to complete
2481                  * the process of deleting those. */
2482         }
2483         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2484                                          * (for stat), not the fs block
2485                                          * size */  
2486         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2487         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2488 #ifdef EXT3_FRAGMENTS
2489         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2490         ei->i_frag_no = raw_inode->i_frag;
2491         ei->i_frag_size = raw_inode->i_fsize;
2492 #endif
2493         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2494         if (!S_ISREG(inode->i_mode)) {
2495                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2496         } else {
2497                 inode->i_size |=
2498                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2499         }
2500         ei->i_disksize = inode->i_size;
2501         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2502         ei->i_block_group = iloc.block_group;
2503         ei->i_rsv_window.rsv_start = 0;
2504         ei->i_rsv_window.rsv_end= 0;
2505         atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
2506         seqlock_init(&ei->i_rsv_window.rsv_seqlock);
2507         /*
2508          * NOTE! The in-memory inode i_data array is in little-endian order
2509          * even on big-endian machines: we do NOT byteswap the block numbers!
2510          */
2511         for (block = 0; block < EXT3_N_BLOCKS; block++)
2512                 ei->i_data[block] = raw_inode->i_block[block];
2513         INIT_LIST_HEAD(&ei->i_orphan);
2514
2515         if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2516             EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2517                 /*
2518                  * When mke2fs creates big inodes it does not zero out
2519                  * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2520                  * so ignore those first few inodes.
2521                  */
2522                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2523                 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2524                     EXT3_INODE_SIZE(inode->i_sb))
2525                         goto bad_inode;
2526                 if (ei->i_extra_isize == 0) {
2527                         /* The extra space is currently unused. Use it. */
2528                         ei->i_extra_isize = sizeof(struct ext3_inode) -
2529                                             EXT3_GOOD_OLD_INODE_SIZE;
2530                 } else {
2531                         __le32 *magic = (void *)raw_inode +
2532                                         EXT3_GOOD_OLD_INODE_SIZE +
2533                                         ei->i_extra_isize;
2534                         if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2535                                  ei->i_state |= EXT3_STATE_XATTR;
2536                 }
2537         } else
2538                 ei->i_extra_isize = 0;
2539
2540         if (S_ISREG(inode->i_mode)) {
2541                 inode->i_op = &ext3_file_inode_operations;
2542                 inode->i_fop = &ext3_file_operations;
2543                 ext3_set_aops(inode);
2544         } else if (S_ISDIR(inode->i_mode)) {
2545                 inode->i_op = &ext3_dir_inode_operations;
2546                 inode->i_fop = &ext3_dir_operations;
2547         } else if (S_ISLNK(inode->i_mode)) {
2548                 if (ext3_inode_is_fast_symlink(inode))
2549                         inode->i_op = &ext3_fast_symlink_inode_operations;
2550                 else {
2551                         inode->i_op = &ext3_symlink_inode_operations;
2552                         ext3_set_aops(inode);
2553                 }
2554         } else {
2555                 inode->i_op = &ext3_special_inode_operations;
2556                 if (raw_inode->i_block[0])
2557                         init_special_inode(inode, inode->i_mode,
2558                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2559                 else 
2560                         init_special_inode(inode, inode->i_mode,
2561                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2562         }
2563         brelse (iloc.bh);
2564         ext3_set_inode_flags(inode);
2565         return;
2566
2567 bad_inode:
2568         make_bad_inode(inode);
2569         return;
2570 }
2571
2572 /*
2573  * Post the struct inode info into an on-disk inode location in the
2574  * buffer-cache.  This gobbles the caller's reference to the
2575  * buffer_head in the inode location struct.
2576  *
2577  * The caller must have write access to iloc->bh.
2578  */
2579 static int ext3_do_update_inode(handle_t *handle, 
2580                                 struct inode *inode, 
2581                                 struct ext3_iloc *iloc)
2582 {
2583         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2584         struct ext3_inode_info *ei = EXT3_I(inode);
2585         struct buffer_head *bh = iloc->bh;
2586         uid_t uid = XIDINO_UID(XID_TAG(inode), inode->i_uid, inode->i_xid);
2587         gid_t gid = XIDINO_GID(XID_TAG(inode), inode->i_gid, inode->i_xid);
2588         int err = 0, rc, block;
2589
2590         /* For fields not not tracking in the in-memory inode,
2591          * initialise them to zero for new inodes. */
2592         if (ei->i_state & EXT3_STATE_NEW)
2593                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2594
2595         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2596         if(!(test_opt(inode->i_sb, NO_UID32))) {
2597                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
2598                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
2599 /*
2600  * Fix up interoperability with old kernels. Otherwise, old inodes get
2601  * re-used with the upper 16 bits of the uid/gid intact
2602  */
2603                 if(!ei->i_dtime) {
2604                         raw_inode->i_uid_high =
2605                                 cpu_to_le16(high_16_bits(uid));
2606                         raw_inode->i_gid_high =
2607                                 cpu_to_le16(high_16_bits(gid));
2608                 } else {
2609                         raw_inode->i_uid_high = 0;
2610                         raw_inode->i_gid_high = 0;
2611                 }
2612         } else {
2613                 raw_inode->i_uid_low =
2614                         cpu_to_le16(fs_high2lowuid(uid));
2615                 raw_inode->i_gid_low =
2616                         cpu_to_le16(fs_high2lowgid(gid));
2617                 raw_inode->i_uid_high = 0;
2618                 raw_inode->i_gid_high = 0;
2619         }
2620 #ifdef CONFIG_INOXID_INTERN
2621         raw_inode->i_raw_xid = cpu_to_le16(inode->i_xid);
2622 #endif
2623         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2624         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2625         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2626         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2627         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2628         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2629         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2630         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2631 #ifdef EXT3_FRAGMENTS
2632         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2633         raw_inode->i_frag = ei->i_frag_no;
2634         raw_inode->i_fsize = ei->i_frag_size;
2635 #endif
2636         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2637         if (!S_ISREG(inode->i_mode)) {
2638                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2639         } else {
2640                 raw_inode->i_size_high =
2641                         cpu_to_le32(ei->i_disksize >> 32);
2642                 if (ei->i_disksize > 0x7fffffffULL) {
2643                         struct super_block *sb = inode->i_sb;
2644                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2645                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2646                             EXT3_SB(sb)->s_es->s_rev_level ==
2647                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2648                                /* If this is the first large file
2649                                 * created, add a flag to the superblock.
2650                                 */
2651                                 err = ext3_journal_get_write_access(handle,
2652                                                 EXT3_SB(sb)->s_sbh);
2653                                 if (err)
2654                                         goto out_brelse;
2655                                 ext3_update_dynamic_rev(sb);
2656                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2657                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2658                                 sb->s_dirt = 1;
2659                                 handle->h_sync = 1;
2660                                 err = ext3_journal_dirty_metadata(handle,
2661                                                 EXT3_SB(sb)->s_sbh);
2662                         }
2663                 }
2664         }
2665         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2666         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2667                 if (old_valid_dev(inode->i_rdev)) {
2668                         raw_inode->i_block[0] =
2669                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
2670                         raw_inode->i_block[1] = 0;
2671                 } else {
2672                         raw_inode->i_block[0] = 0;
2673                         raw_inode->i_block[1] =
2674                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
2675                         raw_inode->i_block[2] = 0;
2676                 }
2677         } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2678                 raw_inode->i_block[block] = ei->i_data[block];
2679
2680         if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
2681                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2682
2683         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2684         rc = ext3_journal_dirty_metadata(handle, bh);
2685         if (!err)
2686                 err = rc;
2687         ei->i_state &= ~EXT3_STATE_NEW;
2688
2689 out_brelse:
2690         brelse (bh);
2691         ext3_std_error(inode->i_sb, err);
2692         return err;
2693 }
2694
2695 /*
2696  * ext3_write_inode()
2697  *
2698  * We are called from a few places:
2699  *
2700  * - Within generic_file_write() for O_SYNC files.
2701  *   Here, there will be no transaction running. We wait for any running
2702  *   trasnaction to commit.
2703  *
2704  * - Within sys_sync(), kupdate and such.
2705  *   We wait on commit, if tol to.
2706  *
2707  * - Within prune_icache() (PF_MEMALLOC == true)
2708  *   Here we simply return.  We can't afford to block kswapd on the
2709  *   journal commit.
2710  *
2711  * In all cases it is actually safe for us to return without doing anything,
2712  * because the inode has been copied into a raw inode buffer in
2713  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2714  * knfsd.
2715  *
2716  * Note that we are absolutely dependent upon all inode dirtiers doing the
2717  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2718  * which we are interested.
2719  *
2720  * It would be a bug for them to not do this.  The code:
2721  *
2722  *      mark_inode_dirty(inode)
2723  *      stuff();
2724  *      inode->i_size = expr;
2725  *
2726  * is in error because a kswapd-driven write_inode() could occur while
2727  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2728  * will no longer be on the superblock's dirty inode list.
2729  */
2730 int ext3_write_inode(struct inode *inode, int wait)
2731 {
2732         if (current->flags & PF_MEMALLOC)
2733                 return 0;
2734
2735         if (ext3_journal_current_handle()) {
2736                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2737                 dump_stack();
2738                 return -EIO;
2739         }
2740
2741         if (!wait)
2742                 return 0;
2743
2744         return ext3_force_commit(inode->i_sb);
2745 }
2746
2747 int ext3_setattr_flags(struct inode *inode, unsigned int flags)
2748 {
2749         unsigned int oldflags, newflags;
2750         int err = 0;
2751
2752         oldflags = EXT3_I(inode)->i_flags;
2753         newflags = oldflags &
2754                 ~(EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL | EXT3_BARRIER_FL);
2755         if (flags & ATTR_FLAG_IMMUTABLE)
2756                 newflags |= EXT3_IMMUTABLE_FL;
2757         if (flags & ATTR_FLAG_IUNLINK)
2758                 newflags |= EXT3_IUNLINK_FL;
2759         if (flags & ATTR_FLAG_BARRIER)
2760                 newflags |= EXT3_BARRIER_FL;
2761
2762         if (oldflags ^ newflags) {
2763                 handle_t *handle;
2764                 struct ext3_iloc iloc;
2765
2766                 handle = ext3_journal_start(inode, 1);
2767                 if (IS_ERR(handle))
2768                         return PTR_ERR(handle);
2769                 if (IS_SYNC(inode))
2770                         handle->h_sync = 1;
2771                 err = ext3_reserve_inode_write(handle, inode, &iloc);
2772                 if (err)
2773                         goto flags_err;
2774
2775                 EXT3_I(inode)->i_flags = newflags;
2776                 inode->i_ctime = CURRENT_TIME;
2777
2778                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2779         flags_err:
2780                 ext3_journal_stop(handle);
2781         }
2782         return err;
2783 }
2784
2785 /*
2786  * ext3_setattr()
2787  *
2788  * Called from notify_change.
2789  *
2790  * We want to trap VFS attempts to truncate the file as soon as
2791  * possible.  In particular, we want to make sure that when the VFS
2792  * shrinks i_size, we put the inode on the orphan list and modify
2793  * i_disksize immediately, so that during the subsequent flushing of
2794  * dirty pages and freeing of disk blocks, we can guarantee that any
2795  * commit will leave the blocks being flushed in an unused state on
2796  * disk.  (On recovery, the inode will get truncated and the blocks will
2797  * be freed, so we have a strong guarantee that no future commit will
2798  * leave these blocks visible to the user.)  
2799  *
2800  * Called with inode->sem down.
2801  */
2802 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2803 {
2804         struct inode *inode = dentry->d_inode;
2805         int error, rc = 0;
2806         const unsigned int ia_valid = attr->ia_valid;
2807
2808         error = inode_change_ok(inode, attr);
2809         if (error)
2810                 return error;
2811
2812         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2813                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) ||
2814                 (ia_valid & ATTR_XID && attr->ia_xid != inode->i_xid)) {
2815                 handle_t *handle;
2816
2817                 /* (user+group)*(old+new) structure, inode write (sb,
2818                  * inode block, ? - but truncate inode update has it) */
2819                 handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
2820                 if (IS_ERR(handle)) {
2821                         error = PTR_ERR(handle);
2822                         goto err_out;
2823                 }
2824                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2825                 if (error) {
2826                         ext3_journal_stop(handle);
2827                         return error;
2828                 }
2829                 /* Update corresponding info in inode so that everything is in
2830                  * one transaction */
2831                 if (attr->ia_valid & ATTR_UID)
2832                         inode->i_uid = attr->ia_uid;
2833                 if (attr->ia_valid & ATTR_GID)
2834                         inode->i_gid = attr->ia_gid;
2835                 if ((attr->ia_valid & ATTR_XID)
2836                         && inode->i_sb
2837                         && (inode->i_sb->s_flags & MS_TAGXID))
2838                         inode->i_xid = attr->ia_xid;
2839                 error = ext3_mark_inode_dirty(handle, inode);
2840                 ext3_journal_stop(handle);
2841         }
2842
2843         if (S_ISREG(inode->i_mode) &&
2844             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2845                 handle_t *handle;
2846
2847                 handle = ext3_journal_start(inode, 3);
2848                 if (IS_ERR(handle)) {
2849                         error = PTR_ERR(handle);
2850                         goto err_out;
2851                 }
2852
2853                 error = ext3_orphan_add(handle, inode);
2854                 EXT3_I(inode)->i_disksize = attr->ia_size;
2855                 rc = ext3_mark_inode_dirty(handle, inode);
2856                 if (!error)
2857                         error = rc;
2858                 ext3_journal_stop(handle);
2859         }
2860
2861         if (ia_valid & ATTR_ATTR_FLAG) {
2862                 rc = ext3_setattr_flags(inode, attr->ia_attr_flags);
2863                 if (!error)
2864                         error = rc;
2865         }
2866
2867         rc = inode_setattr(inode, attr);
2868
2869         /* If inode_setattr's call to ext3_truncate failed to get a
2870          * transaction handle at all, we need to clean up the in-core
2871          * orphan list manually. */
2872         if (inode->i_nlink)
2873                 ext3_orphan_del(NULL, inode);
2874
2875         if (!rc && (ia_valid & ATTR_MODE))
2876                 rc = ext3_acl_chmod(inode);
2877
2878 err_out:
2879         ext3_std_error(inode->i_sb, error);
2880         if (!error)
2881                 error = rc;
2882         return error;
2883 }
2884
2885
2886 /*
2887  * akpm: how many blocks doth make a writepage()?
2888  *
2889  * With N blocks per page, it may be:
2890  * N data blocks
2891  * 2 indirect block
2892  * 2 dindirect
2893  * 1 tindirect
2894  * N+5 bitmap blocks (from the above)
2895  * N+5 group descriptor summary blocks
2896  * 1 inode block
2897  * 1 superblock.
2898  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2899  *
2900  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2901  *
2902  * With ordered or writeback data it's the same, less the N data blocks.
2903  *
2904  * If the inode's direct blocks can hold an integral number of pages then a
2905  * page cannot straddle two indirect blocks, and we can only touch one indirect
2906  * and dindirect block, and the "5" above becomes "3".
2907  *
2908  * This still overestimates under most circumstances.  If we were to pass the
2909  * start and end offsets in here as well we could do block_to_path() on each
2910  * block and work out the exact number of indirects which are touched.  Pah.
2911  */
2912
2913 static int ext3_writepage_trans_blocks(struct inode *inode)
2914 {
2915         int bpp = ext3_journal_blocks_per_page(inode);
2916         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2917         int ret;
2918
2919         if (ext3_should_journal_data(inode))
2920                 ret = 3 * (bpp + indirects) + 2;
2921         else
2922                 ret = 2 * (bpp + indirects) + 2;
2923
2924 #ifdef CONFIG_QUOTA
2925         /* We know that structure was already allocated during DQUOT_INIT so
2926          * we will be updating only the data blocks + inodes */
2927         ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
2928 #endif
2929
2930         return ret;
2931 }
2932
2933 /*
2934  * The caller must have previously called ext3_reserve_inode_write().
2935  * Give this, we know that the caller already has write access to iloc->bh.
2936  */
2937 int ext3_mark_iloc_dirty(handle_t *handle,
2938                 struct inode *inode, struct ext3_iloc *iloc)
2939 {
2940         int err = 0;
2941
2942         /* the do_update_inode consumes one bh->b_count */
2943         get_bh(iloc->bh);
2944
2945         /* ext3_do_update_inode() does journal_dirty_metadata */
2946         err = ext3_do_update_inode(handle, inode, iloc);
2947         put_bh(iloc->bh);
2948         return err;
2949 }
2950
2951 /* 
2952  * On success, We end up with an outstanding reference count against
2953  * iloc->bh.  This _must_ be cleaned up later. 
2954  */
2955
2956 int
2957 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
2958                          struct ext3_iloc *iloc)
2959 {
2960         int err = 0;
2961         if (handle) {
2962                 err = ext3_get_inode_loc(inode, iloc);
2963                 if (!err) {
2964                         BUFFER_TRACE(iloc->bh, "get_write_access");
2965                         err = ext3_journal_get_write_access(handle, iloc->bh);
2966                         if (err) {
2967                                 brelse(iloc->bh);
2968                                 iloc->bh = NULL;
2969                         }
2970                 }
2971         }
2972         ext3_std_error(inode->i_sb, err);
2973         return err;
2974 }
2975
2976 /*
2977  * akpm: What we do here is to mark the in-core inode as clean
2978  * with respect to inode dirtiness (it may still be data-dirty).
2979  * This means that the in-core inode may be reaped by prune_icache
2980  * without having to perform any I/O.  This is a very good thing,
2981  * because *any* task may call prune_icache - even ones which
2982  * have a transaction open against a different journal.
2983  *
2984  * Is this cheating?  Not really.  Sure, we haven't written the
2985  * inode out, but prune_icache isn't a user-visible syncing function.
2986  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2987  * we start and wait on commits.
2988  *
2989  * Is this efficient/effective?  Well, we're being nice to the system
2990  * by cleaning up our inodes proactively so they can be reaped
2991  * without I/O.  But we are potentially leaving up to five seconds'
2992  * worth of inodes floating about which prune_icache wants us to
2993  * write out.  One way to fix that would be to get prune_icache()
2994  * to do a write_super() to free up some memory.  It has the desired
2995  * effect.
2996  */
2997 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2998 {
2999         struct ext3_iloc iloc;
3000         int err;
3001
3002         might_sleep();
3003         err = ext3_reserve_inode_write(handle, inode, &iloc);
3004         if (!err)
3005                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3006         return err;
3007 }
3008
3009 /*
3010  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
3011  *
3012  * We're really interested in the case where a file is being extended.
3013  * i_size has been changed by generic_commit_write() and we thus need
3014  * to include the updated inode in the current transaction.
3015  *
3016  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3017  * are allocated to the file.
3018  *
3019  * If the inode is marked synchronous, we don't honour that here - doing
3020  * so would cause a commit on atime updates, which we don't bother doing.
3021  * We handle synchronous inodes at the highest possible level.
3022  */
3023 void ext3_dirty_inode(struct inode *inode)
3024 {
3025         handle_t *current_handle = ext3_journal_current_handle();
3026         handle_t *handle;
3027
3028         handle = ext3_journal_start(inode, 2);
3029         if (IS_ERR(handle))
3030                 goto out;
3031         if (current_handle &&
3032                 current_handle->h_transaction != handle->h_transaction) {
3033                 /* This task has a transaction open against a different fs */
3034                 printk(KERN_EMERG "%s: transactions do not match!\n",
3035                        __FUNCTION__);
3036         } else {
3037                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
3038                                 current_handle);
3039                 ext3_mark_inode_dirty(handle, inode);
3040         }
3041         ext3_journal_stop(handle);
3042 out:
3043         return;
3044 }
3045
3046 #ifdef AKPM
3047 /* 
3048  * Bind an inode's backing buffer_head into this transaction, to prevent
3049  * it from being flushed to disk early.  Unlike
3050  * ext3_reserve_inode_write, this leaves behind no bh reference and
3051  * returns no iloc structure, so the caller needs to repeat the iloc
3052  * lookup to mark the inode dirty later.
3053  */
3054 static inline int
3055 ext3_pin_inode(handle_t *handle, struct inode *inode)
3056 {
3057         struct ext3_iloc iloc;
3058
3059         int err = 0;
3060         if (handle) {
3061                 err = ext3_get_inode_loc(inode, &iloc);
3062                 if (!err) {
3063                         BUFFER_TRACE(iloc.bh, "get_write_access");
3064                         err = journal_get_write_access(handle, iloc.bh);
3065                         if (!err)
3066                                 err = ext3_journal_dirty_metadata(handle, 
3067                                                                   iloc.bh);
3068                         brelse(iloc.bh);
3069                 }
3070         }
3071         ext3_std_error(inode->i_sb, err);
3072         return err;
3073 }
3074 #endif
3075
3076 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3077 {
3078         journal_t *journal;
3079         handle_t *handle;
3080         int err;
3081
3082         /*
3083          * We have to be very careful here: changing a data block's
3084          * journaling status dynamically is dangerous.  If we write a
3085          * data block to the journal, change the status and then delete
3086          * that block, we risk forgetting to revoke the old log record
3087          * from the journal and so a subsequent replay can corrupt data.
3088          * So, first we make sure that the journal is empty and that
3089          * nobody is changing anything.
3090          */
3091
3092         journal = EXT3_JOURNAL(inode);
3093         if (is_journal_aborted(journal) || IS_RDONLY(inode))
3094                 return -EROFS;
3095
3096         journal_lock_updates(journal);
3097         journal_flush(journal);
3098
3099         /*
3100          * OK, there are no updates running now, and all cached data is
3101          * synced to disk.  We are now in a completely consistent state
3102          * which doesn't have anything in the journal, and we know that
3103          * no filesystem updates are running, so it is safe to modify
3104          * the inode's in-core data-journaling state flag now.
3105          */
3106
3107         if (val)
3108                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3109         else
3110                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3111         ext3_set_aops(inode);
3112
3113         journal_unlock_updates(journal);
3114
3115         /* Finally we can mark the inode as dirty. */
3116
3117         handle = ext3_journal_start(inode, 1);
3118         if (IS_ERR(handle))
3119                 return PTR_ERR(handle);
3120
3121         err = ext3_mark_inode_dirty(handle, inode);
3122         handle->h_sync = 1;
3123         ext3_journal_stop(handle);
3124         ext3_std_error(inode->i_sb, err);
3125
3126         return err;
3127 }