fs/ext3/inode.c

   1 /*
   2  *  linux/fs/ext3/inode.c
   3  *
   4  * Copyright (C) 1992, 1993, 1994, 1995
   5  * Remy Card (card@masi.ibp.fr)
   6  * Laboratoire MASI - Institut Blaise Pascal
   7  * Universite Pierre et Marie Curie (Paris VI)
   8  *
   9  *  from
  10  *
  11  *  linux/fs/minix/inode.c
  12  *
  13  *  Copyright (C) 1991, 1992  Linus Torvalds
  14  *
  15  *  Goal-directed block allocation by Stephen Tweedie
  16  *      (sct@redhat.com), 1993, 1998
  17  *  Big-endian to little-endian byte-swapping/bitmaps by
  18  *        David S. Miller (davem@caip.rutgers.edu), 1995
  19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  20  *      (jj@sunsite.ms.mff.cuni.cz)
  21  *
  22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
  23  */
  24
  25 #include <linux/module.h>
  26 #include <linux/fs.h>
  27 #include <linux/time.h>
  28 #include <linux/ext3_jbd.h>
  29 #include <linux/jbd.h>
  30 #include <linux/smp_lock.h>
  31 #include <linux/highuid.h>
  32 #include <linux/pagemap.h>
  33 #include <linux/quotaops.h>
  34 #include <linux/string.h>
  35 #include <linux/buffer_head.h>
  36 #include <linux/writeback.h>
  37 #include <linux/mpage.h>
  38 #include <linux/uio.h>
  39 #include <linux/vserver/xid.h>
  40 #include "xattr.h"
  41 #include "acl.h"
  42
  43 /*
  44  * Test whether an inode is a fast symlink.
  45  */
  46 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
  47 {
  48         int ea_blocks = EXT3_I(inode)->i_file_acl ?
  49                 (inode->i_sb->s_blocksize >> 9) : 0;
  50
  51         return (S_ISLNK(inode->i_mode) &&
  52                 inode->i_blocks - ea_blocks == 0);
  53 }
  54
  55 /* The ext3 forget function must perform a revoke if we are freeing data
  56  * which has been journaled.  Metadata (eg. indirect blocks) must be
  57  * revoked in all cases.
  58  *
  59  * "bh" may be NULL: a metadata block may have been freed from memory
  60  * but there may still be a record of it in the journal, and that record
  61  * still needs to be revoked.
  62  */
  63
  64 int ext3_forget(handle_t *handle, int is_metadata,
  65                        struct inode *inode, struct buffer_head *bh,
  66                        int blocknr)
  67 {
  68         int err;
  69
  70         might_sleep();
  71
  72         BUFFER_TRACE(bh, "enter");
  73
  74         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
  75                   "data mode %lx\n",
  76                   bh, is_metadata, inode->i_mode,
  77                   test_opt(inode->i_sb, DATA_FLAGS));
  78
  79         /* Never use the revoke function if we are doing full data
  80          * journaling: there is no need to, and a V1 superblock won't
  81          * support it.  Otherwise, only skip the revoke on un-journaled
  82          * data blocks. */
  83
  84         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
  85             (!is_metadata && !ext3_should_journal_data(inode))) {
  86                 if (bh) {
  87                         BUFFER_TRACE(bh, "call journal_forget");
  88                         return ext3_journal_forget(handle, bh);
  89                 }
  90                 return 0;
  91         }
  92
  93         /*
  94          * data!=journal && (is_metadata || should_journal_data(inode))
  95          */
  96         BUFFER_TRACE(bh, "call ext3_journal_revoke");
  97         err = ext3_journal_revoke(handle, blocknr, bh);
  98         if (err)
  99                 ext3_abort(inode->i_sb, __FUNCTION__,
 100                            "error %d when attempting revoke", err);
 101         BUFFER_TRACE(bh, "exit");
 102         return err;
 103 }
 104
 105 /*
 106  * Work out how many blocks we need to progress with the next chunk of a
 107  * truncate transaction.
 108  */
 109
 110 static unsigned long blocks_for_truncate(struct inode *inode)
 111 {
 112         unsigned long needed;
 113
 114         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 115
 116         /* Give ourselves just enough room to cope with inodes in which
 117          * i_blocks is corrupt: we've seen disk corruptions in the past
 118          * which resulted in random data in an inode which looked enough
 119          * like a regular file for ext3 to try to delete it.  Things
 120          * will go a bit crazy if that happens, but at least we should
 121          * try not to panic the whole kernel. */
 122         if (needed < 2)
 123                 needed = 2;
 124
 125         /* But we need to bound the transaction so we don't overflow the
 126          * journal. */
 127         if (needed > EXT3_MAX_TRANS_DATA)
 128                 needed = EXT3_MAX_TRANS_DATA;
 129
 130         return EXT3_DATA_TRANS_BLOCKS + needed;
 131 }
 132
 133 /*
 134  * Truncate transactions can be complex and absolutely huge.  So we need to
 135  * be able to restart the transaction at a conventient checkpoint to make
 136  * sure we don't overflow the journal.
 137  *
 138  * start_transaction gets us a new handle for a truncate transaction,
 139  * and extend_transaction tries to extend the existing one a bit.  If
 140  * extend fails, we need to propagate the failure up and restart the
 141  * transaction in the top-level truncate loop. --sct
 142  */
 143
 144 static handle_t *start_transaction(struct inode *inode)
 145 {
 146         handle_t *result;
 147
 148         result = ext3_journal_start(inode, blocks_for_truncate(inode));
 149         if (!IS_ERR(result))
 150                 return result;
 151
 152         ext3_std_error(inode->i_sb, PTR_ERR(result));
 153         return result;
 154 }
 155
 156 /*
 157  * Try to extend this transaction for the purposes of truncation.
 158  *
 159  * Returns 0 if we managed to create more room.  If we can't create more
 160  * room, and the transaction must be restarted we return 1.
 161  */
 162 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 163 {
 164         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
 165                 return 0;
 166         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
 167                 return 0;
 168         return 1;
 169 }
 170
 171 /*
 172  * Restart the transaction associated with *handle.  This does a commit,
 173  * so before we call here everything must be consistently dirtied against
 174  * this transaction.
 175  */
 176 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
 177 {
 178         jbd_debug(2, "restarting handle %p\n", handle);
 179         return ext3_journal_restart(handle, blocks_for_truncate(inode));
 180 }
 181
 182 static void ext3_truncate_nocheck (struct inode *inode);
 183
 184 /*
 185  * Called at the last iput() if i_nlink is zero.
 186  */
 187 void ext3_delete_inode (struct inode * inode)
 188 {
 189         handle_t *handle;
 190
 191         if (is_bad_inode(inode))
 192                 goto no_delete;
 193
 194         handle = start_transaction(inode);
 195         if (IS_ERR(handle)) {
 196                 /* If we're going to skip the normal cleanup, we still
 197                  * need to make sure that the in-core orphan linked list
 198                  * is properly cleaned up. */
 199                 ext3_orphan_del(NULL, inode);
 200                 goto no_delete;
 201         }
 202
 203         if (IS_SYNC(inode))
 204                 handle->h_sync = 1;
 205         inode->i_size = 0;
 206         if (inode->i_blocks)
 207                 ext3_truncate_nocheck(inode);
 208         /*
 209          * Kill off the orphan record which ext3_truncate created.
 210          * AKPM: I think this can be inside the above `if'.
 211          * Note that ext3_orphan_del() has to be able to cope with the
 212          * deletion of a non-existent orphan - this is because we don't
 213          * know if ext3_truncate() actually created an orphan record.
 214          * (Well, we could do this if we need to, but heck - it works)
 215          */
 216         ext3_orphan_del(handle, inode);
 217         EXT3_I(inode)->i_dtime  = get_seconds();
 218
 219         /*
 220          * One subtle ordering requirement: if anything has gone wrong
 221          * (transaction abort, IO errors, whatever), then we can still
 222          * do these next steps (the fs will already have been marked as
 223          * having errors), but we can't free the inode if the mark_dirty
 224          * fails.
 225          */
 226         if (ext3_mark_inode_dirty(handle, inode))
 227                 /* If that failed, just do the required in-core inode clear. */
 228                 clear_inode(inode);
 229         else
 230                 ext3_free_inode(handle, inode);
 231         ext3_journal_stop(handle);
 232         return;
 233 no_delete:
 234         clear_inode(inode);     /* We must guarantee clearing of inode... */
 235 }
 236
 237 static int ext3_alloc_block (handle_t *handle,
 238                         struct inode * inode, unsigned long goal, int *err)
 239 {
 240         unsigned long result;
 241
 242         result = ext3_new_block(handle, inode, goal, err);
 243         return result;
 244 }
 245
 246
 247 typedef struct {
 248         __le32  *p;
 249         __le32  key;
 250         struct buffer_head *bh;
 251 } Indirect;
 252
 253 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 254 {
 255         p->key = *(p->p = v);
 256         p->bh = bh;
 257 }
 258
 259 static inline int verify_chain(Indirect *from, Indirect *to)
 260 {
 261         while (from <= to && from->key == *from->p)
 262                 from++;
 263         return (from > to);
 264 }
 265
 266 /**
 267  *      ext3_block_to_path - parse the block number into array of offsets
 268  *      @inode: inode in question (we are only interested in its superblock)
 269  *      @i_block: block number to be parsed
 270  *      @offsets: array to store the offsets in
 271  *      @boundary: set this non-zero if the referred-to block is likely to be
 272  *             followed (on disk) by an indirect block.
 273  *
 274  *      To store the locations of file's data ext3 uses a data structure common
 275  *      for UNIX filesystems - tree of pointers anchored in the inode, with
 276  *      data blocks at leaves and indirect blocks in intermediate nodes.
 277  *      This function translates the block number into path in that tree -
 278  *      return value is the path length and @offsets[n] is the offset of
 279  *      pointer to (n+1)th node in the nth one. If @block is out of range
 280  *      (negative or too large) warning is printed and zero returned.
 281  *
 282  *      Note: function doesn't find node addresses, so no IO is needed. All
 283  *      we need to know is the capacity of indirect blocks (taken from the
 284  *      inode->i_sb).
 285  */
 286
 287 /*
 288  * Portability note: the last comparison (check that we fit into triple
 289  * indirect block) is spelled differently, because otherwise on an
 290  * architecture with 32-bit longs and 8Kb pages we might get into trouble
 291  * if our filesystem had 8Kb blocks. We might use long long, but that would
 292  * kill us on x86. Oh, well, at least the sign propagation does not matter -
 293  * i_block would have to be negative in the very beginning, so we would not
 294  * get there at all.
 295  */
 296
 297 static int ext3_block_to_path(struct inode *inode,
 298                         long i_block, int offsets[4], int *boundary)
 299 {
 300         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 301         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
 302         const long direct_blocks = EXT3_NDIR_BLOCKS,
 303                 indirect_blocks = ptrs,
 304                 double_blocks = (1 << (ptrs_bits * 2));
 305         int n = 0;
 306         int final = 0;
 307
 308         if (i_block < 0) {
 309                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
 310         } else if (i_block < direct_blocks) {
 311                 offsets[n++] = i_block;
 312                 final = direct_blocks;
 313         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
 314                 offsets[n++] = EXT3_IND_BLOCK;
 315                 offsets[n++] = i_block;
 316                 final = ptrs;
 317         } else if ((i_block -= indirect_blocks) < double_blocks) {
 318                 offsets[n++] = EXT3_DIND_BLOCK;
 319                 offsets[n++] = i_block >> ptrs_bits;
 320                 offsets[n++] = i_block & (ptrs - 1);
 321                 final = ptrs;
 322         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 323                 offsets[n++] = EXT3_TIND_BLOCK;
 324                 offsets[n++] = i_block >> (ptrs_bits * 2);
 325                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 326                 offsets[n++] = i_block & (ptrs - 1);
 327                 final = ptrs;
 328         } else {
 329                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
 330         }
 331         if (boundary)
 332                 *boundary = (i_block & (ptrs - 1)) == (final - 1);
 333         return n;
 334 }
 335
 336 /**
 337  *      ext3_get_branch - read the chain of indirect blocks leading to data
 338  *      @inode: inode in question
 339  *      @depth: depth of the chain (1 - direct pointer, etc.)
 340  *      @offsets: offsets of pointers in inode/indirect blocks
 341  *      @chain: place to store the result
 342  *      @err: here we store the error value
 343  *
 344  *      Function fills the array of triples <key, p, bh> and returns %NULL
 345  *      if everything went OK or the pointer to the last filled triple
 346  *      (incomplete one) otherwise. Upon the return chain[i].key contains
 347  *      the number of (i+1)-th block in the chain (as it is stored in memory,
 348  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
 349  *      number (it points into struct inode for i==0 and into the bh->b_data
 350  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
 351  *      block for i>0 and NULL for i==0. In other words, it holds the block
 352  *      numbers of the chain, addresses they were taken from (and where we can
 353  *      verify that chain did not change) and buffer_heads hosting these
 354  *      numbers.
 355  *
 356  *      Function stops when it stumbles upon zero pointer (absent block)
 357  *              (pointer to last triple returned, *@err == 0)
 358  *      or when it gets an IO error reading an indirect block
 359  *              (ditto, *@err == -EIO)
 360  *      or when it notices that chain had been changed while it was reading
 361  *              (ditto, *@err == -EAGAIN)
 362  *      or when it reads all @depth-1 indirect blocks successfully and finds
 363  *      the whole chain, all way to the data (returns %NULL, *err == 0).
 364  */
 365 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
 366                                  Indirect chain[4], int *err)
 367 {
 368         struct super_block *sb = inode->i_sb;
 369         Indirect *p = chain;
 370         struct buffer_head *bh;
 371
 372         *err = 0;
 373         /* i_data is not going away, no lock needed */
 374         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
 375         if (!p->key)
 376                 goto no_block;
 377         while (--depth) {
 378                 bh = sb_bread(sb, le32_to_cpu(p->key));
 379                 if (!bh)
 380                         goto failure;
 381                 /* Reader: pointers */
 382                 if (!verify_chain(chain, p))
 383                         goto changed;
 384                 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
 385                 /* Reader: end */
 386                 if (!p->key)
 387                         goto no_block;
 388         }
 389         return NULL;
 390
 391 changed:
 392         brelse(bh);
 393         *err = -EAGAIN;
 394         goto no_block;
 395 failure:
 396         *err = -EIO;
 397 no_block:
 398         return p;
 399 }
 400
 401 /**
 402  *      ext3_find_near - find a place for allocation with sufficient locality
 403  *      @inode: owner
 404  *      @ind: descriptor of indirect block.
 405  *
 406  *      This function returns the prefered place for block allocation.
 407  *      It is used when heuristic for sequential allocation fails.
 408  *      Rules are:
 409  *        + if there is a block to the left of our position - allocate near it.
 410  *        + if pointer will live in indirect block - allocate near that block.
 411  *        + if pointer will live in inode - allocate in the same
 412  *          cylinder group.
 413  *
 414  * In the latter case we colour the starting block by the callers PID to
 415  * prevent it from clashing with concurrent allocations for a different inode
 416  * in the same block group.   The PID is used here so that functionally related
 417  * files will be close-by on-disk.
 418  *
 419  *      Caller must make sure that @ind is valid and will stay that way.
 420  */
 421
 422 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
 423 {
 424         struct ext3_inode_info *ei = EXT3_I(inode);
 425         __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
 426         __le32 *p;
 427         unsigned long bg_start;
 428         unsigned long colour;
 429
 430         /* Try to find previous block */
 431         for (p = ind->p - 1; p >= start; p--)
 432                 if (*p)
 433                         return le32_to_cpu(*p);
 434
 435         /* No such thing, so let's try location of indirect block */
 436         if (ind->bh)
 437                 return ind->bh->b_blocknr;
 438
 439         /*
 440          * It is going to be refered from inode itself? OK, just put it into
 441          * the same cylinder group then.
 442          */
 443         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
 444                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
 445         colour = (current->pid % 16) *
 446                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 447         return bg_start + colour;
 448 }
 449
 450 /**
 451  *      ext3_find_goal - find a prefered place for allocation.
 452  *      @inode: owner
 453  *      @block:  block we want
 454  *      @chain:  chain of indirect blocks
 455  *      @partial: pointer to the last triple within a chain
 456  *      @goal:  place to store the result.
 457  *
 458  *      Normally this function find the prefered place for block allocation,
 459  *      stores it in *@goal and returns zero. If the branch had been changed
 460  *      under us we return -EAGAIN.
 461  */
 462
 463 static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
 464                           Indirect *partial, unsigned long *goal)
 465 {
 466         struct ext3_inode_info *ei = EXT3_I(inode);
 467         /* Writer: ->i_next_alloc* */
 468         if (block == ei->i_next_alloc_block + 1) {
 469                 ei->i_next_alloc_block++;
 470                 ei->i_next_alloc_goal++;
 471         }
 472         /* Writer: end */
 473         /* Reader: pointers, ->i_next_alloc* */
 474         if (verify_chain(chain, partial)) {
 475                 /*
 476                  * try the heuristic for sequential allocation,
 477                  * failing that at least try to get decent locality.
 478                  */
 479                 if (block == ei->i_next_alloc_block)
 480                         *goal = ei->i_next_alloc_goal;
 481                 if (!*goal)
 482                         *goal = ext3_find_near(inode, partial);
 483                 return 0;
 484         }
 485         /* Reader: end */
 486         return -EAGAIN;
 487 }
 488
 489 /**
 490  *      ext3_alloc_branch - allocate and set up a chain of blocks.
 491  *      @inode: owner
 492  *      @num: depth of the chain (number of blocks to allocate)
 493  *      @offsets: offsets (in the blocks) to store the pointers to next.
 494  *      @branch: place to store the chain in.
 495  *
 496  *      This function allocates @num blocks, zeroes out all but the last one,
 497  *      links them into chain and (if we are synchronous) writes them to disk.
 498  *      In other words, it prepares a branch that can be spliced onto the
 499  *      inode. It stores the information about that chain in the branch[], in
 500  *      the same format as ext3_get_branch() would do. We are calling it after
 501  *      we had read the existing part of chain and partial points to the last
 502  *      triple of that (one with zero ->key). Upon the exit we have the same
 503  *      picture as after the successful ext3_get_block(), excpet that in one
 504  *      place chain is disconnected - *branch->p is still zero (we did not
 505  *      set the last link), but branch->key contains the number that should
 506  *      be placed into *branch->p to fill that gap.
 507  *
 508  *      If allocation fails we free all blocks we've allocated (and forget
 509  *      their buffer_heads) and return the error value the from failed
 510  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
 511  *      as described above and return 0.
 512  */
 513
 514 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 515                              int num,
 516                              unsigned long goal,
 517                              int *offsets,
 518                              Indirect *branch)
 519 {
 520         int blocksize = inode->i_sb->s_blocksize;
 521         int n = 0, keys = 0;
 522         int err = 0;
 523         int i;
 524         int parent = ext3_alloc_block(handle, inode, goal, &err);
 525
 526         branch[0].key = cpu_to_le32(parent);
 527         if (parent) {
 528                 for (n = 1; n < num; n++) {
 529                         struct buffer_head *bh;
 530                         /* Allocate the next block */
 531                         int nr = ext3_alloc_block(handle, inode, parent, &err);
 532                         if (!nr)
 533                                 break;
 534                         branch[n].key = cpu_to_le32(nr);
 535                         keys = n+1;
 536
 537                         /*
 538                          * Get buffer_head for parent block, zero it out
 539                          * and set the pointer to new one, then send
 540                          * parent to disk.
 541                          */
 542                         bh = sb_getblk(inode->i_sb, parent);
 543                         branch[n].bh = bh;
 544                         lock_buffer(bh);
 545                         BUFFER_TRACE(bh, "call get_create_access");
 546                         err = ext3_journal_get_create_access(handle, bh);
 547                         if (err) {
 548                                 unlock_buffer(bh);
 549                                 brelse(bh);
 550                                 break;
 551                         }
 552
 553                         memset(bh->b_data, 0, blocksize);
 554                         branch[n].p = (__le32*) bh->b_data + offsets[n];
 555                         *branch[n].p = branch[n].key;
 556                         BUFFER_TRACE(bh, "marking uptodate");
 557                         set_buffer_uptodate(bh);
 558                         unlock_buffer(bh);
 559
 560                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 561                         err = ext3_journal_dirty_metadata(handle, bh);
 562                         if (err)
 563                                 break;
 564
 565                         parent = nr;
 566                 }
 567         }
 568         if (n == num)
 569                 return 0;
 570
 571         /* Allocation failed, free what we already allocated */
 572         for (i = 1; i < keys; i++) {
 573                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
 574                 ext3_journal_forget(handle, branch[i].bh);
 575         }
 576         for (i = 0; i < keys; i++)
 577                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
 578         return err;
 579 }
 580
 581 /**
 582  *      ext3_splice_branch - splice the allocated branch onto inode.
 583  *      @inode: owner
 584  *      @block: (logical) number of block we are adding
 585  *      @chain: chain of indirect blocks (with a missing link - see
 586  *              ext3_alloc_branch)
 587  *      @where: location of missing link
 588  *      @num:   number of blocks we are adding
 589  *
 590  *      This function verifies that chain (up to the missing link) had not
 591  *      changed, fills the missing link and does all housekeeping needed in
 592  *      inode (->i_blocks, etc.). In case of success we end up with the full
 593  *      chain to new block and return 0. Otherwise (== chain had been changed)
 594  *      we free the new blocks (forgetting their buffer_heads, indeed) and
 595  *      return -EAGAIN.
 596  */
 597
 598 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
 599                               Indirect chain[4], Indirect *where, int num)
 600 {
 601         int i;
 602         int err = 0;
 603         struct ext3_inode_info *ei = EXT3_I(inode);
 604
 605         /*
 606          * If we're splicing into a [td]indirect block (as opposed to the
 607          * inode) then we need to get write access to the [td]indirect block
 608          * before the splice.
 609          */
 610         if (where->bh) {
 611                 BUFFER_TRACE(where->bh, "get_write_access");
 612                 err = ext3_journal_get_write_access(handle, where->bh);
 613                 if (err)
 614                         goto err_out;
 615         }
 616         /* Verify that place we are splicing to is still there and vacant */
 617
 618         /* Writer: pointers, ->i_next_alloc* */
 619         if (!verify_chain(chain, where-1) || *where->p)
 620                 /* Writer: end */
 621                 goto changed;
 622
 623         /* That's it */
 624
 625         *where->p = where->key;
 626         ei->i_next_alloc_block = block;
 627         ei->i_next_alloc_goal = le32_to_cpu(where[num-1].key);
 628         /* Writer: end */
 629
 630         /* We are done with atomic stuff, now do the rest of housekeeping */
 631
 632         inode->i_ctime = CURRENT_TIME;
 633         ext3_mark_inode_dirty(handle, inode);
 634
 635         /* had we spliced it onto indirect block? */
 636         if (where->bh) {
 637                 /*
 638                  * akpm: If we spliced it onto an indirect block, we haven't
 639                  * altered the inode.  Note however that if it is being spliced
 640                  * onto an indirect block at the very end of the file (the
 641                  * file is growing) then we *will* alter the inode to reflect
 642                  * the new i_size.  But that is not done here - it is done in
 643                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
 644                  */
 645                 jbd_debug(5, "splicing indirect only\n");
 646                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
 647                 err = ext3_journal_dirty_metadata(handle, where->bh);
 648                 if (err)
 649                         goto err_out;
 650         } else {
 651                 /*
 652                  * OK, we spliced it into the inode itself on a direct block.
 653                  * Inode was dirtied above.
 654                  */
 655                 jbd_debug(5, "splicing direct\n");
 656         }
 657         return err;
 658
 659 changed:
 660         /*
 661          * AKPM: if where[i].bh isn't part of the current updating
 662          * transaction then we explode nastily.  Test this code path.
 663          */
 664         jbd_debug(1, "the chain changed: try again\n");
 665         err = -EAGAIN;
 666
 667 err_out:
 668         for (i = 1; i < num; i++) {
 669                 BUFFER_TRACE(where[i].bh, "call journal_forget");
 670                 ext3_journal_forget(handle, where[i].bh);
 671         }
 672         /* For the normal collision cleanup case, we free up the blocks.
 673          * On genuine filesystem errors we don't even think about doing
 674          * that. */
 675         if (err == -EAGAIN)
 676                 for (i = 0; i < num; i++)
 677                         ext3_free_blocks(handle, inode,
 678                                          le32_to_cpu(where[i].key), 1);
 679         return err;
 680 }
 681
 682 /*
 683  * Allocation strategy is simple: if we have to allocate something, we will
 684  * have to go the whole way to leaf. So let's do it before attaching anything
 685  * to tree, set linkage between the newborn blocks, write them if sync is
 686  * required, recheck the path, free and repeat if check fails, otherwise
 687  * set the last missing link (that will protect us from any truncate-generated
 688  * removals - all blocks on the path are immune now) and possibly force the
 689  * write on the parent block.
 690  * That has a nice additional property: no special recovery from the failed
 691  * allocations is needed - we simply release blocks and do not touch anything
 692  * reachable from inode.
 693  *
 694  * akpm: `handle' can be NULL if create == 0.
 695  *
 696  * The BKL may not be held on entry here.  Be sure to take it early.
 697  */
 698
 699 static int
 700 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 701                 struct buffer_head *bh_result, int create, int extend_disksize)
 702 {
 703         int err = -EIO;
 704         int offsets[4];
 705         Indirect chain[4];
 706         Indirect *partial;
 707         unsigned long goal;
 708         int left;
 709         int boundary = 0;
 710         int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
 711         struct ext3_inode_info *ei = EXT3_I(inode);
 712
 713         J_ASSERT(handle != NULL || create == 0);
 714
 715         if (depth == 0)
 716                 goto out;
 717
 718 reread:
 719         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
 720
 721         /* Simplest case - block found, no allocation needed */
 722         if (!partial) {
 723                 clear_buffer_new(bh_result);
 724 got_it:
 725                 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
 726                 if (boundary)
 727                         set_buffer_boundary(bh_result);
 728                 /* Clean up and exit */
 729                 partial = chain+depth-1; /* the whole chain */
 730                 goto cleanup;
 731         }
 732
 733         /* Next simple case - plain lookup or failed read of indirect block */
 734         if (!create || err == -EIO) {
 735 cleanup:
 736                 while (partial > chain) {
 737                         BUFFER_TRACE(partial->bh, "call brelse");
 738                         brelse(partial->bh);
 739                         partial--;
 740                 }
 741                 BUFFER_TRACE(bh_result, "returned");
 742 out:
 743                 return err;
 744         }
 745
 746         /*
 747          * Indirect block might be removed by truncate while we were
 748          * reading it. Handling of that case (forget what we've got and
 749          * reread) is taken out of the main path.
 750          */
 751         if (err == -EAGAIN)
 752                 goto changed;
 753
 754         goal = 0;
 755         down(&ei->truncate_sem);
 756         if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) {
 757                 up(&ei->truncate_sem);
 758                 goto changed;
 759         }
 760
 761         left = (chain + depth) - partial;
 762
 763         /*
 764          * Block out ext3_truncate while we alter the tree
 765          */
 766         err = ext3_alloc_branch(handle, inode, left, goal,
 767                                         offsets+(partial-chain), partial);
 768
 769         /* The ext3_splice_branch call will free and forget any buffers
 770          * on the new chain if there is a failure, but that risks using
 771          * up transaction credits, especially for bitmaps where the
 772          * credits cannot be returned.  Can we handle this somehow?  We
 773          * may need to return -EAGAIN upwards in the worst case.  --sct */
 774         if (!err)
 775                 err = ext3_splice_branch(handle, inode, iblock, chain,
 776                                          partial, left);
 777         /* i_disksize growing is protected by truncate_sem
 778          * don't forget to protect it if you're about to implement
 779          * concurrent ext3_get_block() -bzzz */
 780         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
 781                 ei->i_disksize = inode->i_size;
 782         up(&ei->truncate_sem);
 783         if (err == -EAGAIN)
 784                 goto changed;
 785         if (err)
 786                 goto cleanup;
 787
 788         set_buffer_new(bh_result);
 789         goto got_it;
 790
 791 changed:
 792         while (partial > chain) {
 793                 jbd_debug(1, "buffer chain changed, retrying\n");
 794                 BUFFER_TRACE(partial->bh, "brelsing");
 795                 brelse(partial->bh);
 796                 partial--;
 797         }
 798         goto reread;
 799 }
 800
 801 static int ext3_get_block(struct inode *inode, sector_t iblock,
 802                         struct buffer_head *bh_result, int create)
 803 {
 804         handle_t *handle = NULL;
 805         int ret;
 806
 807         if (create) {
 808                 handle = ext3_journal_current_handle();
 809                 J_ASSERT(handle != 0);
 810         }
 811         ret = ext3_get_block_handle(handle, inode, iblock,
 812                                 bh_result, create, 1);
 813         return ret;
 814 }
 815
 816 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
 817
 818 static int
 819 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
 820                 unsigned long max_blocks, struct buffer_head *bh_result,
 821                 int create)
 822 {
 823         handle_t *handle = journal_current_handle();
 824         int ret = 0;
 825
 826         if (!handle)
 827                 goto get_block;         /* A read */
 828
 829         if (handle->h_transaction->t_state == T_LOCKED) {
 830                 /*
 831                  * Huge direct-io writes can hold off commits for long
 832                  * periods of time.  Let this commit run.
 833                  */
 834                 ext3_journal_stop(handle);
 835                 handle = ext3_journal_start(inode, DIO_CREDITS);
 836                 if (IS_ERR(handle))
 837                         ret = PTR_ERR(handle);
 838                 goto get_block;
 839         }
 840
 841         if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
 842                 /*
 843                  * Getting low on buffer credits...
 844                  */
 845                 ret = ext3_journal_extend(handle, DIO_CREDITS);
 846                 if (ret > 0) {
 847                         /*
 848                          * Couldn't extend the transaction.  Start a new one.
 849                          */
 850                         ret = ext3_journal_restart(handle, DIO_CREDITS);
 851                 }
 852         }
 853
 854 get_block:
 855         if (ret == 0)
 856                 ret = ext3_get_block_handle(handle, inode, iblock,
 857                                         bh_result, create, 0);
 858         bh_result->b_size = (1 << inode->i_blkbits);
 859         return ret;
 860 }
 861
 862 /*
 863  * `handle' can be NULL if create is zero
 864  */
 865 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
 866                                 long block, int create, int * errp)
 867 {
 868         struct buffer_head dummy;
 869         int fatal = 0, err;
 870
 871         J_ASSERT(handle != NULL || create == 0);
 872
 873         dummy.b_state = 0;
 874         dummy.b_blocknr = -1000;
 875         buffer_trace_init(&dummy.b_history);
 876         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
 877         if (!*errp && buffer_mapped(&dummy)) {
 878                 struct buffer_head *bh;
 879                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
 880                 if (buffer_new(&dummy)) {
 881                         J_ASSERT(create != 0);
 882                         J_ASSERT(handle != 0);
 883
 884                         /* Now that we do not always journal data, we
 885                            should keep in mind whether this should
 886                            always journal the new buffer as metadata.
 887                            For now, regular file writes use
 888                            ext3_get_block instead, so it's not a
 889                            problem. */
 890                         lock_buffer(bh);
 891                         BUFFER_TRACE(bh, "call get_create_access");
 892                         fatal = ext3_journal_get_create_access(handle, bh);
 893                         if (!fatal && !buffer_uptodate(bh)) {
 894                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 895                                 set_buffer_uptodate(bh);
 896                         }
 897                         unlock_buffer(bh);
 898                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 899                         err = ext3_journal_dirty_metadata(handle, bh);
 900                         if (!fatal)
 901                                 fatal = err;
 902                 } else {
 903                         BUFFER_TRACE(bh, "not a new buffer");
 904                 }
 905                 if (fatal) {
 906                         *errp = fatal;
 907                         brelse(bh);
 908                         bh = NULL;
 909                 }
 910                 return bh;
 911         }
 912         return NULL;
 913 }
 914
 915 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
 916                                int block, int create, int *err)
 917 {
 918         struct buffer_head * bh;
 919         int prev_blocks;
 920
 921         prev_blocks = inode->i_blocks;
 922
 923         bh = ext3_getblk (handle, inode, block, create, err);
 924         if (!bh)
 925                 return bh;
 926         if (buffer_uptodate(bh))
 927                 return bh;
 928         ll_rw_block (READ, 1, &bh);
 929         wait_on_buffer (bh);
 930         if (buffer_uptodate(bh))
 931                 return bh;
 932         brelse (bh);
 933         *err = -EIO;
 934         return NULL;
 935 }
 936
 937 static int walk_page_buffers(   handle_t *handle,
 938                                 struct buffer_head *head,
 939                                 unsigned from,
 940                                 unsigned to,
 941                                 int *partial,
 942                                 int (*fn)(      handle_t *handle,
 943                                                 struct buffer_head *bh))
 944 {
 945         struct buffer_head *bh;
 946         unsigned block_start, block_end;
 947         unsigned blocksize = head->b_size;
 948         int err, ret = 0;
 949         struct buffer_head *next;
 950
 951         for (   bh = head, block_start = 0;
 952                 ret == 0 && (bh != head || !block_start);
 953                 block_start = block_end, bh = next)
 954         {
 955                 next = bh->b_this_page;
 956                 block_end = block_start + blocksize;
 957                 if (block_end <= from || block_start >= to) {
 958                         if (partial && !buffer_uptodate(bh))
 959                                 *partial = 1;
 960                         continue;
 961                 }
 962                 err = (*fn)(handle, bh);
 963                 if (!ret)
 964                         ret = err;
 965         }
 966         return ret;
 967 }
 968
 969 /*
 970  * To preserve ordering, it is essential that the hole instantiation and
 971  * the data write be encapsulated in a single transaction.  We cannot
 972  * close off a transaction and start a new one between the ext3_get_block()
 973  * and the commit_write().  So doing the journal_start at the start of
 974  * prepare_write() is the right place.
 975  *
 976  * Also, this function can nest inside ext3_writepage() ->
 977  * block_write_full_page(). In that case, we *know* that ext3_writepage()
 978  * has generated enough buffer credits to do the whole page.  So we won't
 979  * block on the journal in that case, which is good, because the caller may
 980  * be PF_MEMALLOC.
 981  *
 982  * By accident, ext3 can be reentered when a transaction is open via
 983  * quota file writes.  If we were to commit the transaction while thus
 984  * reentered, there can be a deadlock - we would be holding a quota
 985  * lock, and the commit would never complete if another thread had a
 986  * transaction open and was blocking on the quota lock - a ranking
 987  * violation.
 988  *
 989  * So what we do is to rely on the fact that journal_stop/journal_start
 990  * will _not_ run commit under these circumstances because handle->h_ref
 991  * is elevated.  We'll still have enough credits for the tiny quotafile
 992  * write.
 993  */
 994
 995 static int do_journal_get_write_access(handle_t *handle,
 996                                        struct buffer_head *bh)
 997 {
 998         if (!buffer_mapped(bh) || buffer_freed(bh))
 999                 return 0;
1000         return ext3_journal_get_write_access(handle, bh);
1001 }
1002
1003 static int ext3_prepare_write(struct file *file, struct page *page,
1004                               unsigned from, unsigned to)
1005 {
1006         struct inode *inode = page->mapping->host;
1007         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1008         handle_t *handle;
1009         int retries = 0;
1010
1011 retry:
1012         handle = ext3_journal_start(inode, needed_blocks);
1013         if (IS_ERR(handle)) {
1014                 ret = PTR_ERR(handle);
1015                 goto out;
1016         }
1017         ret = block_prepare_write(page, from, to, ext3_get_block);
1018         if (ret)
1019                 goto prepare_write_failed;
1020
1021         if (ext3_should_journal_data(inode)) {
1022                 ret = walk_page_buffers(handle, page_buffers(page),
1023                                 from, to, NULL, do_journal_get_write_access);
1024         }
1025 prepare_write_failed:
1026         if (ret)
1027                 ext3_journal_stop(handle);
1028         if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1029                 goto retry;
1030 out:
1031         return ret;
1032 }
1033
1034 static int
1035 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1036 {
1037         int err = journal_dirty_data(handle, bh);
1038         if (err)
1039                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1040                                                 bh, handle,err);
1041         return err;
1042 }
1043
1044 /* For commit_write() in data=journal mode */
1045 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1046 {
1047         if (!buffer_mapped(bh) || buffer_freed(bh))
1048                 return 0;
1049         set_buffer_uptodate(bh);
1050         return ext3_journal_dirty_metadata(handle, bh);
1051 }
1052
1053 /*
1054  * We need to pick up the new inode size which generic_commit_write gave us
1055  * `file' can be NULL - eg, when called from page_symlink().
1056  *
1057  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1058  * buffers are managed internally.
1059  */
1060
1061 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1062                              unsigned from, unsigned to)
1063 {
1064         handle_t *handle = ext3_journal_current_handle();
1065         struct inode *inode = page->mapping->host;
1066         int ret = 0, ret2;
1067
1068         ret = walk_page_buffers(handle, page_buffers(page),
1069                 from, to, NULL, ext3_journal_dirty_data);
1070
1071         if (ret == 0) {
1072                 /*
1073                  * generic_commit_write() will run mark_inode_dirty() if i_size
1074                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1075                  * into that.
1076                  */
1077                 loff_t new_i_size;
1078
1079                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1080                 if (new_i_size > EXT3_I(inode)->i_disksize)
1081                         EXT3_I(inode)->i_disksize = new_i_size;
1082                 ret = generic_commit_write(file, page, from, to);
1083         }
1084         ret2 = ext3_journal_stop(handle);
1085         if (!ret)
1086                 ret = ret2;
1087         return ret;
1088 }
1089
1090 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1091                              unsigned from, unsigned to)
1092 {
1093         handle_t *handle = ext3_journal_current_handle();
1094         struct inode *inode = page->mapping->host;
1095         int ret = 0, ret2;
1096         loff_t new_i_size;
1097
1098         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1099         if (new_i_size > EXT3_I(inode)->i_disksize)
1100                 EXT3_I(inode)->i_disksize = new_i_size;
1101         ret = generic_commit_write(file, page, from, to);
1102         ret2 = ext3_journal_stop(handle);
1103         if (!ret)
1104                 ret = ret2;
1105         return ret;
1106 }
1107
1108 static int ext3_journalled_commit_write(struct file *file,
1109                         struct page *page, unsigned from, unsigned to)
1110 {
1111         handle_t *handle = ext3_journal_current_handle();
1112         struct inode *inode = page->mapping->host;
1113         int ret = 0, ret2;
1114         int partial = 0;
1115         loff_t pos;
1116
1117         /*
1118          * Here we duplicate the generic_commit_write() functionality
1119          */
1120         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1121
1122         ret = walk_page_buffers(handle, page_buffers(page), from,
1123                                 to, &partial, commit_write_fn);
1124         if (!partial)
1125                 SetPageUptodate(page);
1126         if (pos > inode->i_size)
1127                 i_size_write(inode, pos);
1128         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1129         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1130                 EXT3_I(inode)->i_disksize = inode->i_size;
1131                 ret2 = ext3_mark_inode_dirty(handle, inode);
1132                 if (!ret)
1133                         ret = ret2;
1134         }
1135         ret2 = ext3_journal_stop(handle);
1136         if (!ret)
1137                 ret = ret2;
1138         return ret;
1139 }
1140
1141 /*
1142  * bmap() is special.  It gets used by applications such as lilo and by
1143  * the swapper to find the on-disk block of a specific piece of data.
1144  *
1145  * Naturally, this is dangerous if the block concerned is still in the
1146  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1147  * filesystem and enables swap, then they may get a nasty shock when the
1148  * data getting swapped to that swapfile suddenly gets overwritten by
1149  * the original zero's written out previously to the journal and
1150  * awaiting writeback in the kernel's buffer cache.
1151  *
1152  * So, if we see any bmap calls here on a modified, data-journaled file,
1153  * take extra steps to flush any blocks which might be in the cache.
1154  */
1155 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1156 {
1157         struct inode *inode = mapping->host;
1158         journal_t *journal;
1159         int err;
1160
1161         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1162                 /*
1163                  * This is a REALLY heavyweight approach, but the use of
1164                  * bmap on dirty files is expected to be extremely rare:
1165                  * only if we run lilo or swapon on a freshly made file
1166                  * do we expect this to happen.
1167                  *
1168                  * (bmap requires CAP_SYS_RAWIO so this does not
1169                  * represent an unprivileged user DOS attack --- we'd be
1170                  * in trouble if mortal users could trigger this path at
1171                  * will.)
1172                  *
1173                  * NB. EXT3_STATE_JDATA is not set on files other than
1174                  * regular files.  If somebody wants to bmap a directory
1175                  * or symlink and gets confused because the buffer
1176                  * hasn't yet been flushed to disk, they deserve
1177                  * everything they get.
1178                  */
1179
1180                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1181                 journal = EXT3_JOURNAL(inode);
1182                 journal_lock_updates(journal);
1183                 err = journal_flush(journal);
1184                 journal_unlock_updates(journal);
1185
1186                 if (err)
1187                         return 0;
1188         }
1189
1190         return generic_block_bmap(mapping,block,ext3_get_block);
1191 }
1192
1193 static int bget_one(handle_t *handle, struct buffer_head *bh)
1194 {
1195         get_bh(bh);
1196         return 0;
1197 }
1198
1199 static int bput_one(handle_t *handle, struct buffer_head *bh)
1200 {
1201         put_bh(bh);
1202         return 0;
1203 }
1204
1205 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1206 {
1207         if (buffer_mapped(bh))
1208                 return ext3_journal_dirty_data(handle, bh);
1209         return 0;
1210 }
1211
1212 /*
1213  * Note that we always start a transaction even if we're not journalling
1214  * data.  This is to preserve ordering: any hole instantiation within
1215  * __block_write_full_page -> ext3_get_block() should be journalled
1216  * along with the data so we don't crash and then get metadata which
1217  * refers to old data.
1218  *
1219  * In all journalling modes block_write_full_page() will start the I/O.
1220  *
1221  * Problem:
1222  *
1223  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1224  *              ext3_writepage()
1225  *
1226  * Similar for:
1227  *
1228  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1229  *
1230  * Same applies to ext3_get_block().  We will deadlock on various things like
1231  * lock_journal and i_truncate_sem.
1232  *
1233  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1234  * allocations fail.
1235  *
1236  * 16May01: If we're reentered then journal_current_handle() will be
1237  *          non-zero. We simply *return*.
1238  *
1239  * 1 July 2001: @@@ FIXME:
1240  *   In journalled data mode, a data buffer may be metadata against the
1241  *   current transaction.  But the same file is part of a shared mapping
1242  *   and someone does a writepage() on it.
1243  *
1244  *   We will move the buffer onto the async_data list, but *after* it has
1245  *   been dirtied. So there's a small window where we have dirty data on
1246  *   BJ_Metadata.
1247  *
1248  *   Note that this only applies to the last partial page in the file.  The
1249  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1250  *   broken code anyway: it's wrong for msync()).
1251  *
1252  *   It's a rare case: affects the final partial page, for journalled data
1253  *   where the file is subject to bith write() and writepage() in the same
1254  *   transction.  To fix it we'll need a custom block_write_full_page().
1255  *   We'll probably need that anyway for journalling writepage() output.
1256  *
1257  * We don't honour synchronous mounts for writepage().  That would be
1258  * disastrous.  Any write() or metadata operation will sync the fs for
1259  * us.
1260  *
1261  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1262  * we don't need to open a transaction here.
1263  */
1264 static int ext3_ordered_writepage(struct page *page,
1265                         struct writeback_control *wbc)
1266 {
1267         struct inode *inode = page->mapping->host;
1268         struct buffer_head *page_bufs;
1269         handle_t *handle = NULL;
1270         int ret = 0;
1271         int err;
1272
1273         J_ASSERT(PageLocked(page));
1274
1275         /*
1276          * We give up here if we're reentered, because it might be for a
1277          * different filesystem.
1278          */
1279         if (ext3_journal_current_handle())
1280                 goto out_fail;
1281
1282         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1283
1284         if (IS_ERR(handle)) {
1285                 ret = PTR_ERR(handle);
1286                 goto out_fail;
1287         }
1288
1289         if (!page_has_buffers(page)) {
1290                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1291                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1292         }
1293         page_bufs = page_buffers(page);
1294         walk_page_buffers(handle, page_bufs, 0,
1295                         PAGE_CACHE_SIZE, NULL, bget_one);
1296
1297         ret = block_write_full_page(page, ext3_get_block, wbc);
1298
1299         /*
1300          * The page can become unlocked at any point now, and
1301          * truncate can then come in and change things.  So we
1302          * can't touch *page from now on.  But *page_bufs is
1303          * safe due to elevated refcount.
1304          */
1305
1306         /*
1307          * And attach them to the current transaction.  But only if
1308          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1309          * and generally junk.
1310          */
1311         if (ret == 0) {
1312                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1313                                         NULL, journal_dirty_data_fn);
1314                 if (!ret)
1315                         ret = err;
1316         }
1317         walk_page_buffers(handle, page_bufs, 0,
1318                         PAGE_CACHE_SIZE, NULL, bput_one);
1319         err = ext3_journal_stop(handle);
1320         if (!ret)
1321                 ret = err;
1322         return ret;
1323
1324 out_fail:
1325         redirty_page_for_writepage(wbc, page);
1326         unlock_page(page);
1327         return ret;
1328 }
1329
1330 static int ext3_writeback_writepage(struct page *page,
1331                                 struct writeback_control *wbc)
1332 {
1333         struct inode *inode = page->mapping->host;
1334         handle_t *handle = NULL;
1335         int ret = 0;
1336         int err;
1337
1338         if (ext3_journal_current_handle())
1339                 goto out_fail;
1340
1341         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1342         if (IS_ERR(handle)) {
1343                 ret = PTR_ERR(handle);
1344                 goto out_fail;
1345         }
1346
1347         ret = block_write_full_page(page, ext3_get_block, wbc);
1348         err = ext3_journal_stop(handle);
1349         if (!ret)
1350                 ret = err;
1351         return ret;
1352
1353 out_fail:
1354         redirty_page_for_writepage(wbc, page);
1355         unlock_page(page);
1356         return ret;
1357 }
1358
1359 static int ext3_journalled_writepage(struct page *page,
1360                                 struct writeback_control *wbc)
1361 {
1362         struct inode *inode = page->mapping->host;
1363         handle_t *handle = NULL;
1364         int ret = 0;
1365         int err;
1366
1367         if (ext3_journal_current_handle())
1368                 goto no_write;
1369
1370         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1371         if (IS_ERR(handle)) {
1372                 ret = PTR_ERR(handle);
1373                 goto no_write;
1374         }
1375
1376         if (!page_has_buffers(page) || PageChecked(page)) {
1377                 /*
1378                  * It's mmapped pagecache.  Add buffers and journal it.  There
1379                  * doesn't seem much point in redirtying the page here.
1380                  */
1381                 ClearPageChecked(page);
1382                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1383                                         ext3_get_block);
1384                 if (ret != 0)
1385                         goto out_unlock;
1386                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1387                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1388
1389                 err = walk_page_buffers(handle, page_buffers(page), 0,
1390                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1391                 if (ret == 0)
1392                         ret = err;
1393                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1394                 unlock_page(page);
1395         } else {
1396                 /*
1397                  * It may be a page full of checkpoint-mode buffers.  We don't
1398                  * really know unless we go poke around in the buffer_heads.
1399                  * But block_write_full_page will do the right thing.
1400                  */
1401                 ret = block_write_full_page(page, ext3_get_block, wbc);
1402         }
1403         err = ext3_journal_stop(handle);
1404         if (!ret)
1405                 ret = err;
1406 out:
1407         return ret;
1408
1409 no_write:
1410         redirty_page_for_writepage(wbc, page);
1411 out_unlock:
1412         unlock_page(page);
1413         goto out;
1414 }
1415
1416 static int ext3_readpage(struct file *file, struct page *page)
1417 {
1418         return mpage_readpage(page, ext3_get_block);
1419 }
1420
1421 static int
1422 ext3_readpages(struct file *file, struct address_space *mapping,
1423                 struct list_head *pages, unsigned nr_pages)
1424 {
1425         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1426 }
1427
1428 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1429 {
1430         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1431
1432         /*
1433          * If it's a full truncate we just forget about the pending dirtying
1434          */
1435         if (offset == 0)
1436                 ClearPageChecked(page);
1437
1438         return journal_invalidatepage(journal, page, offset);
1439 }
1440
1441 static int ext3_releasepage(struct page *page, int wait)
1442 {
1443         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1444
1445         WARN_ON(PageChecked(page));
1446         return journal_try_to_free_buffers(journal, page, wait);
1447 }
1448
1449 /*
1450  * If the O_DIRECT write will extend the file then add this inode to the
1451  * orphan list.  So recovery will truncate it back to the original size
1452  * if the machine crashes during the write.
1453  *
1454  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1455  * crashes then stale disk data _may_ be exposed inside the file.
1456  */
1457 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1458                         const struct iovec *iov, loff_t offset,
1459                         unsigned long nr_segs)
1460 {
1461         struct file *file = iocb->ki_filp;
1462         struct inode *inode = file->f_mapping->host;
1463         struct ext3_inode_info *ei = EXT3_I(inode);
1464         handle_t *handle = NULL;
1465         ssize_t ret;
1466         int orphan = 0;
1467         size_t count = iov_length(iov, nr_segs);
1468
1469         if (rw == WRITE) {
1470                 loff_t final_size = offset + count;
1471
1472                 handle = ext3_journal_start(inode, DIO_CREDITS);
1473                 if (IS_ERR(handle)) {
1474                         ret = PTR_ERR(handle);
1475                         goto out;
1476                 }
1477                 if (final_size > inode->i_size) {
1478                         ret = ext3_orphan_add(handle, inode);
1479                         if (ret)
1480                                 goto out_stop;
1481                         orphan = 1;
1482                         ei->i_disksize = inode->i_size;
1483                 }
1484         }
1485
1486         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1487                                  offset, nr_segs,
1488                                  ext3_direct_io_get_blocks, NULL);
1489
1490         /*
1491          * Reacquire the handle: ext3_direct_io_get_block() can restart the
1492          * transaction
1493          */
1494         handle = journal_current_handle();
1495
1496 out_stop:
1497         if (handle) {
1498                 int err;
1499
1500                 if (orphan)
1501                         ext3_orphan_del(handle, inode);
1502                 if (orphan && ret > 0) {
1503                         loff_t end = offset + ret;
1504                         if (end > inode->i_size) {
1505                                 ei->i_disksize = end;
1506                                 i_size_write(inode, end);
1507                                 err = ext3_mark_inode_dirty(handle, inode);
1508                                 if (!ret)
1509                                         ret = err;
1510                         }
1511                 }
1512                 err = ext3_journal_stop(handle);
1513                 if (ret == 0)
1514                         ret = err;
1515         }
1516 out:
1517         return ret;
1518 }
1519
1520 /*
1521  * Pages can be marked dirty completely asynchronously from ext3's journalling
1522  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1523  * much here because ->set_page_dirty is called under VFS locks.  The page is
1524  * not necessarily locked.
1525  *
1526  * We cannot just dirty the page and leave attached buffers clean, because the
1527  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1528  * or jbddirty because all the journalling code will explode.
1529  *
1530  * So what we do is to mark the page "pending dirty" and next time writepage
1531  * is called, propagate that into the buffers appropriately.
1532  */
1533 static int ext3_journalled_set_page_dirty(struct page *page)
1534 {
1535         SetPageChecked(page);
1536         return __set_page_dirty_nobuffers(page);
1537 }
1538
1539 static struct address_space_operations ext3_ordered_aops = {
1540         .readpage       = ext3_readpage,
1541         .readpages      = ext3_readpages,
1542         .writepage      = ext3_ordered_writepage,
1543         .sync_page      = block_sync_page,
1544         .prepare_write  = ext3_prepare_write,
1545         .commit_write   = ext3_ordered_commit_write,
1546         .bmap           = ext3_bmap,
1547         .invalidatepage = ext3_invalidatepage,
1548         .releasepage    = ext3_releasepage,
1549         .direct_IO      = ext3_direct_IO,
1550 };
1551
1552 static struct address_space_operations ext3_writeback_aops = {
1553         .readpage       = ext3_readpage,
1554         .readpages      = ext3_readpages,
1555         .writepage      = ext3_writeback_writepage,
1556         .sync_page      = block_sync_page,
1557         .prepare_write  = ext3_prepare_write,
1558         .commit_write   = ext3_writeback_commit_write,
1559         .bmap           = ext3_bmap,
1560         .invalidatepage = ext3_invalidatepage,
1561         .releasepage    = ext3_releasepage,
1562         .direct_IO      = ext3_direct_IO,
1563 };
1564
1565 static struct address_space_operations ext3_journalled_aops = {
1566         .readpage       = ext3_readpage,
1567         .readpages      = ext3_readpages,
1568         .writepage      = ext3_journalled_writepage,
1569         .sync_page      = block_sync_page,
1570         .prepare_write  = ext3_prepare_write,
1571         .commit_write   = ext3_journalled_commit_write,
1572         .set_page_dirty = ext3_journalled_set_page_dirty,
1573         .bmap           = ext3_bmap,
1574         .invalidatepage = ext3_invalidatepage,
1575         .releasepage    = ext3_releasepage,
1576 };
1577
1578 void ext3_set_aops(struct inode *inode)
1579 {
1580         if (ext3_should_order_data(inode))
1581                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1582         else if (ext3_should_writeback_data(inode))
1583                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1584         else
1585                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1586 }
1587
1588 /*
1589  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1590  * up to the end of the block which corresponds to `from'.
1591  * This required during truncate. We need to physically zero the tail end
1592  * of that block so it doesn't yield old data if the file is later grown.
1593  */
1594 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1595                 struct address_space *mapping, loff_t from)
1596 {
1597         unsigned long index = from >> PAGE_CACHE_SHIFT;
1598         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1599         unsigned blocksize, iblock, length, pos;
1600         struct inode *inode = mapping->host;
1601         struct buffer_head *bh;
1602         int err;
1603         void *kaddr;
1604
1605         blocksize = inode->i_sb->s_blocksize;
1606         length = blocksize - (offset & (blocksize - 1));
1607         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1608
1609         if (!page_has_buffers(page))
1610                 create_empty_buffers(page, blocksize, 0);
1611
1612         /* Find the buffer that contains "offset" */
1613         bh = page_buffers(page);
1614         pos = blocksize;
1615         while (offset >= pos) {
1616                 bh = bh->b_this_page;
1617                 iblock++;
1618                 pos += blocksize;
1619         }
1620
1621         err = 0;
1622         if (buffer_freed(bh)) {
1623                 BUFFER_TRACE(bh, "freed: skip");
1624                 goto unlock;
1625         }
1626
1627         if (!buffer_mapped(bh)) {
1628                 BUFFER_TRACE(bh, "unmapped");
1629                 ext3_get_block(inode, iblock, bh, 0);
1630                 /* unmapped? It's a hole - nothing to do */
1631                 if (!buffer_mapped(bh)) {
1632                         BUFFER_TRACE(bh, "still unmapped");
1633                         goto unlock;
1634                 }
1635         }
1636
1637         /* Ok, it's mapped. Make sure it's up-to-date */
1638         if (PageUptodate(page))
1639                 set_buffer_uptodate(bh);
1640
1641         if (!buffer_uptodate(bh)) {
1642                 err = -EIO;
1643                 ll_rw_block(READ, 1, &bh);
1644                 wait_on_buffer(bh);
1645                 /* Uhhuh. Read error. Complain and punt. */
1646                 if (!buffer_uptodate(bh))
1647                         goto unlock;
1648         }
1649
1650         if (ext3_should_journal_data(inode)) {
1651                 BUFFER_TRACE(bh, "get write access");
1652                 err = ext3_journal_get_write_access(handle, bh);
1653                 if (err)
1654                         goto unlock;
1655         }
1656
1657         kaddr = kmap_atomic(page, KM_USER0);
1658         memset(kaddr + offset, 0, length);
1659         flush_dcache_page(page);
1660         kunmap_atomic(kaddr, KM_USER0);
1661
1662         BUFFER_TRACE(bh, "zeroed end of block");
1663
1664         err = 0;
1665         if (ext3_should_journal_data(inode)) {
1666                 err = ext3_journal_dirty_metadata(handle, bh);
1667         } else {
1668                 if (ext3_should_order_data(inode))
1669                         err = ext3_journal_dirty_data(handle, bh);
1670                 mark_buffer_dirty(bh);
1671         }
1672
1673 unlock:
1674         unlock_page(page);
1675         page_cache_release(page);
1676         return err;
1677 }
1678
1679 /*
1680  * Probably it should be a library function... search for first non-zero word
1681  * or memcmp with zero_page, whatever is better for particular architecture.
1682  * Linus?
1683  */
1684 static inline int all_zeroes(__le32 *p, __le32 *q)
1685 {
1686         while (p < q)
1687                 if (*p++)
1688                         return 0;
1689         return 1;
1690 }
1691
1692 /**
1693  *      ext3_find_shared - find the indirect blocks for partial truncation.
1694  *      @inode:   inode in question
1695  *      @depth:   depth of the affected branch
1696  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1697  *      @chain:   place to store the pointers to partial indirect blocks
1698  *      @top:     place to the (detached) top of branch
1699  *
1700  *      This is a helper function used by ext3_truncate().
1701  *
1702  *      When we do truncate() we may have to clean the ends of several
1703  *      indirect blocks but leave the blocks themselves alive. Block is
1704  *      partially truncated if some data below the new i_size is refered
1705  *      from it (and it is on the path to the first completely truncated
1706  *      data block, indeed).  We have to free the top of that path along
1707  *      with everything to the right of the path. Since no allocation
1708  *      past the truncation point is possible until ext3_truncate()
1709  *      finishes, we may safely do the latter, but top of branch may
1710  *      require special attention - pageout below the truncation point
1711  *      might try to populate it.
1712  *
1713  *      We atomically detach the top of branch from the tree, store the
1714  *      block number of its root in *@top, pointers to buffer_heads of
1715  *      partially truncated blocks - in @chain[].bh and pointers to
1716  *      their last elements that should not be removed - in
1717  *      @chain[].p. Return value is the pointer to last filled element
1718  *      of @chain.
1719  *
1720  *      The work left to caller to do the actual freeing of subtrees:
1721  *              a) free the subtree starting from *@top
1722  *              b) free the subtrees whose roots are stored in
1723  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1724  *              c) free the subtrees growing from the inode past the @chain[0].
1725  *                      (no partially truncated stuff there).  */
1726
1727 static Indirect *ext3_find_shared(struct inode *inode,
1728                                 int depth,
1729                                 int offsets[4],
1730                                 Indirect chain[4],
1731                                 __le32 *top)
1732 {
1733         Indirect *partial, *p;
1734         int k, err;
1735
1736         *top = 0;
1737         /* Make k index the deepest non-null offest + 1 */
1738         for (k = depth; k > 1 && !offsets[k-1]; k--)
1739                 ;
1740         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1741         /* Writer: pointers */
1742         if (!partial)
1743                 partial = chain + k-1;
1744         /*
1745          * If the branch acquired continuation since we've looked at it -
1746          * fine, it should all survive and (new) top doesn't belong to us.
1747          */
1748         if (!partial->key && *partial->p)
1749                 /* Writer: end */
1750                 goto no_top;
1751         for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1752                 ;
1753         /*
1754          * OK, we've found the last block that must survive. The rest of our
1755          * branch should be detached before unlocking. However, if that rest
1756          * of branch is all ours and does not grow immediately from the inode
1757          * it's easier to cheat and just decrement partial->p.
1758          */
1759         if (p == chain + k - 1 && p > chain) {
1760                 p->p--;
1761         } else {
1762                 *top = *p->p;
1763                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1764 #if 0
1765                 *p->p = 0;
1766 #endif
1767         }
1768         /* Writer: end */
1769
1770         while(partial > p)
1771         {
1772                 brelse(partial->bh);
1773                 partial--;
1774         }
1775 no_top:
1776         return partial;
1777 }
1778
1779 /*
1780  * Zero a number of block pointers in either an inode or an indirect block.
1781  * If we restart the transaction we must again get write access to the
1782  * indirect block for further modification.
1783  *
1784  * We release `count' blocks on disk, but (last - first) may be greater
1785  * than `count' because there can be holes in there.
1786  */
1787 static void
1788 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1789                 unsigned long block_to_free, unsigned long count,
1790                 __le32 *first, __le32 *last)
1791 {
1792         __le32 *p;
1793         if (try_to_extend_transaction(handle, inode)) {
1794                 if (bh) {
1795                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1796                         ext3_journal_dirty_metadata(handle, bh);
1797                 }
1798                 ext3_mark_inode_dirty(handle, inode);
1799                 ext3_journal_test_restart(handle, inode);
1800                 if (bh) {
1801                         BUFFER_TRACE(bh, "retaking write access");
1802                         ext3_journal_get_write_access(handle, bh);
1803                 }
1804         }
1805
1806         /*
1807          * Any buffers which are on the journal will be in memory. We find
1808          * them on the hash table so journal_revoke() will run journal_forget()
1809          * on them.  We've already detached each block from the file, so
1810          * bforget() in journal_forget() should be safe.
1811          *
1812          * AKPM: turn on bforget in journal_forget()!!!
1813          */
1814         for (p = first; p < last; p++) {
1815                 u32 nr = le32_to_cpu(*p);
1816                 if (nr) {
1817                         struct buffer_head *bh;
1818
1819                         *p = 0;
1820                         bh = sb_find_get_block(inode->i_sb, nr);
1821                         ext3_forget(handle, 0, inode, bh, nr);
1822                 }
1823         }
1824
1825         ext3_free_blocks(handle, inode, block_to_free, count);
1826 }
1827
1828 /**
1829  * ext3_free_data - free a list of data blocks
1830  * @handle:     handle for this transaction
1831  * @inode:      inode we are dealing with
1832  * @this_bh:    indirect buffer_head which contains *@first and *@last
1833  * @first:      array of block numbers
1834  * @last:       points immediately past the end of array
1835  *
1836  * We are freeing all blocks refered from that array (numbers are stored as
1837  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1838  *
1839  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1840  * blocks are contiguous then releasing them at one time will only affect one
1841  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1842  * actually use a lot of journal space.
1843  *
1844  * @this_bh will be %NULL if @first and @last point into the inode's direct
1845  * block pointers.
1846  */
1847 static void ext3_free_data(handle_t *handle, struct inode *inode,
1848                            struct buffer_head *this_bh,
1849                            __le32 *first, __le32 *last)
1850 {
1851         unsigned long block_to_free = 0;    /* Starting block # of a run */
1852         unsigned long count = 0;            /* Number of blocks in the run */
1853         __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
1854                                                corresponding to
1855                                                block_to_free */
1856         unsigned long nr;                   /* Current block # */
1857         __le32 *p;                          /* Pointer into inode/ind
1858                                                for current block */
1859         int err;
1860
1861         if (this_bh) {                          /* For indirect block */
1862                 BUFFER_TRACE(this_bh, "get_write_access");
1863                 err = ext3_journal_get_write_access(handle, this_bh);
1864                 /* Important: if we can't update the indirect pointers
1865                  * to the blocks, we can't free them. */
1866                 if (err)
1867                         return;
1868         }
1869
1870         for (p = first; p < last; p++) {
1871                 nr = le32_to_cpu(*p);
1872                 if (nr) {
1873                         /* accumulate blocks to free if they're contiguous */
1874                         if (count == 0) {
1875                                 block_to_free = nr;
1876                                 block_to_free_p = p;
1877                                 count = 1;
1878                         } else if (nr == block_to_free + count) {
1879                                 count++;
1880                         } else {
1881                                 ext3_clear_blocks(handle, inode, this_bh,
1882                                                   block_to_free,
1883                                                   count, block_to_free_p, p);
1884                                 block_to_free = nr;
1885                                 block_to_free_p = p;
1886                                 count = 1;
1887                         }
1888                 }
1889         }
1890
1891         if (count > 0)
1892                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1893                                   count, block_to_free_p, p);
1894
1895         if (this_bh) {
1896                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1897                 ext3_journal_dirty_metadata(handle, this_bh);
1898         }
1899 }
1900
1901 /**
1902  *      ext3_free_branches - free an array of branches
1903  *      @handle: JBD handle for this transaction
1904  *      @inode: inode we are dealing with
1905  *      @parent_bh: the buffer_head which contains *@first and *@last
1906  *      @first: array of block numbers
1907  *      @last:  pointer immediately past the end of array
1908  *      @depth: depth of the branches to free
1909  *
1910  *      We are freeing all blocks refered from these branches (numbers are
1911  *      stored as little-endian 32-bit) and updating @inode->i_blocks
1912  *      appropriately.
1913  */
1914 static void ext3_free_branches(handle_t *handle, struct inode *inode,
1915                                struct buffer_head *parent_bh,
1916                                __le32 *first, __le32 *last, int depth)
1917 {
1918         unsigned long nr;
1919         __le32 *p;
1920
1921         if (is_handle_aborted(handle))
1922                 return;
1923
1924         if (depth--) {
1925                 struct buffer_head *bh;
1926                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1927                 p = last;
1928                 while (--p >= first) {
1929                         nr = le32_to_cpu(*p);
1930                         if (!nr)
1931                                 continue;               /* A hole */
1932
1933                         /* Go read the buffer for the next level down */
1934                         bh = sb_bread(inode->i_sb, nr);
1935
1936                         /*
1937                          * A read failure? Report error and clear slot
1938                          * (should be rare).
1939                          */
1940                         if (!bh) {
1941                                 ext3_error(inode->i_sb, "ext3_free_branches",
1942                                            "Read failure, inode=%ld, block=%ld",
1943                                            inode->i_ino, nr);
1944                                 continue;
1945                         }
1946
1947                         /* This zaps the entire block.  Bottom up. */
1948                         BUFFER_TRACE(bh, "free child branches");
1949                         ext3_free_branches(handle, inode, bh,
1950                                            (__le32*)bh->b_data,
1951                                            (__le32*)bh->b_data + addr_per_block,
1952                                            depth);
1953
1954                         /*
1955                          * We've probably journalled the indirect block several
1956                          * times during the truncate.  But it's no longer
1957                          * needed and we now drop it from the transaction via
1958                          * journal_revoke().
1959                          *
1960                          * That's easy if it's exclusively part of this
1961                          * transaction.  But if it's part of the committing
1962                          * transaction then journal_forget() will simply
1963                          * brelse() it.  That means that if the underlying
1964                          * block is reallocated in ext3_get_block(),
1965                          * unmap_underlying_metadata() will find this block
1966                          * and will try to get rid of it.  damn, damn.
1967                          *
1968                          * If this block has already been committed to the
1969                          * journal, a revoke record will be written.  And
1970                          * revoke records must be emitted *before* clearing
1971                          * this block's bit in the bitmaps.
1972                          */
1973                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
1974
1975                         /*
1976                          * Everything below this this pointer has been
1977                          * released.  Now let this top-of-subtree go.
1978                          *
1979                          * We want the freeing of this indirect block to be
1980                          * atomic in the journal with the updating of the
1981                          * bitmap block which owns it.  So make some room in
1982                          * the journal.
1983                          *
1984                          * We zero the parent pointer *after* freeing its
1985                          * pointee in the bitmaps, so if extend_transaction()
1986                          * for some reason fails to put the bitmap changes and
1987                          * the release into the same transaction, recovery
1988                          * will merely complain about releasing a free block,
1989                          * rather than leaking blocks.
1990                          */
1991                         if (is_handle_aborted(handle))
1992                                 return;
1993                         if (try_to_extend_transaction(handle, inode)) {
1994                                 ext3_mark_inode_dirty(handle, inode);
1995                                 ext3_journal_test_restart(handle, inode);
1996                         }
1997
1998                         ext3_free_blocks(handle, inode, nr, 1);
1999
2000                         if (parent_bh) {
2001                                 /*
2002                                  * The block which we have just freed is
2003                                  * pointed to by an indirect block: journal it
2004                                  */
2005                                 BUFFER_TRACE(parent_bh, "get_write_access");
2006                                 if (!ext3_journal_get_write_access(handle,
2007                                                                    parent_bh)){
2008                                         *p = 0;
2009                                         BUFFER_TRACE(parent_bh,
2010                                         "call ext3_journal_dirty_metadata");
2011                                         ext3_journal_dirty_metadata(handle,
2012                                                                     parent_bh);
2013                                 }
2014                         }
2015                 }
2016         } else {
2017                 /* We have reached the bottom of the tree. */
2018                 BUFFER_TRACE(parent_bh, "free data blocks");
2019                 ext3_free_data(handle, inode, parent_bh, first, last);
2020         }
2021 }
2022
2023 /*
2024  * ext3_truncate()
2025  *
2026  * We block out ext3_get_block() block instantiations across the entire
2027  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2028  * simultaneously on behalf of the same inode.
2029  *
2030  * As we work through the truncate and commmit bits of it to the journal there
2031  * is one core, guiding principle: the file's tree must always be consistent on
2032  * disk.  We must be able to restart the truncate after a crash.
2033  *
2034  * The file's tree may be transiently inconsistent in memory (although it
2035  * probably isn't), but whenever we close off and commit a journal transaction,
2036  * the contents of (the filesystem + the journal) must be consistent and
2037  * restartable.  It's pretty simple, really: bottom up, right to left (although
2038  * left-to-right works OK too).
2039  *
2040  * Note that at recovery time, journal replay occurs *before* the restart of
2041  * truncate against the orphan inode list.
2042  *
2043  * The committed inode has the new, desired i_size (which is the same as
2044  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2045  * that this inode's truncate did not complete and it will again call
2046  * ext3_truncate() to have another go.  So there will be instantiated blocks
2047  * to the right of the truncation point in a crashed ext3 filesystem.  But
2048  * that's fine - as long as they are linked from the inode, the post-crash
2049  * ext3_truncate() run will find them and release them.
2050  */
2051
2052 void ext3_truncate_nocheck(struct inode * inode)
2053 {
2054         handle_t *handle;
2055         struct ext3_inode_info *ei = EXT3_I(inode);
2056         __le32 *i_data = ei->i_data;
2057         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2058         struct address_space *mapping = inode->i_mapping;
2059         int offsets[4];
2060         Indirect chain[4];
2061         Indirect *partial;
2062         __le32 nr = 0;
2063         int n;
2064         long last_block;
2065         unsigned blocksize = inode->i_sb->s_blocksize;
2066         struct page *page;
2067
2068         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2069             S_ISLNK(inode->i_mode)))
2070                 return;
2071         if (ext3_inode_is_fast_symlink(inode))
2072                 return;
2073
2074         ext3_discard_reservation(inode);
2075
2076         /*
2077          * We have to lock the EOF page here, because lock_page() nests
2078          * outside journal_start().
2079          */
2080         if ((inode->i_size & (blocksize - 1)) == 0) {
2081                 /* Block boundary? Nothing to do */
2082                 page = NULL;
2083         } else {
2084                 page = grab_cache_page(mapping,
2085                                 inode->i_size >> PAGE_CACHE_SHIFT);
2086                 if (!page)
2087                         return;
2088         }
2089
2090         handle = start_transaction(inode);
2091         if (IS_ERR(handle)) {
2092                 if (page) {
2093                         clear_highpage(page);
2094                         flush_dcache_page(page);
2095                         unlock_page(page);
2096                         page_cache_release(page);
2097                 }
2098                 return;         /* AKPM: return what? */
2099         }
2100
2101         last_block = (inode->i_size + blocksize-1)
2102                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2103
2104         if (page)
2105                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2106
2107         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2108         if (n == 0)
2109                 goto out_stop;  /* error */
2110
2111         /*
2112          * OK.  This truncate is going to happen.  We add the inode to the
2113          * orphan list, so that if this truncate spans multiple transactions,
2114          * and we crash, we will resume the truncate when the filesystem
2115          * recovers.  It also marks the inode dirty, to catch the new size.
2116          *
2117          * Implication: the file must always be in a sane, consistent
2118          * truncatable state while each transaction commits.
2119          */
2120         if (ext3_orphan_add(handle, inode))
2121                 goto out_stop;
2122
2123         /*
2124          * The orphan list entry will now protect us from any crash which
2125          * occurs before the truncate completes, so it is now safe to propagate
2126          * the new, shorter inode size (held for now in i_size) into the
2127          * on-disk inode. We do this via i_disksize, which is the value which
2128          * ext3 *really* writes onto the disk inode.
2129          */
2130         ei->i_disksize = inode->i_size;
2131
2132         /*
2133          * From here we block out all ext3_get_block() callers who want to
2134          * modify the block allocation tree.
2135          */
2136         down(&ei->truncate_sem);
2137
2138         if (n == 1) {           /* direct blocks */
2139                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2140                                i_data + EXT3_NDIR_BLOCKS);
2141                 goto do_indirects;
2142         }
2143
2144         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2145         /* Kill the top of shared branch (not detached) */
2146         if (nr) {
2147                 if (partial == chain) {
2148                         /* Shared branch grows from the inode */
2149                         ext3_free_branches(handle, inode, NULL,
2150                                            &nr, &nr+1, (chain+n-1) - partial);
2151                         *partial->p = 0;
2152                         /*
2153                          * We mark the inode dirty prior to restart,
2154                          * and prior to stop.  No need for it here.
2155                          */
2156                 } else {
2157                         /* Shared branch grows from an indirect block */
2158                         BUFFER_TRACE(partial->bh, "get_write_access");
2159                         ext3_free_branches(handle, inode, partial->bh,
2160                                         partial->p,
2161                                         partial->p+1, (chain+n-1) - partial);
2162                 }
2163         }
2164         /* Clear the ends of indirect blocks on the shared branch */
2165         while (partial > chain) {
2166                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2167                                    (__le32*)partial->bh->b_data+addr_per_block,
2168                                    (chain+n-1) - partial);
2169                 BUFFER_TRACE(partial->bh, "call brelse");
2170                 brelse (partial->bh);
2171                 partial--;
2172         }
2173 do_indirects:
2174         /* Kill the remaining (whole) subtrees */
2175         switch (offsets[0]) {
2176                 default:
2177                         nr = i_data[EXT3_IND_BLOCK];
2178                         if (nr) {
2179                                 ext3_free_branches(handle, inode, NULL,
2180                                                    &nr, &nr+1, 1);
2181                                 i_data[EXT3_IND_BLOCK] = 0;
2182                         }
2183                 case EXT3_IND_BLOCK:
2184                         nr = i_data[EXT3_DIND_BLOCK];
2185                         if (nr) {
2186                                 ext3_free_branches(handle, inode, NULL,
2187                                                    &nr, &nr+1, 2);
2188                                 i_data[EXT3_DIND_BLOCK] = 0;
2189                         }
2190                 case EXT3_DIND_BLOCK:
2191                         nr = i_data[EXT3_TIND_BLOCK];
2192                         if (nr) {
2193                                 ext3_free_branches(handle, inode, NULL,
2194                                                    &nr, &nr+1, 3);
2195                                 i_data[EXT3_TIND_BLOCK] = 0;
2196                         }
2197                 case EXT3_TIND_BLOCK:
2198                         ;
2199         }
2200         up(&ei->truncate_sem);
2201         inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2202         ext3_mark_inode_dirty(handle, inode);
2203
2204         /* In a multi-transaction truncate, we only make the final
2205          * transaction synchronous */
2206         if (IS_SYNC(inode))
2207                 handle->h_sync = 1;
2208 out_stop:
2209         /*
2210          * If this was a simple ftruncate(), and the file will remain alive
2211          * then we need to clear up the orphan record which we created above.
2212          * However, if this was a real unlink then we were called by
2213          * ext3_delete_inode(), and we allow that function to clean up the
2214          * orphan info for us.
2215          */
2216         if (inode->i_nlink)
2217                 ext3_orphan_del(handle, inode);
2218
2219         ext3_journal_stop(handle);
2220 }
2221
2222 static unsigned long ext3_get_inode_block(struct super_block *sb,
2223                 unsigned long ino, struct ext3_iloc *iloc)
2224 {
2225         unsigned long desc, group_desc, block_group;
2226         unsigned long offset, block;
2227         struct buffer_head *bh;
2228         struct ext3_group_desc * gdp;
2229
2230
2231         if ((ino != EXT3_ROOT_INO &&
2232                 ino != EXT3_JOURNAL_INO &&
2233                 ino != EXT3_RESIZE_INO &&
2234                 ino < EXT3_FIRST_INO(sb)) ||
2235                 ino > le32_to_cpu(
2236                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2237                 ext3_error (sb, "ext3_get_inode_block",
2238                             "bad inode number: %lu", ino);
2239                 return 0;
2240         }
2241         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2242         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2243                 ext3_error (sb, "ext3_get_inode_block",
2244                             "group >= groups count");
2245                 return 0;
2246         }
2247         smp_rmb();
2248         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2249         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2250         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2251         if (!bh) {
2252                 ext3_error (sb, "ext3_get_inode_block",
2253                             "Descriptor not loaded");
2254                 return 0;
2255         }
2256
2257         gdp = (struct ext3_group_desc *) bh->b_data;
2258         /*
2259          * Figure out the offset within the block group inode table
2260          */
2261         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2262                 EXT3_INODE_SIZE(sb);
2263         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2264                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2265
2266         iloc->block_group = block_group;
2267         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2268         return block;
2269 }
2270
2271 /*
2272  * ext3_get_inode_loc returns with an extra refcount against the inode's
2273  * underlying buffer_head on success.  If `in_mem' is false then we're purely
2274  * trying to determine the inode's location on-disk and no read need be
2275  * performed.
2276  */
2277 static int ext3_get_inode_loc(struct inode *inode,
2278                                 struct ext3_iloc *iloc, int in_mem)
2279 {
2280         unsigned long block;
2281         struct buffer_head *bh;
2282
2283         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2284         if (!block)
2285                 return -EIO;
2286
2287         bh = sb_getblk(inode->i_sb, block);
2288         if (!bh) {
2289                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2290                                 "unable to read inode block - "
2291                                 "inode=%lu, block=%lu", inode->i_ino, block);
2292                 return -EIO;
2293         }
2294         if (!buffer_uptodate(bh)) {
2295                 lock_buffer(bh);
2296                 if (buffer_uptodate(bh)) {
2297                         /* someone brought it uptodate while we waited */
2298                         unlock_buffer(bh);
2299                         goto has_buffer;
2300                 }
2301
2302                 /* we can't skip I/O if inode is on a disk only */
2303                 if (in_mem) {
2304                         struct buffer_head *bitmap_bh;
2305                         struct ext3_group_desc *desc;
2306                         int inodes_per_buffer;
2307                         int inode_offset, i;
2308                         int block_group;
2309                         int start;
2310
2311                         /*
2312                          * If this is the only valid inode in the block we
2313                          * need not read the block.
2314                          */
2315                         block_group = (inode->i_ino - 1) /
2316                                         EXT3_INODES_PER_GROUP(inode->i_sb);
2317                         inodes_per_buffer = bh->b_size /
2318                                 EXT3_INODE_SIZE(inode->i_sb);
2319                         inode_offset = ((inode->i_ino - 1) %
2320                                         EXT3_INODES_PER_GROUP(inode->i_sb));
2321                         start = inode_offset & ~(inodes_per_buffer - 1);
2322
2323                         /* Is the inode bitmap in cache? */
2324                         desc = ext3_get_group_desc(inode->i_sb,
2325                                                 block_group, NULL);
2326                         if (!desc)
2327                                 goto make_io;
2328
2329                         bitmap_bh = sb_getblk(inode->i_sb,
2330                                         le32_to_cpu(desc->bg_inode_bitmap));
2331                         if (!bitmap_bh)
2332                                 goto make_io;
2333
2334                         /*
2335                          * If the inode bitmap isn't in cache then the
2336                          * optimisation may end up performing two reads instead
2337                          * of one, so skip it.
2338                          */
2339                         if (!buffer_uptodate(bitmap_bh)) {
2340                                 brelse(bitmap_bh);
2341                                 goto make_io;
2342                         }
2343                         for (i = start; i < start + inodes_per_buffer; i++) {
2344                                 if (i == inode_offset)
2345                                         continue;
2346                                 if (ext3_test_bit(i, bitmap_bh->b_data))
2347                                         break;
2348                         }
2349                         brelse(bitmap_bh);
2350                         if (i == start + inodes_per_buffer) {
2351                                 /* all other inodes are free, so skip I/O */
2352                                 memset(bh->b_data, 0, bh->b_size);
2353                                 set_buffer_uptodate(bh);
2354                                 unlock_buffer(bh);
2355                                 goto has_buffer;
2356                         }
2357                 }
2358
2359 make_io:
2360                 /*
2361                  * There are another valid inodes in the buffer so we must
2362                  * read the block from disk
2363                  */
2364                 get_bh(bh);
2365                 bh->b_end_io = end_buffer_read_sync;
2366                 submit_bh(READ, bh);
2367                 wait_on_buffer(bh);
2368                 if (!buffer_uptodate(bh)) {
2369                         ext3_error(inode->i_sb, "ext3_get_inode_loc",
2370                                         "unable to read inode block - "
2371                                         "inode=%lu, block=%lu",
2372                                         inode->i_ino, block);
2373                         brelse(bh);
2374                         return -EIO;
2375                 }
2376         }
2377 has_buffer:
2378         iloc->bh = bh;
2379         return 0;
2380 }
2381
2382 void ext3_truncate(struct inode * inode)
2383 {
2384         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2385                 return;
2386         ext3_truncate_nocheck(inode);
2387 }
2388
2389 void ext3_set_inode_flags(struct inode *inode)
2390 {
2391         unsigned int flags = EXT3_I(inode)->i_flags;
2392
2393         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_IUNLINK|S_BARRIER|S_NOATIME|S_DIRSYNC);
2394         if (flags & EXT3_SYNC_FL)
2395                 inode->i_flags |= S_SYNC;
2396         if (flags & EXT3_APPEND_FL)
2397                 inode->i_flags |= S_APPEND;
2398         if (flags & EXT3_IMMUTABLE_FL)
2399                 inode->i_flags |= S_IMMUTABLE;
2400         if (flags & EXT3_IUNLINK_FL)
2401                 inode->i_flags |= S_IUNLINK;
2402         if (flags & EXT3_BARRIER_FL)
2403                 inode->i_flags |= S_BARRIER;
2404         if (flags & EXT3_NOATIME_FL)
2405                 inode->i_flags |= S_NOATIME;
2406         if (flags & EXT3_DIRSYNC_FL)
2407                 inode->i_flags |= S_DIRSYNC;
2408 }
2409
2410 void ext3_read_inode(struct inode * inode)
2411 {
2412         struct ext3_iloc iloc;
2413         struct ext3_inode *raw_inode;
2414         struct ext3_inode_info *ei = EXT3_I(inode);
2415         struct buffer_head *bh;
2416         int block;
2417         uid_t uid;
2418         gid_t gid;
2419
2420 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2421         ei->i_acl = EXT3_ACL_NOT_CACHED;
2422         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2423 #endif
2424         ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
2425
2426         if (ext3_get_inode_loc(inode, &iloc, 0))
2427                 goto bad_inode;
2428         bh = iloc.bh;
2429         raw_inode = ext3_raw_inode(&iloc);
2430         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2431         uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2432         gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2433         if(!(test_opt (inode->i_sb, NO_UID32))) {
2434                 uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2435                 gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2436         }
2437         inode->i_uid = INOXID_UID(XID_TAG(inode), uid, gid);
2438         inode->i_gid = INOXID_GID(XID_TAG(inode), uid, gid);
2439         inode->i_xid = INOXID_XID(XID_TAG(inode), uid, gid,
2440                 le16_to_cpu(raw_inode->i_raw_xid));
2441
2442         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2443         inode->i_size = le32_to_cpu(raw_inode->i_size);
2444         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2445         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2446         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2447         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2448
2449         ei->i_state = 0;
2450         ei->i_next_alloc_block = 0;
2451         ei->i_next_alloc_goal = 0;
2452         ei->i_dir_start_lookup = 0;
2453         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2454         /* We now have enough fields to check if the inode was active or not.
2455          * This is needed because nfsd might try to access dead inodes
2456          * the test is that same one that e2fsck uses
2457          * NeilBrown 1999oct15
2458          */
2459         if (inode->i_nlink == 0) {
2460                 if (inode->i_mode == 0 ||
2461                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2462                         /* this inode is deleted */
2463                         brelse (bh);
2464                         goto bad_inode;
2465                 }
2466                 /* The only unlinked inodes we let through here have
2467                  * valid i_mode and are being read by the orphan
2468                  * recovery code: that's fine, we're about to complete
2469                  * the process of deleting those. */
2470         }
2471         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2472                                          * (for stat), not the fs block
2473                                          * size */
2474         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2475         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2476 #ifdef EXT3_FRAGMENTS
2477         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2478         ei->i_frag_no = raw_inode->i_frag;
2479         ei->i_frag_size = raw_inode->i_fsize;
2480 #endif
2481         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2482         if (!S_ISREG(inode->i_mode)) {
2483                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2484         } else {
2485                 inode->i_size |=
2486                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2487         }
2488         ei->i_disksize = inode->i_size;
2489         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2490         ei->i_block_group = iloc.block_group;
2491         ei->i_rsv_window.rsv_start = 0;
2492         ei->i_rsv_window.rsv_end= 0;
2493         atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
2494         seqlock_init(&ei->i_rsv_window.rsv_seqlock);
2495         /*
2496          * NOTE! The in-memory inode i_data array is in little-endian order
2497          * even on big-endian machines: we do NOT byteswap the block numbers!
2498          */
2499         for (block = 0; block < EXT3_N_BLOCKS; block++)
2500                 ei->i_data[block] = raw_inode->i_block[block];
2501         INIT_LIST_HEAD(&ei->i_orphan);
2502
2503         if (S_ISREG(inode->i_mode)) {
2504                 inode->i_op = &ext3_file_inode_operations;
2505                 inode->i_fop = &ext3_file_operations;
2506                 ext3_set_aops(inode);
2507         } else if (S_ISDIR(inode->i_mode)) {
2508                 inode->i_op = &ext3_dir_inode_operations;
2509                 inode->i_fop = &ext3_dir_operations;
2510         } else if (S_ISLNK(inode->i_mode)) {
2511                 if (ext3_inode_is_fast_symlink(inode))
2512                         inode->i_op = &ext3_fast_symlink_inode_operations;
2513                 else {
2514                         inode->i_op = &ext3_symlink_inode_operations;
2515                         ext3_set_aops(inode);
2516                 }
2517         } else {
2518                 inode->i_op = &ext3_special_inode_operations;
2519                 if (raw_inode->i_block[0])
2520                         init_special_inode(inode, inode->i_mode,
2521                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2522                 else
2523                         init_special_inode(inode, inode->i_mode,
2524                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2525         }
2526         brelse (iloc.bh);
2527         ext3_set_inode_flags(inode);
2528         return;
2529
2530 bad_inode:
2531         make_bad_inode(inode);
2532         return;
2533 }
2534
2535 /*
2536  * Post the struct inode info into an on-disk inode location in the
2537  * buffer-cache.  This gobbles the caller's reference to the
2538  * buffer_head in the inode location struct.
2539  *
2540  * The caller must have write access to iloc->bh.
2541  */
2542 static int ext3_do_update_inode(handle_t *handle,
2543                                 struct inode *inode,
2544                                 struct ext3_iloc *iloc)
2545 {
2546         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2547         struct ext3_inode_info *ei = EXT3_I(inode);
2548         struct buffer_head *bh = iloc->bh;
2549         uid_t uid = XIDINO_UID(XID_TAG(inode), inode->i_uid, inode->i_xid);
2550         gid_t gid = XIDINO_GID(XID_TAG(inode), inode->i_gid, inode->i_xid);
2551         int err = 0, rc, block;
2552
2553         /* For fields not not tracking in the in-memory inode,
2554          * initialise them to zero for new inodes. */
2555         if (ei->i_state & EXT3_STATE_NEW)
2556                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2557
2558         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2559         if(!(test_opt(inode->i_sb, NO_UID32))) {
2560                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
2561                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
2562 /*
2563  * Fix up interoperability with old kernels. Otherwise, old inodes get
2564  * re-used with the upper 16 bits of the uid/gid intact
2565  */
2566                 if(!ei->i_dtime) {
2567                         raw_inode->i_uid_high =
2568                                 cpu_to_le16(high_16_bits(uid));
2569                         raw_inode->i_gid_high =
2570                                 cpu_to_le16(high_16_bits(gid));
2571                 } else {
2572                         raw_inode->i_uid_high = 0;
2573                         raw_inode->i_gid_high = 0;
2574                 }
2575         } else {
2576                 raw_inode->i_uid_low =
2577                         cpu_to_le16(fs_high2lowuid(uid));
2578                 raw_inode->i_gid_low =
2579                         cpu_to_le16(fs_high2lowgid(gid));
2580                 raw_inode->i_uid_high = 0;
2581                 raw_inode->i_gid_high = 0;
2582         }
2583 #ifdef CONFIG_INOXID_GID32
2584         raw_inode->i_raw_xid = cpu_to_le16(inode->i_xid);
2585 #endif
2586         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2587         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2588         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2589         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2590         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2591         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2592         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2593         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2594 #ifdef EXT3_FRAGMENTS
2595         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2596         raw_inode->i_frag = ei->i_frag_no;
2597         raw_inode->i_fsize = ei->i_frag_size;
2598 #endif
2599         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2600         if (!S_ISREG(inode->i_mode)) {
2601                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2602         } else {
2603                 raw_inode->i_size_high =
2604                         cpu_to_le32(ei->i_disksize >> 32);
2605                 if (ei->i_disksize > 0x7fffffffULL) {
2606                         struct super_block *sb = inode->i_sb;
2607                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2608                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2609                             EXT3_SB(sb)->s_es->s_rev_level ==
2610                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2611                                /* If this is the first large file
2612                                 * created, add a flag to the superblock.
2613                                 */
2614                                 err = ext3_journal_get_write_access(handle,
2615                                                 EXT3_SB(sb)->s_sbh);
2616                                 if (err)
2617                                         goto out_brelse;
2618                                 ext3_update_dynamic_rev(sb);
2619                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2620                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2621                                 sb->s_dirt = 1;
2622                                 handle->h_sync = 1;
2623                                 err = ext3_journal_dirty_metadata(handle,
2624                                                 EXT3_SB(sb)->s_sbh);
2625                         }
2626                 }
2627         }
2628         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2629         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2630                 if (old_valid_dev(inode->i_rdev)) {
2631                         raw_inode->i_block[0] =
2632                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
2633                         raw_inode->i_block[1] = 0;
2634                 } else {
2635                         raw_inode->i_block[0] = 0;
2636                         raw_inode->i_block[1] =
2637                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
2638                         raw_inode->i_block[2] = 0;
2639                 }
2640         } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2641                 raw_inode->i_block[block] = ei->i_data[block];
2642
2643         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2644         rc = ext3_journal_dirty_metadata(handle, bh);
2645         if (!err)
2646                 err = rc;
2647         ei->i_state &= ~EXT3_STATE_NEW;
2648
2649 out_brelse:
2650         brelse (bh);
2651         ext3_std_error(inode->i_sb, err);
2652         return err;
2653 }
2654
2655 /*
2656  * ext3_write_inode()
2657  *
2658  * We are called from a few places:
2659  *
2660  * - Within generic_file_write() for O_SYNC files.
2661  *   Here, there will be no transaction running. We wait for any running
2662  *   trasnaction to commit.
2663  *
2664  * - Within sys_sync(), kupdate and such.
2665  *   We wait on commit, if tol to.
2666  *
2667  * - Within prune_icache() (PF_MEMALLOC == true)
2668  *   Here we simply return.  We can't afford to block kswapd on the
2669  *   journal commit.
2670  *
2671  * In all cases it is actually safe for us to return without doing anything,
2672  * because the inode has been copied into a raw inode buffer in
2673  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2674  * knfsd.
2675  *
2676  * Note that we are absolutely dependent upon all inode dirtiers doing the
2677  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2678  * which we are interested.
2679  *
2680  * It would be a bug for them to not do this.  The code:
2681  *
2682  *      mark_inode_dirty(inode)
2683  *      stuff();
2684  *      inode->i_size = expr;
2685  *
2686  * is in error because a kswapd-driven write_inode() could occur while
2687  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2688  * will no longer be on the superblock's dirty inode list.
2689  */
2690 int ext3_write_inode(struct inode *inode, int wait)
2691 {
2692         if (current->flags & PF_MEMALLOC)
2693                 return 0;
2694
2695         if (ext3_journal_current_handle()) {
2696                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2697                 dump_stack();
2698                 return -EIO;
2699         }
2700
2701         if (!wait)
2702                 return 0;
2703
2704         return ext3_force_commit(inode->i_sb);
2705 }
2706
2707 int ext3_setattr_flags(struct inode *inode, unsigned int flags)
2708 {
2709         unsigned int oldflags, newflags;
2710         int err = 0;
2711
2712         oldflags = EXT3_I(inode)->i_flags;
2713         newflags = oldflags &
2714                 ~(EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL | EXT3_BARRIER_FL);
2715         if (flags & ATTR_FLAG_IMMUTABLE)
2716                 newflags |= EXT3_IMMUTABLE_FL;
2717         if (flags & ATTR_FLAG_IUNLINK)
2718                 newflags |= EXT3_IUNLINK_FL;
2719         if (flags & ATTR_FLAG_BARRIER)
2720                 newflags |= EXT3_BARRIER_FL;
2721
2722         if (oldflags ^ newflags) {
2723                 handle_t *handle;
2724                 struct ext3_iloc iloc;
2725
2726                 handle = ext3_journal_start(inode, 1);
2727                 if (IS_ERR(handle))
2728                         return PTR_ERR(handle);
2729                 if (IS_SYNC(inode))
2730                         handle->h_sync = 1;
2731                 err = ext3_reserve_inode_write(handle, inode, &iloc);
2732                 if (err)
2733                         goto flags_err;
2734
2735                 EXT3_I(inode)->i_flags = newflags;
2736                 inode->i_ctime = CURRENT_TIME;
2737
2738                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2739         flags_err:
2740                 ext3_journal_stop(handle);
2741         }
2742         return err;
2743 }
2744
2745 /*
2746  * ext3_setattr()
2747  *
2748  * Called from notify_change.
2749  *
2750  * We want to trap VFS attempts to truncate the file as soon as
2751  * possible.  In particular, we want to make sure that when the VFS
2752  * shrinks i_size, we put the inode on the orphan list and modify
2753  * i_disksize immediately, so that during the subsequent flushing of
2754  * dirty pages and freeing of disk blocks, we can guarantee that any
2755  * commit will leave the blocks being flushed in an unused state on
2756  * disk.  (On recovery, the inode will get truncated and the blocks will
2757  * be freed, so we have a strong guarantee that no future commit will
2758  * leave these blocks visible to the user.)
2759  *
2760  * Called with inode->sem down.
2761  */
2762 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2763 {
2764         struct inode *inode = dentry->d_inode;
2765         int error, rc = 0;
2766         const unsigned int ia_valid = attr->ia_valid;
2767
2768         error = inode_change_ok(inode, attr);
2769         if (error)
2770                 return error;
2771
2772         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2773                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) ||
2774                 (ia_valid & ATTR_XID && attr->ia_xid != inode->i_xid)) {
2775                 handle_t *handle;
2776
2777                 /* (user+group)*(old+new) structure, inode write (sb,
2778                  * inode block, ? - but truncate inode update has it) */
2779                 handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
2780                 if (IS_ERR(handle)) {
2781                         error = PTR_ERR(handle);
2782                         goto err_out;
2783                 }
2784                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2785                 if (error) {
2786                         ext3_journal_stop(handle);
2787                         return error;
2788                 }
2789                 /* Update corresponding info in inode so that everything is in
2790                  * one transaction */
2791                 if (attr->ia_valid & ATTR_UID)
2792                         inode->i_uid = attr->ia_uid;
2793                 if (attr->ia_valid & ATTR_GID)
2794                         inode->i_gid = attr->ia_gid;
2795                 if ((attr->ia_valid & ATTR_XID)
2796                         && inode->i_sb
2797                         && (inode->i_sb->s_flags & MS_TAGXID))
2798                         inode->i_xid = attr->ia_xid;
2799                 error = ext3_mark_inode_dirty(handle, inode);
2800                 ext3_journal_stop(handle);
2801         }
2802
2803         if (S_ISREG(inode->i_mode) &&
2804             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2805                 handle_t *handle;
2806
2807                 handle = ext3_journal_start(inode, 3);
2808                 if (IS_ERR(handle)) {
2809                         error = PTR_ERR(handle);
2810                         goto err_out;
2811                 }
2812
2813                 error = ext3_orphan_add(handle, inode);
2814                 EXT3_I(inode)->i_disksize = attr->ia_size;
2815                 rc = ext3_mark_inode_dirty(handle, inode);
2816                 if (!error)
2817                         error = rc;
2818                 ext3_journal_stop(handle);
2819         }
2820
2821         if (ia_valid & ATTR_ATTR_FLAG) {
2822                 rc = ext3_setattr_flags(inode, attr->ia_attr_flags);
2823                 if (!error)
2824                         error = rc;
2825         }
2826
2827         rc = inode_setattr(inode, attr);
2828
2829         /* If inode_setattr's call to ext3_truncate failed to get a
2830          * transaction handle at all, we need to clean up the in-core
2831          * orphan list manually. */
2832         if (inode->i_nlink)
2833                 ext3_orphan_del(NULL, inode);
2834
2835         if (!rc && (ia_valid & ATTR_MODE))
2836                 rc = ext3_acl_chmod(inode);
2837
2838 err_out:
2839         ext3_std_error(inode->i_sb, error);
2840         if (!error)
2841                 error = rc;
2842         return error;
2843 }
2844
2845
2846 /*
2847  * akpm: how many blocks doth make a writepage()?
2848  *
2849  * With N blocks per page, it may be:
2850  * N data blocks
2851  * 2 indirect block
2852  * 2 dindirect
2853  * 1 tindirect
2854  * N+5 bitmap blocks (from the above)
2855  * N+5 group descriptor summary blocks
2856  * 1 inode block
2857  * 1 superblock.
2858  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2859  *
2860  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2861  *
2862  * With ordered or writeback data it's the same, less the N data blocks.
2863  *
2864  * If the inode's direct blocks can hold an integral number of pages then a
2865  * page cannot straddle two indirect blocks, and we can only touch one indirect
2866  * and dindirect block, and the "5" above becomes "3".
2867  *
2868  * This still overestimates under most circumstances.  If we were to pass the
2869  * start and end offsets in here as well we could do block_to_path() on each
2870  * block and work out the exact number of indirects which are touched.  Pah.
2871  */
2872
2873 int ext3_writepage_trans_blocks(struct inode *inode)
2874 {
2875         int bpp = ext3_journal_blocks_per_page(inode);
2876         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2877         int ret;
2878
2879         if (ext3_should_journal_data(inode))
2880                 ret = 3 * (bpp + indirects) + 2;
2881         else
2882                 ret = 2 * (bpp + indirects) + 2;
2883
2884 #ifdef CONFIG_QUOTA
2885         /* We know that structure was already allocated during DQUOT_INIT so
2886          * we will be updating only the data blocks + inodes */
2887         ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
2888 #endif
2889
2890         return ret;
2891 }
2892
2893 /*
2894  * The caller must have previously called ext3_reserve_inode_write().
2895  * Give this, we know that the caller already has write access to iloc->bh.
2896  */
2897 int ext3_mark_iloc_dirty(handle_t *handle,
2898                 struct inode *inode, struct ext3_iloc *iloc)
2899 {
2900         int err = 0;
2901
2902         /* the do_update_inode consumes one bh->b_count */
2903         get_bh(iloc->bh);
2904
2905         /* ext3_do_update_inode() does journal_dirty_metadata */
2906         err = ext3_do_update_inode(handle, inode, iloc);
2907         put_bh(iloc->bh);
2908         return err;
2909 }
2910
2911 /*
2912  * On success, We end up with an outstanding reference count against
2913  * iloc->bh.  This _must_ be cleaned up later.
2914  */
2915
2916 int
2917 ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
2918                          struct ext3_iloc *iloc)
2919 {
2920         int err = 0;
2921         if (handle) {
2922                 err = ext3_get_inode_loc(inode, iloc, 1);
2923                 if (!err) {
2924                         BUFFER_TRACE(iloc->bh, "get_write_access");
2925                         err = ext3_journal_get_write_access(handle, iloc->bh);
2926                         if (err) {
2927                                 brelse(iloc->bh);
2928                                 iloc->bh = NULL;
2929                         }
2930                 }
2931         }
2932         ext3_std_error(inode->i_sb, err);
2933         return err;
2934 }
2935
2936 /*
2937  * akpm: What we do here is to mark the in-core inode as clean
2938  * with respect to inode dirtiness (it may still be data-dirty).
2939  * This means that the in-core inode may be reaped by prune_icache
2940  * without having to perform any I/O.  This is a very good thing,
2941  * because *any* task may call prune_icache - even ones which
2942  * have a transaction open against a different journal.
2943  *
2944  * Is this cheating?  Not really.  Sure, we haven't written the
2945  * inode out, but prune_icache isn't a user-visible syncing function.
2946  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2947  * we start and wait on commits.
2948  *
2949  * Is this efficient/effective?  Well, we're being nice to the system
2950  * by cleaning up our inodes proactively so they can be reaped
2951  * without I/O.  But we are potentially leaving up to five seconds'
2952  * worth of inodes floating about which prune_icache wants us to
2953  * write out.  One way to fix that would be to get prune_icache()
2954  * to do a write_super() to free up some memory.  It has the desired
2955  * effect.
2956  */
2957 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2958 {
2959         struct ext3_iloc iloc;
2960         int err;
2961
2962         might_sleep();
2963         err = ext3_reserve_inode_write(handle, inode, &iloc);
2964         if (!err)
2965                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2966         return err;
2967 }
2968
2969 /*
2970  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
2971  *
2972  * We're really interested in the case where a file is being extended.
2973  * i_size has been changed by generic_commit_write() and we thus need
2974  * to include the updated inode in the current transaction.
2975  *
2976  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
2977  * are allocated to the file.
2978  *
2979  * If the inode is marked synchronous, we don't honour that here - doing
2980  * so would cause a commit on atime updates, which we don't bother doing.
2981  * We handle synchronous inodes at the highest possible level.
2982  */
2983 void ext3_dirty_inode(struct inode *inode)
2984 {
2985         handle_t *current_handle = ext3_journal_current_handle();
2986         handle_t *handle;
2987
2988         handle = ext3_journal_start(inode, 2);
2989         if (IS_ERR(handle))
2990                 goto out;
2991         if (current_handle &&
2992                 current_handle->h_transaction != handle->h_transaction) {
2993                 /* This task has a transaction open against a different fs */
2994                 printk(KERN_EMERG "%s: transactions do not match!\n",
2995                        __FUNCTION__);
2996         } else {
2997                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
2998                                 current_handle);
2999                 ext3_mark_inode_dirty(handle, inode);
3000         }
3001         ext3_journal_stop(handle);
3002 out:
3003         return;
3004 }
3005
3006 #ifdef AKPM
3007 /*
3008  * Bind an inode's backing buffer_head into this transaction, to prevent
3009  * it from being flushed to disk early.  Unlike
3010  * ext3_reserve_inode_write, this leaves behind no bh reference and
3011  * returns no iloc structure, so the caller needs to repeat the iloc
3012  * lookup to mark the inode dirty later.
3013  */
3014 static inline int
3015 ext3_pin_inode(handle_t *handle, struct inode *inode)
3016 {
3017         struct ext3_iloc iloc;
3018
3019         int err = 0;
3020         if (handle) {
3021                 err = ext3_get_inode_loc(inode, &iloc, 1);
3022                 if (!err) {
3023                         BUFFER_TRACE(iloc.bh, "get_write_access");
3024                         err = journal_get_write_access(handle, iloc.bh);
3025                         if (!err)
3026                                 err = ext3_journal_dirty_metadata(handle,
3027                                                                   iloc.bh);
3028                         brelse(iloc.bh);
3029                 }
3030         }
3031         ext3_std_error(inode->i_sb, err);
3032         return err;
3033 }
3034 #endif
3035
3036 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3037 {
3038         journal_t *journal;
3039         handle_t *handle;
3040         int err;
3041
3042         /*
3043          * We have to be very careful here: changing a data block's
3044          * journaling status dynamically is dangerous.  If we write a
3045          * data block to the journal, change the status and then delete
3046          * that block, we risk forgetting to revoke the old log record
3047          * from the journal and so a subsequent replay can corrupt data.
3048          * So, first we make sure that the journal is empty and that
3049          * nobody is changing anything.
3050          */
3051
3052         journal = EXT3_JOURNAL(inode);
3053         if (is_journal_aborted(journal) || IS_RDONLY(inode))
3054                 return -EROFS;
3055
3056         journal_lock_updates(journal);
3057         journal_flush(journal);
3058
3059         /*
3060          * OK, there are no updates running now, and all cached data is
3061          * synced to disk.  We are now in a completely consistent state
3062          * which doesn't have anything in the journal, and we know that
3063          * no filesystem updates are running, so it is safe to modify
3064          * the inode's in-core data-journaling state flag now.
3065          */
3066
3067         if (val)
3068                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3069         else
3070                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3071         ext3_set_aops(inode);
3072
3073         journal_unlock_updates(journal);
3074
3075         /* Finally we can mark the inode as dirty. */
3076
3077         handle = ext3_journal_start(inode, 1);
3078         if (IS_ERR(handle))
3079                 return PTR_ERR(handle);
3080
3081         err = ext3_mark_inode_dirty(handle, inode);
3082         handle->h_sync = 1;
3083         ext3_journal_stop(handle);
3084         ext3_std_error(inode->i_sb, err);
3085
3086         return err;
3087 }