fs/reiserfs/inode.c

   1 /*
   2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
   3  */
   4
   5 #include <linux/config.h>
   6 #include <linux/time.h>
   7 #include <linux/fs.h>
   8 #include <linux/reiserfs_fs.h>
   9 #include <linux/reiserfs_acl.h>
  10 #include <linux/reiserfs_xattr.h>
  11 #include <linux/smp_lock.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/highmem.h>
  14 #include <asm/uaccess.h>
  15 #include <asm/unaligned.h>
  16 #include <linux/buffer_head.h>
  17 #include <linux/mpage.h>
  18 #include <linux/writeback.h>
  19 #include <linux/quotaops.h>
  20 #include <linux/vserver/xid.h>
  21
  22 extern int reiserfs_default_io_size; /* default io size devuned in super.c */
  23
  24 static int reiserfs_commit_write(struct file *f, struct page *page,
  25                                  unsigned from, unsigned to);
  26 static int reiserfs_prepare_write(struct file *f, struct page *page,
  27                                   unsigned from, unsigned to);
  28
  29 void reiserfs_delete_inode (struct inode * inode)
  30 {
  31     /* We need blocks for transaction + (user+group) quota update (possibly delete) */
  32     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS;
  33     struct reiserfs_transaction_handle th ;
  34
  35     reiserfs_write_lock(inode->i_sb);
  36
  37     /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
  38     if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
  39         down (&inode->i_sem);
  40
  41         reiserfs_delete_xattrs (inode);
  42
  43         if (journal_begin(&th, inode->i_sb, jbegin_count)) {
  44             up (&inode->i_sem);
  45             goto out;
  46         }
  47         reiserfs_update_inode_transaction(inode) ;
  48
  49         if (reiserfs_delete_object (&th, inode)) {
  50             up (&inode->i_sem);
  51             goto out;
  52         }
  53
  54         /* Do quota update inside a transaction for journaled quotas. We must do that
  55          * after delete_object so that quota updates go into the same transaction as
  56          * stat data deletion */
  57         DQUOT_FREE_INODE(inode);
  58
  59         if (journal_end(&th, inode->i_sb, jbegin_count)) {
  60             up (&inode->i_sem);
  61             goto out;
  62         }
  63
  64         up (&inode->i_sem);
  65
  66         /* all items of file are deleted, so we can remove "save" link */
  67         remove_save_link (inode, 0/* not truncate */); /* we can't do anything
  68                                                         * about an error here */
  69     } else {
  70         /* no object items are in the tree */
  71         ;
  72     }
  73 out:
  74     clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */
  75     inode->i_blocks = 0;
  76     reiserfs_write_unlock(inode->i_sb);
  77 }
  78
  79 static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid,
  80                loff_t offset, int type, int length )
  81 {
  82     key->version = version;
  83
  84     key->on_disk_key.k_dir_id = dirid;
  85     key->on_disk_key.k_objectid = objectid;
  86     set_cpu_key_k_offset (key, offset);
  87     set_cpu_key_k_type (key, type);
  88     key->key_length = length;
  89 }
  90
  91
  92 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
  93    offset and type of key */
  94 void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset,
  95               int type, int length )
  96 {
  97   _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id),
  98                  le32_to_cpu (INODE_PKEY (inode)->k_objectid),
  99                  offset, type, length);
 100 }
 101
 102
 103 //
 104 // when key is 0, do not set version and short key
 105 //
 106 inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key,
 107                                int version,
 108                                loff_t offset, int type, int length,
 109                                int entry_count/*or ih_free_space*/)
 110 {
 111     if (key) {
 112         ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id);
 113         ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid);
 114     }
 115     put_ih_version( ih, version );
 116     set_le_ih_k_offset (ih, offset);
 117     set_le_ih_k_type (ih, type);
 118     put_ih_item_len( ih, length );
 119     /*    set_ih_free_space (ih, 0);*/
 120     // for directory items it is entry count, for directs and stat
 121     // datas - 0xffff, for indirects - 0
 122     put_ih_entry_count( ih, entry_count );
 123 }
 124
 125 //
 126 // FIXME: we might cache recently accessed indirect item
 127
 128 // Ugh.  Not too eager for that....
 129 //  I cut the code until such time as I see a convincing argument (benchmark).
 130 // I don't want a bloated inode struct..., and I don't like code complexity....
 131
 132 /* cutting the code is fine, since it really isn't in use yet and is easy
 133 ** to add back in.  But, Vladimir has a really good idea here.  Think
 134 ** about what happens for reading a file.  For each page,
 135 ** The VFS layer calls reiserfs_readpage, who searches the tree to find
 136 ** an indirect item.  This indirect item has X number of pointers, where
 137 ** X is a big number if we've done the block allocation right.  But,
 138 ** we only use one or two of these pointers during each call to readpage,
 139 ** needlessly researching again later on.
 140 **
 141 ** The size of the cache could be dynamic based on the size of the file.
 142 **
 143 ** I'd also like to see us cache the location the stat data item, since
 144 ** we are needlessly researching for that frequently.
 145 **
 146 ** --chris
 147 */
 148
 149 /* If this page has a file tail in it, and
 150 ** it was read in by get_block_create_0, the page data is valid,
 151 ** but tail is still sitting in a direct item, and we can't write to
 152 ** it.  So, look through this page, and check all the mapped buffers
 153 ** to make sure they have valid block numbers.  Any that don't need
 154 ** to be unmapped, so that block_prepare_write will correctly call
 155 ** reiserfs_get_block to convert the tail into an unformatted node
 156 */
 157 static inline void fix_tail_page_for_writing(struct page *page) {
 158     struct buffer_head *head, *next, *bh ;
 159
 160     if (page && page_has_buffers(page)) {
 161         head = page_buffers(page) ;
 162         bh = head ;
 163         do {
 164             next = bh->b_this_page ;
 165             if (buffer_mapped(bh) && bh->b_blocknr == 0) {
 166                 reiserfs_unmap_buffer(bh) ;
 167             }
 168             bh = next ;
 169         } while (bh != head) ;
 170     }
 171 }
 172
 173 /* reiserfs_get_block does not need to allocate a block only if it has been
 174    done already or non-hole position has been found in the indirect item */
 175 static inline int allocation_needed (int retval, b_blocknr_t allocated,
 176                                      struct item_head * ih,
 177                                      __le32 * item, int pos_in_item)
 178 {
 179   if (allocated)
 180          return 0;
 181   if (retval == POSITION_FOUND && is_indirect_le_ih (ih) &&
 182       get_block_num(item, pos_in_item))
 183          return 0;
 184   return 1;
 185 }
 186
 187 static inline int indirect_item_found (int retval, struct item_head * ih)
 188 {
 189   return (retval == POSITION_FOUND) && is_indirect_le_ih (ih);
 190 }
 191
 192
 193 static inline void set_block_dev_mapped (struct buffer_head * bh,
 194                                          b_blocknr_t block, struct inode * inode)
 195 {
 196         map_bh(bh, inode->i_sb, block);
 197 }
 198
 199
 200 //
 201 // files which were created in the earlier version can not be longer,
 202 // than 2 gb
 203 //
 204 static int file_capable (struct inode * inode, long block)
 205 {
 206     if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file.
 207         block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
 208         return 1;
 209
 210     return 0;
 211 }
 212
 213 /*static*/ int restart_transaction(struct reiserfs_transaction_handle *th,
 214                                 struct inode *inode, struct path *path) {
 215   struct super_block *s = th->t_super ;
 216   int len = th->t_blocks_allocated ;
 217   int err;
 218
 219   BUG_ON (!th->t_trans_id);
 220   BUG_ON (!th->t_refcount);
 221
 222   /* we cannot restart while nested */
 223   if (th->t_refcount > 1) {
 224       return 0  ;
 225   }
 226   pathrelse(path) ;
 227   reiserfs_update_sd(th, inode) ;
 228   err = journal_end(th, s, len) ;
 229   if (!err) {
 230       err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ;
 231       if (!err)
 232         reiserfs_update_inode_transaction(inode) ;
 233   }
 234   return err;
 235 }
 236
 237 // it is called by get_block when create == 0. Returns block number
 238 // for 'block'-th logical block of file. When it hits direct item it
 239 // returns 0 (being called from bmap) or read direct item into piece
 240 // of page (bh_result)
 241
 242 // Please improve the english/clarity in the comment above, as it is
 243 // hard to understand.
 244
 245 static int _get_block_create_0 (struct inode * inode, long block,
 246                                  struct buffer_head * bh_result,
 247                                  int args)
 248 {
 249     INITIALIZE_PATH (path);
 250     struct cpu_key key;
 251     struct buffer_head * bh;
 252     struct item_head * ih, tmp_ih;
 253     int fs_gen ;
 254     int blocknr;
 255     char * p = NULL;
 256     int chars;
 257     int ret ;
 258     int done = 0 ;
 259     unsigned long offset ;
 260
 261     // prepare the key to look for the 'block'-th block of file
 262     make_cpu_key (&key, inode,
 263                   (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3);
 264
 265 research:
 266     if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) {
 267         pathrelse (&path);
 268         if (p)
 269             kunmap(bh_result->b_page) ;
 270         // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 271         // That there is some MMAPED data associated with it that is yet to be written to disk.
 272         if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
 273             return -ENOENT ;
 274         }
 275         return 0 ;
 276     }
 277
 278     //
 279     bh = get_last_bh (&path);
 280     ih = get_ih (&path);
 281     if (is_indirect_le_ih (ih)) {
 282         __le32 * ind_item = (__le32 *)B_I_PITEM (bh, ih);
 283
 284         /* FIXME: here we could cache indirect item or part of it in
 285            the inode to avoid search_by_key in case of subsequent
 286            access to file */
 287         blocknr = get_block_num(ind_item, path.pos_in_item) ;
 288         ret = 0 ;
 289         if (blocknr) {
 290             map_bh(bh_result, inode->i_sb, blocknr);
 291             if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
 292                 set_buffer_boundary(bh_result);
 293             }
 294         } else
 295             // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 296             // That there is some MMAPED data associated with it that is yet to  be written to disk.
 297             if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
 298             ret = -ENOENT ;
 299             }
 300
 301         pathrelse (&path);
 302         if (p)
 303             kunmap(bh_result->b_page) ;
 304         return ret ;
 305     }
 306
 307     // requested data are in direct item(s)
 308     if (!(args & GET_BLOCK_READ_DIRECT)) {
 309         // we are called by bmap. FIXME: we can not map block of file
 310         // when it is stored in direct item(s)
 311         pathrelse (&path);
 312         if (p)
 313             kunmap(bh_result->b_page) ;
 314         return -ENOENT;
 315     }
 316
 317     /* if we've got a direct item, and the buffer or page was uptodate,
 318     ** we don't want to pull data off disk again.  skip to the
 319     ** end, where we map the buffer and return
 320     */
 321     if (buffer_uptodate(bh_result)) {
 322         goto finished ;
 323     } else
 324         /*
 325         ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
 326         ** pages without any buffers.  If the page is up to date, we don't want
 327         ** read old data off disk.  Set the up to date bit on the buffer instead
 328         ** and jump to the end
 329         */
 330             if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
 331                 set_buffer_uptodate(bh_result);
 332                 goto finished ;
 333     }
 334
 335     // read file tail into part of page
 336     offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ;
 337     fs_gen = get_generation(inode->i_sb) ;
 338     copy_item_head (&tmp_ih, ih);
 339
 340     /* we only want to kmap if we are reading the tail into the page.
 341     ** this is not the common case, so we don't kmap until we are
 342     ** sure we need to.  But, this means the item might move if
 343     ** kmap schedules
 344     */
 345     if (!p) {
 346         p = (char *)kmap(bh_result->b_page) ;
 347         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 348             goto research;
 349         }
 350     }
 351     p += offset ;
 352     memset (p, 0, inode->i_sb->s_blocksize);
 353     do {
 354         if (!is_direct_le_ih (ih)) {
 355             BUG ();
 356         }
 357         /* make sure we don't read more bytes than actually exist in
 358         ** the file.  This can happen in odd cases where i_size isn't
 359         ** correct, and when direct item padding results in a few
 360         ** extra bytes at the end of the direct item
 361         */
 362         if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
 363             break ;
 364         if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
 365             chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item;
 366             done = 1 ;
 367         } else {
 368             chars = ih_item_len(ih) - path.pos_in_item;
 369         }
 370         memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars);
 371
 372         if (done)
 373             break ;
 374
 375         p += chars;
 376
 377         if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1))
 378             // we done, if read direct item is not the last item of
 379             // node FIXME: we could try to check right delimiting key
 380             // to see whether direct item continues in the right
 381             // neighbor or rely on i_size
 382             break;
 383
 384         // update key to look for the next piece
 385         set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars);
 386         if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND)
 387             // we read something from tail, even if now we got IO_ERROR
 388             break;
 389         bh = get_last_bh (&path);
 390         ih = get_ih (&path);
 391     } while (1);
 392
 393     flush_dcache_page(bh_result->b_page) ;
 394     kunmap(bh_result->b_page) ;
 395
 396 finished:
 397     pathrelse (&path);
 398     /* this buffer has valid data, but isn't valid for io.  mapping it to
 399      * block #0 tells the rest of reiserfs it just has a tail in it
 400      */
 401     map_bh(bh_result, inode->i_sb, 0);
 402     set_buffer_uptodate (bh_result);
 403     return 0;
 404 }
 405
 406
 407 // this is called to create file map. So, _get_block_create_0 will not
 408 // read direct item
 409 static int reiserfs_bmap (struct inode * inode, sector_t block,
 410                           struct buffer_head * bh_result, int create)
 411 {
 412     if (!file_capable (inode, block))
 413         return -EFBIG;
 414
 415     reiserfs_write_lock(inode->i_sb);
 416     /* do not read the direct item */
 417     _get_block_create_0 (inode, block, bh_result, 0) ;
 418     reiserfs_write_unlock(inode->i_sb);
 419     return 0;
 420 }
 421
 422 /* special version of get_block that is only used by grab_tail_page right
 423 ** now.  It is sent to block_prepare_write, and when you try to get a
 424 ** block past the end of the file (or a block from a hole) it returns
 425 ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
 426 ** be able to do i/o on the buffers returned, unless an error value
 427 ** is also returned.
 428 **
 429 ** So, this allows block_prepare_write to be used for reading a single block
 430 ** in a page.  Where it does not produce a valid page for holes, or past the
 431 ** end of the file.  This turns out to be exactly what we need for reading
 432 ** tails for conversion.
 433 **
 434 ** The point of the wrapper is forcing a certain value for create, even
 435 ** though the VFS layer is calling this function with create==1.  If you
 436 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 437 ** don't use this function.
 438 */
 439 static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block,
 440                         struct buffer_head * bh_result, int create) {
 441     return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ;
 442 }
 443
 444 /* This is special helper for reiserfs_get_block in case we are executing
 445    direct_IO request. */
 446 static int reiserfs_get_blocks_direct_io(struct inode *inode,
 447                                          sector_t iblock,
 448                                          unsigned long max_blocks,
 449                                          struct buffer_head *bh_result,
 450                                          int create)
 451 {
 452     int ret ;
 453
 454     bh_result->b_page = NULL;
 455
 456     /* We set the b_size before reiserfs_get_block call since it is
 457        referenced in convert_tail_for_hole() that may be called from
 458        reiserfs_get_block() */
 459     bh_result->b_size = (1 << inode->i_blkbits);
 460
 461     ret = reiserfs_get_block(inode, iblock, bh_result,
 462                              create | GET_BLOCK_NO_DANGLE) ;
 463     if (ret)
 464         goto out;
 465
 466     /* don't allow direct io onto tail pages */
 467     if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 468         /* make sure future calls to the direct io funcs for this offset
 469         ** in the file fail by unmapping the buffer
 470         */
 471         clear_buffer_mapped(bh_result);
 472         ret = -EINVAL ;
 473     }
 474     /* Possible unpacked tail. Flush the data before pages have
 475        disappeared */
 476     if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
 477         int err;
 478         lock_kernel();
 479         err = reiserfs_commit_for_inode(inode);
 480         REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 481         unlock_kernel();
 482         if (err < 0)
 483             ret = err;
 484     }
 485 out:
 486     return ret ;
 487 }
 488
 489
 490 /*
 491 ** helper function for when reiserfs_get_block is called for a hole
 492 ** but the file tail is still in a direct item
 493 ** bh_result is the buffer head for the hole
 494 ** tail_offset is the offset of the start of the tail in the file
 495 **
 496 ** This calls prepare_write, which will start a new transaction
 497 ** you should not be in a transaction, or have any paths held when you
 498 ** call this.
 499 */
 500 static int convert_tail_for_hole(struct inode *inode,
 501                                  struct buffer_head *bh_result,
 502                                  loff_t tail_offset) {
 503     unsigned long index ;
 504     unsigned long tail_end ;
 505     unsigned long tail_start ;
 506     struct page * tail_page ;
 507     struct page * hole_page = bh_result->b_page ;
 508     int retval = 0 ;
 509
 510     if ((tail_offset & (bh_result->b_size - 1)) != 1)
 511         return -EIO ;
 512
 513     /* always try to read until the end of the block */
 514     tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
 515     tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
 516
 517     index = tail_offset >> PAGE_CACHE_SHIFT ;
 518     /* hole_page can be zero in case of direct_io, we are sure
 519        that we cannot get here if we write with O_DIRECT into
 520        tail page */
 521     if (!hole_page || index != hole_page->index) {
 522         tail_page = grab_cache_page(inode->i_mapping, index) ;
 523         retval = -ENOMEM;
 524         if (!tail_page) {
 525             goto out ;
 526         }
 527     } else {
 528         tail_page = hole_page ;
 529     }
 530
 531     /* we don't have to make sure the conversion did not happen while
 532     ** we were locking the page because anyone that could convert
 533     ** must first take i_sem.
 534     **
 535     ** We must fix the tail page for writing because it might have buffers
 536     ** that are mapped, but have a block number of 0.  This indicates tail
 537     ** data that has been read directly into the page, and block_prepare_write
 538     ** won't trigger a get_block in this case.
 539     */
 540     fix_tail_page_for_writing(tail_page) ;
 541     retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
 542     if (retval)
 543         goto unlock ;
 544
 545     /* tail conversion might change the data in the page */
 546     flush_dcache_page(tail_page) ;
 547
 548     retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
 549
 550 unlock:
 551     if (tail_page != hole_page) {
 552         unlock_page(tail_page) ;
 553         page_cache_release(tail_page) ;
 554     }
 555 out:
 556     return retval ;
 557 }
 558
 559 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
 560                            long block,
 561                            struct inode *inode,
 562                            b_blocknr_t *allocated_block_nr,
 563                            struct path * path,
 564                            int flags) {
 565     BUG_ON (!th->t_trans_id);
 566
 567 #ifdef REISERFS_PREALLOCATE
 568     if (!(flags & GET_BLOCK_NO_ISEM)) {
 569         return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block);
 570     }
 571 #endif
 572     return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block);
 573 }
 574
 575 int reiserfs_get_block (struct inode * inode, sector_t block,
 576                         struct buffer_head * bh_result, int create)
 577 {
 578     int repeat, retval = 0;
 579     b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int
 580     INITIALIZE_PATH(path);
 581     int pos_in_item;
 582     struct cpu_key key;
 583     struct buffer_head * bh, * unbh = NULL;
 584     struct item_head * ih, tmp_ih;
 585     __le32 * item;
 586     int done;
 587     int fs_gen;
 588     struct reiserfs_transaction_handle *th = NULL;
 589     /* space reserved in transaction batch:
 590         . 3 balancings in direct->indirect conversion
 591         . 1 block involved into reiserfs_update_sd()
 592        XXX in practically impossible worst case direct2indirect()
 593        can incur (much) more than 3 balancings.
 594        quota update for user, group */
 595     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS;
 596     int version;
 597     int dangle = 1;
 598     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
 599
 600                                 /* bad.... */
 601     reiserfs_write_lock(inode->i_sb);
 602     version = get_inode_item_key_version (inode);
 603
 604     if (block < 0) {
 605         reiserfs_write_unlock(inode->i_sb);
 606         return -EIO;
 607     }
 608
 609     if (!file_capable (inode, block)) {
 610         reiserfs_write_unlock(inode->i_sb);
 611         return -EFBIG;
 612     }
 613
 614     /* if !create, we aren't changing the FS, so we don't need to
 615     ** log anything, so we don't need to start a transaction
 616     */
 617     if (!(create & GET_BLOCK_CREATE)) {
 618         int ret ;
 619         /* find number of block-th logical block of the file */
 620         ret = _get_block_create_0 (inode, block, bh_result,
 621                                    create | GET_BLOCK_READ_DIRECT) ;
 622         reiserfs_write_unlock(inode->i_sb);
 623         return ret;
 624     }
 625     /*
 626      * if we're already in a transaction, make sure to close
 627      * any new transactions we start in this func
 628      */
 629     if ((create & GET_BLOCK_NO_DANGLE) ||
 630         reiserfs_transaction_running(inode->i_sb))
 631         dangle = 0;
 632
 633     /* If file is of such a size, that it might have a tail and tails are enabled
 634     ** we should mark it as possibly needing tail packing on close
 635     */
 636     if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) ||
 637          (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
 638         REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 639
 640     /* set the key of the first byte in the 'block'-th block of file */
 641     make_cpu_key (&key, inode, new_offset,
 642                   TYPE_ANY, 3/*key length*/);
 643     if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
 644 start_trans:
 645         th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
 646         if (!th) {
 647             retval = -ENOMEM;
 648             goto failure;
 649         }
 650         reiserfs_update_inode_transaction(inode) ;
 651     }
 652  research:
 653
 654     retval = search_for_position_by_key (inode->i_sb, &key, &path);
 655     if (retval == IO_ERROR) {
 656         retval = -EIO;
 657         goto failure;
 658     }
 659
 660     bh = get_last_bh (&path);
 661     ih = get_ih (&path);
 662     item = get_item (&path);
 663     pos_in_item = path.pos_in_item;
 664
 665     fs_gen = get_generation (inode->i_sb);
 666     copy_item_head (&tmp_ih, ih);
 667
 668     if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 669         /* we have to allocate block for the unformatted node */
 670         if (!th) {
 671             pathrelse(&path) ;
 672             goto start_trans;
 673         }
 674
 675         repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 676
 677         if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 678             /* restart the transaction to give the journal a chance to free
 679             ** some blocks.  releases the path, so we have to go back to
 680             ** research if we succeed on the second try
 681             */
 682             SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
 683             retval = restart_transaction(th, inode, &path) ;
 684             if (retval)
 685                 goto failure;
 686             repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 687
 688             if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
 689                 goto research ;
 690             }
 691             if (repeat == QUOTA_EXCEEDED)
 692                 retval = -EDQUOT;
 693             else
 694                 retval = -ENOSPC;
 695             goto failure;
 696         }
 697
 698         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 699             goto research;
 700         }
 701     }
 702
 703     if (indirect_item_found (retval, ih)) {
 704         b_blocknr_t unfm_ptr;
 705         /* 'block'-th block is in the file already (there is
 706            corresponding cell in some indirect item). But it may be
 707            zero unformatted node pointer (hole) */
 708         unfm_ptr = get_block_num (item, pos_in_item);
 709         if (unfm_ptr == 0) {
 710             /* use allocated block to plug the hole */
 711             reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
 712             if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 713                 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
 714                 goto research;
 715             }
 716             set_buffer_new(bh_result);
 717             if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
 718                 reiserfs_add_ordered_list(inode, bh_result);
 719             put_block_num(item, pos_in_item, allocated_block_nr) ;
 720             unfm_ptr = allocated_block_nr;
 721             journal_mark_dirty (th, inode->i_sb, bh);
 722             reiserfs_update_sd(th, inode) ;
 723         }
 724         set_block_dev_mapped(bh_result, unfm_ptr, inode);
 725         pathrelse (&path);
 726         retval = 0;
 727         if (!dangle && th)
 728             retval = reiserfs_end_persistent_transaction(th);
 729
 730         reiserfs_write_unlock(inode->i_sb);
 731
 732         /* the item was found, so new blocks were not added to the file
 733         ** there is no need to make sure the inode is updated with this
 734         ** transaction
 735         */
 736         return retval;
 737     }
 738
 739     if (!th) {
 740         pathrelse(&path) ;
 741         goto start_trans;
 742     }
 743
 744     /* desired position is not found or is in the direct item. We have
 745        to append file with holes up to 'block'-th block converting
 746        direct items to indirect one if necessary */
 747     done = 0;
 748     do {
 749         if (is_statdata_le_ih (ih)) {
 750             __le32 unp = 0;
 751             struct cpu_key tmp_key;
 752
 753             /* indirect item has to be inserted */
 754             make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT,
 755                                UNFM_P_SIZE, 0/* free_space */);
 756
 757             if (cpu_key_k_offset (&key) == 1) {
 758                 /* we are going to add 'block'-th block to the file. Use
 759                    allocated block for that */
 760                 unp = cpu_to_le32 (allocated_block_nr);
 761                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 762                 set_buffer_new(bh_result);
 763                 done = 1;
 764             }
 765             tmp_key = key; // ;)
 766             set_cpu_key_k_offset (&tmp_key, 1);
 767             PATH_LAST_POSITION(&path) ++;
 768
 769             retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
 770             if (retval) {
 771                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 772                 goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 773             }
 774             //mark_tail_converted (inode);
 775         } else if (is_direct_le_ih (ih)) {
 776             /* direct item has to be converted */
 777             loff_t tail_offset;
 778
 779             tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
 780             if (tail_offset == cpu_key_k_offset (&key)) {
 781                 /* direct item we just found fits into block we have
 782                    to map. Convert it into unformatted node: use
 783                    bh_result for the conversion */
 784                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 785                 unbh = bh_result;
 786                 done = 1;
 787             } else {
 788                 /* we have to padd file tail stored in direct item(s)
 789                    up to block size and convert it to unformatted
 790                    node. FIXME: this should also get into page cache */
 791
 792                 pathrelse(&path) ;
 793                 /*
 794                  * ugly, but we can only end the transaction if
 795                  * we aren't nested
 796                  */
 797                 BUG_ON (!th->t_refcount);
 798                 if (th->t_refcount == 1) {
 799                     retval = reiserfs_end_persistent_transaction(th);
 800                     th = NULL;
 801                     if (retval)
 802                         goto failure;
 803                 }
 804
 805                 retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 806                 if (retval) {
 807                     if ( retval != -ENOSPC )
 808                         reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ;
 809                     if (allocated_block_nr) {
 810                         /* the bitmap, the super, and the stat data == 3 */
 811                         if (!th)
 812                             th = reiserfs_persistent_transaction(inode->i_sb,3);
 813                         if (th)
 814                             reiserfs_free_block (th,inode,allocated_block_nr,1);
 815                     }
 816                     goto failure ;
 817                 }
 818                 goto research ;
 819             }
 820             retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 821             if (retval) {
 822                 reiserfs_unmap_buffer(unbh);
 823                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 824                 goto failure;
 825             }
 826             /* it is important the set_buffer_uptodate is done after
 827             ** the direct2indirect.  The buffer might contain valid
 828             ** data newer than the data on disk (read by readpage, changed,
 829             ** and then sent here by writepage).  direct2indirect needs
 830             ** to know if unbh was already up to date, so it can decide
 831             ** if the data in unbh needs to be replaced with data from
 832             ** the disk
 833             */
 834             set_buffer_uptodate (unbh);
 835
 836             /* unbh->b_page == NULL in case of DIRECT_IO request, this means
 837                buffer will disappear shortly, so it should not be added to
 838              */
 839             if ( unbh->b_page ) {
 840                 /* we've converted the tail, so we must
 841                 ** flush unbh before the transaction commits
 842                 */
 843                 reiserfs_add_tail_list(inode, unbh) ;
 844
 845                 /* mark it dirty now to prevent commit_write from adding
 846                 ** this buffer to the inode's dirty buffer list
 847                 */
 848                 /*
 849                  * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
 850                  * It's still atomic, but it sets the page dirty too,
 851                  * which makes it eligible for writeback at any time by the
 852                  * VM (which was also the case with __mark_buffer_dirty())
 853                  */
 854                 mark_buffer_dirty(unbh) ;
 855             }
 856         } else {
 857             /* append indirect item with holes if needed, when appending
 858                pointer to 'block'-th block use block, which is already
 859                allocated */
 860             struct cpu_key tmp_key;
 861             unp_t unf_single=0; // We use this in case we need to allocate only
 862                                 // one block which is a fastpath
 863             unp_t *un;
 864             __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE;
 865             __u64 blocks_needed;
 866
 867             RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
 868                     "vs-804: invalid position for append");
 869             /* indirect item has to be appended, set up key of that position */
 870             make_cpu_key (&tmp_key, inode,
 871                           le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize),
 872                           //pos_in_item * inode->i_sb->s_blocksize,
 873                           TYPE_INDIRECT, 3);// key type is unimportant
 874
 875             blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits);
 876             RFALSE( blocks_needed < 0, "green-805: invalid offset");
 877
 878             if ( blocks_needed == 1 ) {
 879                 un = &unf_single;
 880             } else {
 881                 un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE,
 882                             GFP_ATOMIC); // We need to avoid scheduling.
 883                 if ( !un) {
 884                     un = &unf_single;
 885                     blocks_needed = 1;
 886                     max_to_insert = 0;
 887                 } else
 888                     memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert));
 889             }
 890             if ( blocks_needed <= max_to_insert) {
 891                 /* we are going to add target block to the file. Use allocated
 892                    block for that */
 893                 un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr);
 894                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 895                 set_buffer_new(bh_result);
 896                 done = 1;
 897             } else {
 898                 /* paste hole to the indirect item */
 899                 /* If kmalloc failed, max_to_insert becomes zero and it means we
 900                    only have space for one block */
 901                 blocks_needed=max_to_insert?max_to_insert:1;
 902             }
 903             retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
 904
 905             if (blocks_needed != 1)
 906                 kfree(un);
 907
 908             if (retval) {
 909                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 910                 goto failure;
 911             }
 912             if (!done) {
 913                 /* We need to mark new file size in case this function will be
 914                    interrupted/aborted later on. And we may do this only for
 915                    holes. */
 916                 inode->i_size += inode->i_sb->s_blocksize * blocks_needed;
 917             }
 918         }
 919
 920         if (done == 1)
 921             break;
 922
 923         /* this loop could log more blocks than we had originally asked
 924         ** for.  So, we have to allow the transaction to end if it is
 925         ** too big or too full.  Update the inode so things are
 926         ** consistent if we crash before the function returns
 927         **
 928         ** release the path so that anybody waiting on the path before
 929         ** ending their transaction will be able to continue.
 930         */
 931         if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
 932           retval = restart_transaction(th, inode, &path) ;
 933           if (retval)
 934             goto failure;
 935         }
 936         /* inserting indirect pointers for a hole can take a
 937         ** long time.  reschedule if needed
 938         */
 939         cond_resched();
 940
 941         retval = search_for_position_by_key (inode->i_sb, &key, &path);
 942         if (retval == IO_ERROR) {
 943             retval = -EIO;
 944             goto failure;
 945         }
 946         if (retval == POSITION_FOUND) {
 947             reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: "
 948                               "%K should not be found", &key);
 949             retval = -EEXIST;
 950             if (allocated_block_nr)
 951                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 952             pathrelse(&path) ;
 953             goto failure;
 954         }
 955         bh = get_last_bh (&path);
 956         ih = get_ih (&path);
 957         item = get_item (&path);
 958         pos_in_item = path.pos_in_item;
 959     } while (1);
 960
 961
 962     retval = 0;
 963
 964  failure:
 965     if (th && (!dangle || (retval && !th->t_trans_id))) {
 966         int err;
 967         if (th->t_trans_id)
 968             reiserfs_update_sd(th, inode);
 969         err = reiserfs_end_persistent_transaction(th);
 970         if (err)
 971             retval = err;
 972     }
 973
 974     reiserfs_write_unlock(inode->i_sb);
 975     reiserfs_check_path(&path) ;
 976     return retval;
 977 }
 978
 979 static int
 980 reiserfs_readpages(struct file *file, struct address_space *mapping,
 981                 struct list_head *pages, unsigned nr_pages)
 982 {
 983     return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
 984 }
 985
 986 /* Compute real number of used bytes by file
 987  * Following three functions can go away when we'll have enough space in stat item
 988  */
 989 static int real_space_diff(struct inode *inode, int sd_size)
 990 {
 991     int bytes;
 992     loff_t blocksize = inode->i_sb->s_blocksize ;
 993
 994     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
 995         return sd_size ;
 996
 997     /* End of file is also in full block with indirect reference, so round
 998     ** up to the next block.
 999     **
1000     ** there is just no way to know if the tail is actually packed
1001     ** on the file, so we have to assume it isn't.  When we pack the
1002     ** tail, we add 4 bytes to pretend there really is an unformatted
1003     ** node pointer
1004     */
1005     bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
1006     return bytes ;
1007 }
1008
1009 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1010                                         int sd_size)
1011 {
1012     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1013         return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
1014     }
1015     return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
1016 }
1017
1018 /* Compute number of blocks used by file in ReiserFS counting */
1019 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1020 {
1021     loff_t bytes = inode_get_bytes(inode) ;
1022     loff_t real_space = real_space_diff(inode, sd_size) ;
1023
1024     /* keeps fsck and non-quota versions of reiserfs happy */
1025     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1026         bytes += (loff_t)511 ;
1027     }
1028
1029     /* files from before the quota patch might i_blocks such that
1030     ** bytes < real_space.  Deal with that here to prevent it from
1031     ** going negative.
1032     */
1033     if (bytes < real_space)
1034         return 0 ;
1035     return (bytes - real_space) >> 9;
1036 }
1037
1038 //
1039 // BAD: new directories have stat data of new type and all other items
1040 // of old type. Version stored in the inode says about body items, so
1041 // in update_stat_data we can not rely on inode, but have to check
1042 // item version directly
1043 //
1044
1045 // called by read_locked_inode
1046 static void init_inode (struct inode * inode, struct path * path)
1047 {
1048     struct buffer_head * bh;
1049     struct item_head * ih;
1050     __u32 rdev;
1051     uid_t uid;
1052     gid_t gid;
1053     //int version = ITEM_VERSION_1;
1054
1055     bh = PATH_PLAST_BUFFER (path);
1056     ih = PATH_PITEM_HEAD (path);
1057
1058
1059     copy_key (INODE_PKEY (inode), &(ih->ih_key));
1060     inode->i_blksize = reiserfs_default_io_size;
1061
1062     INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1063     REISERFS_I(inode)->i_flags = 0;
1064     REISERFS_I(inode)->i_prealloc_block = 0;
1065     REISERFS_I(inode)->i_prealloc_count = 0;
1066     REISERFS_I(inode)->i_trans_id = 0;
1067     REISERFS_I(inode)->i_jl = NULL;
1068     REISERFS_I(inode)->i_acl_access = NULL;
1069     REISERFS_I(inode)->i_acl_default = NULL;
1070     init_rwsem (&REISERFS_I(inode)->xattr_sem);
1071
1072     if (stat_data_v1 (ih)) {
1073         struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
1074         unsigned long blocks;
1075
1076         uid = sd_v1_uid(sd);
1077         gid = sd_v1_gid(sd);
1078
1079         set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1080         set_inode_sd_version (inode, STAT_DATA_V1);
1081         inode->i_mode  = sd_v1_mode(sd);
1082         inode->i_nlink = sd_v1_nlink(sd);
1083         inode->i_size  = sd_v1_size(sd);
1084         inode->i_atime.tv_sec = sd_v1_atime(sd);
1085         inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1086         inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1087         inode->i_atime.tv_nsec = 0;
1088         inode->i_ctime.tv_nsec = 0;
1089         inode->i_mtime.tv_nsec = 0;
1090
1091         inode->i_blocks = sd_v1_blocks(sd);
1092         inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1093         blocks = (inode->i_size + 511) >> 9;
1094         blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9);
1095         if (inode->i_blocks > blocks) {
1096             // there was a bug in <=3.5.23 when i_blocks could take negative
1097             // values. Starting from 3.5.17 this value could even be stored in
1098             // stat data. For such files we set i_blocks based on file
1099             // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1100             // only updated if file's inode will ever change
1101             inode->i_blocks = blocks;
1102         }
1103
1104         rdev = sd_v1_rdev(sd);
1105         REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd);
1106         /* an early bug in the quota code can give us an odd number for the
1107         ** block count.  This is incorrect, fix it here.
1108         */
1109         if (inode->i_blocks & 1) {
1110             inode->i_blocks++ ;
1111         }
1112         inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1113                                                   SD_V1_SIZE));
1114         /* nopack is initially zero for v1 objects. For v2 objects,
1115            nopack is initialised from sd_attrs */
1116         REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1117     } else {
1118         // new stat data found, but object may have old items
1119         // (directories and symlinks)
1120         struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih);
1121
1122         uid    = sd_v2_uid(sd);
1123         gid    = sd_v2_gid(sd);
1124
1125         inode->i_mode   = sd_v2_mode(sd);
1126         inode->i_nlink  = sd_v2_nlink(sd);
1127         inode->i_size   = sd_v2_size(sd);
1128         inode->i_mtime.tv_sec  = sd_v2_mtime(sd);
1129         inode->i_atime.tv_sec = sd_v2_atime(sd);
1130         inode->i_ctime.tv_sec  = sd_v2_ctime(sd);
1131         inode->i_ctime.tv_nsec = 0;
1132         inode->i_mtime.tv_nsec = 0;
1133         inode->i_atime.tv_nsec = 0;
1134         inode->i_blocks = sd_v2_blocks(sd);
1135         rdev            = sd_v2_rdev(sd);
1136         if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) )
1137             inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1138         else
1139             inode->i_generation = sd_v2_generation(sd);
1140
1141         if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode))
1142             set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1143         else
1144             set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1145         REISERFS_I(inode)->i_first_direct_byte = 0;
1146         set_inode_sd_version (inode, STAT_DATA_V2);
1147         inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1148                                                   SD_V2_SIZE));
1149         /* read persistent inode attributes from sd and initalise
1150            generic inode flags from them */
1151         REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd );
1152         sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode );
1153     }
1154     inode->i_uid = INOXID_UID(XID_TAG(inode), uid, gid);
1155     inode->i_gid = INOXID_GID(XID_TAG(inode), uid, gid);
1156     inode->i_xid = INOXID_XID(XID_TAG(inode), uid, gid, 0);
1157
1158     pathrelse (path);
1159     if (S_ISREG (inode->i_mode)) {
1160         inode->i_op = &reiserfs_file_inode_operations;
1161         inode->i_fop = &reiserfs_file_operations;
1162         inode->i_mapping->a_ops = &reiserfs_address_space_operations ;
1163     } else if (S_ISDIR (inode->i_mode)) {
1164         inode->i_op = &reiserfs_dir_inode_operations;
1165         inode->i_fop = &reiserfs_dir_operations;
1166     } else if (S_ISLNK (inode->i_mode)) {
1167         inode->i_op = &reiserfs_symlink_inode_operations;
1168         inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1169     } else {
1170         inode->i_blocks = 0;
1171         inode->i_op = &reiserfs_special_inode_operations;
1172         init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1173     }
1174 }
1175
1176
1177 // update new stat data with inode fields
1178 static void inode2sd (void * sd, struct inode * inode, loff_t size)
1179 {
1180     struct stat_data * sd_v2 = (struct stat_data *)sd;
1181     uid_t uid = XIDINO_UID(XID_TAG(inode), inode->i_uid, inode->i_xid);
1182     gid_t gid = XIDINO_GID(XID_TAG(inode), inode->i_gid, inode->i_xid);
1183     __u16 flags;
1184
1185     set_sd_v2_uid(sd_v2, uid );
1186     set_sd_v2_gid(sd_v2, gid );
1187     set_sd_v2_mode(sd_v2, inode->i_mode );
1188     set_sd_v2_nlink(sd_v2, inode->i_nlink );
1189     set_sd_v2_size(sd_v2, size );
1190     set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec );
1191     set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec );
1192     set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec );
1193     set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1194     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1195         set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1196     else
1197         set_sd_v2_generation(sd_v2, inode->i_generation);
1198     flags = REISERFS_I(inode)->i_attrs;
1199     i_attrs_to_sd_attrs( inode, &flags );
1200     set_sd_v2_attrs( sd_v2, flags );
1201 }
1202
1203
1204 // used to copy inode's fields to old stat data
1205 static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size)
1206 {
1207     struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
1208
1209     set_sd_v1_mode(sd_v1, inode->i_mode );
1210     set_sd_v1_uid(sd_v1, inode->i_uid );
1211     set_sd_v1_gid(sd_v1, inode->i_gid );
1212     set_sd_v1_nlink(sd_v1, inode->i_nlink );
1213     set_sd_v1_size(sd_v1, size );
1214     set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec );
1215     set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec );
1216     set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec );
1217
1218     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1219         set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1220     else
1221         set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1222
1223     // Sigh. i_first_direct_byte is back
1224     set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte);
1225 }
1226
1227
1228 /* NOTE, you must prepare the buffer head before sending it here,
1229 ** and then log it after the call
1230 */
1231 static void update_stat_data (struct path * path, struct inode * inode,
1232                               loff_t size)
1233 {
1234     struct buffer_head * bh;
1235     struct item_head * ih;
1236
1237     bh = PATH_PLAST_BUFFER (path);
1238     ih = PATH_PITEM_HEAD (path);
1239
1240     if (!is_statdata_le_ih (ih))
1241         reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h",
1242                         INODE_PKEY (inode), ih);
1243
1244     if (stat_data_v1 (ih)) {
1245         // path points to old stat data
1246         inode2sd_v1 (B_I_PITEM (bh, ih), inode, size);
1247     } else {
1248         inode2sd (B_I_PITEM (bh, ih), inode, size);
1249     }
1250
1251     return;
1252 }
1253
1254
1255 void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
1256                               struct inode * inode, loff_t size)
1257 {
1258     struct cpu_key key;
1259     INITIALIZE_PATH(path);
1260     struct buffer_head *bh ;
1261     int fs_gen ;
1262     struct item_head *ih, tmp_ih ;
1263     int retval;
1264
1265     BUG_ON (!th->t_trans_id);
1266
1267     make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant
1268
1269     for(;;) {
1270         int pos;
1271         /* look for the object's stat data */
1272         retval = search_item (inode->i_sb, &key, &path);
1273         if (retval == IO_ERROR) {
1274             reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: "
1275                               "i/o failure occurred trying to update %K stat data",
1276                               &key);
1277             return;
1278         }
1279         if (retval == ITEM_NOT_FOUND) {
1280             pos = PATH_LAST_POSITION (&path);
1281             pathrelse(&path) ;
1282             if (inode->i_nlink == 0) {
1283                 /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/
1284                 return;
1285             }
1286             reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: "
1287                               "stat data of object %k (nlink == %d) not found (pos %d)",
1288                               INODE_PKEY (inode), inode->i_nlink, pos);
1289             reiserfs_check_path(&path) ;
1290             return;
1291         }
1292
1293         /* sigh, prepare_for_journal might schedule.  When it schedules the
1294         ** FS might change.  We have to detect that, and loop back to the
1295         ** search if the stat data item has moved
1296         */
1297         bh = get_last_bh(&path) ;
1298         ih = get_ih(&path) ;
1299         copy_item_head (&tmp_ih, ih);
1300         fs_gen = get_generation (inode->i_sb);
1301         reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
1302         if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
1303             reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
1304             continue ;  /* Stat_data item has been moved after scheduling. */
1305         }
1306         break;
1307     }
1308     update_stat_data (&path, inode, size);
1309     journal_mark_dirty(th, th->t_super, bh) ;
1310     pathrelse (&path);
1311     return;
1312 }
1313
1314 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
1315 ** does a make_bad_inode when things go wrong.  But, we need to make sure
1316 ** and clear the key in the private portion of the inode, otherwise a
1317 ** corresponding iput might try to delete whatever object the inode last
1318 ** represented.
1319 */
1320 static void reiserfs_make_bad_inode(struct inode *inode) {
1321     memset(INODE_PKEY(inode), 0, KEY_SIZE);
1322     make_bad_inode(inode);
1323 }
1324
1325 //
1326 // initially this function was derived from minix or ext2's analog and
1327 // evolved as the prototype did
1328 //
1329
1330 int reiserfs_init_locked_inode (struct inode * inode, void *p)
1331 {
1332     struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ;
1333     inode->i_ino = args->objectid;
1334     INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1335     return 0;
1336 }
1337
1338 /* looks for stat data in the tree, and fills up the fields of in-core
1339    inode stat data fields */
1340 void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args)
1341 {
1342     INITIALIZE_PATH (path_to_sd);
1343     struct cpu_key key;
1344     unsigned long dirino;
1345     int retval;
1346
1347     dirino = args->dirid ;
1348
1349     /* set version 1, version 2 could be used too, because stat data
1350        key is the same in both versions */
1351     key.version = KEY_FORMAT_3_5;
1352     key.on_disk_key.k_dir_id = dirino;
1353     key.on_disk_key.k_objectid = inode->i_ino;
1354     key.on_disk_key.k_offset = 0;
1355     key.on_disk_key.k_type = 0;
1356
1357     /* look for the object's stat data */
1358     retval = search_item (inode->i_sb, &key, &path_to_sd);
1359     if (retval == IO_ERROR) {
1360         reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: "
1361                           "i/o failure occurred trying to find stat data of %K",
1362                           &key);
1363         reiserfs_make_bad_inode(inode) ;
1364         return;
1365     }
1366     if (retval != ITEM_FOUND) {
1367         /* a stale NFS handle can trigger this without it being an error */
1368         pathrelse (&path_to_sd);
1369         reiserfs_make_bad_inode(inode) ;
1370         inode->i_nlink = 0;
1371         return;
1372     }
1373
1374     init_inode (inode, &path_to_sd);
1375
1376     /* It is possible that knfsd is trying to access inode of a file
1377        that is being removed from the disk by some other thread. As we
1378        update sd on unlink all that is required is to check for nlink
1379        here. This bug was first found by Sizif when debugging
1380        SquidNG/Butterfly, forgotten, and found again after Philippe
1381        Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1382
1383        More logical fix would require changes in fs/inode.c:iput() to
1384        remove inode from hash-table _after_ fs cleaned disk stuff up and
1385        in iget() to return NULL if I_FREEING inode is found in
1386        hash-table. */
1387     /* Currently there is one place where it's ok to meet inode with
1388        nlink==0: processing of open-unlinked and half-truncated files
1389        during mount (fs/reiserfs/super.c:finish_unfinished()). */
1390     if( ( inode -> i_nlink == 0 ) &&
1391         ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) {
1392             reiserfs_warning (inode->i_sb,
1393                               "vs-13075: reiserfs_read_locked_inode: "
1394                               "dead inode read from disk %K. "
1395                               "This is likely to be race with knfsd. Ignore",
1396                               &key );
1397             reiserfs_make_bad_inode( inode );
1398     }
1399
1400     reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
1401
1402 }
1403
1404 /**
1405  * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1406  *
1407  * @inode:    inode from hash table to check
1408  * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1409  *
1410  * This function is called by iget5_locked() to distinguish reiserfs inodes
1411  * having the same inode numbers. Such inodes can only exist due to some
1412  * error condition. One of them should be bad. Inodes with identical
1413  * inode numbers (objectids) are distinguished by parent directory ids.
1414  *
1415  */
1416 int reiserfs_find_actor( struct inode *inode, void *opaque )
1417 {
1418     struct reiserfs_iget_args *args;
1419
1420     args = opaque;
1421     /* args is already in CPU order */
1422     return (inode->i_ino == args->objectid) &&
1423         (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1424 }
1425
1426 struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key)
1427 {
1428     struct inode * inode;
1429     struct reiserfs_iget_args args ;
1430
1431     args.objectid = key->on_disk_key.k_objectid ;
1432     args.dirid = key->on_disk_key.k_dir_id ;
1433     inode = iget5_locked (s, key->on_disk_key.k_objectid,
1434                    reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args));
1435     if (!inode)
1436         return ERR_PTR(-ENOMEM) ;
1437
1438     if (inode->i_state & I_NEW) {
1439         reiserfs_read_locked_inode(inode, &args);
1440         unlock_new_inode(inode);
1441     }
1442
1443     if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) {
1444         /* either due to i/o error or a stale NFS handle */
1445         iput (inode);
1446         inode = NULL;
1447     }
1448     return inode;
1449 }
1450
1451 struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
1452 {
1453     __u32 *data = vobjp;
1454     struct cpu_key key ;
1455     struct dentry *result;
1456     struct inode *inode;
1457
1458     key.on_disk_key.k_objectid = data[0] ;
1459     key.on_disk_key.k_dir_id = data[1] ;
1460     reiserfs_write_lock(sb);
1461     inode = reiserfs_iget(sb, &key) ;
1462     if (inode && !IS_ERR(inode) && data[2] != 0 &&
1463         data[2] != inode->i_generation) {
1464             iput(inode) ;
1465             inode = NULL ;
1466     }
1467     reiserfs_write_unlock(sb);
1468     if (!inode)
1469             inode = ERR_PTR(-ESTALE);
1470     if (IS_ERR(inode))
1471             return ERR_PTR(PTR_ERR(inode));
1472     result = d_alloc_anon(inode);
1473     if (!result) {
1474             iput(inode);
1475             return ERR_PTR(-ENOMEM);
1476     }
1477     return result;
1478 }
1479
1480 struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data,
1481                                      int len, int fhtype,
1482                                   int (*acceptable)(void *contect, struct dentry *de),
1483                                   void *context) {
1484     __u32 obj[3], parent[3];
1485
1486     /* fhtype happens to reflect the number of u32s encoded.
1487      * due to a bug in earlier code, fhtype might indicate there
1488      * are more u32s then actually fitted.
1489      * so if fhtype seems to be more than len, reduce fhtype.
1490      * Valid types are:
1491      *   2 - objectid + dir_id - legacy support
1492      *   3 - objectid + dir_id + generation
1493      *   4 - objectid + dir_id + objectid and dirid of parent - legacy
1494      *   5 - objectid + dir_id + generation + objectid and dirid of parent
1495      *   6 - as above plus generation of directory
1496      * 6 does not fit in NFSv2 handles
1497      */
1498     if (fhtype > len) {
1499             if (fhtype != 6 || len != 5)
1500                     reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1501                            fhtype, len);
1502             fhtype = 5;
1503     }
1504
1505     obj[0] = data[0];
1506     obj[1] = data[1];
1507     if (fhtype == 3 || fhtype >= 5)
1508             obj[2] = data[2];
1509     else    obj[2] = 0; /* generation number */
1510
1511     if (fhtype >= 4) {
1512             parent[0] = data[fhtype>=5?3:2] ;
1513             parent[1] = data[fhtype>=5?4:3] ;
1514             if (fhtype == 6)
1515                     parent[2] = data[5];
1516             else    parent[2] = 0;
1517     }
1518     return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent,
1519                                acceptable, context);
1520 }
1521
1522 int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) {
1523     struct inode *inode = dentry->d_inode ;
1524     int maxlen = *lenp;
1525
1526     if (maxlen < 3)
1527         return 255 ;
1528
1529     data[0] = inode->i_ino ;
1530     data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1531     data[2] = inode->i_generation ;
1532     *lenp = 3 ;
1533     /* no room for directory info? return what we've stored so far */
1534     if (maxlen < 5 || ! need_parent)
1535         return 3 ;
1536
1537     spin_lock(&dentry->d_lock);
1538     inode = dentry->d_parent->d_inode ;
1539     data[3] = inode->i_ino ;
1540     data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1541     *lenp = 5 ;
1542     if (maxlen >= 6) {
1543             data[5] = inode->i_generation ;
1544             *lenp = 6 ;
1545     }
1546     spin_unlock(&dentry->d_lock);
1547     return *lenp ;
1548 }
1549
1550
1551 /* looks for stat data, then copies fields to it, marks the buffer
1552    containing stat data as dirty */
1553 /* reiserfs inodes are never really dirty, since the dirty inode call
1554 ** always logs them.  This call allows the VFS inode marking routines
1555 ** to properly mark inodes for datasync and such, but only actually
1556 ** does something when called for a synchronous update.
1557 */
1558 int reiserfs_write_inode (struct inode * inode, int do_sync) {
1559     struct reiserfs_transaction_handle th ;
1560     int jbegin_count = 1 ;
1561
1562     if (inode->i_sb->s_flags & MS_RDONLY)
1563         return -EROFS;
1564     /* memory pressure can sometimes initiate write_inode calls with sync == 1,
1565     ** these cases are just when the system needs ram, not when the
1566     ** inode needs to reach disk for safety, and they can safely be
1567     ** ignored because the altered inode has already been logged.
1568     */
1569     if (do_sync && !(current->flags & PF_MEMALLOC)) {
1570         reiserfs_write_lock(inode->i_sb);
1571         if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1572             reiserfs_update_sd (&th, inode);
1573             journal_end_sync(&th, inode->i_sb, jbegin_count) ;
1574         }
1575         reiserfs_write_unlock(inode->i_sb);
1576     }
1577     return 0;
1578 }
1579
1580 /* stat data of new object is inserted already, this inserts the item
1581    containing "." and ".." entries */
1582 static int reiserfs_new_directory (struct reiserfs_transaction_handle *th,
1583                                    struct inode *inode,
1584                                    struct item_head * ih, struct path * path,
1585                                    struct inode * dir)
1586 {
1587     struct super_block * sb = th->t_super;
1588     char empty_dir [EMPTY_DIR_SIZE];
1589     char * body = empty_dir;
1590     struct cpu_key key;
1591     int retval;
1592
1593     BUG_ON (!th->t_trans_id);
1594
1595     _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id),
1596                    le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/);
1597
1598     /* compose item head for new item. Directories consist of items of
1599        old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1600        is done by reiserfs_new_inode */
1601     if (old_format_only (sb)) {
1602         make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1603
1604         make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1605                                 INODE_PKEY (dir)->k_dir_id,
1606                                 INODE_PKEY (dir)->k_objectid );
1607     } else {
1608         make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1609
1610         make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1611                                 INODE_PKEY (dir)->k_dir_id,
1612                                 INODE_PKEY (dir)->k_objectid );
1613     }
1614
1615     /* look for place in the tree for new item */
1616     retval = search_item (sb, &key, path);
1617     if (retval == IO_ERROR) {
1618         reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: "
1619                           "i/o failure occurred creating new directory");
1620         return -EIO;
1621     }
1622     if (retval == ITEM_FOUND) {
1623         pathrelse (path);
1624         reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: "
1625                           "object with this key exists (%k)", &(ih->ih_key));
1626         return -EEXIST;
1627     }
1628
1629     /* insert item, that is empty directory item */
1630     return reiserfs_insert_item (th, path, &key, ih, inode, body);
1631 }
1632
1633
1634 /* stat data of object has been inserted, this inserts the item
1635    containing the body of symlink */
1636 static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th,
1637                                  struct inode *inode,   /* Inode of symlink */
1638                                  struct item_head * ih,
1639                                  struct path * path, const char * symname, int item_len)
1640 {
1641     struct super_block * sb = th->t_super;
1642     struct cpu_key key;
1643     int retval;
1644
1645     BUG_ON (!th->t_trans_id);
1646
1647     _make_cpu_key (&key, KEY_FORMAT_3_5,
1648                    le32_to_cpu (ih->ih_key.k_dir_id),
1649                    le32_to_cpu (ih->ih_key.k_objectid),
1650                    1, TYPE_DIRECT, 3/*key length*/);
1651
1652     make_le_item_head (ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/);
1653
1654     /* look for place in the tree for new item */
1655     retval = search_item (sb, &key, path);
1656     if (retval == IO_ERROR) {
1657         reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: "
1658                           "i/o failure occurred creating new symlink");
1659         return -EIO;
1660     }
1661     if (retval == ITEM_FOUND) {
1662         pathrelse (path);
1663         reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: "
1664                           "object with this key exists (%k)", &(ih->ih_key));
1665         return -EEXIST;
1666     }
1667
1668     /* insert item, that is body of symlink */
1669     return reiserfs_insert_item (th, path, &key, ih, inode, symname);
1670 }
1671
1672
1673 /* inserts the stat data into the tree, and then calls
1674    reiserfs_new_directory (to insert ".", ".." item if new object is
1675    directory) or reiserfs_new_symlink (to insert symlink body if new
1676    object is symlink) or nothing (if new object is regular file)
1677
1678    NOTE! uid and gid must already be set in the inode.  If we return
1679    non-zero due to an error, we have to drop the quota previously allocated
1680    for the fresh inode.  This can only be done outside a transaction, so
1681    if we return non-zero, we also end the transaction.  */
1682 int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
1683                         struct inode * dir, int mode,
1684                         const char * symname,
1685                         /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1686                            strlen (symname) for symlinks)*/
1687                          loff_t i_size, struct dentry *dentry,
1688                          struct inode *inode)
1689 {
1690     struct super_block * sb;
1691     INITIALIZE_PATH (path_to_key);
1692     struct cpu_key key;
1693     struct item_head ih;
1694     struct stat_data sd;
1695     int retval;
1696     int err;
1697
1698     BUG_ON (!th->t_trans_id);
1699
1700     if (DQUOT_ALLOC_INODE(inode)) {
1701         err = -EDQUOT;
1702         goto out_end_trans;
1703     }
1704     if (!dir || !dir->i_nlink) {
1705         err = -EPERM;
1706         goto out_bad_inode;
1707     }
1708
1709     sb = dir->i_sb;
1710
1711     /* item head of new item */
1712     ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1713     ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
1714     if (!ih.ih_key.k_objectid) {
1715         err = -ENOMEM;
1716         goto out_bad_inode ;
1717     }
1718     if (old_format_only (sb))
1719         /* not a perfect generation count, as object ids can be reused, but
1720         ** this is as good as reiserfs can do right now.
1721         ** note that the private part of inode isn't filled in yet, we have
1722         ** to use the directory.
1723         */
1724         inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid);
1725     else
1726 #if defined( USE_INODE_GENERATION_COUNTER )
1727         inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1728 #else
1729         inode->i_generation = ++event;
1730 #endif
1731
1732     /* fill stat data */
1733     inode->i_nlink = (S_ISDIR (mode) ? 2 : 1);
1734
1735     /* uid and gid must already be set by the caller for quota init */
1736
1737     /* symlink cannot be immutable or append only, right? */
1738     if( S_ISLNK( inode -> i_mode ) )
1739             inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
1740
1741     inode->i_mtime = inode->i_atime = inode->i_ctime =
1742             CURRENT_TIME_SEC;
1743     inode->i_size = i_size;
1744     inode->i_blocks = 0;
1745     inode->i_bytes = 0;
1746     REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1747       U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
1748
1749     INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1750     REISERFS_I(inode)->i_flags = 0;
1751     REISERFS_I(inode)->i_prealloc_block = 0;
1752     REISERFS_I(inode)->i_prealloc_count = 0;
1753     REISERFS_I(inode)->i_trans_id = 0;
1754     REISERFS_I(inode)->i_jl = NULL;
1755     REISERFS_I(inode)->i_attrs =
1756         REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1757     sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
1758     REISERFS_I(inode)->i_acl_access = NULL;
1759     REISERFS_I(inode)->i_acl_default = NULL;
1760     init_rwsem (&REISERFS_I(inode)->xattr_sem);
1761
1762     if (old_format_only (sb))
1763         make_le_item_head (&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1764     else
1765         make_le_item_head (&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1766
1767     /* key to search for correct place for new stat data */
1768     _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id),
1769                    le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/);
1770
1771     /* find proper place for inserting of stat data */
1772     retval = search_item (sb, &key, &path_to_key);
1773     if (retval == IO_ERROR) {
1774         err = -EIO;
1775         goto out_bad_inode;
1776     }
1777     if (retval == ITEM_FOUND) {
1778         pathrelse (&path_to_key);
1779         err = -EEXIST;
1780         goto out_bad_inode;
1781     }
1782     if (old_format_only (sb)) {
1783         if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1784             pathrelse (&path_to_key);
1785             /* i_uid or i_gid is too big to be stored in stat data v3.5 */
1786             err = -EINVAL;
1787             goto out_bad_inode;
1788         }
1789         inode2sd_v1 (&sd, inode, inode->i_size);
1790     } else {
1791         inode2sd (&sd, inode, inode->i_size);
1792     }
1793     // these do not go to on-disk stat data
1794     inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
1795     inode->i_blksize = reiserfs_default_io_size;
1796
1797     // store in in-core inode the key of stat data and version all
1798     // object items will have (directory items will have old offset
1799     // format, other new objects will consist of new items)
1800     memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE);
1801     if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode))
1802         set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1803     else
1804         set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1805     if (old_format_only (sb))
1806         set_inode_sd_version (inode, STAT_DATA_V1);
1807     else
1808         set_inode_sd_version (inode, STAT_DATA_V2);
1809
1810     /* insert the stat data into the tree */
1811 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1812     if (REISERFS_I(dir)->new_packing_locality)
1813         th->displace_new_blocks = 1;
1814 #endif
1815     retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
1816     if (retval) {
1817         err = retval;
1818         reiserfs_check_path(&path_to_key) ;
1819         goto out_bad_inode;
1820     }
1821
1822 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1823     if (!th->displace_new_blocks)
1824         REISERFS_I(dir)->new_packing_locality = 0;
1825 #endif
1826     if (S_ISDIR(mode)) {
1827         /* insert item with "." and ".." */
1828         retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
1829     }
1830
1831     if (S_ISLNK(mode)) {
1832         /* insert body of symlink */
1833         if (!old_format_only (sb))
1834             i_size = ROUND_UP(i_size);
1835         retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
1836     }
1837     if (retval) {
1838         err = retval;
1839         reiserfs_check_path(&path_to_key) ;
1840         journal_end(th, th->t_super, th->t_blocks_allocated);
1841         goto out_inserted_sd;
1842     }
1843
1844     /* XXX CHECK THIS */
1845     if (reiserfs_posixacl (inode->i_sb)) {
1846         retval = reiserfs_inherit_default_acl (dir, dentry, inode);
1847         if (retval) {
1848             err = retval;
1849             reiserfs_check_path(&path_to_key) ;
1850             journal_end(th, th->t_super, th->t_blocks_allocated);
1851             goto out_inserted_sd;
1852         }
1853     } else if (inode->i_sb->s_flags & MS_POSIXACL) {
1854         reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, "
1855                           "but vfs thinks they are!");
1856     } else if (is_reiserfs_priv_object (dir)) {
1857         reiserfs_mark_inode_private (inode);
1858     }
1859
1860     insert_inode_hash (inode);
1861     reiserfs_update_sd(th, inode);
1862     reiserfs_check_path(&path_to_key) ;
1863
1864     return 0;
1865
1866 /* it looks like you can easily compress these two goto targets into
1867  * one.  Keeping it like this doesn't actually hurt anything, and they
1868  * are place holders for what the quota code actually needs.
1869  */
1870 out_bad_inode:
1871     /* Invalidate the object, nothing was inserted yet */
1872     INODE_PKEY(inode)->k_objectid = 0;
1873
1874     /* Quota change must be inside a transaction for journaling */
1875     DQUOT_FREE_INODE(inode);
1876
1877 out_end_trans:
1878     journal_end(th, th->t_super, th->t_blocks_allocated) ;
1879     /* Drop can be outside and it needs more credits so it's better to have it outside */
1880     DQUOT_DROP(inode);
1881     inode->i_flags |= S_NOQUOTA;
1882     make_bad_inode(inode);
1883
1884 out_inserted_sd:
1885     inode->i_nlink = 0;
1886     th->t_trans_id = 0; /* so the caller can't use this handle later */
1887     iput(inode);
1888     return err;
1889 }
1890
1891 /*
1892 ** finds the tail page in the page cache,
1893 ** reads the last block in.
1894 **
1895 ** On success, page_result is set to a locked, pinned page, and bh_result
1896 ** is set to an up to date buffer for the last block in the file.  returns 0.
1897 **
1898 ** tail conversion is not done, so bh_result might not be valid for writing
1899 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
1900 ** trying to write the block.
1901 **
1902 ** on failure, nonzero is returned, page_result and bh_result are untouched.
1903 */
1904 static int grab_tail_page(struct inode *p_s_inode,
1905                           struct page **page_result,
1906                           struct buffer_head **bh_result) {
1907
1908     /* we want the page with the last byte in the file,
1909     ** not the page that will hold the next byte for appending
1910     */
1911     unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ;
1912     unsigned long pos = 0 ;
1913     unsigned long start = 0 ;
1914     unsigned long blocksize = p_s_inode->i_sb->s_blocksize ;
1915     unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ;
1916     struct buffer_head *bh ;
1917     struct buffer_head *head ;
1918     struct page * page ;
1919     int error ;
1920
1921     /* we know that we are only called with inode->i_size > 0.
1922     ** we also know that a file tail can never be as big as a block
1923     ** If i_size % blocksize == 0, our file is currently block aligned
1924     ** and it won't need converting or zeroing after a truncate.
1925     */
1926     if ((offset & (blocksize - 1)) == 0) {
1927         return -ENOENT ;
1928     }
1929     page = grab_cache_page(p_s_inode->i_mapping, index) ;
1930     error = -ENOMEM ;
1931     if (!page) {
1932         goto out ;
1933     }
1934     /* start within the page of the last block in the file */
1935     start = (offset / blocksize) * blocksize ;
1936
1937     error = block_prepare_write(page, start, offset,
1938                                 reiserfs_get_block_create_0) ;
1939     if (error)
1940         goto unlock ;
1941
1942     head = page_buffers(page) ;
1943     bh = head;
1944     do {
1945         if (pos >= start) {
1946             break ;
1947         }
1948         bh = bh->b_this_page ;
1949         pos += blocksize ;
1950     } while(bh != head) ;
1951
1952     if (!buffer_uptodate(bh)) {
1953         /* note, this should never happen, prepare_write should
1954         ** be taking care of this for us.  If the buffer isn't up to date,
1955         ** I've screwed up the code to find the buffer, or the code to
1956         ** call prepare_write
1957         */
1958         reiserfs_warning (p_s_inode->i_sb,
1959                           "clm-6000: error reading block %lu on dev %s",
1960                           bh->b_blocknr,
1961                           reiserfs_bdevname (p_s_inode->i_sb)) ;
1962         error = -EIO ;
1963         goto unlock ;
1964     }
1965     *bh_result = bh ;
1966     *page_result = page ;
1967
1968 out:
1969     return error ;
1970
1971 unlock:
1972     unlock_page(page) ;
1973     page_cache_release(page) ;
1974     return error ;
1975 }
1976
1977 /*
1978 ** vfs version of truncate file.  Must NOT be called with
1979 ** a transaction already started.
1980 **
1981 ** some code taken from block_truncate_page
1982 */
1983 int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
1984     struct reiserfs_transaction_handle th ;
1985     /* we want the offset for the first byte after the end of the file */
1986     unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
1987     unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
1988     unsigned length ;
1989     struct page *page = NULL ;
1990     int error ;
1991     struct buffer_head *bh = NULL ;
1992
1993     reiserfs_write_lock(p_s_inode->i_sb);
1994
1995     if (p_s_inode->i_size > 0) {
1996         if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
1997             // -ENOENT means we truncated past the end of the file,
1998             // and get_block_create_0 could not find a block to read in,
1999             // which is ok.
2000             if (error != -ENOENT)
2001                 reiserfs_warning (p_s_inode->i_sb,
2002                                   "clm-6001: grab_tail_page failed %d",
2003                                   error);
2004             page = NULL ;
2005             bh = NULL ;
2006         }
2007     }
2008
2009     /* so, if page != NULL, we have a buffer head for the offset at
2010     ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2011     ** then we have an unformatted node.  Otherwise, we have a direct item,
2012     ** and no zeroing is required on disk.  We zero after the truncate,
2013     ** because the truncate might pack the item anyway
2014     ** (it will unmap bh if it packs).
2015     */
2016     /* it is enough to reserve space in transaction for 2 balancings:
2017        one for "save" link adding and another for the first
2018        cut_from_item. 1 is for update_sd */
2019     error = journal_begin (&th, p_s_inode->i_sb,
2020                            JOURNAL_PER_BALANCE_CNT * 2 + 1);
2021     if (error)
2022         goto out;
2023     reiserfs_update_inode_transaction(p_s_inode) ;
2024     if (update_timestamps)
2025             /* we are doing real truncate: if the system crashes before the last
2026                transaction of truncating gets committed - on reboot the file
2027                either appears truncated properly or not truncated at all */
2028         add_save_link (&th, p_s_inode, 1);
2029     error = reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
2030     if (error)
2031         goto out;
2032     error = journal_end (&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2033     if (error)
2034         goto out;
2035
2036     if (update_timestamps) {
2037         error = remove_save_link (p_s_inode, 1/* truncate */);
2038         if (error)
2039             goto out;
2040     }
2041
2042     if (page) {
2043         length = offset & (blocksize - 1) ;
2044         /* if we are not on a block boundary */
2045         if (length) {
2046             char *kaddr;
2047
2048             length = blocksize - length ;
2049             kaddr = kmap_atomic(page, KM_USER0) ;
2050             memset(kaddr + offset, 0, length) ;
2051             flush_dcache_page(page) ;
2052             kunmap_atomic(kaddr, KM_USER0) ;
2053             if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2054                 mark_buffer_dirty(bh) ;
2055             }
2056         }
2057         unlock_page(page) ;
2058         page_cache_release(page) ;
2059     }
2060
2061     reiserfs_write_unlock(p_s_inode->i_sb);
2062     return 0;
2063 out:
2064     if (page) {
2065         unlock_page (page);
2066         page_cache_release (page);
2067     }
2068     reiserfs_write_unlock(p_s_inode->i_sb);
2069     return error;
2070 }
2071
2072 static int map_block_for_writepage(struct inode *inode,
2073                                struct buffer_head *bh_result,
2074                                unsigned long block) {
2075     struct reiserfs_transaction_handle th ;
2076     int fs_gen ;
2077     struct item_head tmp_ih ;
2078     struct item_head *ih ;
2079     struct buffer_head *bh ;
2080     __le32 *item ;
2081     struct cpu_key key ;
2082     INITIALIZE_PATH(path) ;
2083     int pos_in_item ;
2084     int jbegin_count = JOURNAL_PER_BALANCE_CNT ;
2085     loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ;
2086     int retval ;
2087     int use_get_block = 0 ;
2088     int bytes_copied = 0 ;
2089     int copy_size ;
2090     int trans_running = 0;
2091
2092     /* catch places below that try to log something without starting a trans */
2093     th.t_trans_id = 0;
2094
2095     if (!buffer_uptodate(bh_result)) {
2096         return -EIO;
2097     }
2098
2099     kmap(bh_result->b_page) ;
2100 start_over:
2101     reiserfs_write_lock(inode->i_sb);
2102     make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ;
2103
2104 research:
2105     retval = search_for_position_by_key(inode->i_sb, &key, &path) ;
2106     if (retval != POSITION_FOUND) {
2107         use_get_block = 1;
2108         goto out ;
2109     }
2110
2111     bh = get_last_bh(&path) ;
2112     ih = get_ih(&path) ;
2113     item = get_item(&path) ;
2114     pos_in_item = path.pos_in_item ;
2115
2116     /* we've found an unformatted node */
2117     if (indirect_item_found(retval, ih)) {
2118         if (bytes_copied > 0) {
2119             reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d",
2120                               bytes_copied) ;
2121         }
2122         if (!get_block_num(item, pos_in_item)) {
2123             /* crap, we are writing to a hole */
2124             use_get_block = 1;
2125             goto out ;
2126         }
2127         set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode);
2128     } else if (is_direct_le_ih(ih)) {
2129         char *p ;
2130         p = page_address(bh_result->b_page) ;
2131         p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ;
2132         copy_size = ih_item_len(ih) - pos_in_item;
2133
2134         fs_gen = get_generation(inode->i_sb) ;
2135         copy_item_head(&tmp_ih, ih) ;
2136
2137         if (!trans_running) {
2138             /* vs-3050 is gone, no need to drop the path */
2139             retval = journal_begin(&th, inode->i_sb, jbegin_count) ;
2140             if (retval)
2141                 goto out;
2142             reiserfs_update_inode_transaction(inode) ;
2143             trans_running = 1;
2144             if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
2145                 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2146                 goto research;
2147             }
2148         }
2149
2150         reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
2151
2152         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
2153             reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2154             goto research;
2155         }
2156
2157         memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
2158
2159         journal_mark_dirty(&th, inode->i_sb, bh) ;
2160         bytes_copied += copy_size ;
2161         set_block_dev_mapped(bh_result, 0, inode);
2162
2163         /* are there still bytes left? */
2164         if (bytes_copied < bh_result->b_size &&
2165             (byte_offset + bytes_copied) < inode->i_size) {
2166             set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ;
2167             goto research ;
2168         }
2169     } else {
2170         reiserfs_warning (inode->i_sb,
2171                           "clm-6003: bad item inode %lu, device %s",
2172                           inode->i_ino, reiserfs_bdevname (inode->i_sb)) ;
2173         retval = -EIO ;
2174         goto out ;
2175     }
2176     retval = 0 ;
2177
2178 out:
2179     pathrelse(&path) ;
2180     if (trans_running) {
2181         int err = journal_end(&th, inode->i_sb, jbegin_count) ;
2182         if (err)
2183             retval = err;
2184         trans_running = 0;
2185     }
2186     reiserfs_write_unlock(inode->i_sb);
2187
2188     /* this is where we fill in holes in the file. */
2189     if (use_get_block) {
2190         retval = reiserfs_get_block(inode, block, bh_result,
2191                                     GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
2192                                     GET_BLOCK_NO_DANGLE);
2193         if (!retval) {
2194             if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
2195                 /* get_block failed to find a mapped unformatted node. */
2196                 use_get_block = 0 ;
2197                 goto start_over ;
2198             }
2199         }
2200     }
2201     kunmap(bh_result->b_page) ;
2202
2203     if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2204         /* we've copied data from the page into the direct item, so the
2205          * buffer in the page is now clean, mark it to reflect that.
2206          */
2207         lock_buffer(bh_result);
2208         clear_buffer_dirty(bh_result);
2209         unlock_buffer(bh_result);
2210     }
2211     return retval ;
2212 }
2213
2214 /*
2215  * mason@suse.com: updated in 2.5.54 to follow the same general io
2216  * start/recovery path as __block_write_full_page, along with special
2217  * code to handle reiserfs tails.
2218  */
2219 static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) {
2220     struct inode *inode = page->mapping->host ;
2221     unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
2222     int error = 0;
2223     unsigned long block ;
2224     struct buffer_head *head, *bh;
2225     int partial = 0 ;
2226     int nr = 0;
2227     int checked = PageChecked(page);
2228     struct reiserfs_transaction_handle th;
2229     struct super_block *s = inode->i_sb;
2230     int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2231     th.t_trans_id = 0;
2232
2233     /* The page dirty bit is cleared before writepage is called, which
2234      * means we have to tell create_empty_buffers to make dirty buffers
2235      * The page really should be up to date at this point, so tossing
2236      * in the BH_Uptodate is just a sanity check.
2237      */
2238     if (!page_has_buffers(page)) {
2239         create_empty_buffers(page, s->s_blocksize,
2240                             (1 << BH_Dirty) | (1 << BH_Uptodate));
2241     }
2242     head = page_buffers(page) ;
2243
2244     /* last page in the file, zero out any contents past the
2245     ** last byte in the file
2246     */
2247     if (page->index >= end_index) {
2248         char *kaddr;
2249         unsigned last_offset;
2250
2251         last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
2252         /* no file contents in this page */
2253         if (page->index >= end_index + 1 || !last_offset) {
2254             unlock_page(page);
2255             return 0;
2256         }
2257         kaddr = kmap_atomic(page, KM_USER0);
2258         memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
2259         flush_dcache_page(page) ;
2260         kunmap_atomic(kaddr, KM_USER0) ;
2261     }
2262     bh = head ;
2263     block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits) ;
2264     /* first map all the buffers, logging any direct items we find */
2265     do {
2266         if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
2267            (buffer_mapped(bh) && bh->b_blocknr == 0))) {
2268             /* not mapped yet, or it points to a direct item, search
2269              * the btree for the mapping info, and log any direct
2270              * items found
2271              */
2272             if ((error = map_block_for_writepage(inode, bh, block))) {
2273                 goto fail ;
2274             }
2275         }
2276         bh = bh->b_this_page;
2277         block++;
2278     } while(bh != head) ;
2279
2280     /*
2281      * we start the transaction after map_block_for_writepage,
2282      * because it can create holes in the file (an unbounded operation).
2283      * starting it here, we can make a reliable estimate for how many
2284      * blocks we're going to log
2285      */
2286     if (checked) {
2287         ClearPageChecked(page);
2288         reiserfs_write_lock(s);
2289         error = journal_begin(&th, s, bh_per_page + 1);
2290         if (error) {
2291             reiserfs_write_unlock(s);
2292             goto fail;
2293         }
2294         reiserfs_update_inode_transaction(inode);
2295     }
2296     /* now go through and lock any dirty buffers on the page */
2297     do {
2298         get_bh(bh);
2299         if (!buffer_mapped(bh))
2300             continue;
2301         if (buffer_mapped(bh) && bh->b_blocknr == 0)
2302             continue;
2303
2304         if (checked) {
2305             reiserfs_prepare_for_journal(s, bh, 1);
2306             journal_mark_dirty(&th, s, bh);
2307             continue;
2308         }
2309         /* from this point on, we know the buffer is mapped to a
2310          * real block and not a direct item
2311          */
2312         if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2313             lock_buffer(bh);
2314         } else {
2315             if (test_set_buffer_locked(bh)) {
2316                 redirty_page_for_writepage(wbc, page);
2317                 continue;
2318             }
2319         }
2320         if (test_clear_buffer_dirty(bh)) {
2321             mark_buffer_async_write(bh);
2322         } else {
2323             unlock_buffer(bh);
2324         }
2325     } while((bh = bh->b_this_page) != head);
2326
2327     if (checked) {
2328         error = journal_end(&th, s, bh_per_page + 1);
2329         reiserfs_write_unlock(s);
2330         if (error)
2331             goto fail;
2332     }
2333     BUG_ON(PageWriteback(page));
2334     set_page_writeback(page);
2335     unlock_page(page);
2336
2337     /*
2338      * since any buffer might be the only dirty buffer on the page,
2339      * the first submit_bh can bring the page out of writeback.
2340      * be careful with the buffers.
2341      */
2342     do {
2343         struct buffer_head *next = bh->b_this_page;
2344         if (buffer_async_write(bh)) {
2345             submit_bh(WRITE, bh);
2346             nr++;
2347         }
2348         put_bh(bh);
2349         bh = next;
2350     } while(bh != head);
2351
2352     error = 0;
2353 done:
2354     if (nr == 0) {
2355         /*
2356          * if this page only had a direct item, it is very possible for
2357          * no io to be required without there being an error.  Or,
2358          * someone else could have locked them and sent them down the
2359          * pipe without locking the page
2360          */
2361         bh = head ;
2362         do {
2363             if (!buffer_uptodate(bh)) {
2364                 partial = 1;
2365                 break;
2366             }
2367             bh = bh->b_this_page;
2368         } while(bh != head);
2369         if (!partial)
2370             SetPageUptodate(page);
2371         end_page_writeback(page);
2372     }
2373     return error;
2374
2375 fail:
2376     /* catches various errors, we need to make sure any valid dirty blocks
2377      * get to the media.  The page is currently locked and not marked for
2378      * writeback
2379      */
2380     ClearPageUptodate(page);
2381     bh = head;
2382     do {
2383         get_bh(bh);
2384         if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2385             lock_buffer(bh);
2386             mark_buffer_async_write(bh);
2387         } else {
2388             /*
2389              * clear any dirty bits that might have come from getting
2390              * attached to a dirty page
2391              */
2392              clear_buffer_dirty(bh);
2393         }
2394         bh = bh->b_this_page;
2395     } while(bh != head);
2396     SetPageError(page);
2397     BUG_ON(PageWriteback(page));
2398     set_page_writeback(page);
2399     unlock_page(page);
2400     do {
2401         struct buffer_head *next = bh->b_this_page;
2402         if (buffer_async_write(bh)) {
2403             clear_buffer_dirty(bh);
2404             submit_bh(WRITE, bh);
2405             nr++;
2406         }
2407         put_bh(bh);
2408         bh = next;
2409     } while(bh != head);
2410     goto done;
2411 }
2412
2413
2414 static int reiserfs_readpage (struct file *f, struct page * page)
2415 {
2416     return block_read_full_page (page, reiserfs_get_block);
2417 }
2418
2419
2420 static int reiserfs_writepage (struct page * page, struct writeback_control *wbc)
2421 {
2422     struct inode *inode = page->mapping->host ;
2423     reiserfs_wait_on_write_block(inode->i_sb) ;
2424     return reiserfs_write_full_page(page, wbc) ;
2425 }
2426
2427 static int reiserfs_prepare_write(struct file *f, struct page *page,
2428                            unsigned from, unsigned to) {
2429     struct inode *inode = page->mapping->host ;
2430     int ret;
2431     int old_ref = 0;
2432
2433     reiserfs_wait_on_write_block(inode->i_sb) ;
2434     fix_tail_page_for_writing(page) ;
2435     if (reiserfs_transaction_running(inode->i_sb)) {
2436         struct reiserfs_transaction_handle *th;
2437         th = (struct reiserfs_transaction_handle *)current->journal_info;
2438         BUG_ON (!th->t_refcount);
2439         BUG_ON (!th->t_trans_id);
2440         old_ref = th->t_refcount;
2441         th->t_refcount++;
2442     }
2443
2444     ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
2445     if (ret && reiserfs_transaction_running(inode->i_sb)) {
2446         struct reiserfs_transaction_handle *th = current->journal_info;
2447         /* this gets a little ugly.  If reiserfs_get_block returned an
2448          * error and left a transacstion running, we've got to close it,
2449          * and we've got to free handle if it was a persistent transaction.
2450          *
2451          * But, if we had nested into an existing transaction, we need
2452          * to just drop the ref count on the handle.
2453          *
2454          * If old_ref == 0, the transaction is from reiserfs_get_block,
2455          * and it was a persistent trans.  Otherwise, it was nested above.
2456          */
2457         if (th->t_refcount > old_ref) {
2458             if (old_ref)
2459                 th->t_refcount--;
2460             else {
2461                 int err;
2462                 reiserfs_write_lock(inode->i_sb);
2463                 err = reiserfs_end_persistent_transaction(th);
2464                 reiserfs_write_unlock(inode->i_sb);
2465                 if (err)
2466                     ret = err;
2467             }
2468         }
2469     }
2470     return ret;
2471
2472 }
2473
2474
2475 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) {
2476   return generic_block_bmap(as, block, reiserfs_bmap) ;
2477 }
2478
2479 static int reiserfs_commit_write(struct file *f, struct page *page,
2480                                  unsigned from, unsigned to) {
2481     struct inode *inode = page->mapping->host ;
2482     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2483     int ret = 0;
2484     int update_sd = 0;
2485     struct reiserfs_transaction_handle *th = NULL;
2486
2487     reiserfs_wait_on_write_block(inode->i_sb) ;
2488     if (reiserfs_transaction_running(inode->i_sb)) {
2489         th = current->journal_info;
2490     }
2491     reiserfs_commit_page(inode, page, from, to);
2492
2493     /* generic_commit_write does this for us, but does not update the
2494     ** transaction tracking stuff when the size changes.  So, we have
2495     ** to do the i_size updates here.
2496     */
2497     if (pos > inode->i_size) {
2498         struct reiserfs_transaction_handle myth ;
2499         reiserfs_write_lock(inode->i_sb);
2500         /* If the file have grown beyond the border where it
2501            can have a tail, unmark it as needing a tail
2502            packing */
2503         if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
2504              (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
2505             REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
2506
2507         ret = journal_begin(&myth, inode->i_sb, 1) ;
2508         if (ret) {
2509             reiserfs_write_unlock(inode->i_sb);
2510             goto journal_error;
2511         }
2512         reiserfs_update_inode_transaction(inode) ;
2513         inode->i_size = pos ;
2514         reiserfs_update_sd(&myth, inode) ;
2515         update_sd = 1;
2516         ret = journal_end(&myth, inode->i_sb, 1) ;
2517         reiserfs_write_unlock(inode->i_sb);
2518         if (ret)
2519             goto journal_error;
2520     }
2521     if (th) {
2522         reiserfs_write_lock(inode->i_sb);
2523         if (!update_sd)
2524             reiserfs_update_sd(th, inode) ;
2525         ret = reiserfs_end_persistent_transaction(th);
2526         reiserfs_write_unlock(inode->i_sb);
2527         if (ret)
2528             goto out;
2529     }
2530
2531     /* we test for O_SYNC here so we can commit the transaction
2532     ** for any packed tails the file might have had
2533     */
2534     if (f && (f->f_flags & O_SYNC)) {
2535         reiserfs_write_lock(inode->i_sb);
2536         ret = reiserfs_commit_for_inode(inode) ;
2537         reiserfs_write_unlock(inode->i_sb);
2538     }
2539 out:
2540     return ret ;
2541
2542 journal_error:
2543     if (th) {
2544         reiserfs_write_lock(inode->i_sb);
2545         if (!update_sd)
2546             reiserfs_update_sd(th, inode) ;
2547         ret = reiserfs_end_persistent_transaction(th);
2548         reiserfs_write_unlock(inode->i_sb);
2549     }
2550
2551     return ret;
2552 }
2553
2554 void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode )
2555 {
2556         if( reiserfs_attrs( inode -> i_sb ) ) {
2557                 if( sd_attrs & REISERFS_SYNC_FL )
2558                         inode -> i_flags |= S_SYNC;
2559                 else
2560                         inode -> i_flags &= ~S_SYNC;
2561                 if( sd_attrs & REISERFS_IMMUTABLE_FL )
2562                         inode -> i_flags |= S_IMMUTABLE;
2563                 else
2564                         inode -> i_flags &= ~S_IMMUTABLE;
2565                 if( sd_attrs & REISERFS_IUNLINK_FL )
2566                         inode -> i_flags |= S_IUNLINK;
2567                 else
2568                         inode -> i_flags &= ~S_IUNLINK;
2569                 if( sd_attrs & REISERFS_BARRIER_FL )
2570                         inode -> i_flags |= S_BARRIER;
2571                 else
2572                         inode -> i_flags &= ~S_BARRIER;
2573                 if( sd_attrs & REISERFS_APPEND_FL )
2574                         inode -> i_flags |= S_APPEND;
2575                 else
2576                         inode -> i_flags &= ~S_APPEND;
2577                 if( sd_attrs & REISERFS_NOATIME_FL )
2578                         inode -> i_flags |= S_NOATIME;
2579                 else
2580                         inode -> i_flags &= ~S_NOATIME;
2581                 if( sd_attrs & REISERFS_NOTAIL_FL )
2582                         REISERFS_I(inode)->i_flags |= i_nopack_mask;
2583                 else
2584                         REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2585         }
2586 }
2587
2588 void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
2589 {
2590         if( reiserfs_attrs( inode -> i_sb ) ) {
2591                 if( inode -> i_flags & S_IMMUTABLE )
2592                         *sd_attrs |= REISERFS_IMMUTABLE_FL;
2593                 else
2594                         *sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2595                 if( inode -> i_flags & S_IUNLINK )
2596                         *sd_attrs |= REISERFS_IUNLINK_FL;
2597                 else
2598                         *sd_attrs &= ~REISERFS_IUNLINK_FL;
2599                 if( inode -> i_flags & S_BARRIER )
2600                         *sd_attrs |= REISERFS_BARRIER_FL;
2601                 else
2602                         *sd_attrs &= ~REISERFS_BARRIER_FL;
2603                 if( inode -> i_flags & S_SYNC )
2604                         *sd_attrs |= REISERFS_SYNC_FL;
2605                 else
2606                         *sd_attrs &= ~REISERFS_SYNC_FL;
2607                 if( inode -> i_flags & S_NOATIME )
2608                         *sd_attrs |= REISERFS_NOATIME_FL;
2609                 else
2610                         *sd_attrs &= ~REISERFS_NOATIME_FL;
2611                 if( REISERFS_I(inode)->i_flags & i_nopack_mask )
2612                         *sd_attrs |= REISERFS_NOTAIL_FL;
2613                 else
2614                         *sd_attrs &= ~REISERFS_NOTAIL_FL;
2615         }
2616 }
2617
2618 /* decide if this buffer needs to stay around for data logging or ordered
2619 ** write purposes
2620 */
2621 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2622 {
2623     int ret = 1 ;
2624     struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2625
2626     spin_lock(&j->j_dirty_buffers_lock) ;
2627     if (!buffer_mapped(bh)) {
2628         goto free_jh;
2629     }
2630     /* the page is locked, and the only places that log a data buffer
2631      * also lock the page.
2632      */
2633     if (reiserfs_file_data_log(inode)) {
2634         /*
2635          * very conservative, leave the buffer pinned if
2636          * anyone might need it.
2637          */
2638         if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2639             ret = 0 ;
2640         }
2641     } else
2642     if (buffer_dirty(bh) || buffer_locked(bh)) {
2643         struct reiserfs_journal_list *jl;
2644         struct reiserfs_jh *jh = bh->b_private;
2645
2646         /* why is this safe?
2647          * reiserfs_setattr updates i_size in the on disk
2648          * stat data before allowing vmtruncate to be called.
2649          *
2650          * If buffer was put onto the ordered list for this
2651          * transaction, we know for sure either this transaction
2652          * or an older one already has updated i_size on disk,
2653          * and this ordered data won't be referenced in the file
2654          * if we crash.
2655          *
2656          * if the buffer was put onto the ordered list for an older
2657          * transaction, we need to leave it around
2658          */
2659         if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2660             ret = 0;
2661     }
2662 free_jh:
2663     if (ret && bh->b_private) {
2664         reiserfs_free_jh(bh);
2665     }
2666     spin_unlock(&j->j_dirty_buffers_lock) ;
2667     return ret ;
2668 }
2669
2670 /* clm -- taken from fs/buffer.c:block_invalidate_page */
2671 static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
2672 {
2673     struct buffer_head *head, *bh, *next;
2674     struct inode *inode = page->mapping->host;
2675     unsigned int curr_off = 0;
2676     int ret = 1;
2677
2678     BUG_ON(!PageLocked(page));
2679
2680     if (offset == 0)
2681         ClearPageChecked(page);
2682
2683     if (!page_has_buffers(page))
2684         goto out;
2685
2686     head = page_buffers(page);
2687     bh = head;
2688     do {
2689         unsigned int next_off = curr_off + bh->b_size;
2690         next = bh->b_this_page;
2691
2692         /*
2693          * is this block fully invalidated?
2694          */
2695         if (offset <= curr_off) {
2696             if (invalidatepage_can_drop(inode, bh))
2697                 reiserfs_unmap_buffer(bh);
2698             else
2699                 ret = 0;
2700         }
2701         curr_off = next_off;
2702         bh = next;
2703     } while (bh != head);
2704
2705     /*
2706      * We release buffers only if the entire page is being invalidated.
2707      * The get_block cached value has been unconditionally invalidated,
2708      * so real IO is not possible anymore.
2709      */
2710     if (!offset && ret)
2711         ret = try_to_release_page(page, 0);
2712 out:
2713     return ret;
2714 }
2715
2716 static int reiserfs_set_page_dirty(struct page *page) {
2717     struct inode *inode = page->mapping->host;
2718     if (reiserfs_file_data_log(inode)) {
2719         SetPageChecked(page);
2720         return __set_page_dirty_nobuffers(page);
2721     }
2722     return __set_page_dirty_buffers(page);
2723 }
2724
2725 /*
2726  * Returns 1 if the page's buffers were dropped.  The page is locked.
2727  *
2728  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
2729  * in the buffers at page_buffers(page).
2730  *
2731  * even in -o notail mode, we can't be sure an old mount without -o notail
2732  * didn't create files with tails.
2733  */
2734 static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
2735 {
2736     struct inode *inode = page->mapping->host ;
2737     struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2738     struct buffer_head *head ;
2739     struct buffer_head *bh ;
2740     int ret = 1 ;
2741
2742     WARN_ON(PageChecked(page));
2743     spin_lock(&j->j_dirty_buffers_lock) ;
2744     head = page_buffers(page) ;
2745     bh = head ;
2746     do {
2747         if (bh->b_private) {
2748             if (!buffer_dirty(bh) && !buffer_locked(bh)) {
2749                 reiserfs_free_jh(bh);
2750             } else {
2751                 ret = 0 ;
2752                 break ;
2753             }
2754         }
2755         bh = bh->b_this_page ;
2756     } while (bh != head) ;
2757     if (ret)
2758         ret = try_to_free_buffers(page) ;
2759     spin_unlock(&j->j_dirty_buffers_lock) ;
2760     return ret ;
2761 }
2762
2763 /* We thank Mingming Cao for helping us understand in great detail what
2764    to do in this section of the code. */
2765 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
2766                 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
2767 {
2768     struct file *file = iocb->ki_filp;
2769     struct inode *inode = file->f_mapping->host;
2770
2771     return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2772                         offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
2773 }
2774
2775 int reiserfs_setattr_flags(struct inode *inode, unsigned int flags)
2776 {
2777         unsigned int oldflags, newflags;
2778
2779         oldflags = REISERFS_I(inode)->i_flags;
2780         newflags = oldflags & ~(REISERFS_IMMUTABLE_FL |
2781                 REISERFS_IUNLINK_FL | REISERFS_BARRIER_FL);
2782         if (flags & ATTR_FLAG_IMMUTABLE)
2783                 newflags |= REISERFS_IMMUTABLE_FL;
2784         if (flags & ATTR_FLAG_IUNLINK)
2785                 newflags |= REISERFS_IUNLINK_FL;
2786         if (flags & ATTR_FLAG_BARRIER)
2787                 newflags |= REISERFS_BARRIER_FL;
2788
2789         if (oldflags ^ newflags) {
2790                 REISERFS_I(inode)->i_flags = newflags;
2791                 inode->i_ctime = CURRENT_TIME;
2792         }
2793         return 0;
2794 }
2795
2796 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
2797     struct inode *inode = dentry->d_inode ;
2798     int error ;
2799     unsigned int ia_valid = attr->ia_valid;
2800     reiserfs_write_lock(inode->i_sb);
2801     if (attr->ia_valid & ATTR_SIZE) {
2802         /* version 2 items will be caught by the s_maxbytes check
2803         ** done for us in vmtruncate
2804         */
2805         if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
2806             attr->ia_size > MAX_NON_LFS) {
2807             error = -EFBIG ;
2808             goto out;
2809         }
2810         /* fill in hole pointers in the expanding truncate case. */
2811         if (attr->ia_size > inode->i_size) {
2812             error = generic_cont_expand(inode, attr->ia_size) ;
2813             if (REISERFS_I(inode)->i_prealloc_count > 0) {
2814                 int err;
2815                 struct reiserfs_transaction_handle th ;
2816                 /* we're changing at most 2 bitmaps, inode + super */
2817                 err = journal_begin(&th, inode->i_sb, 4) ;
2818                 if (!err) {
2819                     reiserfs_discard_prealloc (&th, inode);
2820                     err = journal_end(&th, inode->i_sb, 4) ;
2821                 }
2822                 if (err)
2823                     error = err;
2824             }
2825             if (error)
2826                 goto out;
2827         }
2828     }
2829
2830     if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
2831          ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
2832         (get_inode_sd_version (inode) == STAT_DATA_V1)) {
2833                 /* stat data of format v3.5 has 16 bit uid and gid */
2834             error = -EINVAL;
2835             goto out;
2836         }
2837
2838     error = inode_change_ok(inode, attr) ;
2839
2840     if (!error && attr->ia_valid & ATTR_ATTR_FLAG)
2841         reiserfs_setattr_flags(inode, attr->ia_attr_flags);
2842
2843     if (!error) {
2844         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2845             (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) ||
2846             (ia_valid & ATTR_XID && attr->ia_xid != inode->i_xid)) {
2847                 error = reiserfs_chown_xattrs (inode, attr);
2848
2849                 if (!error) {
2850                     struct reiserfs_transaction_handle th;
2851
2852                     /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
2853                     journal_begin(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
2854                     error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2855                     if (error) {
2856                         journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
2857                         goto out;
2858                     }
2859                     /* Update corresponding info in inode so that everything is in
2860                      * one transaction */
2861                     if (attr->ia_valid & ATTR_UID)
2862                         inode->i_uid = attr->ia_uid;
2863                     if (attr->ia_valid & ATTR_GID)
2864                         inode->i_gid = attr->ia_gid;
2865                     if (attr->ia_valid & ATTR_XID)
2866                         inode->i_xid = attr->ia_xid;
2867                     mark_inode_dirty(inode);
2868                     journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
2869                 }
2870         }
2871         if (!error)
2872             error = inode_setattr(inode, attr) ;
2873     }
2874
2875
2876     if (!error && reiserfs_posixacl (inode->i_sb)) {
2877         if (attr->ia_valid & ATTR_MODE)
2878             error = reiserfs_acl_chmod (inode);
2879     }
2880
2881 out:
2882     reiserfs_write_unlock(inode->i_sb);
2883     return error ;
2884 }
2885
2886
2887
2888 struct address_space_operations reiserfs_address_space_operations = {
2889     .writepage = reiserfs_writepage,
2890     .readpage = reiserfs_readpage,
2891     .readpages = reiserfs_readpages,
2892     .releasepage = reiserfs_releasepage,
2893     .invalidatepage = reiserfs_invalidatepage,
2894     .sync_page = block_sync_page,
2895     .prepare_write = reiserfs_prepare_write,
2896     .commit_write = reiserfs_commit_write,
2897     .bmap = reiserfs_aop_bmap,
2898     .direct_IO = reiserfs_direct_IO,
2899     .set_page_dirty = reiserfs_set_page_dirty,
2900 } ;