fs/reiserfs/inode.c

   1 /*
   2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
   3  */
   4
   5 #include <linux/config.h>
   6 #include <linux/time.h>
   7 #include <linux/fs.h>
   8 #include <linux/reiserfs_fs.h>
   9 #include <linux/reiserfs_acl.h>
  10 #include <linux/reiserfs_xattr.h>
  11 #include <linux/smp_lock.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/highmem.h>
  14 #include <asm/uaccess.h>
  15 #include <asm/unaligned.h>
  16 #include <linux/buffer_head.h>
  17 #include <linux/mpage.h>
  18 #include <linux/writeback.h>
  19 #include <linux/quotaops.h>
  20
  21 extern int reiserfs_default_io_size; /* default io size devuned in super.c */
  22
  23 /* args for the create parameter of reiserfs_get_block */
  24 #define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
  25 #define GET_BLOCK_CREATE 1    /* add anything you need to find block */
  26 #define GET_BLOCK_NO_HOLE 2   /* return -ENOENT for file holes */
  27 #define GET_BLOCK_READ_DIRECT 4  /* read the tail if indirect item not found */
  28 #define GET_BLOCK_NO_ISEM     8 /* i_sem is not held, don't preallocate */
  29 #define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
  30
  31 static int reiserfs_get_block (struct inode * inode, sector_t block,
  32                                struct buffer_head * bh_result, int create);
  33 static int reiserfs_commit_write(struct file *f, struct page *page,
  34                                  unsigned from, unsigned to);
  35
  36 void reiserfs_delete_inode (struct inode * inode)
  37 {
  38     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2;
  39     struct reiserfs_transaction_handle th ;
  40
  41     reiserfs_write_lock(inode->i_sb);
  42
  43     DQUOT_FREE_INODE(inode);
  44     /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
  45     if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
  46         down (&inode->i_sem);
  47
  48         reiserfs_delete_xattrs (inode);
  49
  50         journal_begin(&th, inode->i_sb, jbegin_count) ;
  51         reiserfs_update_inode_transaction(inode) ;
  52
  53         reiserfs_delete_object (&th, inode);
  54
  55         journal_end(&th, inode->i_sb, jbegin_count) ;
  56
  57         up (&inode->i_sem);
  58
  59         /* all items of file are deleted, so we can remove "save" link */
  60         remove_save_link (inode, 0/* not truncate */);
  61     } else {
  62         /* no object items are in the tree */
  63         ;
  64     }
  65     clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */
  66     inode->i_blocks = 0;
  67     reiserfs_write_unlock(inode->i_sb);
  68 }
  69
  70 static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid,
  71                loff_t offset, int type, int length )
  72 {
  73     key->version = version;
  74
  75     key->on_disk_key.k_dir_id = dirid;
  76     key->on_disk_key.k_objectid = objectid;
  77     set_cpu_key_k_offset (key, offset);
  78     set_cpu_key_k_type (key, type);
  79     key->key_length = length;
  80 }
  81
  82
  83 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
  84    offset and type of key */
  85 void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset,
  86               int type, int length )
  87 {
  88   _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id),
  89                  le32_to_cpu (INODE_PKEY (inode)->k_objectid),
  90                  offset, type, length);
  91 }
  92
  93
  94 //
  95 // when key is 0, do not set version and short key
  96 //
  97 inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key,
  98                                int version,
  99                                loff_t offset, int type, int length,
 100                                int entry_count/*or ih_free_space*/)
 101 {
 102     if (key) {
 103         ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id);
 104         ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid);
 105     }
 106     put_ih_version( ih, version );
 107     set_le_ih_k_offset (ih, offset);
 108     set_le_ih_k_type (ih, type);
 109     put_ih_item_len( ih, length );
 110     /*    set_ih_free_space (ih, 0);*/
 111     // for directory items it is entry count, for directs and stat
 112     // datas - 0xffff, for indirects - 0
 113     put_ih_entry_count( ih, entry_count );
 114 }
 115
 116 //
 117 // FIXME: we might cache recently accessed indirect item
 118
 119 // Ugh.  Not too eager for that....
 120 //  I cut the code until such time as I see a convincing argument (benchmark).
 121 // I don't want a bloated inode struct..., and I don't like code complexity....
 122
 123 /* cutting the code is fine, since it really isn't in use yet and is easy
 124 ** to add back in.  But, Vladimir has a really good idea here.  Think
 125 ** about what happens for reading a file.  For each page,
 126 ** The VFS layer calls reiserfs_readpage, who searches the tree to find
 127 ** an indirect item.  This indirect item has X number of pointers, where
 128 ** X is a big number if we've done the block allocation right.  But,
 129 ** we only use one or two of these pointers during each call to readpage,
 130 ** needlessly researching again later on.
 131 **
 132 ** The size of the cache could be dynamic based on the size of the file.
 133 **
 134 ** I'd also like to see us cache the location the stat data item, since
 135 ** we are needlessly researching for that frequently.
 136 **
 137 ** --chris
 138 */
 139
 140 /* If this page has a file tail in it, and
 141 ** it was read in by get_block_create_0, the page data is valid,
 142 ** but tail is still sitting in a direct item, and we can't write to
 143 ** it.  So, look through this page, and check all the mapped buffers
 144 ** to make sure they have valid block numbers.  Any that don't need
 145 ** to be unmapped, so that block_prepare_write will correctly call
 146 ** reiserfs_get_block to convert the tail into an unformatted node
 147 */
 148 static inline void fix_tail_page_for_writing(struct page *page) {
 149     struct buffer_head *head, *next, *bh ;
 150
 151     if (page && page_has_buffers(page)) {
 152         head = page_buffers(page) ;
 153         bh = head ;
 154         do {
 155             next = bh->b_this_page ;
 156             if (buffer_mapped(bh) && bh->b_blocknr == 0) {
 157                 reiserfs_unmap_buffer(bh) ;
 158             }
 159             bh = next ;
 160         } while (bh != head) ;
 161     }
 162 }
 163
 164 /* reiserfs_get_block does not need to allocate a block only if it has been
 165    done already or non-hole position has been found in the indirect item */
 166 static inline int allocation_needed (int retval, b_blocknr_t allocated,
 167                                      struct item_head * ih,
 168                                      __u32 * item, int pos_in_item)
 169 {
 170   if (allocated)
 171          return 0;
 172   if (retval == POSITION_FOUND && is_indirect_le_ih (ih) &&
 173       get_block_num(item, pos_in_item))
 174          return 0;
 175   return 1;
 176 }
 177
 178 static inline int indirect_item_found (int retval, struct item_head * ih)
 179 {
 180   return (retval == POSITION_FOUND) && is_indirect_le_ih (ih);
 181 }
 182
 183
 184 static inline void set_block_dev_mapped (struct buffer_head * bh,
 185                                          b_blocknr_t block, struct inode * inode)
 186 {
 187         map_bh(bh, inode->i_sb, block);
 188 }
 189
 190
 191 //
 192 // files which were created in the earlier version can not be longer,
 193 // than 2 gb
 194 //
 195 static int file_capable (struct inode * inode, long block)
 196 {
 197     if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file.
 198         block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
 199         return 1;
 200
 201     return 0;
 202 }
 203
 204 /*static*/ void restart_transaction(struct reiserfs_transaction_handle *th,
 205                                 struct inode *inode, struct path *path) {
 206   struct super_block *s = th->t_super ;
 207   int len = th->t_blocks_allocated ;
 208
 209   /* we cannot restart while nested */
 210   if (th->t_refcount > 1) {
 211       return  ;
 212   }
 213   pathrelse(path) ;
 214   reiserfs_update_sd(th, inode) ;
 215   journal_end(th, s, len) ;
 216   journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ;
 217   reiserfs_update_inode_transaction(inode) ;
 218 }
 219
 220 // it is called by get_block when create == 0. Returns block number
 221 // for 'block'-th logical block of file. When it hits direct item it
 222 // returns 0 (being called from bmap) or read direct item into piece
 223 // of page (bh_result)
 224
 225 // Please improve the english/clarity in the comment above, as it is
 226 // hard to understand.
 227
 228 static int _get_block_create_0 (struct inode * inode, long block,
 229                                  struct buffer_head * bh_result,
 230                                  int args)
 231 {
 232     INITIALIZE_PATH (path);
 233     struct cpu_key key;
 234     struct buffer_head * bh;
 235     struct item_head * ih, tmp_ih;
 236     int fs_gen ;
 237     int blocknr;
 238     char * p = NULL;
 239     int chars;
 240     int ret ;
 241     int done = 0 ;
 242     unsigned long offset ;
 243
 244     // prepare the key to look for the 'block'-th block of file
 245     make_cpu_key (&key, inode,
 246                   (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3);
 247
 248 research:
 249     if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) {
 250         pathrelse (&path);
 251         if (p)
 252             kunmap(bh_result->b_page) ;
 253         // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 254         // That there is some MMAPED data associated with it that is yet to be written to disk.
 255         if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
 256             return -ENOENT ;
 257         }
 258         return 0 ;
 259     }
 260
 261     //
 262     bh = get_last_bh (&path);
 263     ih = get_ih (&path);
 264     if (is_indirect_le_ih (ih)) {
 265         __u32 * ind_item = (__u32 *)B_I_PITEM (bh, ih);
 266
 267         /* FIXME: here we could cache indirect item or part of it in
 268            the inode to avoid search_by_key in case of subsequent
 269            access to file */
 270         blocknr = get_block_num(ind_item, path.pos_in_item) ;
 271         ret = 0 ;
 272         if (blocknr) {
 273             map_bh(bh_result, inode->i_sb, blocknr);
 274             if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
 275                 set_buffer_boundary(bh_result);
 276             }
 277         } else
 278             // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 279             // That there is some MMAPED data associated with it that is yet to  be written to disk.
 280             if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
 281             ret = -ENOENT ;
 282             }
 283
 284         pathrelse (&path);
 285         if (p)
 286             kunmap(bh_result->b_page) ;
 287         return ret ;
 288     }
 289
 290     // requested data are in direct item(s)
 291     if (!(args & GET_BLOCK_READ_DIRECT)) {
 292         // we are called by bmap. FIXME: we can not map block of file
 293         // when it is stored in direct item(s)
 294         pathrelse (&path);
 295         if (p)
 296             kunmap(bh_result->b_page) ;
 297         return -ENOENT;
 298     }
 299
 300     /* if we've got a direct item, and the buffer or page was uptodate,
 301     ** we don't want to pull data off disk again.  skip to the
 302     ** end, where we map the buffer and return
 303     */
 304     if (buffer_uptodate(bh_result)) {
 305         goto finished ;
 306     } else
 307         /*
 308         ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
 309         ** pages without any buffers.  If the page is up to date, we don't want
 310         ** read old data off disk.  Set the up to date bit on the buffer instead
 311         ** and jump to the end
 312         */
 313             if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
 314                 set_buffer_uptodate(bh_result);
 315                 goto finished ;
 316     }
 317
 318     // read file tail into part of page
 319     offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ;
 320     fs_gen = get_generation(inode->i_sb) ;
 321     copy_item_head (&tmp_ih, ih);
 322
 323     /* we only want to kmap if we are reading the tail into the page.
 324     ** this is not the common case, so we don't kmap until we are
 325     ** sure we need to.  But, this means the item might move if
 326     ** kmap schedules
 327     */
 328     if (!p) {
 329         p = (char *)kmap(bh_result->b_page) ;
 330         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 331             goto research;
 332         }
 333     }
 334     p += offset ;
 335     memset (p, 0, inode->i_sb->s_blocksize);
 336     do {
 337         if (!is_direct_le_ih (ih)) {
 338             BUG ();
 339         }
 340         /* make sure we don't read more bytes than actually exist in
 341         ** the file.  This can happen in odd cases where i_size isn't
 342         ** correct, and when direct item padding results in a few
 343         ** extra bytes at the end of the direct item
 344         */
 345         if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
 346             break ;
 347         if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
 348             chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item;
 349             done = 1 ;
 350         } else {
 351             chars = ih_item_len(ih) - path.pos_in_item;
 352         }
 353         memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars);
 354
 355         if (done)
 356             break ;
 357
 358         p += chars;
 359
 360         if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1))
 361             // we done, if read direct item is not the last item of
 362             // node FIXME: we could try to check right delimiting key
 363             // to see whether direct item continues in the right
 364             // neighbor or rely on i_size
 365             break;
 366
 367         // update key to look for the next piece
 368         set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars);
 369         if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND)
 370             // we read something from tail, even if now we got IO_ERROR
 371             break;
 372         bh = get_last_bh (&path);
 373         ih = get_ih (&path);
 374     } while (1);
 375
 376     flush_dcache_page(bh_result->b_page) ;
 377     kunmap(bh_result->b_page) ;
 378
 379 finished:
 380     pathrelse (&path);
 381     /* this buffer has valid data, but isn't valid for io.  mapping it to
 382      * block #0 tells the rest of reiserfs it just has a tail in it
 383      */
 384     map_bh(bh_result, inode->i_sb, 0);
 385     set_buffer_uptodate (bh_result);
 386     return 0;
 387 }
 388
 389
 390 // this is called to create file map. So, _get_block_create_0 will not
 391 // read direct item
 392 int reiserfs_bmap (struct inode * inode, sector_t block,
 393                    struct buffer_head * bh_result, int create)
 394 {
 395     if (!file_capable (inode, block))
 396         return -EFBIG;
 397
 398     reiserfs_write_lock(inode->i_sb);
 399     /* do not read the direct item */
 400     _get_block_create_0 (inode, block, bh_result, 0) ;
 401     reiserfs_write_unlock(inode->i_sb);
 402     return 0;
 403 }
 404
 405 /* special version of get_block that is only used by grab_tail_page right
 406 ** now.  It is sent to block_prepare_write, and when you try to get a
 407 ** block past the end of the file (or a block from a hole) it returns
 408 ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
 409 ** be able to do i/o on the buffers returned, unless an error value
 410 ** is also returned.
 411 **
 412 ** So, this allows block_prepare_write to be used for reading a single block
 413 ** in a page.  Where it does not produce a valid page for holes, or past the
 414 ** end of the file.  This turns out to be exactly what we need for reading
 415 ** tails for conversion.
 416 **
 417 ** The point of the wrapper is forcing a certain value for create, even
 418 ** though the VFS layer is calling this function with create==1.  If you
 419 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 420 ** don't use this function.
 421 */
 422 static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block,
 423                         struct buffer_head * bh_result, int create) {
 424     return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ;
 425 }
 426
 427 /* This is special helper for reiserfs_get_block in case we are executing
 428    direct_IO request. */
 429 static int reiserfs_get_blocks_direct_io(struct inode *inode,
 430                                          sector_t iblock,
 431                                          unsigned long max_blocks,
 432                                          struct buffer_head *bh_result,
 433                                          int create)
 434 {
 435     int ret ;
 436
 437     bh_result->b_page = NULL;
 438
 439     /* We set the b_size before reiserfs_get_block call since it is
 440        referenced in convert_tail_for_hole() that may be called from
 441        reiserfs_get_block() */
 442     bh_result->b_size = (1 << inode->i_blkbits);
 443
 444     ret = reiserfs_get_block(inode, iblock, bh_result,
 445                              create | GET_BLOCK_NO_DANGLE) ;
 446
 447     /* don't allow direct io onto tail pages */
 448     if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 449         /* make sure future calls to the direct io funcs for this offset
 450         ** in the file fail by unmapping the buffer
 451         */
 452         clear_buffer_mapped(bh_result);
 453         ret = -EINVAL ;
 454     }
 455     /* Possible unpacked tail. Flush the data before pages have
 456        disappeared */
 457     if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
 458         lock_kernel();
 459         reiserfs_commit_for_inode(inode);
 460         REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 461         unlock_kernel();
 462     }
 463     return ret ;
 464 }
 465
 466
 467 /*
 468 ** helper function for when reiserfs_get_block is called for a hole
 469 ** but the file tail is still in a direct item
 470 ** bh_result is the buffer head for the hole
 471 ** tail_offset is the offset of the start of the tail in the file
 472 **
 473 ** This calls prepare_write, which will start a new transaction
 474 ** you should not be in a transaction, or have any paths held when you
 475 ** call this.
 476 */
 477 static int convert_tail_for_hole(struct inode *inode,
 478                                  struct buffer_head *bh_result,
 479                                  loff_t tail_offset) {
 480     unsigned long index ;
 481     unsigned long tail_end ;
 482     unsigned long tail_start ;
 483     struct page * tail_page ;
 484     struct page * hole_page = bh_result->b_page ;
 485     int retval = 0 ;
 486
 487     if ((tail_offset & (bh_result->b_size - 1)) != 1)
 488         return -EIO ;
 489
 490     /* always try to read until the end of the block */
 491     tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
 492     tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
 493
 494     index = tail_offset >> PAGE_CACHE_SHIFT ;
 495     /* hole_page can be zero in case of direct_io, we are sure
 496        that we cannot get here if we write with O_DIRECT into
 497        tail page */
 498     if (!hole_page || index != hole_page->index) {
 499         tail_page = grab_cache_page(inode->i_mapping, index) ;
 500         retval = -ENOMEM;
 501         if (!tail_page) {
 502             goto out ;
 503         }
 504     } else {
 505         tail_page = hole_page ;
 506     }
 507
 508     /* we don't have to make sure the conversion did not happen while
 509     ** we were locking the page because anyone that could convert
 510     ** must first take i_sem.
 511     **
 512     ** We must fix the tail page for writing because it might have buffers
 513     ** that are mapped, but have a block number of 0.  This indicates tail
 514     ** data that has been read directly into the page, and block_prepare_write
 515     ** won't trigger a get_block in this case.
 516     */
 517     fix_tail_page_for_writing(tail_page) ;
 518     retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
 519     if (retval)
 520         goto unlock ;
 521
 522     /* tail conversion might change the data in the page */
 523     flush_dcache_page(tail_page) ;
 524
 525     retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
 526
 527 unlock:
 528     if (tail_page != hole_page) {
 529         unlock_page(tail_page) ;
 530         page_cache_release(tail_page) ;
 531     }
 532 out:
 533     return retval ;
 534 }
 535
 536 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
 537                            long block,
 538                            struct inode *inode,
 539                            b_blocknr_t *allocated_block_nr,
 540                            struct path * path,
 541                            int flags) {
 542
 543 #ifdef REISERFS_PREALLOCATE
 544     if (!(flags & GET_BLOCK_NO_ISEM)) {
 545         return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block);
 546     }
 547 #endif
 548     return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block);
 549 }
 550
 551 int reiserfs_get_block (struct inode * inode, sector_t block,
 552                         struct buffer_head * bh_result, int create)
 553 {
 554     int repeat, retval;
 555     b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int
 556     INITIALIZE_PATH(path);
 557     int pos_in_item;
 558     struct cpu_key key;
 559     struct buffer_head * bh, * unbh = NULL;
 560     struct item_head * ih, tmp_ih;
 561     __u32 * item;
 562     int done;
 563     int fs_gen;
 564     struct reiserfs_transaction_handle *th = NULL;
 565     /* space reserved in transaction batch:
 566         . 3 balancings in direct->indirect conversion
 567         . 1 block involved into reiserfs_update_sd()
 568        XXX in practically impossible worst case direct2indirect()
 569        can incur (much) more that 3 balancings. */
 570     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1;
 571     int version;
 572     int dangle = 1;
 573     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
 574
 575                                 /* bad.... */
 576     reiserfs_write_lock(inode->i_sb);
 577     version = get_inode_item_key_version (inode);
 578
 579     if (block < 0) {
 580         reiserfs_write_unlock(inode->i_sb);
 581         return -EIO;
 582     }
 583
 584     if (!file_capable (inode, block)) {
 585         reiserfs_write_unlock(inode->i_sb);
 586         return -EFBIG;
 587     }
 588
 589     /* if !create, we aren't changing the FS, so we don't need to
 590     ** log anything, so we don't need to start a transaction
 591     */
 592     if (!(create & GET_BLOCK_CREATE)) {
 593         int ret ;
 594         /* find number of block-th logical block of the file */
 595         ret = _get_block_create_0 (inode, block, bh_result,
 596                                    create | GET_BLOCK_READ_DIRECT) ;
 597         reiserfs_write_unlock(inode->i_sb);
 598         return ret;
 599     }
 600     /*
 601      * if we're already in a transaction, make sure to close
 602      * any new transactions we start in this func
 603      */
 604     if ((create & GET_BLOCK_NO_DANGLE) ||
 605         reiserfs_transaction_running(inode->i_sb))
 606         dangle = 0;
 607
 608     /* If file is of such a size, that it might have a tail and tails are enabled
 609     ** we should mark it as possibly needing tail packing on close
 610     */
 611     if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) ||
 612          (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
 613         REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 614
 615     /* set the key of the first byte in the 'block'-th block of file */
 616     make_cpu_key (&key, inode, new_offset,
 617                   TYPE_ANY, 3/*key length*/);
 618     if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
 619 start_trans:
 620         th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
 621         if (!th) {
 622             retval = -ENOMEM;
 623             goto failure;
 624         }
 625         reiserfs_update_inode_transaction(inode) ;
 626     }
 627  research:
 628
 629     retval = search_for_position_by_key (inode->i_sb, &key, &path);
 630     if (retval == IO_ERROR) {
 631         retval = -EIO;
 632         goto failure;
 633     }
 634
 635     bh = get_last_bh (&path);
 636     ih = get_ih (&path);
 637     item = get_item (&path);
 638     pos_in_item = path.pos_in_item;
 639
 640     fs_gen = get_generation (inode->i_sb);
 641     copy_item_head (&tmp_ih, ih);
 642
 643     if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 644         /* we have to allocate block for the unformatted node */
 645         if (!th) {
 646             pathrelse(&path) ;
 647             goto start_trans;
 648         }
 649
 650         repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 651
 652         if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 653             /* restart the transaction to give the journal a chance to free
 654             ** some blocks.  releases the path, so we have to go back to
 655             ** research if we succeed on the second try
 656             */
 657             SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
 658             restart_transaction(th, inode, &path) ;
 659             repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 660
 661             if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
 662                 goto research ;
 663             }
 664             if (repeat == QUOTA_EXCEEDED)
 665                 retval = -EDQUOT;
 666             else
 667                 retval = -ENOSPC;
 668             goto failure;
 669         }
 670
 671         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 672             goto research;
 673         }
 674     }
 675
 676     if (indirect_item_found (retval, ih)) {
 677         b_blocknr_t unfm_ptr;
 678         /* 'block'-th block is in the file already (there is
 679            corresponding cell in some indirect item). But it may be
 680            zero unformatted node pointer (hole) */
 681         unfm_ptr = get_block_num (item, pos_in_item);
 682         if (unfm_ptr == 0) {
 683             /* use allocated block to plug the hole */
 684             reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
 685             if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 686                 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
 687                 goto research;
 688             }
 689             set_buffer_new(bh_result);
 690             if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
 691                 reiserfs_add_ordered_list(inode, bh_result);
 692             put_block_num(item, pos_in_item, allocated_block_nr) ;
 693             unfm_ptr = allocated_block_nr;
 694             journal_mark_dirty (th, inode->i_sb, bh);
 695             reiserfs_update_sd(th, inode) ;
 696         }
 697         set_block_dev_mapped(bh_result, unfm_ptr, inode);
 698         pathrelse (&path);
 699         if (!dangle && th)
 700             reiserfs_end_persistent_transaction(th);
 701
 702         reiserfs_write_unlock(inode->i_sb);
 703
 704         /* the item was found, so new blocks were not added to the file
 705         ** there is no need to make sure the inode is updated with this
 706         ** transaction
 707         */
 708         return 0;
 709     }
 710
 711     if (!th) {
 712         pathrelse(&path) ;
 713         goto start_trans;
 714     }
 715
 716     /* desired position is not found or is in the direct item. We have
 717        to append file with holes up to 'block'-th block converting
 718        direct items to indirect one if necessary */
 719     done = 0;
 720     do {
 721         if (is_statdata_le_ih (ih)) {
 722             __u32 unp = 0;
 723             struct cpu_key tmp_key;
 724
 725             /* indirect item has to be inserted */
 726             make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT,
 727                                UNFM_P_SIZE, 0/* free_space */);
 728
 729             if (cpu_key_k_offset (&key) == 1) {
 730                 /* we are going to add 'block'-th block to the file. Use
 731                    allocated block for that */
 732                 unp = cpu_to_le32 (allocated_block_nr);
 733                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 734                 set_buffer_new(bh_result);
 735                 done = 1;
 736             }
 737             tmp_key = key; // ;)
 738             set_cpu_key_k_offset (&tmp_key, 1);
 739             PATH_LAST_POSITION(&path) ++;
 740
 741             retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
 742             if (retval) {
 743                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 744                 goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 745             }
 746             //mark_tail_converted (inode);
 747         } else if (is_direct_le_ih (ih)) {
 748             /* direct item has to be converted */
 749             loff_t tail_offset;
 750
 751             tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
 752             if (tail_offset == cpu_key_k_offset (&key)) {
 753                 /* direct item we just found fits into block we have
 754                    to map. Convert it into unformatted node: use
 755                    bh_result for the conversion */
 756                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 757                 unbh = bh_result;
 758                 done = 1;
 759             } else {
 760                 /* we have to padd file tail stored in direct item(s)
 761                    up to block size and convert it to unformatted
 762                    node. FIXME: this should also get into page cache */
 763
 764                 pathrelse(&path) ;
 765                 /*
 766                  * ugly, but we can only end the transaction if
 767                  * we aren't nested
 768                  */
 769                 if (th->t_refcount == 1) {
 770                     reiserfs_end_persistent_transaction(th);
 771                     th = NULL;
 772                 }
 773
 774                 retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 775                 if (retval) {
 776                     if ( retval != -ENOSPC )
 777                         reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ;
 778                     if (allocated_block_nr) {
 779                         /* the bitmap, the super, and the stat data == 3 */
 780                         if (!th)
 781                             th = reiserfs_persistent_transaction(inode->i_sb,3);
 782                         if (th)
 783                             reiserfs_free_block (th,inode,allocated_block_nr,1);
 784                     }
 785                     goto failure ;
 786                 }
 787                 goto research ;
 788             }
 789             retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 790             if (retval) {
 791                 reiserfs_unmap_buffer(unbh);
 792                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 793                 goto failure;
 794             }
 795             /* it is important the set_buffer_uptodate is done after
 796             ** the direct2indirect.  The buffer might contain valid
 797             ** data newer than the data on disk (read by readpage, changed,
 798             ** and then sent here by writepage).  direct2indirect needs
 799             ** to know if unbh was already up to date, so it can decide
 800             ** if the data in unbh needs to be replaced with data from
 801             ** the disk
 802             */
 803             set_buffer_uptodate (unbh);
 804
 805             /* unbh->b_page == NULL in case of DIRECT_IO request, this means
 806                buffer will disappear shortly, so it should not be added to
 807              */
 808             if ( unbh->b_page ) {
 809                 /* we've converted the tail, so we must
 810                 ** flush unbh before the transaction commits
 811                 */
 812                 reiserfs_add_tail_list(inode, unbh) ;
 813
 814                 /* mark it dirty now to prevent commit_write from adding
 815                 ** this buffer to the inode's dirty buffer list
 816                 */
 817                 /*
 818                  * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
 819                  * It's still atomic, but it sets the page dirty too,
 820                  * which makes it eligible for writeback at any time by the
 821                  * VM (which was also the case with __mark_buffer_dirty())
 822                  */
 823                 mark_buffer_dirty(unbh) ;
 824             }
 825         } else {
 826             /* append indirect item with holes if needed, when appending
 827                pointer to 'block'-th block use block, which is already
 828                allocated */
 829             struct cpu_key tmp_key;
 830             unp_t unf_single=0; // We use this in case we need to allocate only
 831                                 // one block which is a fastpath
 832             unp_t *un;
 833             __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE;
 834             __u64 blocks_needed;
 835
 836             RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
 837                     "vs-804: invalid position for append");
 838             /* indirect item has to be appended, set up key of that position */
 839             make_cpu_key (&tmp_key, inode,
 840                           le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize),
 841                           //pos_in_item * inode->i_sb->s_blocksize,
 842                           TYPE_INDIRECT, 3);// key type is unimportant
 843
 844             blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits);
 845             RFALSE( blocks_needed < 0, "green-805: invalid offset");
 846
 847             if ( blocks_needed == 1 ) {
 848                 un = &unf_single;
 849             } else {
 850                 un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE,
 851                             GFP_ATOMIC); // We need to avoid scheduling.
 852                 if ( !un) {
 853                     un = &unf_single;
 854                     blocks_needed = 1;
 855                     max_to_insert = 0;
 856                 } else
 857                     memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert));
 858             }
 859             if ( blocks_needed <= max_to_insert) {
 860                 /* we are going to add target block to the file. Use allocated
 861                    block for that */
 862                 un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr);
 863                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 864                 set_buffer_new(bh_result);
 865                 done = 1;
 866             } else {
 867                 /* paste hole to the indirect item */
 868                 /* If kmalloc failed, max_to_insert becomes zero and it means we
 869                    only have space for one block */
 870                 blocks_needed=max_to_insert?max_to_insert:1;
 871             }
 872             retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
 873
 874             if (blocks_needed != 1)
 875                 kfree(un);
 876
 877             if (retval) {
 878                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 879                 goto failure;
 880             }
 881             if (!done) {
 882                 /* We need to mark new file size in case this function will be
 883                    interrupted/aborted later on. And we may do this only for
 884                    holes. */
 885                 inode->i_size += inode->i_sb->s_blocksize * blocks_needed;
 886             }
 887         }
 888
 889         if (done == 1)
 890             break;
 891
 892         /* this loop could log more blocks than we had originally asked
 893         ** for.  So, we have to allow the transaction to end if it is
 894         ** too big or too full.  Update the inode so things are
 895         ** consistent if we crash before the function returns
 896         **
 897         ** release the path so that anybody waiting on the path before
 898         ** ending their transaction will be able to continue.
 899         */
 900         if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
 901           restart_transaction(th, inode, &path) ;
 902         }
 903         /* inserting indirect pointers for a hole can take a
 904         ** long time.  reschedule if needed
 905         */
 906         cond_resched();
 907
 908         retval = search_for_position_by_key (inode->i_sb, &key, &path);
 909         if (retval == IO_ERROR) {
 910             retval = -EIO;
 911             goto failure;
 912         }
 913         if (retval == POSITION_FOUND) {
 914             reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: "
 915                               "%K should not be found", &key);
 916             retval = -EEXIST;
 917             if (allocated_block_nr)
 918                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 919             pathrelse(&path) ;
 920             goto failure;
 921         }
 922         bh = get_last_bh (&path);
 923         ih = get_ih (&path);
 924         item = get_item (&path);
 925         pos_in_item = path.pos_in_item;
 926     } while (1);
 927
 928
 929     retval = 0;
 930
 931  failure:
 932     if (th && !dangle) {
 933       reiserfs_update_sd(th, inode) ;
 934       reiserfs_end_persistent_transaction(th);
 935     }
 936     reiserfs_write_unlock(inode->i_sb);
 937     reiserfs_check_path(&path) ;
 938     return retval;
 939 }
 940
 941 static int
 942 reiserfs_readpages(struct file *file, struct address_space *mapping,
 943                 struct list_head *pages, unsigned nr_pages)
 944 {
 945     return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
 946 }
 947
 948 /* Compute real number of used bytes by file
 949  * Following three functions can go away when we'll have enough space in stat item
 950  */
 951 static int real_space_diff(struct inode *inode, int sd_size)
 952 {
 953     int bytes;
 954     loff_t blocksize = inode->i_sb->s_blocksize ;
 955
 956     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
 957         return sd_size ;
 958
 959     /* End of file is also in full block with indirect reference, so round
 960     ** up to the next block.
 961     **
 962     ** there is just no way to know if the tail is actually packed
 963     ** on the file, so we have to assume it isn't.  When we pack the
 964     ** tail, we add 4 bytes to pretend there really is an unformatted
 965     ** node pointer
 966     */
 967     bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
 968     return bytes ;
 969 }
 970
 971 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
 972                                         int sd_size)
 973 {
 974     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
 975         return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
 976     }
 977     return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
 978 }
 979
 980 /* Compute number of blocks used by file in ReiserFS counting */
 981 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
 982 {
 983     loff_t bytes = inode_get_bytes(inode) ;
 984     loff_t real_space = real_space_diff(inode, sd_size) ;
 985
 986     /* keeps fsck and non-quota versions of reiserfs happy */
 987     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
 988         bytes += (loff_t)511 ;
 989     }
 990
 991     /* files from before the quota patch might i_blocks such that
 992     ** bytes < real_space.  Deal with that here to prevent it from
 993     ** going negative.
 994     */
 995     if (bytes < real_space)
 996         return 0 ;
 997     return (bytes - real_space) >> 9;
 998 }
 999
1000 //
1001 // BAD: new directories have stat data of new type and all other items
1002 // of old type. Version stored in the inode says about body items, so
1003 // in update_stat_data we can not rely on inode, but have to check
1004 // item version directly
1005 //
1006
1007 // called by read_locked_inode
1008 static void init_inode (struct inode * inode, struct path * path)
1009 {
1010     struct buffer_head * bh;
1011     struct item_head * ih;
1012     __u32 rdev;
1013     //int version = ITEM_VERSION_1;
1014
1015     bh = PATH_PLAST_BUFFER (path);
1016     ih = PATH_PITEM_HEAD (path);
1017
1018
1019     copy_key (INODE_PKEY (inode), &(ih->ih_key));
1020     inode->i_blksize = reiserfs_default_io_size;
1021
1022     INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1023     REISERFS_I(inode)->i_flags = 0;
1024     REISERFS_I(inode)->i_prealloc_block = 0;
1025     REISERFS_I(inode)->i_prealloc_count = 0;
1026     REISERFS_I(inode)->i_trans_id = 0;
1027     REISERFS_I(inode)->i_jl = NULL;
1028     REISERFS_I(inode)->i_acl_access = NULL;
1029     REISERFS_I(inode)->i_acl_default = NULL;
1030     init_rwsem (&REISERFS_I(inode)->xattr_sem);
1031
1032     if (stat_data_v1 (ih)) {
1033         struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
1034         unsigned long blocks;
1035
1036         set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1037         set_inode_sd_version (inode, STAT_DATA_V1);
1038         inode->i_mode  = sd_v1_mode(sd);
1039         inode->i_nlink = sd_v1_nlink(sd);
1040         inode->i_uid   = sd_v1_uid(sd);
1041         inode->i_gid   = sd_v1_gid(sd);
1042         inode->i_size  = sd_v1_size(sd);
1043         inode->i_atime.tv_sec = sd_v1_atime(sd);
1044         inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1045         inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1046         inode->i_atime.tv_nsec = 0;
1047         inode->i_ctime.tv_nsec = 0;
1048         inode->i_mtime.tv_nsec = 0;
1049
1050         inode->i_blocks = sd_v1_blocks(sd);
1051         inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1052         blocks = (inode->i_size + 511) >> 9;
1053         blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9);
1054         if (inode->i_blocks > blocks) {
1055             // there was a bug in <=3.5.23 when i_blocks could take negative
1056             // values. Starting from 3.5.17 this value could even be stored in
1057             // stat data. For such files we set i_blocks based on file
1058             // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1059             // only updated if file's inode will ever change
1060             inode->i_blocks = blocks;
1061         }
1062
1063         rdev = sd_v1_rdev(sd);
1064         REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd);
1065         /* an early bug in the quota code can give us an odd number for the
1066         ** block count.  This is incorrect, fix it here.
1067         */
1068         if (inode->i_blocks & 1) {
1069             inode->i_blocks++ ;
1070         }
1071         inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1072                                                   SD_V1_SIZE));
1073         /* nopack is initially zero for v1 objects. For v2 objects,
1074            nopack is initialised from sd_attrs */
1075         REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1076     } else {
1077         // new stat data found, but object may have old items
1078         // (directories and symlinks)
1079         struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih);
1080
1081         inode->i_mode   = sd_v2_mode(sd);
1082         inode->i_nlink  = sd_v2_nlink(sd);
1083         inode->i_uid    = sd_v2_uid(sd);
1084         inode->i_size   = sd_v2_size(sd);
1085         inode->i_gid    = sd_v2_gid(sd);
1086         inode->i_mtime.tv_sec  = sd_v2_mtime(sd);
1087         inode->i_atime.tv_sec = sd_v2_atime(sd);
1088         inode->i_ctime.tv_sec  = sd_v2_ctime(sd);
1089         inode->i_ctime.tv_nsec = 0;
1090         inode->i_mtime.tv_nsec = 0;
1091         inode->i_atime.tv_nsec = 0;
1092         inode->i_blocks = sd_v2_blocks(sd);
1093         rdev            = sd_v2_rdev(sd);
1094         if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) )
1095             inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1096         else
1097             inode->i_generation = sd_v2_generation(sd);
1098
1099         if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode))
1100             set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1101         else
1102             set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1103         REISERFS_I(inode)->i_first_direct_byte = 0;
1104         set_inode_sd_version (inode, STAT_DATA_V2);
1105         inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1106                                                   SD_V2_SIZE));
1107         /* read persistent inode attributes from sd and initalise
1108            generic inode flags from them */
1109         REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd );
1110         sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode );
1111     }
1112
1113     pathrelse (path);
1114     if (S_ISREG (inode->i_mode)) {
1115         inode->i_op = &reiserfs_file_inode_operations;
1116         inode->i_fop = &reiserfs_file_operations;
1117         inode->i_mapping->a_ops = &reiserfs_address_space_operations ;
1118     } else if (S_ISDIR (inode->i_mode)) {
1119         inode->i_op = &reiserfs_dir_inode_operations;
1120         inode->i_fop = &reiserfs_dir_operations;
1121     } else if (S_ISLNK (inode->i_mode)) {
1122         inode->i_op = &reiserfs_symlink_inode_operations;
1123         inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1124     } else {
1125         inode->i_blocks = 0;
1126         inode->i_op = &reiserfs_special_inode_operations;
1127         init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1128     }
1129 }
1130
1131
1132 // update new stat data with inode fields
1133 static void inode2sd (void * sd, struct inode * inode, loff_t size)
1134 {
1135     struct stat_data * sd_v2 = (struct stat_data *)sd;
1136     __u16 flags;
1137
1138     set_sd_v2_mode(sd_v2, inode->i_mode );
1139     set_sd_v2_nlink(sd_v2, inode->i_nlink );
1140     set_sd_v2_uid(sd_v2, inode->i_uid );
1141     set_sd_v2_size(sd_v2, size );
1142     set_sd_v2_gid(sd_v2, inode->i_gid );
1143     set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec );
1144     set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec );
1145     set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec );
1146     set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1147     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1148         set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1149     else
1150         set_sd_v2_generation(sd_v2, inode->i_generation);
1151     flags = REISERFS_I(inode)->i_attrs;
1152     i_attrs_to_sd_attrs( inode, &flags );
1153     set_sd_v2_attrs( sd_v2, flags );
1154 }
1155
1156
1157 // used to copy inode's fields to old stat data
1158 static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size)
1159 {
1160     struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
1161
1162     set_sd_v1_mode(sd_v1, inode->i_mode );
1163     set_sd_v1_uid(sd_v1, inode->i_uid );
1164     set_sd_v1_gid(sd_v1, inode->i_gid );
1165     set_sd_v1_nlink(sd_v1, inode->i_nlink );
1166     set_sd_v1_size(sd_v1, size );
1167     set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec );
1168     set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec );
1169     set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec );
1170
1171     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1172         set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1173     else
1174         set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1175
1176     // Sigh. i_first_direct_byte is back
1177     set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte);
1178 }
1179
1180
1181 /* NOTE, you must prepare the buffer head before sending it here,
1182 ** and then log it after the call
1183 */
1184 static void update_stat_data (struct path * path, struct inode * inode,
1185                               loff_t size)
1186 {
1187     struct buffer_head * bh;
1188     struct item_head * ih;
1189
1190     bh = PATH_PLAST_BUFFER (path);
1191     ih = PATH_PITEM_HEAD (path);
1192
1193     if (!is_statdata_le_ih (ih))
1194         reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h",
1195                         INODE_PKEY (inode), ih);
1196
1197     if (stat_data_v1 (ih)) {
1198         // path points to old stat data
1199         inode2sd_v1 (B_I_PITEM (bh, ih), inode, size);
1200     } else {
1201         inode2sd (B_I_PITEM (bh, ih), inode, size);
1202     }
1203
1204     return;
1205 }
1206
1207
1208 void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
1209                               struct inode * inode, loff_t size)
1210 {
1211     struct cpu_key key;
1212     INITIALIZE_PATH(path);
1213     struct buffer_head *bh ;
1214     int fs_gen ;
1215     struct item_head *ih, tmp_ih ;
1216     int retval;
1217
1218     make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant
1219
1220     for(;;) {
1221         int pos;
1222         /* look for the object's stat data */
1223         retval = search_item (inode->i_sb, &key, &path);
1224         if (retval == IO_ERROR) {
1225             reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: "
1226                               "i/o failure occurred trying to update %K stat data",
1227                               &key);
1228             return;
1229         }
1230         if (retval == ITEM_NOT_FOUND) {
1231             pos = PATH_LAST_POSITION (&path);
1232             pathrelse(&path) ;
1233             if (inode->i_nlink == 0) {
1234                 /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/
1235                 return;
1236             }
1237             reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: "
1238                               "stat data of object %k (nlink == %d) not found (pos %d)",
1239                               INODE_PKEY (inode), inode->i_nlink, pos);
1240             reiserfs_check_path(&path) ;
1241             return;
1242         }
1243
1244         /* sigh, prepare_for_journal might schedule.  When it schedules the
1245         ** FS might change.  We have to detect that, and loop back to the
1246         ** search if the stat data item has moved
1247         */
1248         bh = get_last_bh(&path) ;
1249         ih = get_ih(&path) ;
1250         copy_item_head (&tmp_ih, ih);
1251         fs_gen = get_generation (inode->i_sb);
1252         reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
1253         if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
1254             reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
1255             continue ;  /* Stat_data item has been moved after scheduling. */
1256         }
1257         break;
1258     }
1259     update_stat_data (&path, inode, size);
1260     journal_mark_dirty(th, th->t_super, bh) ;
1261     pathrelse (&path);
1262     return;
1263 }
1264
1265 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
1266 ** does a make_bad_inode when things go wrong.  But, we need to make sure
1267 ** and clear the key in the private portion of the inode, otherwise a
1268 ** corresponding iput might try to delete whatever object the inode last
1269 ** represented.
1270 */
1271 static void reiserfs_make_bad_inode(struct inode *inode) {
1272     memset(INODE_PKEY(inode), 0, KEY_SIZE);
1273     make_bad_inode(inode);
1274 }
1275
1276 //
1277 // initially this function was derived from minix or ext2's analog and
1278 // evolved as the prototype did
1279 //
1280
1281 int reiserfs_init_locked_inode (struct inode * inode, void *p)
1282 {
1283     struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ;
1284     inode->i_ino = args->objectid;
1285     INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1286     return 0;
1287 }
1288
1289 /* looks for stat data in the tree, and fills up the fields of in-core
1290    inode stat data fields */
1291 void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args)
1292 {
1293     INITIALIZE_PATH (path_to_sd);
1294     struct cpu_key key;
1295     unsigned long dirino;
1296     int retval;
1297
1298     dirino = args->dirid ;
1299
1300     /* set version 1, version 2 could be used too, because stat data
1301        key is the same in both versions */
1302     key.version = KEY_FORMAT_3_5;
1303     key.on_disk_key.k_dir_id = dirino;
1304     key.on_disk_key.k_objectid = inode->i_ino;
1305     key.on_disk_key.u.k_offset_v1.k_offset = SD_OFFSET;
1306     key.on_disk_key.u.k_offset_v1.k_uniqueness = SD_UNIQUENESS;
1307
1308     /* look for the object's stat data */
1309     retval = search_item (inode->i_sb, &key, &path_to_sd);
1310     if (retval == IO_ERROR) {
1311         reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: "
1312                           "i/o failure occurred trying to find stat data of %K",
1313                           &key);
1314         reiserfs_make_bad_inode(inode) ;
1315         return;
1316     }
1317     if (retval != ITEM_FOUND) {
1318         /* a stale NFS handle can trigger this without it being an error */
1319         pathrelse (&path_to_sd);
1320         reiserfs_make_bad_inode(inode) ;
1321         inode->i_nlink = 0;
1322         return;
1323     }
1324
1325     init_inode (inode, &path_to_sd);
1326
1327     /* It is possible that knfsd is trying to access inode of a file
1328        that is being removed from the disk by some other thread. As we
1329        update sd on unlink all that is required is to check for nlink
1330        here. This bug was first found by Sizif when debugging
1331        SquidNG/Butterfly, forgotten, and found again after Philippe
1332        Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1333
1334        More logical fix would require changes in fs/inode.c:iput() to
1335        remove inode from hash-table _after_ fs cleaned disk stuff up and
1336        in iget() to return NULL if I_FREEING inode is found in
1337        hash-table. */
1338     /* Currently there is one place where it's ok to meet inode with
1339        nlink==0: processing of open-unlinked and half-truncated files
1340        during mount (fs/reiserfs/super.c:finish_unfinished()). */
1341     if( ( inode -> i_nlink == 0 ) &&
1342         ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) {
1343             reiserfs_warning (inode->i_sb,
1344                               "vs-13075: reiserfs_read_locked_inode: "
1345                               "dead inode read from disk %K. "
1346                               "This is likely to be race with knfsd. Ignore",
1347                               &key );
1348             reiserfs_make_bad_inode( inode );
1349     }
1350
1351     reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
1352
1353 }
1354
1355 /**
1356  * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1357  *
1358  * @inode:    inode from hash table to check
1359  * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1360  *
1361  * This function is called by iget5_locked() to distinguish reiserfs inodes
1362  * having the same inode numbers. Such inodes can only exist due to some
1363  * error condition. One of them should be bad. Inodes with identical
1364  * inode numbers (objectids) are distinguished by parent directory ids.
1365  *
1366  */
1367 int reiserfs_find_actor( struct inode *inode, void *opaque )
1368 {
1369     struct reiserfs_iget_args *args;
1370
1371     args = opaque;
1372     /* args is already in CPU order */
1373     return (inode->i_ino == args->objectid) &&
1374         (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1375 }
1376
1377 struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key)
1378 {
1379     struct inode * inode;
1380     struct reiserfs_iget_args args ;
1381
1382     args.objectid = key->on_disk_key.k_objectid ;
1383     args.dirid = key->on_disk_key.k_dir_id ;
1384     inode = iget5_locked (s, key->on_disk_key.k_objectid,
1385                    reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args));
1386     if (!inode)
1387         return ERR_PTR(-ENOMEM) ;
1388
1389     if (inode->i_state & I_NEW) {
1390         reiserfs_read_locked_inode(inode, &args);
1391         unlock_new_inode(inode);
1392     }
1393
1394     if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) {
1395         /* either due to i/o error or a stale NFS handle */
1396         iput (inode);
1397         inode = NULL;
1398     }
1399     return inode;
1400 }
1401
1402 struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
1403 {
1404     __u32 *data = vobjp;
1405     struct cpu_key key ;
1406     struct dentry *result;
1407     struct inode *inode;
1408
1409     key.on_disk_key.k_objectid = data[0] ;
1410     key.on_disk_key.k_dir_id = data[1] ;
1411     inode = reiserfs_iget(sb, &key) ;
1412     if (inode && !IS_ERR(inode) && data[2] != 0 &&
1413         data[2] != inode->i_generation) {
1414             iput(inode) ;
1415             inode = NULL ;
1416     }
1417     if (!inode)
1418             inode = ERR_PTR(-ESTALE);
1419     if (IS_ERR(inode))
1420             return ERR_PTR(PTR_ERR(inode));
1421     result = d_alloc_anon(inode);
1422     if (!result) {
1423             iput(inode);
1424             return ERR_PTR(-ENOMEM);
1425     }
1426     return result;
1427 }
1428
1429 struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data,
1430                                      int len, int fhtype,
1431                                   int (*acceptable)(void *contect, struct dentry *de),
1432                                   void *context) {
1433     __u32 obj[3], parent[3];
1434
1435     /* fhtype happens to reflect the number of u32s encoded.
1436      * due to a bug in earlier code, fhtype might indicate there
1437      * are more u32s then actually fitted.
1438      * so if fhtype seems to be more than len, reduce fhtype.
1439      * Valid types are:
1440      *   2 - objectid + dir_id - legacy support
1441      *   3 - objectid + dir_id + generation
1442      *   4 - objectid + dir_id + objectid and dirid of parent - legacy
1443      *   5 - objectid + dir_id + generation + objectid and dirid of parent
1444      *   6 - as above plus generation of directory
1445      * 6 does not fit in NFSv2 handles
1446      */
1447     if (fhtype > len) {
1448             if (fhtype != 6 || len != 5)
1449                     reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1450                            fhtype, len);
1451             fhtype = 5;
1452     }
1453
1454     obj[0] = data[0];
1455     obj[1] = data[1];
1456     if (fhtype == 3 || fhtype >= 5)
1457             obj[2] = data[2];
1458     else    obj[2] = 0; /* generation number */
1459
1460     if (fhtype >= 4) {
1461             parent[0] = data[fhtype>=5?3:2] ;
1462             parent[1] = data[fhtype>=5?4:3] ;
1463             if (fhtype == 6)
1464                     parent[2] = data[5];
1465             else    parent[2] = 0;
1466     }
1467     return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent,
1468                                acceptable, context);
1469 }
1470
1471 int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) {
1472     struct inode *inode = dentry->d_inode ;
1473     int maxlen = *lenp;
1474
1475     if (maxlen < 3)
1476         return 255 ;
1477
1478     data[0] = inode->i_ino ;
1479     data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1480     data[2] = inode->i_generation ;
1481     *lenp = 3 ;
1482     /* no room for directory info? return what we've stored so far */
1483     if (maxlen < 5 || ! need_parent)
1484         return 3 ;
1485
1486     spin_lock(&dentry->d_lock);
1487     inode = dentry->d_parent->d_inode ;
1488     data[3] = inode->i_ino ;
1489     data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1490     *lenp = 5 ;
1491     if (maxlen >= 6) {
1492             data[5] = inode->i_generation ;
1493             *lenp = 6 ;
1494     }
1495     spin_unlock(&dentry->d_lock);
1496     return *lenp ;
1497 }
1498
1499
1500 /* looks for stat data, then copies fields to it, marks the buffer
1501    containing stat data as dirty */
1502 /* reiserfs inodes are never really dirty, since the dirty inode call
1503 ** always logs them.  This call allows the VFS inode marking routines
1504 ** to properly mark inodes for datasync and such, but only actually
1505 ** does something when called for a synchronous update.
1506 */
1507 void reiserfs_write_inode (struct inode * inode, int do_sync) {
1508     struct reiserfs_transaction_handle th ;
1509     int jbegin_count = 1 ;
1510
1511     if (inode->i_sb->s_flags & MS_RDONLY) {
1512         reiserfs_warning (inode->i_sb,
1513                           "clm-6005: writing inode %lu on readonly FS",
1514                           inode->i_ino) ;
1515         return ;
1516     }
1517     /* memory pressure can sometimes initiate write_inode calls with sync == 1,
1518     ** these cases are just when the system needs ram, not when the
1519     ** inode needs to reach disk for safety, and they can safely be
1520     ** ignored because the altered inode has already been logged.
1521     */
1522     if (do_sync && !(current->flags & PF_MEMALLOC)) {
1523         reiserfs_write_lock(inode->i_sb);
1524         journal_begin(&th, inode->i_sb, jbegin_count) ;
1525         reiserfs_update_sd (&th, inode);
1526         journal_end_sync(&th, inode->i_sb, jbegin_count) ;
1527         reiserfs_write_unlock(inode->i_sb);
1528     }
1529 }
1530
1531 /* FIXME: no need any more. right? */
1532 int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, struct inode * inode)
1533 {
1534   int err = 0;
1535
1536   reiserfs_update_sd (th, inode);
1537   return err;
1538 }
1539
1540
1541 /* stat data of new object is inserted already, this inserts the item
1542    containing "." and ".." entries */
1543 static int reiserfs_new_directory (struct reiserfs_transaction_handle *th,
1544                                    struct inode *inode,
1545                                    struct item_head * ih, struct path * path,
1546                                    struct inode * dir)
1547 {
1548     struct super_block * sb = th->t_super;
1549     char empty_dir [EMPTY_DIR_SIZE];
1550     char * body = empty_dir;
1551     struct cpu_key key;
1552     int retval;
1553
1554     _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id),
1555                    le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/);
1556
1557     /* compose item head for new item. Directories consist of items of
1558        old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1559        is done by reiserfs_new_inode */
1560     if (old_format_only (sb)) {
1561         make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1562
1563         make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1564                                 INODE_PKEY (dir)->k_dir_id,
1565                                 INODE_PKEY (dir)->k_objectid );
1566     } else {
1567         make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1568
1569         make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1570                                 INODE_PKEY (dir)->k_dir_id,
1571                                 INODE_PKEY (dir)->k_objectid );
1572     }
1573
1574     /* look for place in the tree for new item */
1575     retval = search_item (sb, &key, path);
1576     if (retval == IO_ERROR) {
1577         reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: "
1578                           "i/o failure occurred creating new directory");
1579         return -EIO;
1580     }
1581     if (retval == ITEM_FOUND) {
1582         pathrelse (path);
1583         reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: "
1584                           "object with this key exists (%k)", &(ih->ih_key));
1585         return -EEXIST;
1586     }
1587
1588     /* insert item, that is empty directory item */
1589     return reiserfs_insert_item (th, path, &key, ih, inode, body);
1590 }
1591
1592
1593 /* stat data of object has been inserted, this inserts the item
1594    containing the body of symlink */
1595 static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th,
1596                                  struct inode *inode,   /* Inode of symlink */
1597                                  struct item_head * ih,
1598                                  struct path * path, const char * symname, int item_len)
1599 {
1600     struct super_block * sb = th->t_super;
1601     struct cpu_key key;
1602     int retval;
1603
1604     _make_cpu_key (&key, KEY_FORMAT_3_5,
1605                    le32_to_cpu (ih->ih_key.k_dir_id),
1606                    le32_to_cpu (ih->ih_key.k_objectid),
1607                    1, TYPE_DIRECT, 3/*key length*/);
1608
1609     make_le_item_head (ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/);
1610
1611     /* look for place in the tree for new item */
1612     retval = search_item (sb, &key, path);
1613     if (retval == IO_ERROR) {
1614         reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: "
1615                           "i/o failure occurred creating new symlink");
1616         return -EIO;
1617     }
1618     if (retval == ITEM_FOUND) {
1619         pathrelse (path);
1620         reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: "
1621                           "object with this key exists (%k)", &(ih->ih_key));
1622         return -EEXIST;
1623     }
1624
1625     /* insert item, that is body of symlink */
1626     return reiserfs_insert_item (th, path, &key, ih, inode, symname);
1627 }
1628
1629
1630 /* inserts the stat data into the tree, and then calls
1631    reiserfs_new_directory (to insert ".", ".." item if new object is
1632    directory) or reiserfs_new_symlink (to insert symlink body if new
1633    object is symlink) or nothing (if new object is regular file)
1634
1635    NOTE! uid and gid must already be set in the inode.  If we return
1636    non-zero due to an error, we have to drop the quota previously allocated
1637    for the fresh inode.  This can only be done outside a transaction, so
1638    if we return non-zero, we also end the transaction.  */
1639 int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
1640                         struct inode * dir, int mode,
1641                         const char * symname,
1642                         /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1643                            strlen (symname) for symlinks)*/
1644                          loff_t i_size, struct dentry *dentry,
1645                          struct inode *inode)
1646 {
1647     struct super_block * sb;
1648     INITIALIZE_PATH (path_to_key);
1649     struct cpu_key key;
1650     struct item_head ih;
1651     struct stat_data sd;
1652     int retval;
1653     int err;
1654
1655     if (!dir || !dir->i_nlink) {
1656         err = -EPERM;
1657         goto out_bad_inode;
1658     }
1659
1660     sb = dir->i_sb;
1661
1662     /* item head of new item */
1663     ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1664     ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
1665     if (!ih.ih_key.k_objectid) {
1666         err = -ENOMEM;
1667         goto out_bad_inode ;
1668     }
1669     if (old_format_only (sb))
1670         /* not a perfect generation count, as object ids can be reused, but
1671         ** this is as good as reiserfs can do right now.
1672         ** note that the private part of inode isn't filled in yet, we have
1673         ** to use the directory.
1674         */
1675         inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid);
1676     else
1677 #if defined( USE_INODE_GENERATION_COUNTER )
1678         inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1679 #else
1680         inode->i_generation = ++event;
1681 #endif
1682
1683     /* fill stat data */
1684     inode->i_nlink = (S_ISDIR (mode) ? 2 : 1);
1685
1686     /* uid and gid must already be set by the caller for quota init */
1687
1688     /* symlink cannot be immutable or append only, right? */
1689     if( S_ISLNK( inode -> i_mode ) )
1690             inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
1691
1692     inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1693     inode->i_size = i_size;
1694     inode->i_blocks = 0;
1695     inode->i_bytes = 0;
1696     REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1697       U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
1698
1699     INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1700     REISERFS_I(inode)->i_flags = 0;
1701     REISERFS_I(inode)->i_prealloc_block = 0;
1702     REISERFS_I(inode)->i_prealloc_count = 0;
1703     REISERFS_I(inode)->i_trans_id = 0;
1704     REISERFS_I(inode)->i_jl = NULL;
1705     REISERFS_I(inode)->i_attrs =
1706         REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1707     sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
1708     REISERFS_I(inode)->i_acl_access = NULL;
1709     REISERFS_I(inode)->i_acl_default = NULL;
1710     init_rwsem (&REISERFS_I(inode)->xattr_sem);
1711
1712     if (old_format_only (sb))
1713         make_le_item_head (&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1714     else
1715         make_le_item_head (&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1716
1717     /* key to search for correct place for new stat data */
1718     _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id),
1719                    le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/);
1720
1721     /* find proper place for inserting of stat data */
1722     retval = search_item (sb, &key, &path_to_key);
1723     if (retval == IO_ERROR) {
1724         err = -EIO;
1725         goto out_bad_inode;
1726     }
1727     if (retval == ITEM_FOUND) {
1728         pathrelse (&path_to_key);
1729         err = -EEXIST;
1730         goto out_bad_inode;
1731     }
1732     if (old_format_only (sb)) {
1733         if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1734             pathrelse (&path_to_key);
1735             /* i_uid or i_gid is too big to be stored in stat data v3.5 */
1736             err = -EINVAL;
1737             goto out_bad_inode;
1738         }
1739         inode2sd_v1 (&sd, inode, inode->i_size);
1740     } else {
1741         inode2sd (&sd, inode, inode->i_size);
1742     }
1743     // these do not go to on-disk stat data
1744     inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
1745     inode->i_blksize = reiserfs_default_io_size;
1746
1747     // store in in-core inode the key of stat data and version all
1748     // object items will have (directory items will have old offset
1749     // format, other new objects will consist of new items)
1750     memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE);
1751     if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode))
1752         set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1753     else
1754         set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1755     if (old_format_only (sb))
1756         set_inode_sd_version (inode, STAT_DATA_V1);
1757     else
1758         set_inode_sd_version (inode, STAT_DATA_V2);
1759
1760     /* insert the stat data into the tree */
1761 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1762     if (REISERFS_I(dir)->new_packing_locality)
1763         th->displace_new_blocks = 1;
1764 #endif
1765     retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
1766     if (retval) {
1767         err = retval;
1768         reiserfs_check_path(&path_to_key) ;
1769         goto out_bad_inode;
1770     }
1771
1772 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1773     if (!th->displace_new_blocks)
1774         REISERFS_I(dir)->new_packing_locality = 0;
1775 #endif
1776     if (S_ISDIR(mode)) {
1777         /* insert item with "." and ".." */
1778         retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
1779     }
1780
1781     if (S_ISLNK(mode)) {
1782         /* insert body of symlink */
1783         if (!old_format_only (sb))
1784             i_size = ROUND_UP(i_size);
1785         retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
1786     }
1787     if (retval) {
1788         err = retval;
1789         reiserfs_check_path(&path_to_key) ;
1790         journal_end(th, th->t_super, th->t_blocks_allocated);
1791         goto out_inserted_sd;
1792     }
1793
1794     /* XXX CHECK THIS */
1795     if (reiserfs_posixacl (inode->i_sb)) {
1796         retval = reiserfs_inherit_default_acl (dir, dentry, inode);
1797         if (retval) {
1798             err = retval;
1799             reiserfs_check_path(&path_to_key) ;
1800             journal_end(th, th->t_super, th->t_blocks_allocated);
1801             goto out_inserted_sd;
1802         }
1803     } else if (inode->i_sb->s_flags & MS_POSIXACL) {
1804         reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, "
1805                           "but vfs thinks they are!");
1806     }
1807
1808     insert_inode_hash (inode);
1809     reiserfs_update_sd(th, inode);
1810     reiserfs_check_path(&path_to_key) ;
1811
1812     return 0;
1813
1814 /* it looks like you can easily compress these two goto targets into
1815  * one.  Keeping it like this doesn't actually hurt anything, and they
1816  * are place holders for what the quota code actually needs.
1817  */
1818 out_bad_inode:
1819     /* Invalidate the object, nothing was inserted yet */
1820     INODE_PKEY(inode)->k_objectid = 0;
1821
1822     /* dquot_drop must be done outside a transaction */
1823     journal_end(th, th->t_super, th->t_blocks_allocated) ;
1824     DQUOT_FREE_INODE(inode);
1825     DQUOT_DROP(inode);
1826     inode->i_flags |= S_NOQUOTA;
1827     make_bad_inode(inode);
1828
1829 out_inserted_sd:
1830     inode->i_nlink = 0;
1831     th->t_trans_id = 0; /* so the caller can't use this handle later */
1832     iput(inode);
1833     return err;
1834 }
1835
1836 /*
1837 ** finds the tail page in the page cache,
1838 ** reads the last block in.
1839 **
1840 ** On success, page_result is set to a locked, pinned page, and bh_result
1841 ** is set to an up to date buffer for the last block in the file.  returns 0.
1842 **
1843 ** tail conversion is not done, so bh_result might not be valid for writing
1844 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
1845 ** trying to write the block.
1846 **
1847 ** on failure, nonzero is returned, page_result and bh_result are untouched.
1848 */
1849 static int grab_tail_page(struct inode *p_s_inode,
1850                           struct page **page_result,
1851                           struct buffer_head **bh_result) {
1852
1853     /* we want the page with the last byte in the file,
1854     ** not the page that will hold the next byte for appending
1855     */
1856     unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ;
1857     unsigned long pos = 0 ;
1858     unsigned long start = 0 ;
1859     unsigned long blocksize = p_s_inode->i_sb->s_blocksize ;
1860     unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ;
1861     struct buffer_head *bh ;
1862     struct buffer_head *head ;
1863     struct page * page ;
1864     int error ;
1865
1866     /* we know that we are only called with inode->i_size > 0.
1867     ** we also know that a file tail can never be as big as a block
1868     ** If i_size % blocksize == 0, our file is currently block aligned
1869     ** and it won't need converting or zeroing after a truncate.
1870     */
1871     if ((offset & (blocksize - 1)) == 0) {
1872         return -ENOENT ;
1873     }
1874     page = grab_cache_page(p_s_inode->i_mapping, index) ;
1875     error = -ENOMEM ;
1876     if (!page) {
1877         goto out ;
1878     }
1879     /* start within the page of the last block in the file */
1880     start = (offset / blocksize) * blocksize ;
1881
1882     error = block_prepare_write(page, start, offset,
1883                                 reiserfs_get_block_create_0) ;
1884     if (error)
1885         goto unlock ;
1886
1887     head = page_buffers(page) ;
1888     bh = head;
1889     do {
1890         if (pos >= start) {
1891             break ;
1892         }
1893         bh = bh->b_this_page ;
1894         pos += blocksize ;
1895     } while(bh != head) ;
1896
1897     if (!buffer_uptodate(bh)) {
1898         /* note, this should never happen, prepare_write should
1899         ** be taking care of this for us.  If the buffer isn't up to date,
1900         ** I've screwed up the code to find the buffer, or the code to
1901         ** call prepare_write
1902         */
1903         reiserfs_warning (p_s_inode->i_sb,
1904                           "clm-6000: error reading block %lu on dev %s",
1905                           bh->b_blocknr,
1906                           reiserfs_bdevname (p_s_inode->i_sb)) ;
1907         error = -EIO ;
1908         goto unlock ;
1909     }
1910     *bh_result = bh ;
1911     *page_result = page ;
1912
1913 out:
1914     return error ;
1915
1916 unlock:
1917     unlock_page(page) ;
1918     page_cache_release(page) ;
1919     return error ;
1920 }
1921
1922 /*
1923 ** vfs version of truncate file.  Must NOT be called with
1924 ** a transaction already started.
1925 **
1926 ** some code taken from block_truncate_page
1927 */
1928 void reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
1929     struct reiserfs_transaction_handle th ;
1930     /* we want the offset for the first byte after the end of the file */
1931     unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
1932     unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
1933     unsigned length ;
1934     struct page *page = NULL ;
1935     int error ;
1936     struct buffer_head *bh = NULL ;
1937
1938     reiserfs_write_lock(p_s_inode->i_sb);
1939
1940     if (p_s_inode->i_size > 0) {
1941         if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
1942             // -ENOENT means we truncated past the end of the file,
1943             // and get_block_create_0 could not find a block to read in,
1944             // which is ok.
1945             if (error != -ENOENT)
1946                 reiserfs_warning (p_s_inode->i_sb,
1947                                   "clm-6001: grab_tail_page failed %d",
1948                                   error);
1949             page = NULL ;
1950             bh = NULL ;
1951         }
1952     }
1953
1954     /* so, if page != NULL, we have a buffer head for the offset at
1955     ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
1956     ** then we have an unformatted node.  Otherwise, we have a direct item,
1957     ** and no zeroing is required on disk.  We zero after the truncate,
1958     ** because the truncate might pack the item anyway
1959     ** (it will unmap bh if it packs).
1960     */
1961     /* it is enough to reserve space in transaction for 2 balancings:
1962        one for "save" link adding and another for the first
1963        cut_from_item. 1 is for update_sd */
1964     journal_begin(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
1965     reiserfs_update_inode_transaction(p_s_inode) ;
1966     if (update_timestamps)
1967             /* we are doing real truncate: if the system crashes before the last
1968                transaction of truncating gets committed - on reboot the file
1969                either appears truncated properly or not truncated at all */
1970         add_save_link (&th, p_s_inode, 1);
1971     reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
1972     journal_end(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
1973
1974     if (update_timestamps)
1975         remove_save_link (p_s_inode, 1/* truncate */);
1976
1977     if (page) {
1978         length = offset & (blocksize - 1) ;
1979         /* if we are not on a block boundary */
1980         if (length) {
1981             char *kaddr;
1982
1983             length = blocksize - length ;
1984             kaddr = kmap_atomic(page, KM_USER0) ;
1985             memset(kaddr + offset, 0, length) ;
1986             flush_dcache_page(page) ;
1987             kunmap_atomic(kaddr, KM_USER0) ;
1988             if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1989                 mark_buffer_dirty(bh) ;
1990             }
1991         }
1992         unlock_page(page) ;
1993         page_cache_release(page) ;
1994     }
1995
1996     reiserfs_write_unlock(p_s_inode->i_sb);
1997 }
1998
1999 static int map_block_for_writepage(struct inode *inode,
2000                                struct buffer_head *bh_result,
2001                                unsigned long block) {
2002     struct reiserfs_transaction_handle th ;
2003     int fs_gen ;
2004     struct item_head tmp_ih ;
2005     struct item_head *ih ;
2006     struct buffer_head *bh ;
2007     __u32 *item ;
2008     struct cpu_key key ;
2009     INITIALIZE_PATH(path) ;
2010     int pos_in_item ;
2011     int jbegin_count = JOURNAL_PER_BALANCE_CNT ;
2012     loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ;
2013     int retval ;
2014     int use_get_block = 0 ;
2015     int bytes_copied = 0 ;
2016     int copy_size ;
2017     int trans_running = 0;
2018
2019     /* catch places below that try to log something without starting a trans */
2020     th.t_trans_id = 0;
2021
2022     if (!buffer_uptodate(bh_result)) {
2023         return -EIO;
2024     }
2025
2026     kmap(bh_result->b_page) ;
2027 start_over:
2028     reiserfs_write_lock(inode->i_sb);
2029     make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ;
2030
2031 research:
2032     retval = search_for_position_by_key(inode->i_sb, &key, &path) ;
2033     if (retval != POSITION_FOUND) {
2034         use_get_block = 1;
2035         goto out ;
2036     }
2037
2038     bh = get_last_bh(&path) ;
2039     ih = get_ih(&path) ;
2040     item = get_item(&path) ;
2041     pos_in_item = path.pos_in_item ;
2042
2043     /* we've found an unformatted node */
2044     if (indirect_item_found(retval, ih)) {
2045         if (bytes_copied > 0) {
2046             reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d",
2047                               bytes_copied) ;
2048         }
2049         if (!get_block_num(item, pos_in_item)) {
2050             /* crap, we are writing to a hole */
2051             use_get_block = 1;
2052             goto out ;
2053         }
2054         set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode);
2055     } else if (is_direct_le_ih(ih)) {
2056         char *p ;
2057         p = page_address(bh_result->b_page) ;
2058         p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ;
2059         copy_size = ih_item_len(ih) - pos_in_item;
2060
2061         fs_gen = get_generation(inode->i_sb) ;
2062         copy_item_head(&tmp_ih, ih) ;
2063
2064         if (!trans_running) {
2065             /* vs-3050 is gone, no need to drop the path */
2066             journal_begin(&th, inode->i_sb, jbegin_count) ;
2067             reiserfs_update_inode_transaction(inode) ;
2068             trans_running = 1;
2069             if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
2070                 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2071                 goto research;
2072             }
2073         }
2074
2075         reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
2076
2077         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
2078             reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2079             goto research;
2080         }
2081
2082         memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
2083
2084         journal_mark_dirty(&th, inode->i_sb, bh) ;
2085         bytes_copied += copy_size ;
2086         set_block_dev_mapped(bh_result, 0, inode);
2087
2088         /* are there still bytes left? */
2089         if (bytes_copied < bh_result->b_size &&
2090             (byte_offset + bytes_copied) < inode->i_size) {
2091             set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ;
2092             goto research ;
2093         }
2094     } else {
2095         reiserfs_warning (inode->i_sb,
2096                           "clm-6003: bad item inode %lu, device %s",
2097                           inode->i_ino, reiserfs_bdevname (inode->i_sb)) ;
2098         retval = -EIO ;
2099         goto out ;
2100     }
2101     retval = 0 ;
2102
2103 out:
2104     pathrelse(&path) ;
2105     if (trans_running) {
2106         journal_end(&th, inode->i_sb, jbegin_count) ;
2107         trans_running = 0;
2108     }
2109     reiserfs_write_unlock(inode->i_sb);
2110
2111     /* this is where we fill in holes in the file. */
2112     if (use_get_block) {
2113         retval = reiserfs_get_block(inode, block, bh_result,
2114                                     GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
2115                                     GET_BLOCK_NO_DANGLE);
2116         if (!retval) {
2117             if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
2118                 /* get_block failed to find a mapped unformatted node. */
2119                 use_get_block = 0 ;
2120                 goto start_over ;
2121             }
2122         }
2123     }
2124     kunmap(bh_result->b_page) ;
2125
2126     if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2127         /* we've copied data from the page into the direct item, so the
2128          * buffer in the page is now clean, mark it to reflect that.
2129          */
2130         lock_buffer(bh_result);
2131         clear_buffer_dirty(bh_result);
2132         unlock_buffer(bh_result);
2133     }
2134     return retval ;
2135 }
2136
2137 /*
2138  * mason@suse.com: updated in 2.5.54 to follow the same general io
2139  * start/recovery path as __block_write_full_page, along with special
2140  * code to handle reiserfs tails.
2141  */
2142 static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) {
2143     struct inode *inode = page->mapping->host ;
2144     unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
2145     int error = 0;
2146     unsigned long block ;
2147     struct buffer_head *head, *bh;
2148     int partial = 0 ;
2149     int nr = 0;
2150     int checked = PageChecked(page);
2151     struct reiserfs_transaction_handle th;
2152     struct super_block *s = inode->i_sb;
2153     int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2154     th.t_trans_id = 0;
2155
2156     /* The page dirty bit is cleared before writepage is called, which
2157      * means we have to tell create_empty_buffers to make dirty buffers
2158      * The page really should be up to date at this point, so tossing
2159      * in the BH_Uptodate is just a sanity check.
2160      */
2161     if (!page_has_buffers(page)) {
2162         create_empty_buffers(page, s->s_blocksize,
2163                             (1 << BH_Dirty) | (1 << BH_Uptodate));
2164     }
2165     head = page_buffers(page) ;
2166
2167     /* last page in the file, zero out any contents past the
2168     ** last byte in the file
2169     */
2170     if (page->index >= end_index) {
2171         char *kaddr;
2172         unsigned last_offset;
2173
2174         last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
2175         /* no file contents in this page */
2176         if (page->index >= end_index + 1 || !last_offset) {
2177             unlock_page(page);
2178             return 0;
2179         }
2180         kaddr = kmap_atomic(page, KM_USER0);
2181         memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
2182         flush_dcache_page(page) ;
2183         kunmap_atomic(kaddr, KM_USER0) ;
2184     }
2185     bh = head ;
2186     block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits) ;
2187     /* first map all the buffers, logging any direct items we find */
2188     do {
2189         if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
2190            (buffer_mapped(bh) && bh->b_blocknr == 0))) {
2191             /* not mapped yet, or it points to a direct item, search
2192              * the btree for the mapping info, and log any direct
2193              * items found
2194              */
2195             if ((error = map_block_for_writepage(inode, bh, block))) {
2196                 goto fail ;
2197             }
2198         }
2199         bh = bh->b_this_page;
2200         block++;
2201     } while(bh != head) ;
2202
2203     /*
2204      * we start the transaction after map_block_for_writepage,
2205      * because it can create holes in the file (an unbounded operation).
2206      * starting it here, we can make a reliable estimate for how many
2207      * blocks we're going to log
2208      */
2209     if (checked) {
2210         ClearPageChecked(page);
2211         reiserfs_write_lock(s);
2212         journal_begin(&th, s, bh_per_page + 1);
2213         reiserfs_update_inode_transaction(inode);
2214     }
2215     /* now go through and lock any dirty buffers on the page */
2216     do {
2217         get_bh(bh);
2218         if (!buffer_mapped(bh))
2219             continue;
2220         if (buffer_mapped(bh) && bh->b_blocknr == 0)
2221             continue;
2222
2223         if (checked) {
2224             reiserfs_prepare_for_journal(s, bh, 1);
2225             journal_mark_dirty(&th, s, bh);
2226             continue;
2227         }
2228         /* from this point on, we know the buffer is mapped to a
2229          * real block and not a direct item
2230          */
2231         if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2232             lock_buffer(bh);
2233         } else {
2234             if (test_set_buffer_locked(bh)) {
2235                 redirty_page_for_writepage(wbc, page);
2236                 continue;
2237             }
2238         }
2239         if (test_clear_buffer_dirty(bh)) {
2240             mark_buffer_async_write(bh);
2241         } else {
2242             unlock_buffer(bh);
2243         }
2244     } while((bh = bh->b_this_page) != head);
2245
2246     if (checked) {
2247         journal_end(&th, s, bh_per_page + 1);
2248         reiserfs_write_unlock(s);
2249     }
2250     BUG_ON(PageWriteback(page));
2251     set_page_writeback(page);
2252     unlock_page(page);
2253
2254     /*
2255      * since any buffer might be the only dirty buffer on the page,
2256      * the first submit_bh can bring the page out of writeback.
2257      * be careful with the buffers.
2258      */
2259     do {
2260         struct buffer_head *next = bh->b_this_page;
2261         if (buffer_async_write(bh)) {
2262             submit_bh(WRITE, bh);
2263             nr++;
2264         }
2265         put_bh(bh);
2266         bh = next;
2267     } while(bh != head);
2268
2269     error = 0;
2270 done:
2271     if (nr == 0) {
2272         /*
2273          * if this page only had a direct item, it is very possible for
2274          * no io to be required without there being an error.  Or,
2275          * someone else could have locked them and sent them down the
2276          * pipe without locking the page
2277          */
2278         bh = head ;
2279         do {
2280             if (!buffer_uptodate(bh)) {
2281                 partial = 1;
2282                 break;
2283             }
2284             bh = bh->b_this_page;
2285         } while(bh != head);
2286         if (!partial)
2287             SetPageUptodate(page);
2288         end_page_writeback(page);
2289     }
2290     return error;
2291
2292 fail:
2293     /* catches various errors, we need to make sure any valid dirty blocks
2294      * get to the media.  The page is currently locked and not marked for
2295      * writeback
2296      */
2297     ClearPageUptodate(page);
2298     bh = head;
2299     do {
2300         get_bh(bh);
2301         if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2302             lock_buffer(bh);
2303             mark_buffer_async_write(bh);
2304         } else {
2305             /*
2306              * clear any dirty bits that might have come from getting
2307              * attached to a dirty page
2308              */
2309              clear_buffer_dirty(bh);
2310         }
2311         bh = bh->b_this_page;
2312     } while(bh != head);
2313     SetPageError(page);
2314     BUG_ON(PageWriteback(page));
2315     set_page_writeback(page);
2316     unlock_page(page);
2317     do {
2318         struct buffer_head *next = bh->b_this_page;
2319         if (buffer_async_write(bh)) {
2320             clear_buffer_dirty(bh);
2321             submit_bh(WRITE, bh);
2322             nr++;
2323         }
2324         put_bh(bh);
2325         bh = next;
2326     } while(bh != head);
2327     goto done;
2328 }
2329
2330
2331 static int reiserfs_readpage (struct file *f, struct page * page)
2332 {
2333     return block_read_full_page (page, reiserfs_get_block);
2334 }
2335
2336
2337 static int reiserfs_writepage (struct page * page, struct writeback_control *wbc)
2338 {
2339     struct inode *inode = page->mapping->host ;
2340     reiserfs_wait_on_write_block(inode->i_sb) ;
2341     return reiserfs_write_full_page(page, wbc) ;
2342 }
2343
2344 int reiserfs_prepare_write(struct file *f, struct page *page,
2345                            unsigned from, unsigned to) {
2346     struct inode *inode = page->mapping->host ;
2347     int ret;
2348     int old_ref = 0;
2349
2350     reiserfs_wait_on_write_block(inode->i_sb) ;
2351     fix_tail_page_for_writing(page) ;
2352     if (reiserfs_transaction_running(inode->i_sb)) {
2353         struct reiserfs_transaction_handle *th;
2354         th = (struct reiserfs_transaction_handle *)current->journal_info;
2355         old_ref = th->t_refcount;
2356         th->t_refcount++;
2357     }
2358
2359     ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
2360     if (ret && reiserfs_transaction_running(inode->i_sb)) {
2361         struct reiserfs_transaction_handle *th = current->journal_info;
2362         /* this gets a little ugly.  If reiserfs_get_block returned an
2363          * error and left a transacstion running, we've got to close it,
2364          * and we've got to free handle if it was a persistent transaction.
2365          *
2366          * But, if we had nested into an existing transaction, we need
2367          * to just drop the ref count on the handle.
2368          *
2369          * If old_ref == 0, the transaction is from reiserfs_get_block,
2370          * and it was a persistent trans.  Otherwise, it was nested above.
2371          */
2372         if (th->t_refcount > old_ref) {
2373             if (old_ref)
2374                 th->t_refcount--;
2375             else {
2376                 reiserfs_write_lock(inode->i_sb);
2377                 reiserfs_end_persistent_transaction(th);
2378                 reiserfs_write_unlock(inode->i_sb);
2379             }
2380         }
2381     }
2382     return ret;
2383
2384 }
2385
2386
2387 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) {
2388   return generic_block_bmap(as, block, reiserfs_bmap) ;
2389 }
2390
2391 static int reiserfs_commit_write(struct file *f, struct page *page,
2392                                  unsigned from, unsigned to) {
2393     struct inode *inode = page->mapping->host ;
2394     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2395     int ret = 0;
2396     int update_sd = 0;
2397     struct reiserfs_transaction_handle *th = NULL;
2398
2399     reiserfs_wait_on_write_block(inode->i_sb) ;
2400     if (reiserfs_transaction_running(inode->i_sb)) {
2401         th = current->journal_info;
2402     }
2403     reiserfs_commit_page(inode, page, from, to);
2404
2405     /* generic_commit_write does this for us, but does not update the
2406     ** transaction tracking stuff when the size changes.  So, we have
2407     ** to do the i_size updates here.
2408     */
2409     if (pos > inode->i_size) {
2410         struct reiserfs_transaction_handle myth ;
2411         reiserfs_write_lock(inode->i_sb);
2412         /* If the file have grown beyond the border where it
2413            can have a tail, unmark it as needing a tail
2414            packing */
2415         if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
2416              (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
2417             REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
2418
2419         journal_begin(&myth, inode->i_sb, 1) ;
2420         reiserfs_update_inode_transaction(inode) ;
2421         inode->i_size = pos ;
2422         reiserfs_update_sd(&myth, inode) ;
2423         update_sd = 1;
2424         journal_end(&myth, inode->i_sb, 1) ;
2425         reiserfs_write_unlock(inode->i_sb);
2426     }
2427     if (th) {
2428         reiserfs_write_lock(inode->i_sb);
2429         if (!update_sd)
2430             reiserfs_update_sd(th, inode) ;
2431         reiserfs_end_persistent_transaction(th);
2432         reiserfs_write_unlock(inode->i_sb);
2433     }
2434
2435     /* we test for O_SYNC here so we can commit the transaction
2436     ** for any packed tails the file might have had
2437     */
2438     if (f && (f->f_flags & O_SYNC)) {
2439         reiserfs_write_lock(inode->i_sb);
2440         reiserfs_commit_for_inode(inode) ;
2441         reiserfs_write_unlock(inode->i_sb);
2442     }
2443     return ret ;
2444 }
2445
2446 void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode )
2447 {
2448         if( reiserfs_attrs( inode -> i_sb ) ) {
2449                 if( sd_attrs & REISERFS_SYNC_FL )
2450                         inode -> i_flags |= S_SYNC;
2451                 else
2452                         inode -> i_flags &= ~S_SYNC;
2453                 if( sd_attrs & REISERFS_IMMUTABLE_FL )
2454                         inode -> i_flags |= S_IMMUTABLE;
2455                 else
2456                         inode -> i_flags &= ~S_IMMUTABLE;
2457                 if( sd_attrs & REISERFS_APPEND_FL )
2458                         inode -> i_flags |= S_APPEND;
2459                 else
2460                         inode -> i_flags &= ~S_APPEND;
2461                 if( sd_attrs & REISERFS_NOATIME_FL )
2462                         inode -> i_flags |= S_NOATIME;
2463                 else
2464                         inode -> i_flags &= ~S_NOATIME;
2465                 if( sd_attrs & REISERFS_NOTAIL_FL )
2466                         REISERFS_I(inode)->i_flags |= i_nopack_mask;
2467                 else
2468                         REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2469         }
2470 }
2471
2472 void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
2473 {
2474         if( reiserfs_attrs( inode -> i_sb ) ) {
2475                 if( inode -> i_flags & S_IMMUTABLE )
2476                         *sd_attrs |= REISERFS_IMMUTABLE_FL;
2477                 else
2478                         *sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2479                 if( inode -> i_flags & S_SYNC )
2480                         *sd_attrs |= REISERFS_SYNC_FL;
2481                 else
2482                         *sd_attrs &= ~REISERFS_SYNC_FL;
2483                 if( inode -> i_flags & S_NOATIME )
2484                         *sd_attrs |= REISERFS_NOATIME_FL;
2485                 else
2486                         *sd_attrs &= ~REISERFS_NOATIME_FL;
2487                 if( REISERFS_I(inode)->i_flags & i_nopack_mask )
2488                         *sd_attrs |= REISERFS_NOTAIL_FL;
2489                 else
2490                         *sd_attrs &= ~REISERFS_NOTAIL_FL;
2491         }
2492 }
2493
2494 /* decide if this buffer needs to stay around for data logging or ordered
2495 ** write purposes
2496 */
2497 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2498 {
2499     int ret = 1 ;
2500     struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2501
2502     spin_lock(&j->j_dirty_buffers_lock) ;
2503     if (!buffer_mapped(bh)) {
2504         goto free_jh;
2505     }
2506     /* the page is locked, and the only places that log a data buffer
2507      * also lock the page.
2508      */
2509     if (reiserfs_file_data_log(inode)) {
2510         /*
2511          * very conservative, leave the buffer pinned if
2512          * anyone might need it.
2513          */
2514         if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2515             ret = 0 ;
2516         }
2517     } else
2518     if (buffer_dirty(bh) || buffer_locked(bh)) {
2519         struct reiserfs_journal_list *jl;
2520         struct reiserfs_jh *jh = bh->b_private;
2521
2522         /* why is this safe?
2523          * reiserfs_setattr updates i_size in the on disk
2524          * stat data before allowing vmtruncate to be called.
2525          *
2526          * If buffer was put onto the ordered list for this
2527          * transaction, we know for sure either this transaction
2528          * or an older one already has updated i_size on disk,
2529          * and this ordered data won't be referenced in the file
2530          * if we crash.
2531          *
2532          * if the buffer was put onto the ordered list for an older
2533          * transaction, we need to leave it around
2534          */
2535         if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2536             ret = 0;
2537     }
2538 free_jh:
2539     if (ret && bh->b_private) {
2540         reiserfs_free_jh(bh);
2541     }
2542     spin_unlock(&j->j_dirty_buffers_lock) ;
2543     return ret ;
2544 }
2545
2546 /* clm -- taken from fs/buffer.c:block_invalidate_page */
2547 static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
2548 {
2549     struct buffer_head *head, *bh, *next;
2550     struct inode *inode = page->mapping->host;
2551     unsigned int curr_off = 0;
2552     int ret = 1;
2553
2554     BUG_ON(!PageLocked(page));
2555
2556     if (offset == 0)
2557         ClearPageChecked(page);
2558
2559     if (!page_has_buffers(page))
2560         goto out;
2561
2562     head = page_buffers(page);
2563     bh = head;
2564     do {
2565         unsigned int next_off = curr_off + bh->b_size;
2566         next = bh->b_this_page;
2567
2568         /*
2569          * is this block fully invalidated?
2570          */
2571         if (offset <= curr_off) {
2572             if (invalidatepage_can_drop(inode, bh))
2573                 reiserfs_unmap_buffer(bh);
2574             else
2575                 ret = 0;
2576         }
2577         curr_off = next_off;
2578         bh = next;
2579     } while (bh != head);
2580
2581     /*
2582      * We release buffers only if the entire page is being invalidated.
2583      * The get_block cached value has been unconditionally invalidated,
2584      * so real IO is not possible anymore.
2585      */
2586     if (!offset && ret)
2587         ret = try_to_release_page(page, 0);
2588 out:
2589     return ret;
2590 }
2591
2592 static int reiserfs_set_page_dirty(struct page *page) {
2593     struct inode *inode = page->mapping->host;
2594     if (reiserfs_file_data_log(inode)) {
2595         SetPageChecked(page);
2596         return __set_page_dirty_nobuffers(page);
2597     }
2598     return __set_page_dirty_buffers(page);
2599 }
2600
2601 /*
2602  * Returns 1 if the page's buffers were dropped.  The page is locked.
2603  *
2604  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
2605  * in the buffers at page_buffers(page).
2606  *
2607  * even in -o notail mode, we can't be sure an old mount without -o notail
2608  * didn't create files with tails.
2609  */
2610 static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
2611 {
2612     struct inode *inode = page->mapping->host ;
2613     struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2614     struct buffer_head *head ;
2615     struct buffer_head *bh ;
2616     int ret = 1 ;
2617
2618     WARN_ON(PageChecked(page));
2619     spin_lock(&j->j_dirty_buffers_lock) ;
2620     head = page_buffers(page) ;
2621     bh = head ;
2622     do {
2623         if (bh->b_private) {
2624             if (!buffer_dirty(bh) && !buffer_locked(bh)) {
2625                 reiserfs_free_jh(bh);
2626             } else {
2627                 ret = 0 ;
2628                 break ;
2629             }
2630         }
2631         bh = bh->b_this_page ;
2632     } while (bh != head) ;
2633     if (ret)
2634         ret = try_to_free_buffers(page) ;
2635     spin_unlock(&j->j_dirty_buffers_lock) ;
2636     return ret ;
2637 }
2638
2639 /* We thank Mingming Cao for helping us understand in great detail what
2640    to do in this section of the code. */
2641 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
2642                 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
2643 {
2644     struct file *file = iocb->ki_filp;
2645     struct inode *inode = file->f_mapping->host;
2646
2647     return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2648                         offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
2649 }
2650
2651 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
2652     struct inode *inode = dentry->d_inode ;
2653     int error ;
2654     unsigned int ia_valid = attr->ia_valid;
2655     reiserfs_write_lock(inode->i_sb);
2656     if (attr->ia_valid & ATTR_SIZE) {
2657         /* version 2 items will be caught by the s_maxbytes check
2658         ** done for us in vmtruncate
2659         */
2660         if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
2661             attr->ia_size > MAX_NON_LFS) {
2662             error = -EFBIG ;
2663             goto out;
2664         }
2665         /* fill in hole pointers in the expanding truncate case. */
2666         if (attr->ia_size > inode->i_size) {
2667             error = generic_cont_expand(inode, attr->ia_size) ;
2668             if (REISERFS_I(inode)->i_prealloc_count > 0) {
2669                 struct reiserfs_transaction_handle th ;
2670                 /* we're changing at most 2 bitmaps, inode + super */
2671                 journal_begin(&th, inode->i_sb, 4) ;
2672                 reiserfs_discard_prealloc (&th, inode);
2673                 journal_end(&th, inode->i_sb, 4) ;
2674             }
2675             if (error)
2676                 goto out;
2677         }
2678     }
2679
2680     if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
2681          ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
2682         (get_inode_sd_version (inode) == STAT_DATA_V1)) {
2683                 /* stat data of format v3.5 has 16 bit uid and gid */
2684             error = -EINVAL;
2685             goto out;
2686         }
2687
2688     error = inode_change_ok(inode, attr) ;
2689     if (!error) {
2690         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2691             (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2692                 error = reiserfs_chown_xattrs (inode, attr);
2693
2694                 if (!error)
2695                     error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2696         }
2697         if (!error)
2698             error = inode_setattr(inode, attr) ;
2699     }
2700
2701
2702     if (!error && reiserfs_posixacl (inode->i_sb)) {
2703         if (attr->ia_valid & ATTR_MODE)
2704             error = reiserfs_acl_chmod (inode);
2705     }
2706
2707 out:
2708     reiserfs_write_unlock(inode->i_sb);
2709     return error ;
2710 }
2711
2712
2713
2714 struct address_space_operations reiserfs_address_space_operations = {
2715     .writepage = reiserfs_writepage,
2716     .readpage = reiserfs_readpage,
2717     .readpages = reiserfs_readpages,
2718     .releasepage = reiserfs_releasepage,
2719     .invalidatepage = reiserfs_invalidatepage,
2720     .sync_page = block_sync_page,
2721     .prepare_write = reiserfs_prepare_write,
2722     .commit_write = reiserfs_commit_write,
2723     .bmap = reiserfs_aop_bmap,
2724     .direct_IO = reiserfs_direct_IO,
2725     .set_page_dirty = reiserfs_set_page_dirty,
2726 } ;