fs/reiserfs/inode.c

   1 /*
   2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
   3  */
   4
   5 #include <linux/config.h>
   6 #include <linux/time.h>
   7 #include <linux/fs.h>
   8 #include <linux/reiserfs_fs.h>
   9 #include <linux/reiserfs_acl.h>
  10 #include <linux/reiserfs_xattr.h>
  11 #include <linux/smp_lock.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/highmem.h>
  14 #include <asm/uaccess.h>
  15 #include <asm/unaligned.h>
  16 #include <linux/buffer_head.h>
  17 #include <linux/mpage.h>
  18 #include <linux/writeback.h>
  19 #include <linux/quotaops.h>
  20
  21 extern int reiserfs_default_io_size; /* default io size devuned in super.c */
  22
  23 /* args for the create parameter of reiserfs_get_block */
  24 #define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
  25 #define GET_BLOCK_CREATE 1    /* add anything you need to find block */
  26 #define GET_BLOCK_NO_HOLE 2   /* return -ENOENT for file holes */
  27 #define GET_BLOCK_READ_DIRECT 4  /* read the tail if indirect item not found */
  28 #define GET_BLOCK_NO_ISEM     8 /* i_sem is not held, don't preallocate */
  29 #define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
  30
  31 static int reiserfs_get_block (struct inode * inode, sector_t block,
  32                                struct buffer_head * bh_result, int create);
  33 static int reiserfs_commit_write(struct file *f, struct page *page,
  34                                  unsigned from, unsigned to);
  35
  36 void reiserfs_delete_inode (struct inode * inode)
  37 {
  38     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2;
  39     struct reiserfs_transaction_handle th ;
  40
  41     reiserfs_write_lock(inode->i_sb);
  42
  43     DQUOT_FREE_INODE(inode);
  44     /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
  45     if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
  46         down (&inode->i_sem);
  47
  48         reiserfs_delete_xattrs (inode);
  49
  50         journal_begin(&th, inode->i_sb, jbegin_count) ;
  51         reiserfs_update_inode_transaction(inode) ;
  52
  53         reiserfs_delete_object (&th, inode);
  54
  55         journal_end(&th, inode->i_sb, jbegin_count) ;
  56
  57         up (&inode->i_sem);
  58
  59         /* all items of file are deleted, so we can remove "save" link */
  60         remove_save_link (inode, 0/* not truncate */);
  61     } else {
  62         /* no object items are in the tree */
  63         ;
  64     }
  65     clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */
  66     inode->i_blocks = 0;
  67     reiserfs_write_unlock(inode->i_sb);
  68 }
  69
  70 static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid,
  71                loff_t offset, int type, int length )
  72 {
  73     key->version = version;
  74
  75     key->on_disk_key.k_dir_id = dirid;
  76     key->on_disk_key.k_objectid = objectid;
  77     set_cpu_key_k_offset (key, offset);
  78     set_cpu_key_k_type (key, type);
  79     key->key_length = length;
  80 }
  81
  82
  83 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
  84    offset and type of key */
  85 void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset,
  86               int type, int length )
  87 {
  88   _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id),
  89                  le32_to_cpu (INODE_PKEY (inode)->k_objectid),
  90                  offset, type, length);
  91 }
  92
  93
  94 //
  95 // when key is 0, do not set version and short key
  96 //
  97 inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key,
  98                                int version,
  99                                loff_t offset, int type, int length,
 100                                int entry_count/*or ih_free_space*/)
 101 {
 102     if (key) {
 103         ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id);
 104         ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid);
 105     }
 106     put_ih_version( ih, version );
 107     set_le_ih_k_offset (ih, offset);
 108     set_le_ih_k_type (ih, type);
 109     put_ih_item_len( ih, length );
 110     /*    set_ih_free_space (ih, 0);*/
 111     // for directory items it is entry count, for directs and stat
 112     // datas - 0xffff, for indirects - 0
 113     put_ih_entry_count( ih, entry_count );
 114 }
 115
 116 //
 117 // FIXME: we might cache recently accessed indirect item
 118
 119 // Ugh.  Not too eager for that....
 120 //  I cut the code until such time as I see a convincing argument (benchmark).
 121 // I don't want a bloated inode struct..., and I don't like code complexity....
 122
 123 /* cutting the code is fine, since it really isn't in use yet and is easy
 124 ** to add back in.  But, Vladimir has a really good idea here.  Think
 125 ** about what happens for reading a file.  For each page,
 126 ** The VFS layer calls reiserfs_readpage, who searches the tree to find
 127 ** an indirect item.  This indirect item has X number of pointers, where
 128 ** X is a big number if we've done the block allocation right.  But,
 129 ** we only use one or two of these pointers during each call to readpage,
 130 ** needlessly researching again later on.
 131 **
 132 ** The size of the cache could be dynamic based on the size of the file.
 133 **
 134 ** I'd also like to see us cache the location the stat data item, since
 135 ** we are needlessly researching for that frequently.
 136 **
 137 ** --chris
 138 */
 139
 140 /* If this page has a file tail in it, and
 141 ** it was read in by get_block_create_0, the page data is valid,
 142 ** but tail is still sitting in a direct item, and we can't write to
 143 ** it.  So, look through this page, and check all the mapped buffers
 144 ** to make sure they have valid block numbers.  Any that don't need
 145 ** to be unmapped, so that block_prepare_write will correctly call
 146 ** reiserfs_get_block to convert the tail into an unformatted node
 147 */
 148 static inline void fix_tail_page_for_writing(struct page *page) {
 149     struct buffer_head *head, *next, *bh ;
 150
 151     if (page && page_has_buffers(page)) {
 152         head = page_buffers(page) ;
 153         bh = head ;
 154         do {
 155             next = bh->b_this_page ;
 156             if (buffer_mapped(bh) && bh->b_blocknr == 0) {
 157                 reiserfs_unmap_buffer(bh) ;
 158             }
 159             bh = next ;
 160         } while (bh != head) ;
 161     }
 162 }
 163
 164 /* reiserfs_get_block does not need to allocate a block only if it has been
 165    done already or non-hole position has been found in the indirect item */
 166 static inline int allocation_needed (int retval, b_blocknr_t allocated,
 167                                      struct item_head * ih,
 168                                      __u32 * item, int pos_in_item)
 169 {
 170   if (allocated)
 171          return 0;
 172   if (retval == POSITION_FOUND && is_indirect_le_ih (ih) &&
 173       get_block_num(item, pos_in_item))
 174          return 0;
 175   return 1;
 176 }
 177
 178 static inline int indirect_item_found (int retval, struct item_head * ih)
 179 {
 180   return (retval == POSITION_FOUND) && is_indirect_le_ih (ih);
 181 }
 182
 183
 184 static inline void set_block_dev_mapped (struct buffer_head * bh,
 185                                          b_blocknr_t block, struct inode * inode)
 186 {
 187         map_bh(bh, inode->i_sb, block);
 188 }
 189
 190
 191 //
 192 // files which were created in the earlier version can not be longer,
 193 // than 2 gb
 194 //
 195 static int file_capable (struct inode * inode, long block)
 196 {
 197     if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file.
 198         block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
 199         return 1;
 200
 201     return 0;
 202 }
 203
 204 /*static*/ void restart_transaction(struct reiserfs_transaction_handle *th,
 205                                 struct inode *inode, struct path *path) {
 206   struct super_block *s = th->t_super ;
 207   int len = th->t_blocks_allocated ;
 208
 209   /* we cannot restart while nested */
 210   if (th->t_refcount > 1) {
 211       return  ;
 212   }
 213   pathrelse(path) ;
 214   reiserfs_update_sd(th, inode) ;
 215   journal_end(th, s, len) ;
 216   journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ;
 217   reiserfs_update_inode_transaction(inode) ;
 218 }
 219
 220 // it is called by get_block when create == 0. Returns block number
 221 // for 'block'-th logical block of file. When it hits direct item it
 222 // returns 0 (being called from bmap) or read direct item into piece
 223 // of page (bh_result)
 224
 225 // Please improve the english/clarity in the comment above, as it is
 226 // hard to understand.
 227
 228 static int _get_block_create_0 (struct inode * inode, long block,
 229                                  struct buffer_head * bh_result,
 230                                  int args)
 231 {
 232     INITIALIZE_PATH (path);
 233     struct cpu_key key;
 234     struct buffer_head * bh;
 235     struct item_head * ih, tmp_ih;
 236     int fs_gen ;
 237     int blocknr;
 238     char * p = NULL;
 239     int chars;
 240     int ret ;
 241     int done = 0 ;
 242     unsigned long offset ;
 243
 244     // prepare the key to look for the 'block'-th block of file
 245     make_cpu_key (&key, inode,
 246                   (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3);
 247
 248 research:
 249     if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) {
 250         pathrelse (&path);
 251         if (p)
 252             kunmap(bh_result->b_page) ;
 253         // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 254         // That there is some MMAPED data associated with it that is yet to be written to disk.
 255         if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
 256             return -ENOENT ;
 257         }
 258         return 0 ;
 259     }
 260
 261     //
 262     bh = get_last_bh (&path);
 263     ih = get_ih (&path);
 264     if (is_indirect_le_ih (ih)) {
 265         __u32 * ind_item = (__u32 *)B_I_PITEM (bh, ih);
 266
 267         /* FIXME: here we could cache indirect item or part of it in
 268            the inode to avoid search_by_key in case of subsequent
 269            access to file */
 270         blocknr = get_block_num(ind_item, path.pos_in_item) ;
 271         ret = 0 ;
 272         if (blocknr) {
 273             map_bh(bh_result, inode->i_sb, blocknr);
 274             if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
 275                 set_buffer_boundary(bh_result);
 276             }
 277         } else
 278             // We do not return -ENOENT if there is a hole but page is uptodate, because it means
 279             // That there is some MMAPED data associated with it that is yet to  be written to disk.
 280             if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
 281             ret = -ENOENT ;
 282             }
 283
 284         pathrelse (&path);
 285         if (p)
 286             kunmap(bh_result->b_page) ;
 287         return ret ;
 288     }
 289
 290     // requested data are in direct item(s)
 291     if (!(args & GET_BLOCK_READ_DIRECT)) {
 292         // we are called by bmap. FIXME: we can not map block of file
 293         // when it is stored in direct item(s)
 294         pathrelse (&path);
 295         if (p)
 296             kunmap(bh_result->b_page) ;
 297         return -ENOENT;
 298     }
 299
 300     /* if we've got a direct item, and the buffer or page was uptodate,
 301     ** we don't want to pull data off disk again.  skip to the
 302     ** end, where we map the buffer and return
 303     */
 304     if (buffer_uptodate(bh_result)) {
 305         goto finished ;
 306     } else
 307         /*
 308         ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
 309         ** pages without any buffers.  If the page is up to date, we don't want
 310         ** read old data off disk.  Set the up to date bit on the buffer instead
 311         ** and jump to the end
 312         */
 313             if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
 314                 set_buffer_uptodate(bh_result);
 315                 goto finished ;
 316     }
 317
 318     // read file tail into part of page
 319     offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ;
 320     fs_gen = get_generation(inode->i_sb) ;
 321     copy_item_head (&tmp_ih, ih);
 322
 323     /* we only want to kmap if we are reading the tail into the page.
 324     ** this is not the common case, so we don't kmap until we are
 325     ** sure we need to.  But, this means the item might move if
 326     ** kmap schedules
 327     */
 328     if (!p) {
 329         p = (char *)kmap(bh_result->b_page) ;
 330         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 331             goto research;
 332         }
 333     }
 334     p += offset ;
 335     memset (p, 0, inode->i_sb->s_blocksize);
 336     do {
 337         if (!is_direct_le_ih (ih)) {
 338             BUG ();
 339         }
 340         /* make sure we don't read more bytes than actually exist in
 341         ** the file.  This can happen in odd cases where i_size isn't
 342         ** correct, and when direct item padding results in a few
 343         ** extra bytes at the end of the direct item
 344         */
 345         if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
 346             break ;
 347         if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
 348             chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item;
 349             done = 1 ;
 350         } else {
 351             chars = ih_item_len(ih) - path.pos_in_item;
 352         }
 353         memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars);
 354
 355         if (done)
 356             break ;
 357
 358         p += chars;
 359
 360         if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1))
 361             // we done, if read direct item is not the last item of
 362             // node FIXME: we could try to check right delimiting key
 363             // to see whether direct item continues in the right
 364             // neighbor or rely on i_size
 365             break;
 366
 367         // update key to look for the next piece
 368         set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars);
 369         if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND)
 370             // we read something from tail, even if now we got IO_ERROR
 371             break;
 372         bh = get_last_bh (&path);
 373         ih = get_ih (&path);
 374     } while (1);
 375
 376     flush_dcache_page(bh_result->b_page) ;
 377     kunmap(bh_result->b_page) ;
 378
 379 finished:
 380     pathrelse (&path);
 381     /* this buffer has valid data, but isn't valid for io.  mapping it to
 382      * block #0 tells the rest of reiserfs it just has a tail in it
 383      */
 384     map_bh(bh_result, inode->i_sb, 0);
 385     set_buffer_uptodate (bh_result);
 386     return 0;
 387 }
 388
 389
 390 // this is called to create file map. So, _get_block_create_0 will not
 391 // read direct item
 392 int reiserfs_bmap (struct inode * inode, sector_t block,
 393                    struct buffer_head * bh_result, int create)
 394 {
 395     if (!file_capable (inode, block))
 396         return -EFBIG;
 397
 398     reiserfs_write_lock(inode->i_sb);
 399     /* do not read the direct item */
 400     _get_block_create_0 (inode, block, bh_result, 0) ;
 401     reiserfs_write_unlock(inode->i_sb);
 402     return 0;
 403 }
 404
 405 /* special version of get_block that is only used by grab_tail_page right
 406 ** now.  It is sent to block_prepare_write, and when you try to get a
 407 ** block past the end of the file (or a block from a hole) it returns
 408 ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
 409 ** be able to do i/o on the buffers returned, unless an error value
 410 ** is also returned.
 411 **
 412 ** So, this allows block_prepare_write to be used for reading a single block
 413 ** in a page.  Where it does not produce a valid page for holes, or past the
 414 ** end of the file.  This turns out to be exactly what we need for reading
 415 ** tails for conversion.
 416 **
 417 ** The point of the wrapper is forcing a certain value for create, even
 418 ** though the VFS layer is calling this function with create==1.  If you
 419 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 420 ** don't use this function.
 421 */
 422 static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block,
 423                         struct buffer_head * bh_result, int create) {
 424     return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ;
 425 }
 426
 427 /* This is special helper for reiserfs_get_block in case we are executing
 428    direct_IO request. */
 429 static int reiserfs_get_blocks_direct_io(struct inode *inode,
 430                                          sector_t iblock,
 431                                          unsigned long max_blocks,
 432                                          struct buffer_head *bh_result,
 433                                          int create)
 434 {
 435     int ret ;
 436
 437     bh_result->b_page = NULL;
 438
 439     /* We set the b_size before reiserfs_get_block call since it is
 440        referenced in convert_tail_for_hole() that may be called from
 441        reiserfs_get_block() */
 442     bh_result->b_size = (1 << inode->i_blkbits);
 443
 444     ret = reiserfs_get_block(inode, iblock, bh_result,
 445                              create | GET_BLOCK_NO_DANGLE) ;
 446
 447     /* don't allow direct io onto tail pages */
 448     if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 449         /* make sure future calls to the direct io funcs for this offset
 450         ** in the file fail by unmapping the buffer
 451         */
 452         clear_buffer_mapped(bh_result);
 453         ret = -EINVAL ;
 454     }
 455     /* Possible unpacked tail. Flush the data before pages have
 456        disappeared */
 457     if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
 458         lock_kernel();
 459         reiserfs_commit_for_inode(inode);
 460         REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 461         unlock_kernel();
 462     }
 463     return ret ;
 464 }
 465
 466
 467 /*
 468 ** helper function for when reiserfs_get_block is called for a hole
 469 ** but the file tail is still in a direct item
 470 ** bh_result is the buffer head for the hole
 471 ** tail_offset is the offset of the start of the tail in the file
 472 **
 473 ** This calls prepare_write, which will start a new transaction
 474 ** you should not be in a transaction, or have any paths held when you
 475 ** call this.
 476 */
 477 static int convert_tail_for_hole(struct inode *inode,
 478                                  struct buffer_head *bh_result,
 479                                  loff_t tail_offset) {
 480     unsigned long index ;
 481     unsigned long tail_end ;
 482     unsigned long tail_start ;
 483     struct page * tail_page ;
 484     struct page * hole_page = bh_result->b_page ;
 485     int retval = 0 ;
 486
 487     if ((tail_offset & (bh_result->b_size - 1)) != 1)
 488         return -EIO ;
 489
 490     /* always try to read until the end of the block */
 491     tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
 492     tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
 493
 494     index = tail_offset >> PAGE_CACHE_SHIFT ;
 495     /* hole_page can be zero in case of direct_io, we are sure
 496        that we cannot get here if we write with O_DIRECT into
 497        tail page */
 498     if (!hole_page || index != hole_page->index) {
 499         tail_page = grab_cache_page(inode->i_mapping, index) ;
 500         retval = -ENOMEM;
 501         if (!tail_page) {
 502             goto out ;
 503         }
 504     } else {
 505         tail_page = hole_page ;
 506     }
 507
 508     /* we don't have to make sure the conversion did not happen while
 509     ** we were locking the page because anyone that could convert
 510     ** must first take i_sem.
 511     **
 512     ** We must fix the tail page for writing because it might have buffers
 513     ** that are mapped, but have a block number of 0.  This indicates tail
 514     ** data that has been read directly into the page, and block_prepare_write
 515     ** won't trigger a get_block in this case.
 516     */
 517     fix_tail_page_for_writing(tail_page) ;
 518     retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
 519     if (retval)
 520         goto unlock ;
 521
 522     /* tail conversion might change the data in the page */
 523     flush_dcache_page(tail_page) ;
 524
 525     retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
 526
 527 unlock:
 528     if (tail_page != hole_page) {
 529         unlock_page(tail_page) ;
 530         page_cache_release(tail_page) ;
 531     }
 532 out:
 533     return retval ;
 534 }
 535
 536 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
 537                            long block,
 538                            struct inode *inode,
 539                            b_blocknr_t *allocated_block_nr,
 540                            struct path * path,
 541                            int flags) {
 542
 543 #ifdef REISERFS_PREALLOCATE
 544     if (!(flags & GET_BLOCK_NO_ISEM)) {
 545         return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block);
 546     }
 547 #endif
 548     return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block);
 549 }
 550
 551 int reiserfs_get_block (struct inode * inode, sector_t block,
 552                         struct buffer_head * bh_result, int create)
 553 {
 554     int repeat, retval;
 555     b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int
 556     INITIALIZE_PATH(path);
 557     int pos_in_item;
 558     struct cpu_key key;
 559     struct buffer_head * bh, * unbh = 0;
 560     struct item_head * ih, tmp_ih;
 561     __u32 * item;
 562     int done;
 563     int fs_gen;
 564     struct reiserfs_transaction_handle *th = NULL;
 565     /* space reserved in transaction batch:
 566         . 3 balancings in direct->indirect conversion
 567         . 1 block involved into reiserfs_update_sd()
 568        XXX in practically impossible worst case direct2indirect()
 569        can incur (much) more that 3 balancings. */
 570     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1;
 571     int version;
 572     int dangle = 1;
 573     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
 574
 575                                 /* bad.... */
 576     reiserfs_write_lock(inode->i_sb);
 577     version = get_inode_item_key_version (inode);
 578
 579     if (block < 0) {
 580         reiserfs_write_unlock(inode->i_sb);
 581         return -EIO;
 582     }
 583
 584     if (!file_capable (inode, block)) {
 585         reiserfs_write_unlock(inode->i_sb);
 586         return -EFBIG;
 587     }
 588
 589     /* if !create, we aren't changing the FS, so we don't need to
 590     ** log anything, so we don't need to start a transaction
 591     */
 592     if (!(create & GET_BLOCK_CREATE)) {
 593         int ret ;
 594         /* find number of block-th logical block of the file */
 595         ret = _get_block_create_0 (inode, block, bh_result,
 596                                    create | GET_BLOCK_READ_DIRECT) ;
 597         reiserfs_write_unlock(inode->i_sb);
 598         return ret;
 599     }
 600     /*
 601      * if we're already in a transaction, make sure to close
 602      * any new transactions we start in this func
 603      */
 604     if ((create & GET_BLOCK_NO_DANGLE) ||
 605         reiserfs_transaction_running(inode->i_sb))
 606         dangle = 0;
 607
 608     /* If file is of such a size, that it might have a tail and tails are enabled
 609     ** we should mark it as possibly needing tail packing on close
 610     */
 611     if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) ||
 612          (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
 613         REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 614
 615     /* set the key of the first byte in the 'block'-th block of file */
 616     make_cpu_key (&key, inode, new_offset,
 617                   TYPE_ANY, 3/*key length*/);
 618     if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
 619 start_trans:
 620         th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
 621         if (!th) {
 622             retval = -ENOMEM;
 623             goto failure;
 624         }
 625         reiserfs_update_inode_transaction(inode) ;
 626     }
 627  research:
 628
 629     retval = search_for_position_by_key (inode->i_sb, &key, &path);
 630     if (retval == IO_ERROR) {
 631         retval = -EIO;
 632         goto failure;
 633     }
 634
 635     bh = get_last_bh (&path);
 636     ih = get_ih (&path);
 637     item = get_item (&path);
 638     pos_in_item = path.pos_in_item;
 639
 640     fs_gen = get_generation (inode->i_sb);
 641     copy_item_head (&tmp_ih, ih);
 642
 643     if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 644         /* we have to allocate block for the unformatted node */
 645         if (!th) {
 646             pathrelse(&path) ;
 647             goto start_trans;
 648         }
 649
 650         repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 651
 652         if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 653             /* restart the transaction to give the journal a chance to free
 654             ** some blocks.  releases the path, so we have to go back to
 655             ** research if we succeed on the second try
 656             */
 657             SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
 658             restart_transaction(th, inode, &path) ;
 659             repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 660
 661             if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
 662                 goto research ;
 663             }
 664             if (repeat == QUOTA_EXCEEDED)
 665                 retval = -EDQUOT;
 666             else
 667                 retval = -ENOSPC;
 668             goto failure;
 669         }
 670
 671         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 672             goto research;
 673         }
 674     }
 675
 676     if (indirect_item_found (retval, ih)) {
 677         b_blocknr_t unfm_ptr;
 678         /* 'block'-th block is in the file already (there is
 679            corresponding cell in some indirect item). But it may be
 680            zero unformatted node pointer (hole) */
 681         unfm_ptr = get_block_num (item, pos_in_item);
 682         if (unfm_ptr == 0) {
 683             /* use allocated block to plug the hole */
 684             reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
 685             if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 686                 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
 687                 goto research;
 688             }
 689             set_buffer_new(bh_result);
 690             if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
 691                 reiserfs_add_ordered_list(inode, bh_result);
 692             put_block_num(item, pos_in_item, allocated_block_nr) ;
 693             unfm_ptr = allocated_block_nr;
 694             journal_mark_dirty (th, inode->i_sb, bh);
 695             reiserfs_update_sd(th, inode) ;
 696         }
 697         set_block_dev_mapped(bh_result, unfm_ptr, inode);
 698         pathrelse (&path);
 699         if (!dangle && th)
 700             reiserfs_end_persistent_transaction(th);
 701
 702         reiserfs_write_unlock(inode->i_sb);
 703
 704         /* the item was found, so new blocks were not added to the file
 705         ** there is no need to make sure the inode is updated with this
 706         ** transaction
 707         */
 708         return 0;
 709     }
 710
 711     if (!th) {
 712         pathrelse(&path) ;
 713         goto start_trans;
 714     }
 715
 716     /* desired position is not found or is in the direct item. We have
 717        to append file with holes up to 'block'-th block converting
 718        direct items to indirect one if necessary */
 719     done = 0;
 720     do {
 721         if (is_statdata_le_ih (ih)) {
 722             __u32 unp = 0;
 723             struct cpu_key tmp_key;
 724
 725             /* indirect item has to be inserted */
 726             make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT,
 727                                UNFM_P_SIZE, 0/* free_space */);
 728
 729             if (cpu_key_k_offset (&key) == 1) {
 730                 /* we are going to add 'block'-th block to the file. Use
 731                    allocated block for that */
 732                 unp = cpu_to_le32 (allocated_block_nr);
 733                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 734                 set_buffer_new(bh_result);
 735                 done = 1;
 736             }
 737             tmp_key = key; // ;)
 738             set_cpu_key_k_offset (&tmp_key, 1);
 739             PATH_LAST_POSITION(&path) ++;
 740
 741             retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
 742             if (retval) {
 743                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 744                 goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 745             }
 746             //mark_tail_converted (inode);
 747         } else if (is_direct_le_ih (ih)) {
 748             /* direct item has to be converted */
 749             loff_t tail_offset;
 750
 751             tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
 752             if (tail_offset == cpu_key_k_offset (&key)) {
 753                 /* direct item we just found fits into block we have
 754                    to map. Convert it into unformatted node: use
 755                    bh_result for the conversion */
 756                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 757                 unbh = bh_result;
 758                 done = 1;
 759             } else {
 760                 /* we have to padd file tail stored in direct item(s)
 761                    up to block size and convert it to unformatted
 762                    node. FIXME: this should also get into page cache */
 763
 764                 pathrelse(&path) ;
 765                 /*
 766                  * ugly, but we can only end the transaction if
 767                  * we aren't nested
 768                  */
 769                 if (th->t_refcount == 1) {
 770                     reiserfs_end_persistent_transaction(th);
 771                     th = NULL;
 772                 }
 773
 774                 retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 775                 if (retval) {
 776                     if ( retval != -ENOSPC )
 777                         reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ;
 778                     if (allocated_block_nr) {
 779                         /* the bitmap, the super, and the stat data == 3 */
 780                         if (!th)
 781                             th = reiserfs_persistent_transaction(inode->i_sb,3);
 782                         if (th)
 783                             reiserfs_free_block (th,inode,allocated_block_nr,1);
 784                     }
 785                     goto failure ;
 786                 }
 787                 goto research ;
 788             }
 789             retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 790             if (retval) {
 791                 reiserfs_unmap_buffer(unbh);
 792                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 793                 goto failure;
 794             }
 795             /* it is important the set_buffer_uptodate is done after
 796             ** the direct2indirect.  The buffer might contain valid
 797             ** data newer than the data on disk (read by readpage, changed,
 798             ** and then sent here by writepage).  direct2indirect needs
 799             ** to know if unbh was already up to date, so it can decide
 800             ** if the data in unbh needs to be replaced with data from
 801             ** the disk
 802             */
 803             set_buffer_uptodate (unbh);
 804
 805             /* unbh->b_page == NULL in case of DIRECT_IO request, this means
 806                buffer will disappear shortly, so it should not be added to
 807              */
 808             if ( unbh->b_page ) {
 809                 /* we've converted the tail, so we must
 810                 ** flush unbh before the transaction commits
 811                 */
 812                 reiserfs_add_tail_list(inode, unbh) ;
 813
 814                 /* mark it dirty now to prevent commit_write from adding
 815                 ** this buffer to the inode's dirty buffer list
 816                 */
 817                 /*
 818                  * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
 819                  * It's still atomic, but it sets the page dirty too,
 820                  * which makes it eligible for writeback at any time by the
 821                  * VM (which was also the case with __mark_buffer_dirty())
 822                  */
 823                 mark_buffer_dirty(unbh) ;
 824             }
 825         } else {
 826             /* append indirect item with holes if needed, when appending
 827                pointer to 'block'-th block use block, which is already
 828                allocated */
 829             struct cpu_key tmp_key;
 830             unp_t unf_single=0; // We use this in case we need to allocate only
 831                                 // one block which is a fastpath
 832             unp_t *un;
 833             __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE;
 834             __u64 blocks_needed;
 835
 836             RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
 837                     "vs-804: invalid position for append");
 838             /* indirect item has to be appended, set up key of that position */
 839             make_cpu_key (&tmp_key, inode,
 840                           le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize),
 841                           //pos_in_item * inode->i_sb->s_blocksize,
 842                           TYPE_INDIRECT, 3);// key type is unimportant
 843
 844             blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits);
 845             RFALSE( blocks_needed < 0, "green-805: invalid offset");
 846
 847             if ( blocks_needed == 1 ) {
 848                 un = &unf_single;
 849             } else {
 850                 un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE,
 851                             GFP_ATOMIC); // We need to avoid scheduling.
 852                 if ( !un) {
 853                     un = &unf_single;
 854                     blocks_needed = 1;
 855                     max_to_insert = 0;
 856                 } else
 857                     memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert));
 858             }
 859             if ( blocks_needed <= max_to_insert) {
 860                 /* we are going to add target block to the file. Use allocated
 861                    block for that */
 862                 un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr);
 863                 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
 864                 set_buffer_new(bh_result);
 865                 done = 1;
 866             } else {
 867                 /* paste hole to the indirect item */
 868                 /* If kmalloc failed, max_to_insert becomes zero and it means we
 869                    only have space for one block */
 870                 blocks_needed=max_to_insert?max_to_insert:1;
 871             }
 872             retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
 873
 874             if (blocks_needed != 1)
 875                 kfree(un);
 876
 877             if (retval) {
 878                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 879                 goto failure;
 880             }
 881             if (!done) {
 882                 /* We need to mark new file size in case this function will be
 883                    interrupted/aborted later on. And we may do this only for
 884                    holes. */
 885                 inode->i_size += inode->i_sb->s_blocksize * blocks_needed;
 886             }
 887         }
 888
 889         if (done == 1)
 890             break;
 891
 892         /* this loop could log more blocks than we had originally asked
 893         ** for.  So, we have to allow the transaction to end if it is
 894         ** too big or too full.  Update the inode so things are
 895         ** consistent if we crash before the function returns
 896         **
 897         ** release the path so that anybody waiting on the path before
 898         ** ending their transaction will be able to continue.
 899         */
 900         if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
 901           restart_transaction(th, inode, &path) ;
 902         }
 903         /* inserting indirect pointers for a hole can take a
 904         ** long time.  reschedule if needed
 905         */
 906         cond_resched();
 907
 908         retval = search_for_position_by_key (inode->i_sb, &key, &path);
 909         if (retval == IO_ERROR) {
 910             retval = -EIO;
 911             goto failure;
 912         }
 913         if (retval == POSITION_FOUND) {
 914             reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: "
 915                               "%K should not be found", &key);
 916             retval = -EEXIST;
 917             if (allocated_block_nr)
 918                 reiserfs_free_block (th, inode, allocated_block_nr, 1);
 919             pathrelse(&path) ;
 920             goto failure;
 921         }
 922         bh = get_last_bh (&path);
 923         ih = get_ih (&path);
 924         item = get_item (&path);
 925         pos_in_item = path.pos_in_item;
 926     } while (1);
 927
 928
 929     retval = 0;
 930
 931  failure:
 932     if (th && !dangle) {
 933       reiserfs_update_sd(th, inode) ;
 934       reiserfs_end_persistent_transaction(th);
 935     }
 936     reiserfs_write_unlock(inode->i_sb);
 937     reiserfs_check_path(&path) ;
 938     return retval;
 939 }
 940
 941 static int
 942 reiserfs_readpages(struct file *file, struct address_space *mapping,
 943                 struct list_head *pages, unsigned nr_pages)
 944 {
 945     return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
 946 }
 947
 948 /* Compute real number of used bytes by file
 949  * Following three functions can go away when we'll have enough space in stat item
 950  */
 951 static int real_space_diff(struct inode *inode, int sd_size)
 952 {
 953     int bytes;
 954     loff_t blocksize = inode->i_sb->s_blocksize ;
 955
 956     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
 957         return sd_size ;
 958
 959     /* End of file is also in full block with indirect reference, so round
 960     ** up to the next block.
 961     **
 962     ** there is just no way to know if the tail is actually packed
 963     ** on the file, so we have to assume it isn't.  When we pack the
 964     ** tail, we add 4 bytes to pretend there really is an unformatted
 965     ** node pointer
 966     */
 967     bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
 968     return bytes ;
 969 }
 970
 971 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
 972                                         int sd_size)
 973 {
 974     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
 975         return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
 976     }
 977     return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
 978 }
 979
 980 /* Compute number of blocks used by file in ReiserFS counting */
 981 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
 982 {
 983     loff_t bytes = inode_get_bytes(inode) ;
 984     loff_t real_space = real_space_diff(inode, sd_size) ;
 985
 986     /* keeps fsck and non-quota versions of reiserfs happy */
 987     if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
 988         bytes += (loff_t)511 ;
 989     }
 990
 991     /* files from before the quota patch might i_blocks such that
 992     ** bytes < real_space.  Deal with that here to prevent it from
 993     ** going negative.
 994     */
 995     if (bytes < real_space)
 996         return 0 ;
 997     return (bytes - real_space) >> 9;
 998 }
 999
1000 //
1001 // BAD: new directories have stat data of new type and all other items
1002 // of old type. Version stored in the inode says about body items, so
1003 // in update_stat_data we can not rely on inode, but have to check
1004 // item version directly
1005 //
1006
1007 // called by read_locked_inode
1008 static void init_inode (struct inode * inode, struct path * path)
1009 {
1010     struct buffer_head * bh;
1011     struct item_head * ih;
1012     __u32 rdev;
1013     //int version = ITEM_VERSION_1;
1014
1015     bh = PATH_PLAST_BUFFER (path);
1016     ih = PATH_PITEM_HEAD (path);
1017
1018
1019     copy_key (INODE_PKEY (inode), &(ih->ih_key));
1020     inode->i_blksize = reiserfs_default_io_size;
1021
1022     INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1023     REISERFS_I(inode)->i_flags = 0;
1024     REISERFS_I(inode)->i_prealloc_block = 0;
1025     REISERFS_I(inode)->i_prealloc_count = 0;
1026     REISERFS_I(inode)->i_trans_id = 0;
1027     REISERFS_I(inode)->i_jl = NULL;
1028     REISERFS_I(inode)->i_acl_access = NULL;
1029     REISERFS_I(inode)->i_acl_default = NULL;
1030     init_rwsem (&REISERFS_I(inode)->xattr_sem);
1031
1032     if (stat_data_v1 (ih)) {
1033         struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
1034         unsigned long blocks;
1035
1036         set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1037         set_inode_sd_version (inode, STAT_DATA_V1);
1038         inode->i_mode  = sd_v1_mode(sd);
1039         inode->i_nlink = sd_v1_nlink(sd);
1040         inode->i_uid   = sd_v1_uid(sd);
1041         inode->i_gid   = sd_v1_gid(sd);
1042         inode->i_size  = sd_v1_size(sd);
1043         inode->i_atime.tv_sec = sd_v1_atime(sd);
1044         inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1045         inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1046         inode->i_atime.tv_nsec = 0;
1047         inode->i_ctime.tv_nsec = 0;
1048         inode->i_mtime.tv_nsec = 0;
1049
1050         inode->i_blocks = sd_v1_blocks(sd);
1051         inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1052         blocks = (inode->i_size + 511) >> 9;
1053         blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9);
1054         if (inode->i_blocks > blocks) {
1055             // there was a bug in <=3.5.23 when i_blocks could take negative
1056             // values. Starting from 3.5.17 this value could even be stored in
1057             // stat data. For such files we set i_blocks based on file
1058             // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1059             // only updated if file's inode will ever change
1060             inode->i_blocks = blocks;
1061         }
1062
1063         rdev = sd_v1_rdev(sd);
1064         REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd);
1065         /* an early bug in the quota code can give us an odd number for the
1066         ** block count.  This is incorrect, fix it here.
1067         */
1068         if (inode->i_blocks & 1) {
1069             inode->i_blocks++ ;
1070         }
1071         inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1072                                                   SD_V1_SIZE));
1073         /* nopack is initially zero for v1 objects. For v2 objects,
1074            nopack is initialised from sd_attrs */
1075         REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1076     } else {
1077         // new stat data found, but object may have old items
1078         // (directories and symlinks)
1079         struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih);
1080
1081         inode->i_mode   = sd_v2_mode(sd);
1082         inode->i_nlink  = sd_v2_nlink(sd);
1083         inode->i_uid    = sd_v2_uid(sd);
1084         inode->i_size   = sd_v2_size(sd);
1085         inode->i_gid    = sd_v2_gid(sd);
1086         inode->i_mtime.tv_sec  = sd_v2_mtime(sd);
1087         inode->i_atime.tv_sec = sd_v2_atime(sd);
1088         inode->i_ctime.tv_sec  = sd_v2_ctime(sd);
1089         inode->i_ctime.tv_nsec = 0;
1090         inode->i_mtime.tv_nsec = 0;
1091         inode->i_atime.tv_nsec = 0;
1092         inode->i_blocks = sd_v2_blocks(sd);
1093         rdev            = sd_v2_rdev(sd);
1094         if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) )
1095             inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1096         else
1097             inode->i_generation = sd_v2_generation(sd);
1098
1099         if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode))
1100             set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1101         else
1102             set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1103         REISERFS_I(inode)->i_first_direct_byte = 0;
1104         set_inode_sd_version (inode, STAT_DATA_V2);
1105         inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1106                                                   SD_V2_SIZE));
1107         /* read persistent inode attributes from sd and initalise
1108            generic inode flags from them */
1109         REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd );
1110         sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode );
1111     }
1112
1113     pathrelse (path);
1114     if (S_ISREG (inode->i_mode)) {
1115         inode->i_op = &reiserfs_file_inode_operations;
1116         inode->i_fop = &reiserfs_file_operations;
1117         inode->i_mapping->a_ops = &reiserfs_address_space_operations ;
1118     } else if (S_ISDIR (inode->i_mode)) {
1119         inode->i_op = &reiserfs_dir_inode_operations;
1120         inode->i_fop = &reiserfs_dir_operations;
1121     } else if (S_ISLNK (inode->i_mode)) {
1122         inode->i_op = &reiserfs_symlink_inode_operations;
1123         inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1124     } else {
1125         inode->i_blocks = 0;
1126         inode->i_op = &reiserfs_special_inode_operations;
1127         init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1128     }
1129 }
1130
1131
1132 // update new stat data with inode fields
1133 static void inode2sd (void * sd, struct inode * inode, loff_t size)
1134 {
1135     struct stat_data * sd_v2 = (struct stat_data *)sd;
1136     __u16 flags;
1137
1138     set_sd_v2_mode(sd_v2, inode->i_mode );
1139     set_sd_v2_nlink(sd_v2, inode->i_nlink );
1140     set_sd_v2_uid(sd_v2, inode->i_uid );
1141     set_sd_v2_size(sd_v2, size );
1142     set_sd_v2_gid(sd_v2, inode->i_gid );
1143     set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec );
1144     set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec );
1145     set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec );
1146     set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1147     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1148         set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1149     else
1150         set_sd_v2_generation(sd_v2, inode->i_generation);
1151     flags = REISERFS_I(inode)->i_attrs;
1152     i_attrs_to_sd_attrs( inode, &flags );
1153     set_sd_v2_attrs( sd_v2, flags );
1154 }
1155
1156
1157 // used to copy inode's fields to old stat data
1158 static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size)
1159 {
1160     struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
1161
1162     set_sd_v1_mode(sd_v1, inode->i_mode );
1163     set_sd_v1_uid(sd_v1, inode->i_uid );
1164     set_sd_v1_gid(sd_v1, inode->i_gid );
1165     set_sd_v1_nlink(sd_v1, inode->i_nlink );
1166     set_sd_v1_size(sd_v1, size );
1167     set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec );
1168     set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec );
1169     set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec );
1170
1171     if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1172         set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1173     else
1174         set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1175
1176     // Sigh. i_first_direct_byte is back
1177     set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte);
1178 }
1179
1180
1181 /* NOTE, you must prepare the buffer head before sending it here,
1182 ** and then log it after the call
1183 */
1184 static void update_stat_data (struct path * path, struct inode * inode,
1185                               loff_t size)
1186 {
1187     struct buffer_head * bh;
1188     struct item_head * ih;
1189
1190     bh = PATH_PLAST_BUFFER (path);
1191     ih = PATH_PITEM_HEAD (path);
1192
1193     if (!is_statdata_le_ih (ih))
1194         reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h",
1195                         INODE_PKEY (inode), ih);
1196
1197     if (stat_data_v1 (ih)) {
1198         // path points to old stat data
1199         inode2sd_v1 (B_I_PITEM (bh, ih), inode, size);
1200     } else {
1201         inode2sd (B_I_PITEM (bh, ih), inode, size);
1202     }
1203
1204     return;
1205 }
1206
1207
1208 void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
1209                               struct inode * inode, loff_t size)
1210 {
1211     struct cpu_key key;
1212     INITIALIZE_PATH(path);
1213     struct buffer_head *bh ;
1214     int fs_gen ;
1215     struct item_head *ih, tmp_ih ;
1216     int retval;
1217
1218     make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant
1219
1220     for(;;) {
1221         int pos;
1222         /* look for the object's stat data */
1223         retval = search_item (inode->i_sb, &key, &path);
1224         if (retval == IO_ERROR) {
1225             reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: "
1226                               "i/o failure occurred trying to update %K stat data",
1227                               &key);
1228             return;
1229         }
1230         if (retval == ITEM_NOT_FOUND) {
1231             pos = PATH_LAST_POSITION (&path);
1232             pathrelse(&path) ;
1233             if (inode->i_nlink == 0) {
1234                 /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/
1235                 return;
1236             }
1237             reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: "
1238                               "stat data of object %k (nlink == %d) not found (pos %d)",
1239                               INODE_PKEY (inode), inode->i_nlink, pos);
1240             reiserfs_check_path(&path) ;
1241             return;
1242         }
1243
1244         /* sigh, prepare_for_journal might schedule.  When it schedules the
1245         ** FS might change.  We have to detect that, and loop back to the
1246         ** search if the stat data item has moved
1247         */
1248         bh = get_last_bh(&path) ;
1249         ih = get_ih(&path) ;
1250         copy_item_head (&tmp_ih, ih);
1251         fs_gen = get_generation (inode->i_sb);
1252         reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
1253         if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
1254             reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
1255             continue ;  /* Stat_data item has been moved after scheduling. */
1256         }
1257         break;
1258     }
1259     update_stat_data (&path, inode, size);
1260     journal_mark_dirty(th, th->t_super, bh) ;
1261     pathrelse (&path);
1262     return;
1263 }
1264
1265 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
1266 ** does a make_bad_inode when things go wrong.  But, we need to make sure
1267 ** and clear the key in the private portion of the inode, otherwise a
1268 ** corresponding iput might try to delete whatever object the inode last
1269 ** represented.
1270 */
1271 static void reiserfs_make_bad_inode(struct inode *inode) {
1272     memset(INODE_PKEY(inode), 0, KEY_SIZE);
1273     make_bad_inode(inode);
1274 }
1275
1276 //
1277 // initially this function was derived from minix or ext2's analog and
1278 // evolved as the prototype did
1279 //
1280
1281 int reiserfs_init_locked_inode (struct inode * inode, void *p)
1282 {
1283     struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ;
1284     inode->i_ino = args->objectid;
1285     INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1286     return 0;
1287 }
1288
1289 /* looks for stat data in the tree, and fills up the fields of in-core
1290    inode stat data fields */
1291 void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args)
1292 {
1293     INITIALIZE_PATH (path_to_sd);
1294     struct cpu_key key;
1295     unsigned long dirino;
1296     int retval;
1297
1298     dirino = args->dirid ;
1299
1300     /* set version 1, version 2 could be used too, because stat data
1301        key is the same in both versions */
1302     key.version = KEY_FORMAT_3_5;
1303     key.on_disk_key.k_dir_id = dirino;
1304     key.on_disk_key.k_objectid = inode->i_ino;
1305     key.on_disk_key.u.k_offset_v1.k_offset = SD_OFFSET;
1306     key.on_disk_key.u.k_offset_v1.k_uniqueness = SD_UNIQUENESS;
1307
1308     /* look for the object's stat data */
1309     retval = search_item (inode->i_sb, &key, &path_to_sd);
1310     if (retval == IO_ERROR) {
1311         reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: "
1312                           "i/o failure occurred trying to find stat data of %K",
1313                           &key);
1314         reiserfs_make_bad_inode(inode) ;
1315         return;
1316     }
1317     if (retval != ITEM_FOUND) {
1318         /* a stale NFS handle can trigger this without it being an error */
1319         pathrelse (&path_to_sd);
1320         reiserfs_make_bad_inode(inode) ;
1321         inode->i_nlink = 0;
1322         return;
1323     }
1324
1325     init_inode (inode, &path_to_sd);
1326
1327     /* It is possible that knfsd is trying to access inode of a file
1328        that is being removed from the disk by some other thread. As we
1329        update sd on unlink all that is required is to check for nlink
1330        here. This bug was first found by Sizif when debugging
1331        SquidNG/Butterfly, forgotten, and found again after Philippe
1332        Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1333
1334        More logical fix would require changes in fs/inode.c:iput() to
1335        remove inode from hash-table _after_ fs cleaned disk stuff up and
1336        in iget() to return NULL if I_FREEING inode is found in
1337        hash-table. */
1338     /* Currently there is one place where it's ok to meet inode with
1339        nlink==0: processing of open-unlinked and half-truncated files
1340        during mount (fs/reiserfs/super.c:finish_unfinished()). */
1341     if( ( inode -> i_nlink == 0 ) &&
1342         ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) {
1343             reiserfs_warning (inode->i_sb,
1344                               "vs-13075: reiserfs_read_locked_inode: "
1345                               "dead inode read from disk %K. "
1346                               "This is likely to be race with knfsd. Ignore",
1347                               &key );
1348             reiserfs_make_bad_inode( inode );
1349     }
1350
1351     reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
1352
1353 }
1354
1355 /**
1356  * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1357  *
1358  * @inode:    inode from hash table to check
1359  * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1360  *
1361  * This function is called by iget5_locked() to distinguish reiserfs inodes
1362  * having the same inode numbers. Such inodes can only exist due to some
1363  * error condition. One of them should be bad. Inodes with identical
1364  * inode numbers (objectids) are distinguished by parent directory ids.
1365  *
1366  */
1367 int reiserfs_find_actor( struct inode *inode, void *opaque )
1368 {
1369     struct reiserfs_iget_args *args;
1370
1371     args = opaque;
1372     /* args is already in CPU order */
1373     return (inode->i_ino == args->objectid) &&
1374         (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1375 }
1376
1377 struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key)
1378 {
1379     struct inode * inode;
1380     struct reiserfs_iget_args args ;
1381
1382     args.objectid = key->on_disk_key.k_objectid ;
1383     args.dirid = key->on_disk_key.k_dir_id ;
1384     inode = iget5_locked (s, key->on_disk_key.k_objectid,
1385                    reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args));
1386     if (!inode)
1387         return ERR_PTR(-ENOMEM) ;
1388
1389     if (inode->i_state & I_NEW) {
1390         reiserfs_read_locked_inode(inode, &args);
1391         unlock_new_inode(inode);
1392     }
1393
1394     if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) {
1395         /* either due to i/o error or a stale NFS handle */
1396         iput (inode);
1397         inode = 0;
1398     }
1399     return inode;
1400 }
1401
1402 struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
1403 {
1404     __u32 *data = vobjp;
1405     struct cpu_key key ;
1406     struct dentry *result;
1407     struct inode *inode;
1408
1409     key.on_disk_key.k_objectid = data[0] ;
1410     key.on_disk_key.k_dir_id = data[1] ;
1411     inode = reiserfs_iget(sb, &key) ;
1412     if (inode && !IS_ERR(inode) && data[2] != 0 &&
1413         data[2] != inode->i_generation) {
1414             iput(inode) ;
1415             inode = NULL ;
1416     }
1417     if (!inode)
1418             inode = ERR_PTR(-ESTALE);
1419     if (IS_ERR(inode))
1420             return ERR_PTR(PTR_ERR(inode));
1421     result = d_alloc_anon(inode);
1422     if (!result) {
1423             iput(inode);
1424             return ERR_PTR(-ENOMEM);
1425     }
1426     return result;
1427 }
1428
1429 struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data,
1430                                      int len, int fhtype,
1431                                   int (*acceptable)(void *contect, struct dentry *de),
1432                                   void *context) {
1433     __u32 obj[3], parent[3];
1434
1435     /* fhtype happens to reflect the number of u32s encoded.
1436      * due to a bug in earlier code, fhtype might indicate there
1437      * are more u32s then actually fitted.
1438      * so if fhtype seems to be more than len, reduce fhtype.
1439      * Valid types are:
1440      *   2 - objectid + dir_id - legacy support
1441      *   3 - objectid + dir_id + generation
1442      *   4 - objectid + dir_id + objectid and dirid of parent - legacy
1443      *   5 - objectid + dir_id + generation + objectid and dirid of parent
1444      *   6 - as above plus generation of directory
1445      * 6 does not fit in NFSv2 handles
1446      */
1447     if (fhtype > len) {
1448             if (fhtype != 6 || len != 5)
1449                     reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1450                            fhtype, len);
1451             fhtype = 5;
1452     }
1453
1454     obj[0] = data[0];
1455     obj[1] = data[1];
1456     if (fhtype == 3 || fhtype >= 5)
1457             obj[2] = data[2];
1458     else    obj[2] = 0; /* generation number */
1459
1460     if (fhtype >= 4) {
1461             parent[0] = data[fhtype>=5?3:2] ;
1462             parent[1] = data[fhtype>=5?4:3] ;
1463             if (fhtype == 6)
1464                     parent[2] = data[5];
1465             else    parent[2] = 0;
1466     }
1467     return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent,
1468                                acceptable, context);
1469 }
1470
1471 int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) {
1472     struct inode *inode = dentry->d_inode ;
1473     int maxlen = *lenp;
1474
1475     if (maxlen < 3)
1476         return 255 ;
1477
1478     data[0] = inode->i_ino ;
1479     data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1480     data[2] = inode->i_generation ;
1481     *lenp = 3 ;
1482     /* no room for directory info? return what we've stored so far */
1483     if (maxlen < 5 || ! need_parent)
1484         return 3 ;
1485
1486     spin_lock(&dentry->d_lock);
1487     inode = dentry->d_parent->d_inode ;
1488     data[3] = inode->i_ino ;
1489     data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1490     *lenp = 5 ;
1491     if (maxlen >= 6) {
1492             data[5] = inode->i_generation ;
1493             *lenp = 6 ;
1494     }
1495     spin_unlock(&dentry->d_lock);
1496     return *lenp ;
1497 }
1498
1499
1500 /* looks for stat data, then copies fields to it, marks the buffer
1501    containing stat data as dirty */
1502 /* reiserfs inodes are never really dirty, since the dirty inode call
1503 ** always logs them.  This call allows the VFS inode marking routines
1504 ** to properly mark inodes for datasync and such, but only actually
1505 ** does something when called for a synchronous update.
1506 */
1507 void reiserfs_write_inode (struct inode * inode, int do_sync) {
1508     struct reiserfs_transaction_handle th ;
1509     int jbegin_count = 1 ;
1510
1511     if (inode->i_sb->s_flags & MS_RDONLY) {
1512         reiserfs_warning (inode->i_sb,
1513                           "clm-6005: writing inode %lu on readonly FS",
1514                           inode->i_ino) ;
1515         return ;
1516     }
1517     /* memory pressure can sometimes initiate write_inode calls with sync == 1,
1518     ** these cases are just when the system needs ram, not when the
1519     ** inode needs to reach disk for safety, and they can safely be
1520     ** ignored because the altered inode has already been logged.
1521     */
1522     if (do_sync && !(current->flags & PF_MEMALLOC)) {
1523         reiserfs_write_lock(inode->i_sb);
1524         journal_begin(&th, inode->i_sb, jbegin_count) ;
1525         reiserfs_update_sd (&th, inode);
1526         journal_end_sync(&th, inode->i_sb, jbegin_count) ;
1527         reiserfs_write_unlock(inode->i_sb);
1528     }
1529 }
1530
1531 /* FIXME: no need any more. right? */
1532 int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, struct inode * inode)
1533 {
1534   int err = 0;
1535
1536   reiserfs_update_sd (th, inode);
1537   return err;
1538 }
1539
1540
1541 /* stat data of new object is inserted already, this inserts the item
1542    containing "." and ".." entries */
1543 static int reiserfs_new_directory (struct reiserfs_transaction_handle *th,
1544                                    struct inode *inode,
1545                                    struct item_head * ih, struct path * path,
1546                                    struct inode * dir)
1547 {
1548     struct super_block * sb = th->t_super;
1549     char empty_dir [EMPTY_DIR_SIZE];
1550     char * body = empty_dir;
1551     struct cpu_key key;
1552     int retval;
1553
1554     _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id),
1555                    le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/);
1556
1557     /* compose item head for new item. Directories consist of items of
1558        old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1559        is done by reiserfs_new_inode */
1560     if (old_format_only (sb)) {
1561         make_le_item_head (ih, 0, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1562
1563         make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1564                                 INODE_PKEY (dir)->k_dir_id,
1565                                 INODE_PKEY (dir)->k_objectid );
1566     } else {
1567         make_le_item_head (ih, 0, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1568
1569         make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1570                                 INODE_PKEY (dir)->k_dir_id,
1571                                 INODE_PKEY (dir)->k_objectid );
1572     }
1573
1574     /* look for place in the tree for new item */
1575     retval = search_item (sb, &key, path);
1576     if (retval == IO_ERROR) {
1577         reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: "
1578                           "i/o failure occurred creating new directory");
1579         return -EIO;
1580     }
1581     if (retval == ITEM_FOUND) {
1582         pathrelse (path);
1583         reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: "
1584                           "object with this key exists (%k)", &(ih->ih_key));
1585         return -EEXIST;
1586     }
1587
1588     /* insert item, that is empty directory item */
1589     return reiserfs_insert_item (th, path, &key, ih, inode, body);
1590 }
1591
1592
1593 /* stat data of object has been inserted, this inserts the item
1594    containing the body of symlink */
1595 static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th,
1596                                  struct inode *inode,   /* Inode of symlink */
1597                                  struct item_head * ih,
1598                                  struct path * path, const char * symname, int item_len)
1599 {
1600     struct super_block * sb = th->t_super;
1601     struct cpu_key key;
1602     int retval;
1603
1604     _make_cpu_key (&key, KEY_FORMAT_3_5,
1605                    le32_to_cpu (ih->ih_key.k_dir_id),
1606                    le32_to_cpu (ih->ih_key.k_objectid),
1607                    1, TYPE_DIRECT, 3/*key length*/);
1608
1609     make_le_item_head (ih, 0, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/);
1610
1611     /* look for place in the tree for new item */
1612     retval = search_item (sb, &key, path);
1613     if (retval == IO_ERROR) {
1614         reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: "
1615                           "i/o failure occurred creating new symlink");
1616         return -EIO;
1617     }
1618     if (retval == ITEM_FOUND) {
1619         pathrelse (path);
1620         reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: "
1621                           "object with this key exists (%k)", &(ih->ih_key));
1622         return -EEXIST;
1623     }
1624
1625     /* insert item, that is body of symlink */
1626     return reiserfs_insert_item (th, path, &key, ih, inode, symname);
1627 }
1628
1629
1630 /* inserts the stat data into the tree, and then calls
1631    reiserfs_new_directory (to insert ".", ".." item if new object is
1632    directory) or reiserfs_new_symlink (to insert symlink body if new
1633    object is symlink) or nothing (if new object is regular file)
1634
1635    NOTE! uid and gid must already be set in the inode.  If we return
1636    non-zero due to an error, we have to drop the quota previously allocated
1637    for the fresh inode.  This can only be done outside a transaction, so
1638    if we return non-zero, we also end the transaction.  */
1639 int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
1640                         struct inode * dir, int mode,
1641                         const char * symname,
1642                         /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1643                            strlen (symname) for symlinks)*/
1644                          loff_t i_size, struct dentry *dentry,
1645                          struct inode *inode)
1646 {
1647     struct super_block * sb;
1648     INITIALIZE_PATH (path_to_key);
1649     struct cpu_key key;
1650     struct item_head ih;
1651     struct stat_data sd;
1652     int retval;
1653     int err;
1654
1655     if (!dir || !dir->i_nlink) {
1656         err = -EPERM;
1657         goto out_bad_inode;
1658     }
1659
1660     sb = dir->i_sb;
1661
1662     /* item head of new item */
1663     ih.ih_key.k_dir_id = INODE_PKEY (dir)->k_objectid;
1664     ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
1665     if (!ih.ih_key.k_objectid) {
1666         err = -ENOMEM;
1667         goto out_bad_inode ;
1668     }
1669     if (old_format_only (sb))
1670         /* not a perfect generation count, as object ids can be reused, but
1671         ** this is as good as reiserfs can do right now.
1672         ** note that the private part of inode isn't filled in yet, we have
1673         ** to use the directory.
1674         */
1675         inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid);
1676     else
1677 #if defined( USE_INODE_GENERATION_COUNTER )
1678         inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1679 #else
1680         inode->i_generation = ++event;
1681 #endif
1682
1683     /* fill stat data */
1684     inode->i_nlink = (S_ISDIR (mode) ? 2 : 1);
1685
1686     /* uid and gid must already be set by the caller for quota init */
1687
1688     /* symlink cannot be immutable or append only, right? */
1689     if( S_ISLNK( inode -> i_mode ) )
1690             inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
1691
1692     inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1693     inode->i_size = i_size;
1694     inode->i_blocks = 0;
1695     inode->i_bytes = 0;
1696     REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1697       U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
1698
1699     INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1700     REISERFS_I(inode)->i_flags = 0;
1701     REISERFS_I(inode)->i_prealloc_block = 0;
1702     REISERFS_I(inode)->i_prealloc_count = 0;
1703     REISERFS_I(inode)->i_trans_id = 0;
1704     REISERFS_I(inode)->i_jl = 0;
1705     REISERFS_I(inode)->i_attrs =
1706         REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1707     sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
1708     REISERFS_I(inode)->i_acl_access = NULL;
1709     REISERFS_I(inode)->i_acl_default = NULL;
1710     init_rwsem (&REISERFS_I(inode)->xattr_sem);
1711
1712     if (old_format_only (sb))
1713         make_le_item_head (&ih, 0, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1714     else
1715         make_le_item_head (&ih, 0, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1716
1717     /* key to search for correct place for new stat data */
1718     _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id),
1719                    le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/);
1720
1721     /* find proper place for inserting of stat data */
1722     retval = search_item (sb, &key, &path_to_key);
1723     if (retval == IO_ERROR) {
1724         err = -EIO;
1725         goto out_bad_inode;
1726     }
1727     if (retval == ITEM_FOUND) {
1728         pathrelse (&path_to_key);
1729         err = -EEXIST;
1730         goto out_bad_inode;
1731     }
1732
1733     if (old_format_only (sb)) {
1734         if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1735             pathrelse (&path_to_key);
1736             /* i_uid or i_gid is too big to be stored in stat data v3.5 */
1737             err = -EINVAL;
1738             goto out_bad_inode;
1739         }
1740         inode2sd_v1 (&sd, inode, inode->i_size);
1741     } else {
1742         inode2sd (&sd, inode, inode->i_size);
1743     }
1744     // these do not go to on-disk stat data
1745     inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
1746     inode->i_blksize = reiserfs_default_io_size;
1747
1748     // store in in-core inode the key of stat data and version all
1749     // object items will have (directory items will have old offset
1750     // format, other new objects will consist of new items)
1751     memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE);
1752     if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode))
1753         set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1754     else
1755         set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1756     if (old_format_only (sb))
1757         set_inode_sd_version (inode, STAT_DATA_V1);
1758     else
1759         set_inode_sd_version (inode, STAT_DATA_V2);
1760
1761     /* insert the stat data into the tree */
1762 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1763     if (REISERFS_I(dir)->new_packing_locality)
1764         th->displace_new_blocks = 1;
1765 #endif
1766     retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
1767     if (retval) {
1768         err = retval;
1769         reiserfs_check_path(&path_to_key) ;
1770         goto out_bad_inode;
1771     }
1772
1773 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
1774     if (!th->displace_new_blocks)
1775         REISERFS_I(dir)->new_packing_locality = 0;
1776 #endif
1777     if (S_ISDIR(mode)) {
1778         /* insert item with "." and ".." */
1779         retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
1780     }
1781
1782     if (S_ISLNK(mode)) {
1783         /* insert body of symlink */
1784         if (!old_format_only (sb))
1785             i_size = ROUND_UP(i_size);
1786         retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
1787     }
1788     if (retval) {
1789         err = retval;
1790         reiserfs_check_path(&path_to_key) ;
1791         journal_end(th, th->t_super, th->t_blocks_allocated);
1792         goto out_inserted_sd;
1793     }
1794
1795     /* XXX CHECK THIS */
1796     if (reiserfs_posixacl (inode->i_sb)) {
1797         retval = reiserfs_inherit_default_acl (dir, dentry, inode);
1798         if (retval) {
1799             err = retval;
1800             reiserfs_check_path(&path_to_key) ;
1801             journal_end(th, th->t_super, th->t_blocks_allocated);
1802             goto out_inserted_sd;
1803         }
1804     } else if (inode->i_sb->s_flags & MS_POSIXACL) {
1805         reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, "
1806                           "but vfs thinks they are!");
1807     }
1808
1809     insert_inode_hash (inode);
1810     reiserfs_update_sd(th, inode);
1811     reiserfs_check_path(&path_to_key) ;
1812
1813     return 0;
1814
1815 /* it looks like you can easily compress these two goto targets into
1816  * one.  Keeping it like this doesn't actually hurt anything, and they
1817  * are place holders for what the quota code actually needs.
1818  */
1819 out_bad_inode:
1820     /* Invalidate the object, nothing was inserted yet */
1821     INODE_PKEY(inode)->k_objectid = 0;
1822
1823     /* dquot_drop must be done outside a transaction */
1824     journal_end(th, th->t_super, th->t_blocks_allocated) ;
1825     DQUOT_FREE_INODE(inode);
1826     DQUOT_DROP(inode);
1827     inode->i_flags |= S_NOQUOTA;
1828     make_bad_inode(inode);
1829
1830 out_inserted_sd:
1831     inode->i_nlink = 0;
1832     th->t_trans_id = 0; /* so the caller can't use this handle later */
1833     iput(inode);
1834     return err;
1835 }
1836
1837 /*
1838 ** finds the tail page in the page cache,
1839 ** reads the last block in.
1840 **
1841 ** On success, page_result is set to a locked, pinned page, and bh_result
1842 ** is set to an up to date buffer for the last block in the file.  returns 0.
1843 **
1844 ** tail conversion is not done, so bh_result might not be valid for writing
1845 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
1846 ** trying to write the block.
1847 **
1848 ** on failure, nonzero is returned, page_result and bh_result are untouched.
1849 */
1850 static int grab_tail_page(struct inode *p_s_inode,
1851                           struct page **page_result,
1852                           struct buffer_head **bh_result) {
1853
1854     /* we want the page with the last byte in the file,
1855     ** not the page that will hold the next byte for appending
1856     */
1857     unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ;
1858     unsigned long pos = 0 ;
1859     unsigned long start = 0 ;
1860     unsigned long blocksize = p_s_inode->i_sb->s_blocksize ;
1861     unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ;
1862     struct buffer_head *bh ;
1863     struct buffer_head *head ;
1864     struct page * page ;
1865     int error ;
1866
1867     /* we know that we are only called with inode->i_size > 0.
1868     ** we also know that a file tail can never be as big as a block
1869     ** If i_size % blocksize == 0, our file is currently block aligned
1870     ** and it won't need converting or zeroing after a truncate.
1871     */
1872     if ((offset & (blocksize - 1)) == 0) {
1873         return -ENOENT ;
1874     }
1875     page = grab_cache_page(p_s_inode->i_mapping, index) ;
1876     error = -ENOMEM ;
1877     if (!page) {
1878         goto out ;
1879     }
1880     /* start within the page of the last block in the file */
1881     start = (offset / blocksize) * blocksize ;
1882
1883     error = block_prepare_write(page, start, offset,
1884                                 reiserfs_get_block_create_0) ;
1885     if (error)
1886         goto unlock ;
1887
1888     head = page_buffers(page) ;
1889     bh = head;
1890     do {
1891         if (pos >= start) {
1892             break ;
1893         }
1894         bh = bh->b_this_page ;
1895         pos += blocksize ;
1896     } while(bh != head) ;
1897
1898     if (!buffer_uptodate(bh)) {
1899         /* note, this should never happen, prepare_write should
1900         ** be taking care of this for us.  If the buffer isn't up to date,
1901         ** I've screwed up the code to find the buffer, or the code to
1902         ** call prepare_write
1903         */
1904         reiserfs_warning (p_s_inode->i_sb,
1905                           "clm-6000: error reading block %lu on dev %s",
1906                           bh->b_blocknr,
1907                           reiserfs_bdevname (p_s_inode->i_sb)) ;
1908         error = -EIO ;
1909         goto unlock ;
1910     }
1911     *bh_result = bh ;
1912     *page_result = page ;
1913
1914 out:
1915     return error ;
1916
1917 unlock:
1918     unlock_page(page) ;
1919     page_cache_release(page) ;
1920     return error ;
1921 }
1922
1923 /*
1924 ** vfs version of truncate file.  Must NOT be called with
1925 ** a transaction already started.
1926 **
1927 ** some code taken from block_truncate_page
1928 */
1929 void reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
1930     struct reiserfs_transaction_handle th ;
1931     /* we want the offset for the first byte after the end of the file */
1932     unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
1933     unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
1934     unsigned length ;
1935     struct page *page = NULL ;
1936     int error ;
1937     struct buffer_head *bh = NULL ;
1938
1939     reiserfs_write_lock(p_s_inode->i_sb);
1940
1941     if (p_s_inode->i_size > 0) {
1942         if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
1943             // -ENOENT means we truncated past the end of the file,
1944             // and get_block_create_0 could not find a block to read in,
1945             // which is ok.
1946             if (error != -ENOENT)
1947                 reiserfs_warning (p_s_inode->i_sb,
1948                                   "clm-6001: grab_tail_page failed %d",
1949                                   error);
1950             page = NULL ;
1951             bh = NULL ;
1952         }
1953     }
1954
1955     /* so, if page != NULL, we have a buffer head for the offset at
1956     ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
1957     ** then we have an unformatted node.  Otherwise, we have a direct item,
1958     ** and no zeroing is required on disk.  We zero after the truncate,
1959     ** because the truncate might pack the item anyway
1960     ** (it will unmap bh if it packs).
1961     */
1962     /* it is enough to reserve space in transaction for 2 balancings:
1963        one for "save" link adding and another for the first
1964        cut_from_item. 1 is for update_sd */
1965     journal_begin(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
1966     reiserfs_update_inode_transaction(p_s_inode) ;
1967     if (update_timestamps)
1968             /* we are doing real truncate: if the system crashes before the last
1969                transaction of truncating gets committed - on reboot the file
1970                either appears truncated properly or not truncated at all */
1971         add_save_link (&th, p_s_inode, 1);
1972     reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
1973     journal_end(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
1974
1975     if (update_timestamps)
1976         remove_save_link (p_s_inode, 1/* truncate */);
1977
1978     if (page) {
1979         length = offset & (blocksize - 1) ;
1980         /* if we are not on a block boundary */
1981         if (length) {
1982             char *kaddr;
1983
1984             length = blocksize - length ;
1985             kaddr = kmap_atomic(page, KM_USER0) ;
1986             memset(kaddr + offset, 0, length) ;
1987             flush_dcache_page(page) ;
1988             kunmap_atomic(kaddr, KM_USER0) ;
1989             if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1990                 mark_buffer_dirty(bh) ;
1991             }
1992         }
1993         unlock_page(page) ;
1994         page_cache_release(page) ;
1995     }
1996
1997     reiserfs_write_unlock(p_s_inode->i_sb);
1998 }
1999
2000 static int map_block_for_writepage(struct inode *inode,
2001                                struct buffer_head *bh_result,
2002                                unsigned long block) {
2003     struct reiserfs_transaction_handle th ;
2004     int fs_gen ;
2005     struct item_head tmp_ih ;
2006     struct item_head *ih ;
2007     struct buffer_head *bh ;
2008     __u32 *item ;
2009     struct cpu_key key ;
2010     INITIALIZE_PATH(path) ;
2011     int pos_in_item ;
2012     int jbegin_count = JOURNAL_PER_BALANCE_CNT ;
2013     loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ;
2014     int retval ;
2015     int use_get_block = 0 ;
2016     int bytes_copied = 0 ;
2017     int copy_size ;
2018     int trans_running = 0;
2019
2020     /* catch places below that try to log something without starting a trans */
2021     th.t_trans_id = 0;
2022
2023     if (!buffer_uptodate(bh_result)) {
2024         return -EIO;
2025     }
2026
2027     kmap(bh_result->b_page) ;
2028 start_over:
2029     reiserfs_write_lock(inode->i_sb);
2030     make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ;
2031
2032 research:
2033     retval = search_for_position_by_key(inode->i_sb, &key, &path) ;
2034     if (retval != POSITION_FOUND) {
2035         use_get_block = 1;
2036         goto out ;
2037     }
2038
2039     bh = get_last_bh(&path) ;
2040     ih = get_ih(&path) ;
2041     item = get_item(&path) ;
2042     pos_in_item = path.pos_in_item ;
2043
2044     /* we've found an unformatted node */
2045     if (indirect_item_found(retval, ih)) {
2046         if (bytes_copied > 0) {
2047             reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d",
2048                               bytes_copied) ;
2049         }
2050         if (!get_block_num(item, pos_in_item)) {
2051             /* crap, we are writing to a hole */
2052             use_get_block = 1;
2053             goto out ;
2054         }
2055         set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode);
2056     } else if (is_direct_le_ih(ih)) {
2057         char *p ;
2058         p = page_address(bh_result->b_page) ;
2059         p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ;
2060         copy_size = ih_item_len(ih) - pos_in_item;
2061
2062         fs_gen = get_generation(inode->i_sb) ;
2063         copy_item_head(&tmp_ih, ih) ;
2064
2065         if (!trans_running) {
2066             /* vs-3050 is gone, no need to drop the path */
2067             journal_begin(&th, inode->i_sb, jbegin_count) ;
2068             reiserfs_update_inode_transaction(inode) ;
2069             trans_running = 1;
2070             if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
2071                 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2072                 goto research;
2073             }
2074         }
2075
2076         reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
2077
2078         if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
2079             reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2080             goto research;
2081         }
2082
2083         memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
2084
2085         journal_mark_dirty(&th, inode->i_sb, bh) ;
2086         bytes_copied += copy_size ;
2087         set_block_dev_mapped(bh_result, 0, inode);
2088
2089         /* are there still bytes left? */
2090         if (bytes_copied < bh_result->b_size &&
2091             (byte_offset + bytes_copied) < inode->i_size) {
2092             set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ;
2093             goto research ;
2094         }
2095     } else {
2096         reiserfs_warning (inode->i_sb,
2097                           "clm-6003: bad item inode %lu, device %s",
2098                           inode->i_ino, reiserfs_bdevname (inode->i_sb)) ;
2099         retval = -EIO ;
2100         goto out ;
2101     }
2102     retval = 0 ;
2103
2104 out:
2105     pathrelse(&path) ;
2106     if (trans_running) {
2107         journal_end(&th, inode->i_sb, jbegin_count) ;
2108         trans_running = 0;
2109     }
2110     reiserfs_write_unlock(inode->i_sb);
2111
2112     /* this is where we fill in holes in the file. */
2113     if (use_get_block) {
2114         retval = reiserfs_get_block(inode, block, bh_result,
2115                                     GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
2116                                     GET_BLOCK_NO_DANGLE);
2117         if (!retval) {
2118             if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
2119                 /* get_block failed to find a mapped unformatted node. */
2120                 use_get_block = 0 ;
2121                 goto start_over ;
2122             }
2123         }
2124     }
2125     kunmap(bh_result->b_page) ;
2126
2127     if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2128         /* we've copied data from the page into the direct item, so the
2129          * buffer in the page is now clean, mark it to reflect that.
2130          */
2131         lock_buffer(bh_result);
2132         clear_buffer_dirty(bh_result);
2133         unlock_buffer(bh_result);
2134     }
2135     return retval ;
2136 }
2137
2138 /*
2139  * mason@suse.com: updated in 2.5.54 to follow the same general io
2140  * start/recovery path as __block_write_full_page, along with special
2141  * code to handle reiserfs tails.
2142  */
2143 static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) {
2144     struct inode *inode = page->mapping->host ;
2145     unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
2146     int error = 0;
2147     unsigned long block ;
2148     struct buffer_head *head, *bh;
2149     int partial = 0 ;
2150     int nr = 0;
2151
2152     /* The page dirty bit is cleared before writepage is called, which
2153      * means we have to tell create_empty_buffers to make dirty buffers
2154      * The page really should be up to date at this point, so tossing
2155      * in the BH_Uptodate is just a sanity check.
2156      */
2157     if (!page_has_buffers(page)) {
2158         create_empty_buffers(page, inode->i_sb->s_blocksize,
2159                             (1 << BH_Dirty) | (1 << BH_Uptodate));
2160     }
2161     head = page_buffers(page) ;
2162
2163     /* last page in the file, zero out any contents past the
2164     ** last byte in the file
2165     */
2166     if (page->index >= end_index) {
2167         char *kaddr;
2168         unsigned last_offset;
2169
2170         last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
2171         /* no file contents in this page */
2172         if (page->index >= end_index + 1 || !last_offset) {
2173             unlock_page(page);
2174             return 0;
2175         }
2176         kaddr = kmap_atomic(page, KM_USER0);
2177         memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
2178         flush_dcache_page(page) ;
2179         kunmap_atomic(kaddr, KM_USER0) ;
2180     }
2181     bh = head ;
2182     block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits) ;
2183     /* first map all the buffers, logging any direct items we find */
2184     do {
2185         if (buffer_dirty(bh) && (!buffer_mapped(bh) ||
2186            (buffer_mapped(bh) && bh->b_blocknr == 0))) {
2187             /* not mapped yet, or it points to a direct item, search
2188              * the btree for the mapping info, and log any direct
2189              * items found
2190              */
2191             if ((error = map_block_for_writepage(inode, bh, block))) {
2192                 goto fail ;
2193             }
2194         }
2195         bh = bh->b_this_page;
2196         block++;
2197     } while(bh != head) ;
2198
2199     /* now go through and lock any dirty buffers on the page */
2200     do {
2201         get_bh(bh);
2202         if (!buffer_mapped(bh))
2203             continue;
2204         if (buffer_mapped(bh) && bh->b_blocknr == 0)
2205             continue;
2206
2207         /* from this point on, we know the buffer is mapped to a
2208          * real block and not a direct item
2209          */
2210         if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2211             lock_buffer(bh);
2212         } else {
2213             if (test_set_buffer_locked(bh)) {
2214                 redirty_page_for_writepage(wbc, page);
2215                 continue;
2216             }
2217         }
2218         if (test_clear_buffer_dirty(bh)) {
2219             mark_buffer_async_write(bh);
2220         } else {
2221             unlock_buffer(bh);
2222         }
2223     } while((bh = bh->b_this_page) != head);
2224
2225     BUG_ON(PageWriteback(page));
2226     set_page_writeback(page);
2227     unlock_page(page);
2228
2229     /*
2230      * since any buffer might be the only dirty buffer on the page,
2231      * the first submit_bh can bring the page out of writeback.
2232      * be careful with the buffers.
2233      */
2234     do {
2235         struct buffer_head *next = bh->b_this_page;
2236         if (buffer_async_write(bh)) {
2237             submit_bh(WRITE, bh);
2238             nr++;
2239         }
2240         put_bh(bh);
2241         bh = next;
2242     } while(bh != head);
2243
2244     error = 0;
2245 done:
2246     if (nr == 0) {
2247         /*
2248          * if this page only had a direct item, it is very possible for
2249          * no io to be required without there being an error.  Or,
2250          * someone else could have locked them and sent them down the
2251          * pipe without locking the page
2252          */
2253         bh = head ;
2254         do {
2255             if (!buffer_uptodate(bh)) {
2256                 partial = 1;
2257                 break;
2258             }
2259             bh = bh->b_this_page;
2260         } while(bh != head);
2261         if (!partial)
2262             SetPageUptodate(page);
2263         end_page_writeback(page);
2264     }
2265     return error;
2266
2267 fail:
2268     /* catches various errors, we need to make sure any valid dirty blocks
2269      * get to the media.  The page is currently locked and not marked for
2270      * writeback
2271      */
2272     ClearPageUptodate(page);
2273     bh = head;
2274     do {
2275         get_bh(bh);
2276         if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2277             lock_buffer(bh);
2278             mark_buffer_async_write(bh);
2279         } else {
2280             /*
2281              * clear any dirty bits that might have come from getting
2282              * attached to a dirty page
2283              */
2284              clear_buffer_dirty(bh);
2285         }
2286         bh = bh->b_this_page;
2287     } while(bh != head);
2288     SetPageError(page);
2289     BUG_ON(PageWriteback(page));
2290     set_page_writeback(page);
2291     unlock_page(page);
2292     do {
2293         struct buffer_head *next = bh->b_this_page;
2294         if (buffer_async_write(bh)) {
2295             clear_buffer_dirty(bh);
2296             submit_bh(WRITE, bh);
2297             nr++;
2298         }
2299         put_bh(bh);
2300         bh = next;
2301     } while(bh != head);
2302     goto done;
2303 }
2304
2305
2306 static int reiserfs_readpage (struct file *f, struct page * page)
2307 {
2308     return block_read_full_page (page, reiserfs_get_block);
2309 }
2310
2311
2312 static int reiserfs_writepage (struct page * page, struct writeback_control *wbc)
2313 {
2314     struct inode *inode = page->mapping->host ;
2315     reiserfs_wait_on_write_block(inode->i_sb) ;
2316     return reiserfs_write_full_page(page, wbc) ;
2317 }
2318
2319 int reiserfs_prepare_write(struct file *f, struct page *page,
2320                            unsigned from, unsigned to) {
2321     struct inode *inode = page->mapping->host ;
2322     int ret;
2323     int old_ref = 0;
2324
2325     reiserfs_wait_on_write_block(inode->i_sb) ;
2326     fix_tail_page_for_writing(page) ;
2327     if (reiserfs_transaction_running(inode->i_sb)) {
2328         struct reiserfs_transaction_handle *th;
2329         th = (struct reiserfs_transaction_handle *)current->journal_info;
2330         old_ref = th->t_refcount;
2331         th->t_refcount++;
2332     }
2333
2334     ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
2335     if (ret && reiserfs_transaction_running(inode->i_sb)) {
2336         struct reiserfs_transaction_handle *th = current->journal_info;
2337         /* this gets a little ugly.  If reiserfs_get_block returned an
2338          * error and left a transacstion running, we've got to close it,
2339          * and we've got to free handle if it was a persistent transaction.
2340          *
2341          * But, if we had nested into an existing transaction, we need
2342          * to just drop the ref count on the handle.
2343          *
2344          * If old_ref == 0, the transaction is from reiserfs_get_block,
2345          * and it was a persistent trans.  Otherwise, it was nested above.
2346          */
2347         if (th->t_refcount > old_ref) {
2348             if (old_ref)
2349                 th->t_refcount--;
2350             else {
2351                 reiserfs_write_lock(inode->i_sb);
2352                 reiserfs_end_persistent_transaction(th);
2353                 reiserfs_write_unlock(inode->i_sb);
2354             }
2355         }
2356     }
2357     return ret;
2358
2359 }
2360
2361
2362 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) {
2363   return generic_block_bmap(as, block, reiserfs_bmap) ;
2364 }
2365
2366 static int reiserfs_commit_write(struct file *f, struct page *page,
2367                                  unsigned from, unsigned to) {
2368     struct inode *inode = page->mapping->host ;
2369     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2370     int ret = 0;
2371     int update_sd = 0;
2372     struct reiserfs_transaction_handle *th = NULL;
2373
2374     reiserfs_wait_on_write_block(inode->i_sb) ;
2375     if (reiserfs_transaction_running(inode->i_sb)) {
2376         th = current->journal_info;
2377     }
2378     reiserfs_commit_page(inode, page, from, to);
2379
2380     /* generic_commit_write does this for us, but does not update the
2381     ** transaction tracking stuff when the size changes.  So, we have
2382     ** to do the i_size updates here.
2383     */
2384     if (pos > inode->i_size) {
2385         struct reiserfs_transaction_handle myth ;
2386         reiserfs_write_lock(inode->i_sb);
2387         /* If the file have grown beyond the border where it
2388            can have a tail, unmark it as needing a tail
2389            packing */
2390         if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
2391              (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
2392             REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
2393
2394         journal_begin(&myth, inode->i_sb, 1) ;
2395         reiserfs_update_inode_transaction(inode) ;
2396         inode->i_size = pos ;
2397         reiserfs_update_sd(&myth, inode) ;
2398         update_sd = 1;
2399         journal_end(&myth, inode->i_sb, 1) ;
2400         reiserfs_write_unlock(inode->i_sb);
2401     }
2402     if (th) {
2403         reiserfs_write_lock(inode->i_sb);
2404         if (!update_sd)
2405             reiserfs_update_sd(th, inode) ;
2406         reiserfs_end_persistent_transaction(th);
2407         reiserfs_write_unlock(inode->i_sb);
2408     }
2409
2410     /* we test for O_SYNC here so we can commit the transaction
2411     ** for any packed tails the file might have had
2412     */
2413     if (f && (f->f_flags & O_SYNC)) {
2414         reiserfs_write_lock(inode->i_sb);
2415         reiserfs_commit_for_inode(inode) ;
2416         reiserfs_write_unlock(inode->i_sb);
2417     }
2418     return ret ;
2419 }
2420
2421 void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode )
2422 {
2423         if( reiserfs_attrs( inode -> i_sb ) ) {
2424                 if( sd_attrs & REISERFS_SYNC_FL )
2425                         inode -> i_flags |= S_SYNC;
2426                 else
2427                         inode -> i_flags &= ~S_SYNC;
2428                 if( sd_attrs & REISERFS_IMMUTABLE_FL )
2429                         inode -> i_flags |= S_IMMUTABLE;
2430                 else
2431                         inode -> i_flags &= ~S_IMMUTABLE;
2432                 if( sd_attrs & REISERFS_APPEND_FL )
2433                         inode -> i_flags |= S_APPEND;
2434                 else
2435                         inode -> i_flags &= ~S_APPEND;
2436                 if( sd_attrs & REISERFS_NOATIME_FL )
2437                         inode -> i_flags |= S_NOATIME;
2438                 else
2439                         inode -> i_flags &= ~S_NOATIME;
2440                 if( sd_attrs & REISERFS_NOTAIL_FL )
2441                         REISERFS_I(inode)->i_flags |= i_nopack_mask;
2442                 else
2443                         REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2444         }
2445 }
2446
2447 void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
2448 {
2449         if( reiserfs_attrs( inode -> i_sb ) ) {
2450                 if( inode -> i_flags & S_IMMUTABLE )
2451                         *sd_attrs |= REISERFS_IMMUTABLE_FL;
2452                 else
2453                         *sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2454                 if( inode -> i_flags & S_SYNC )
2455                         *sd_attrs |= REISERFS_SYNC_FL;
2456                 else
2457                         *sd_attrs &= ~REISERFS_SYNC_FL;
2458                 if( inode -> i_flags & S_NOATIME )
2459                         *sd_attrs |= REISERFS_NOATIME_FL;
2460                 else
2461                         *sd_attrs &= ~REISERFS_NOATIME_FL;
2462                 if( REISERFS_I(inode)->i_flags & i_nopack_mask )
2463                         *sd_attrs |= REISERFS_NOTAIL_FL;
2464                 else
2465                         *sd_attrs &= ~REISERFS_NOTAIL_FL;
2466         }
2467 }
2468
2469 /* decide if this buffer needs to stay around for data logging or ordered
2470 ** write purposes
2471 */
2472 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2473 {
2474     int ret = 1 ;
2475     struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2476
2477     spin_lock(&j->j_dirty_buffers_lock) ;
2478     if (!buffer_mapped(bh)) {
2479         goto free_jh;
2480     }
2481     /* the page is locked, and the only places that log a data buffer
2482      * also lock the page.
2483      */
2484 #if 0
2485     if (reiserfs_file_data_log(inode)) {
2486         /* very conservative, leave the buffer pinned if anyone might need it.
2487         ** this should be changed to drop the buffer if it is only in the
2488         ** current transaction
2489         */
2490         if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2491             ret = 0 ;
2492         }
2493     } else
2494 #endif
2495     if (buffer_dirty(bh) || buffer_locked(bh)) {
2496         struct reiserfs_journal_list *jl;
2497         struct reiserfs_jh *jh = bh->b_private;
2498
2499         /* why is this safe?
2500          * reiserfs_setattr updates i_size in the on disk
2501          * stat data before allowing vmtruncate to be called.
2502          *
2503          * If buffer was put onto the ordered list for this
2504          * transaction, we know for sure either this transaction
2505          * or an older one already has updated i_size on disk,
2506          * and this ordered data won't be referenced in the file
2507          * if we crash.
2508          *
2509          * if the buffer was put onto the ordered list for an older
2510          * transaction, we need to leave it around
2511          */
2512         if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2513             ret = 0;
2514     }
2515 free_jh:
2516     if (ret && bh->b_private) {
2517         reiserfs_free_jh(bh);
2518     }
2519     spin_unlock(&j->j_dirty_buffers_lock) ;
2520     return ret ;
2521 }
2522
2523 /* clm -- taken from fs/buffer.c:block_invalidate_page */
2524 static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
2525 {
2526     struct buffer_head *head, *bh, *next;
2527     struct inode *inode = page->mapping->host;
2528     unsigned int curr_off = 0;
2529     int ret = 1;
2530
2531     BUG_ON(!PageLocked(page));
2532     if (!page_has_buffers(page))
2533         goto out;
2534
2535     head = page_buffers(page);
2536     bh = head;
2537     do {
2538         unsigned int next_off = curr_off + bh->b_size;
2539         next = bh->b_this_page;
2540
2541         /*
2542          * is this block fully invalidated?
2543          */
2544         if (offset <= curr_off) {
2545             if (invalidatepage_can_drop(inode, bh))
2546                 reiserfs_unmap_buffer(bh);
2547             else
2548                 ret = 0;
2549         }
2550         curr_off = next_off;
2551         bh = next;
2552     } while (bh != head);
2553
2554     /*
2555      * We release buffers only if the entire page is being invalidated.
2556      * The get_block cached value has been unconditionally invalidated,
2557      * so real IO is not possible anymore.
2558      */
2559     if (!offset && ret)
2560         ret = try_to_release_page(page, 0);
2561 out:
2562     return ret;
2563 }
2564
2565 /*
2566  * Returns 1 if the page's buffers were dropped.  The page is locked.
2567  *
2568  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
2569  * in the buffers at page_buffers(page).
2570  *
2571  * even in -o notail mode, we can't be sure an old mount without -o notail
2572  * didn't create files with tails.
2573  */
2574 static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
2575 {
2576     struct inode *inode = page->mapping->host ;
2577     struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2578     struct buffer_head *head ;
2579     struct buffer_head *bh ;
2580     int ret = 1 ;
2581
2582     spin_lock(&j->j_dirty_buffers_lock) ;
2583     head = page_buffers(page) ;
2584     bh = head ;
2585     do {
2586         if (bh->b_private) {
2587             if (!buffer_dirty(bh) && !buffer_locked(bh)) {
2588                 reiserfs_free_jh(bh);
2589             } else {
2590                 ret = 0 ;
2591                 break ;
2592             }
2593         }
2594         bh = bh->b_this_page ;
2595     } while (bh != head) ;
2596     if (ret)
2597         ret = try_to_free_buffers(page) ;
2598     spin_unlock(&j->j_dirty_buffers_lock) ;
2599     return ret ;
2600 }
2601
2602 /* We thank Mingming Cao for helping us understand in great detail what
2603    to do in this section of the code. */
2604 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
2605                 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
2606 {
2607     struct file *file = iocb->ki_filp;
2608     struct inode *inode = file->f_mapping->host;
2609
2610     return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2611                         offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
2612 }
2613
2614 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
2615     struct inode *inode = dentry->d_inode ;
2616     int error ;
2617     unsigned int ia_valid = attr->ia_valid;
2618     reiserfs_write_lock(inode->i_sb);
2619     if (attr->ia_valid & ATTR_SIZE) {
2620         /* version 2 items will be caught by the s_maxbytes check
2621         ** done for us in vmtruncate
2622         */
2623         if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
2624             attr->ia_size > MAX_NON_LFS) {
2625             error = -EFBIG ;
2626             goto out;
2627         }
2628         /* fill in hole pointers in the expanding truncate case. */
2629         if (attr->ia_size > inode->i_size) {
2630             error = generic_cont_expand(inode, attr->ia_size) ;
2631             if (REISERFS_I(inode)->i_prealloc_count > 0) {
2632                 struct reiserfs_transaction_handle th ;
2633                 /* we're changing at most 2 bitmaps, inode + super */
2634                 journal_begin(&th, inode->i_sb, 4) ;
2635                 reiserfs_discard_prealloc (&th, inode);
2636                 journal_end(&th, inode->i_sb, 4) ;
2637             }
2638             if (error)
2639                 goto out;
2640         }
2641     }
2642
2643     if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
2644          ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
2645         (get_inode_sd_version (inode) == STAT_DATA_V1)) {
2646                 /* stat data of format v3.5 has 16 bit uid and gid */
2647             error = -EINVAL;
2648             goto out;
2649         }
2650
2651     error = inode_change_ok(inode, attr) ;
2652     if (!error) {
2653         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2654             (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2655                 error = reiserfs_chown_xattrs (inode, attr);
2656
2657                 if (!error)
2658                     error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2659         }
2660         if (!error)
2661             inode_setattr(inode, attr) ;
2662     }
2663
2664
2665     if (!error && reiserfs_posixacl (inode->i_sb)) {
2666         if (attr->ia_valid & ATTR_MODE)
2667             error = reiserfs_acl_chmod (inode);
2668     }
2669
2670 out:
2671     reiserfs_write_unlock(inode->i_sb);
2672     return error ;
2673 }
2674
2675
2676
2677 struct address_space_operations reiserfs_address_space_operations = {
2678     .writepage = reiserfs_writepage,
2679     .readpage = reiserfs_readpage,
2680     .readpages = reiserfs_readpages,
2681     .releasepage = reiserfs_releasepage,
2682     .invalidatepage = reiserfs_invalidatepage,
2683     .sync_page = block_sync_page,
2684     .prepare_write = reiserfs_prepare_write,
2685     .commit_write = reiserfs_commit_write,
2686     .bmap = reiserfs_aop_bmap,
2687     .direct_IO = reiserfs_direct_IO
2688 } ;