fs/reiserfs/file.c

   1 /*
   2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
   3  */
   4
   5
   6 #include <linux/time.h>
   7 #include <linux/reiserfs_fs.h>
   8 #include <linux/reiserfs_acl.h>
   9 #include <linux/reiserfs_xattr.h>
  10 #include <linux/smp_lock.h>
  11 #include <asm/uaccess.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/swap.h>
  14 #include <linux/writeback.h>
  15 #include <linux/blkdev.h>
  16 #include <linux/buffer_head.h>
  17 #include <linux/quotaops.h>
  18
  19 /*
  20 ** We pack the tails of files on file close, not at the time they are written.
  21 ** This implies an unnecessary copy of the tail and an unnecessary indirect item
  22 ** insertion/balancing, for files that are written in one write.
  23 ** It avoids unnecessary tail packings (balances) for files that are written in
  24 ** multiple writes and are small enough to have tails.
  25 **
  26 ** file_release is called by the VFS layer when the file is closed.  If
  27 ** this is the last open file descriptor, and the file
  28 ** small enough to have a tail, and the tail is currently in an
  29 ** unformatted node, the tail is converted back into a direct item.
  30 **
  31 ** We use reiserfs_truncate_file to pack the tail, since it already has
  32 ** all the conditions coded.
  33 */
  34 static int reiserfs_file_release (struct inode * inode, struct file * filp)
  35 {
  36
  37     struct reiserfs_transaction_handle th ;
  38
  39     if (!S_ISREG (inode->i_mode))
  40         BUG ();
  41
  42     /* fast out for when nothing needs to be done */
  43     if ((atomic_read(&inode->i_count) > 1 ||
  44         !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
  45          !tail_has_to_be_packed(inode))       &&
  46         REISERFS_I(inode)->i_prealloc_count <= 0) {
  47         return 0;
  48     }
  49
  50     reiserfs_write_lock(inode->i_sb);
  51     down (&inode->i_sem);
  52     journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ;
  53     reiserfs_update_inode_transaction(inode) ;
  54
  55 #ifdef REISERFS_PREALLOCATE
  56     reiserfs_discard_prealloc (&th, inode);
  57 #endif
  58     journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ;
  59
  60     if (atomic_read(&inode->i_count) <= 1 &&
  61         (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
  62         tail_has_to_be_packed (inode)) {
  63         /* if regular file is released by last holder and it has been
  64            appended (we append by unformatted node only) or its direct
  65            item(s) had to be converted, then it may have to be
  66            indirect2direct converted */
  67         reiserfs_truncate_file(inode, 0) ;
  68     }
  69     up (&inode->i_sem);
  70     reiserfs_write_unlock(inode->i_sb);
  71     return 0;
  72 }
  73
  74 static void reiserfs_vfs_truncate_file(struct inode *inode) {
  75     reiserfs_truncate_file(inode, 1) ;
  76 }
  77
  78 /* Sync a reiserfs file. */
  79
  80 /*
  81  * FIXME: sync_mapping_buffers() never has anything to sync.  Can
  82  * be removed...
  83  */
  84
  85 static int reiserfs_sync_file(
  86                               struct file   * p_s_filp,
  87                               struct dentry * p_s_dentry,
  88                               int datasync
  89                               ) {
  90   struct inode * p_s_inode = p_s_dentry->d_inode;
  91   int n_err;
  92
  93   reiserfs_write_lock(p_s_inode->i_sb);
  94
  95   if (!S_ISREG(p_s_inode->i_mode))
  96       BUG ();
  97
  98   n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
  99   reiserfs_commit_for_inode(p_s_inode) ;
 100   reiserfs_write_unlock(p_s_inode->i_sb);
 101   return ( n_err < 0 ) ? -EIO : 0;
 102 }
 103
 104 /* I really do not want to play with memory shortage right now, so
 105    to simplify the code, we are not going to write more than this much pages at
 106    a time. This still should considerably improve performance compared to 4k
 107    at a time case. This is 32 pages of 4k size. */
 108 #define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
 109
 110 /* Allocates blocks for a file to fulfil write request.
 111    Maps all unmapped but prepared pages from the list.
 112    Updates metadata with newly allocated blocknumbers as needed */
 113 int reiserfs_allocate_blocks_for_region(
 114                                 struct reiserfs_transaction_handle *th,
 115                                 struct inode *inode, /* Inode we work with */
 116                                 loff_t pos, /* Writing position */
 117                                 int num_pages, /* number of pages write going
 118                                                   to touch */
 119                                 int write_bytes, /* amount of bytes to write */
 120                                 struct page **prepared_pages, /* array of
 121                                                                  prepared pages
 122                                                                */
 123                                 int blocks_to_allocate /* Amount of blocks we
 124                                                           need to allocate to
 125                                                           fit the data into file
 126                                                          */
 127                                 )
 128 {
 129     struct cpu_key key; // cpu key of item that we are going to deal with
 130     struct item_head *ih; // pointer to item head that we are going to deal with
 131     struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
 132     __u32 * item; // pointer to item we are going to deal with
 133     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
 134     b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored.
 135     reiserfs_blocknr_hint_t hint; // hint structure for block allocator.
 136     size_t res; // return value of various functions that we call.
 137     int curr_block; // current block used to keep track of unmapped blocks.
 138     int i; // loop counter
 139     int itempos; // position in item
 140     unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in
 141                                                        // first page
 142     unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */
 143     __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created.
 144     int modifying_this_item = 0; // Flag for items traversal code to keep track
 145                                  // of the fact that we already prepared
 146                                  // current block for journal
 147     int will_prealloc = 0;
 148
 149     RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?");
 150
 151     /* only preallocate if this is a small write */
 152     if (REISERFS_I(inode)->i_prealloc_count ||
 153        (!(write_bytes & (inode->i_sb->s_blocksize -1)) &&
 154         blocks_to_allocate <
 155         REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
 156         will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
 157
 158     allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
 159                                         sizeof(b_blocknr_t), GFP_NOFS);
 160
 161     /* First we compose a key to point at the writing position, we want to do
 162        that outside of any locking region. */
 163     make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/);
 164
 165     /* If we came here, it means we absolutely need to open a transaction,
 166        since we need to allocate some blocks */
 167     reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
 168     journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
 169     reiserfs_update_inode_transaction(inode) ;
 170
 171     /* Look for the in-tree position of our write, need path for block allocator */
 172     res = search_for_position_by_key(inode->i_sb, &key, &path);
 173     if ( res == IO_ERROR ) {
 174         res = -EIO;
 175         goto error_exit;
 176     }
 177
 178     /* Allocate blocks */
 179     /* First fill in "hint" structure for block allocator */
 180     hint.th = th; // transaction handle.
 181     hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
 182     hint.inode = inode; // Inode is needed by block allocator too.
 183     hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
 184     hint.key = key.on_disk_key; // on disk key of file.
 185     hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already.
 186     hint.formatted_node = 0; // We are allocating blocks for unformatted node.
 187     hint.preallocate = will_prealloc;
 188
 189     /* Call block allocator to allocate blocks */
 190     res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
 191     if ( res != CARRY_ON ) {
 192         if ( res == NO_DISK_SPACE ) {
 193             /* We flush the transaction in case of no space. This way some
 194                blocks might become free */
 195             SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
 196             restart_transaction(th, inode, &path);
 197
 198             /* We might have scheduled, so search again */
 199             res = search_for_position_by_key(inode->i_sb, &key, &path);
 200             if ( res == IO_ERROR ) {
 201                 res = -EIO;
 202                 goto error_exit;
 203             }
 204
 205             /* update changed info for hint structure. */
 206             res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
 207             if ( res != CARRY_ON ) {
 208                 res = -ENOSPC;
 209                 pathrelse(&path);
 210                 goto error_exit;
 211             }
 212         } else {
 213             res = -ENOSPC;
 214             pathrelse(&path);
 215             goto error_exit;
 216         }
 217     }
 218
 219 #ifdef __BIG_ENDIAN
 220         // Too bad, I have not found any way to convert a given region from
 221         // cpu format to little endian format
 222     {
 223         int i;
 224         for ( i = 0; i < blocks_to_allocate ; i++)
 225             allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]);
 226     }
 227 #endif
 228
 229     /* Blocks allocating well might have scheduled and tree might have changed,
 230        let's search the tree again */
 231     /* find where in the tree our write should go */
 232     res = search_for_position_by_key(inode->i_sb, &key, &path);
 233     if ( res == IO_ERROR ) {
 234         res = -EIO;
 235         goto error_exit_free_blocks;
 236     }
 237
 238     bh = get_last_bh( &path ); // Get a bufferhead for last element in path.
 239     ih = get_ih( &path );      // Get a pointer to last item head in path.
 240     item = get_item( &path );  // Get a pointer to last item in path
 241
 242     /* Let's see what we have found */
 243     if ( res != POSITION_FOUND ) { /* position not found, this means that we
 244                                       might need to append file with holes
 245                                       first */
 246         // Since we are writing past the file's end, we need to find out if
 247         // there is a hole that needs to be inserted before our writing
 248         // position, and how many blocks it is going to cover (we need to
 249         //  populate pointers to file blocks representing the hole with zeros)
 250
 251         {
 252             int item_offset = 1;
 253             /*
 254              * if ih is stat data, its offset is 0 and we don't want to
 255              * add 1 to pos in the hole_size calculation
 256              */
 257             if (is_statdata_le_ih(ih))
 258                 item_offset = 0;
 259             hole_size = (pos + item_offset -
 260                     (le_key_k_offset( get_inode_item_key_version(inode),
 261                     &(ih->ih_key)) +
 262                     op_bytes_number(ih, inode->i_sb->s_blocksize))) >>
 263                     inode->i_sb->s_blocksize_bits;
 264         }
 265
 266         if ( hole_size > 0 ) {
 267             int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time.
 268             /* area filled with zeroes, to supply as list of zero blocknumbers
 269                We allocate it outside of loop just in case loop would spin for
 270                several iterations. */
 271             char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway.
 272             if ( !zeros ) {
 273                 res = -ENOMEM;
 274                 goto error_exit_free_blocks;
 275             }
 276             memset ( zeros, 0, to_paste*UNFM_P_SIZE);
 277             do {
 278                 to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE );
 279                 if ( is_indirect_le_ih(ih) ) {
 280                     /* Ok, there is existing indirect item already. Need to append it */
 281                     /* Calculate position past inserted item */
 282                     make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
 283                     res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste);
 284                     if ( res ) {
 285                         kfree(zeros);
 286                         goto error_exit_free_blocks;
 287                     }
 288                 } else if ( is_statdata_le_ih(ih) ) {
 289                     /* No existing item, create it */
 290                     /* item head for new item */
 291                     struct item_head ins_ih;
 292
 293                     /* create a key for our new item */
 294                     make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3);
 295
 296                     /* Create new item head for our new item */
 297                     make_le_item_head (&ins_ih, &key, key.version, 1,
 298                                        TYPE_INDIRECT, to_paste*UNFM_P_SIZE,
 299                                        0 /* free space */);
 300
 301                     /* Find where such item should live in the tree */
 302                     res = search_item (inode->i_sb, &key, &path);
 303                     if ( res != ITEM_NOT_FOUND ) {
 304                         /* item should not exist, otherwise we have error */
 305                         if ( res != -ENOSPC ) {
 306                             reiserfs_warning (inode->i_sb,
 307                                 "green-9008: search_by_key (%K) returned %d",
 308                                               &key, res);
 309                         }
 310                         res = -EIO;
 311                         kfree(zeros);
 312                         goto error_exit_free_blocks;
 313                     }
 314                     res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros);
 315                 } else {
 316                     reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
 317                 }
 318                 if ( res ) {
 319                     kfree(zeros);
 320                     goto error_exit_free_blocks;
 321                 }
 322                 /* Now we want to check if transaction is too full, and if it is
 323                    we restart it. This will also free the path. */
 324                 if (journal_transaction_should_end(th, th->t_blocks_allocated))
 325                     restart_transaction(th, inode, &path);
 326
 327                 /* Well, need to recalculate path and stuff */
 328                 set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
 329                 res = search_for_position_by_key(inode->i_sb, &key, &path);
 330                 if ( res == IO_ERROR ) {
 331                     res = -EIO;
 332                     kfree(zeros);
 333                     goto error_exit_free_blocks;
 334                 }
 335                 bh=get_last_bh(&path);
 336                 ih=get_ih(&path);
 337                 item = get_item(&path);
 338                 hole_size -= to_paste;
 339             } while ( hole_size );
 340             kfree(zeros);
 341         }
 342     }
 343
 344     // Go through existing indirect items first
 345     // replace all zeroes with blocknumbers from list
 346     // Note that if no corresponding item was found, by previous search,
 347     // it means there are no existing in-tree representation for file area
 348     // we are going to overwrite, so there is nothing to scan through for holes.
 349     for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) {
 350 retry:
 351         if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) {
 352             /* We run out of data in this indirect item, let's look for another
 353                one. */
 354             /* First if we are already modifying current item, log it */
 355             if ( modifying_this_item ) {
 356                 journal_mark_dirty (th, inode->i_sb, bh);
 357                 modifying_this_item = 0;
 358             }
 359             /* Then set the key to look for a new indirect item (offset of old
 360                item is added to old item length */
 361             set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize));
 362             /* Search ofor position of new key in the tree. */
 363             res = search_for_position_by_key(inode->i_sb, &key, &path);
 364             if ( res == IO_ERROR) {
 365                 res = -EIO;
 366                 goto error_exit_free_blocks;
 367             }
 368             bh=get_last_bh(&path);
 369             ih=get_ih(&path);
 370             item = get_item(&path);
 371             itempos = path.pos_in_item;
 372             continue; // loop to check all kinds of conditions and so on.
 373         }
 374         /* Ok, we have correct position in item now, so let's see if it is
 375            representing file hole (blocknumber is zero) and fill it if needed */
 376         if ( !item[itempos] ) {
 377             /* Ok, a hole. Now we need to check if we already prepared this
 378                block to be journaled */
 379             while ( !modifying_this_item ) { // loop until succeed
 380                 /* Well, this item is not journaled yet, so we must prepare
 381                    it for journal first, before we can change it */
 382                 struct item_head tmp_ih; // We copy item head of found item,
 383                                          // here to detect if fs changed under
 384                                          // us while we were preparing for
 385                                          // journal.
 386                 int fs_gen; // We store fs generation here to find if someone
 387                             // changes fs under our feet
 388
 389                 copy_item_head (&tmp_ih, ih); // Remember itemhead
 390                 fs_gen = get_generation (inode->i_sb); // remember fs generation
 391                 reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing.
 392                 if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
 393                     // Sigh, fs was changed under us, we need to look for new
 394                     // location of item we are working with
 395
 396                     /* unmark prepaerd area as journaled and search for it's
 397                        new position */
 398                     reiserfs_restore_prepared_buffer(inode->i_sb, bh);
 399                     res = search_for_position_by_key(inode->i_sb, &key, &path);
 400                     if ( res == IO_ERROR) {
 401                         res = -EIO;
 402                         goto error_exit_free_blocks;
 403                     }
 404                     bh=get_last_bh(&path);
 405                     ih=get_ih(&path);
 406                     item = get_item(&path);
 407                     itempos = path.pos_in_item;
 408                     goto retry;
 409                 }
 410                 modifying_this_item = 1;
 411             }
 412             item[itempos] = allocated_blocks[curr_block]; // Assign new block
 413             curr_block++;
 414         }
 415         itempos++;
 416     }
 417
 418     if ( modifying_this_item ) { // We need to log last-accessed block, if it
 419                                  // was modified, but not logged yet.
 420         journal_mark_dirty (th, inode->i_sb, bh);
 421     }
 422
 423     if ( curr_block < blocks_to_allocate ) {
 424         // Oh, well need to append to indirect item, or to create indirect item
 425         // if there weren't any
 426         if ( is_indirect_le_ih(ih) ) {
 427             // Existing indirect item - append. First calculate key for append
 428             // position. We do not need to recalculate path as it should
 429             // already point to correct place.
 430             make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
 431             res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
 432             if ( res ) {
 433                 goto error_exit_free_blocks;
 434             }
 435         } else if (is_statdata_le_ih(ih) ) {
 436             // Last found item was statdata. That means we need to create indirect item.
 437             struct item_head ins_ih; /* itemhead for new item */
 438
 439             /* create a key for our new item */
 440             make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one,
 441                                                             // because that's
 442                                                             // where first
 443                                                             // indirect item
 444                                                             // begins
 445             /* Create new item head for our new item */
 446             make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT,
 447                                (blocks_to_allocate-curr_block)*UNFM_P_SIZE,
 448                                0 /* free space */);
 449             /* Find where such item should live in the tree */
 450             res = search_item (inode->i_sb, &key, &path);
 451             if ( res != ITEM_NOT_FOUND ) {
 452                 /* Well, if we have found such item already, or some error
 453                    occured, we need to warn user and return error */
 454                 if ( res != -ENOSPC ) {
 455                     reiserfs_warning (inode->i_sb,
 456                                       "green-9009: search_by_key (%K) "
 457                                       "returned %d", &key, res);
 458                 }
 459                 res = -EIO;
 460                 goto error_exit_free_blocks;
 461             }
 462             /* Insert item into the tree with the data as its body */
 463             res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block));
 464         } else {
 465             reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
 466         }
 467     }
 468
 469     // the caller is responsible for closing the transaction
 470     // unless we return an error, they are also responsible for logging
 471     // the inode.
 472     //
 473     pathrelse(&path);
 474     /*
 475      * cleanup prellocation from previous writes
 476      * if this is a partial block write
 477      */
 478     if (write_bytes & (inode->i_sb->s_blocksize -1))
 479         reiserfs_discard_prealloc(th, inode);
 480     reiserfs_write_unlock(inode->i_sb);
 481
 482     // go through all the pages/buffers and map the buffers to newly allocated
 483     // blocks (so that system knows where to write these pages later).
 484     curr_block = 0;
 485     for ( i = 0; i < num_pages ; i++ ) {
 486         struct page *page=prepared_pages[i]; //current page
 487         struct buffer_head *head = page_buffers(page);// first buffer for a page
 488         int block_start, block_end; // in-page offsets for buffers.
 489
 490         if (!page_buffers(page))
 491             reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???");
 492
 493         /* For each buffer in page */
 494         for(bh = head, block_start = 0; bh != head || !block_start;
 495             block_start=block_end, bh = bh->b_this_page) {
 496             if (!bh)
 497                 reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?");
 498             block_end = block_start+inode->i_sb->s_blocksize;
 499             if (i == 0 && block_end <= from )
 500                 /* if this buffer is before requested data to map, skip it */
 501                 continue;
 502             if (i == num_pages - 1 && block_start >= to)
 503                 /* If this buffer is after requested data to map, abort
 504                    processing of current page */
 505                 break;
 506
 507             if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
 508                 map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
 509                 curr_block++;
 510                 set_buffer_new(bh);
 511             }
 512         }
 513     }
 514
 515     RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird");
 516
 517     kfree(allocated_blocks);
 518     return 0;
 519
 520 // Need to deal with transaction here.
 521 error_exit_free_blocks:
 522     pathrelse(&path);
 523     // free blocks
 524     for( i = 0; i < blocks_to_allocate; i++ )
 525         reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1);
 526
 527 error_exit:
 528     reiserfs_update_sd(th, inode); // update any changes we made to blk count
 529     journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
 530     reiserfs_write_unlock(inode->i_sb);
 531     kfree(allocated_blocks);
 532
 533     return res;
 534 }
 535
 536 /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
 537 void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */
 538                               int num_pages /* amount of pages */) {
 539     int i; // loop counter
 540
 541     for (i=0; i < num_pages ; i++) {
 542         struct page *page = prepared_pages[i];
 543
 544         try_to_free_buffers(page);
 545         unlock_page(page);
 546         page_cache_release(page);
 547     }
 548 }
 549
 550 /* This function will copy data from userspace to specified pages within
 551    supplied byte range */
 552 int reiserfs_copy_from_user_to_file_region(
 553                                 loff_t pos, /* In-file position */
 554                                 int num_pages, /* Number of pages affected */
 555                                 int write_bytes, /* Amount of bytes to write */
 556                                 struct page **prepared_pages, /* pointer to
 557                                                                  array to
 558                                                                  prepared pages
 559                                                                 */
 560                                 const char __user *buf /* Pointer to user-supplied
 561                                                    data*/
 562                                 )
 563 {
 564     long page_fault=0; // status of copy_from_user.
 565     int i; // loop counter.
 566     int offset; // offset in page
 567
 568     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
 569         int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
 570         struct page *page=prepared_pages[i]; // Current page we process.
 571
 572         fault_in_pages_readable( buf, count);
 573
 574         /* Copy data from userspace to the current page */
 575         kmap(page);
 576         page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data.
 577         /* Flush processor's dcache for this page */
 578         flush_dcache_page(page);
 579         kunmap(page);
 580         buf+=count;
 581         write_bytes-=count;
 582
 583         if (page_fault)
 584             break; // Was there a fault? abort.
 585     }
 586
 587     return page_fault?-EFAULT:0;
 588 }
 589
 590 /* taken fs/buffer.c:__block_commit_write */
 591 int reiserfs_commit_page(struct inode *inode, struct page *page,
 592                 unsigned from, unsigned to)
 593 {
 594     unsigned block_start, block_end;
 595     int partial = 0;
 596     unsigned blocksize;
 597     struct buffer_head *bh, *head;
 598     unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
 599     int new;
 600     int logit = reiserfs_file_data_log(inode);
 601     struct super_block *s = inode->i_sb;
 602     int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
 603     struct reiserfs_transaction_handle th;
 604     th.t_trans_id = 0;
 605
 606     blocksize = 1 << inode->i_blkbits;
 607
 608     if (logit) {
 609         reiserfs_write_lock(s);
 610         journal_begin(&th, s, bh_per_page + 1);
 611         reiserfs_update_inode_transaction(inode);
 612     }
 613     for(bh = head = page_buffers(page), block_start = 0;
 614         bh != head || !block_start;
 615         block_start=block_end, bh = bh->b_this_page)
 616     {
 617
 618         new = buffer_new(bh);
 619         clear_buffer_new(bh);
 620         block_end = block_start + blocksize;
 621         if (block_end <= from || block_start >= to) {
 622             if (!buffer_uptodate(bh))
 623                     partial = 1;
 624         } else {
 625             set_buffer_uptodate(bh);
 626             if (logit) {
 627                 reiserfs_prepare_for_journal(s, bh, 1);
 628                 journal_mark_dirty(&th, s, bh);
 629             } else if (!buffer_dirty(bh)) {
 630                 mark_buffer_dirty(bh);
 631                 /* do data=ordered on any page past the end
 632                  * of file and any buffer marked BH_New.
 633                  */
 634                 if (reiserfs_data_ordered(inode->i_sb) &&
 635                     (new || page->index >= i_size_index)) {
 636                     reiserfs_add_ordered_list(inode, bh);
 637                 }
 638             }
 639         }
 640     }
 641     if (logit) {
 642         journal_end(&th, s, bh_per_page + 1);
 643         reiserfs_write_unlock(s);
 644     }
 645     /*
 646      * If this is a partial write which happened to make all buffers
 647      * uptodate then we can optimize away a bogus readpage() for
 648      * the next read(). Here we 'discover' whether the page went
 649      * uptodate as a result of this (potentially partial) write.
 650      */
 651     if (!partial)
 652         SetPageUptodate(page);
 653     return 0;
 654 }
 655
 656
 657 /* Submit pages for write. This was separated from actual file copying
 658    because we might want to allocate block numbers in-between.
 659    This function assumes that caller will adjust file size to correct value. */
 660 int reiserfs_submit_file_region_for_write(
 661                                 struct reiserfs_transaction_handle *th,
 662                                 struct inode *inode,
 663                                 loff_t pos, /* Writing position offset */
 664                                 int num_pages, /* Number of pages to write */
 665                                 int write_bytes, /* number of bytes to write */
 666                                 struct page **prepared_pages /* list of pages */
 667                                 )
 668 {
 669     int status; // return status of block_commit_write.
 670     int retval = 0; // Return value we are going to return.
 671     int i; // loop counter
 672     int offset; // Writing offset in page.
 673     int orig_write_bytes = write_bytes;
 674     int sd_update = 0;
 675
 676     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
 677         int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
 678         struct page *page=prepared_pages[i]; // Current page we process.
 679
 680         status = reiserfs_commit_page(inode, page, offset, offset+count);
 681         if ( status )
 682             retval = status; // To not overcomplicate matters We are going to
 683                              // submit all the pages even if there was error.
 684                              // we only remember error status to report it on
 685                              // exit.
 686         write_bytes-=count;
 687     }
 688     /* now that we've gotten all the ordered buffers marked dirty,
 689      * we can safely update i_size and close any running transaction
 690      */
 691     if ( pos + orig_write_bytes > inode->i_size) {
 692         inode->i_size = pos + orig_write_bytes; // Set new size
 693         /* If the file have grown so much that tail packing is no
 694          * longer possible, reset "need to pack" flag */
 695         if ( (have_large_tails (inode->i_sb) &&
 696               inode->i_size > i_block_size (inode)*4) ||
 697              (have_small_tails (inode->i_sb) &&
 698              inode->i_size > i_block_size(inode)) )
 699             REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
 700         else if ( (have_large_tails (inode->i_sb) &&
 701                   inode->i_size < i_block_size (inode)*4) ||
 702                   (have_small_tails (inode->i_sb) &&
 703                   inode->i_size < i_block_size(inode)) )
 704             REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 705
 706         if (th->t_trans_id) {
 707             reiserfs_write_lock(inode->i_sb);
 708             reiserfs_update_sd(th, inode); // And update on-disk metadata
 709             reiserfs_write_unlock(inode->i_sb);
 710         } else
 711             inode->i_sb->s_op->dirty_inode(inode);
 712
 713         sd_update = 1;
 714     }
 715     if (th->t_trans_id) {
 716         reiserfs_write_lock(inode->i_sb);
 717         if (!sd_update)
 718             reiserfs_update_sd(th, inode);
 719         journal_end(th, th->t_super, th->t_blocks_allocated);
 720         reiserfs_write_unlock(inode->i_sb);
 721     }
 722     th->t_trans_id = 0;
 723
 724     /*
 725      * we have to unlock the pages after updating i_size, otherwise
 726      * we race with writepage
 727      */
 728     for ( i = 0; i < num_pages ; i++) {
 729         struct page *page=prepared_pages[i];
 730         unlock_page(page);
 731         mark_page_accessed(page);
 732         page_cache_release(page);
 733     }
 734     return retval;
 735 }
 736
 737 /* Look if passed writing region is going to touch file's tail
 738    (if it is present). And if it is, convert the tail to unformatted node */
 739 int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */
 740                                          loff_t pos, /* Writing position */
 741                                          int write_bytes /* amount of bytes to write */
 742                                         )
 743 {
 744     INITIALIZE_PATH(path); // needed for search_for_position
 745     struct cpu_key key; // Key that would represent last touched writing byte.
 746     struct item_head *ih; // item header of found block;
 747     int res; // Return value of various functions we call.
 748     int cont_expand_offset; // We will put offset for generic_cont_expand here
 749                             // This can be int just because tails are created
 750                             // only for small files.
 751
 752 /* this embodies a dependency on a particular tail policy */
 753     if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) {
 754         /* such a big files do not have tails, so we won't bother ourselves
 755            to look for tails, simply return */
 756         return 0;
 757     }
 758
 759     reiserfs_write_lock(inode->i_sb);
 760     /* find the item containing the last byte to be written, or if
 761      * writing past the end of the file then the last item of the
 762      * file (and then we check its type). */
 763     make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/);
 764     res = search_for_position_by_key(inode->i_sb, &key, &path);
 765     if ( res == IO_ERROR ) {
 766         reiserfs_write_unlock(inode->i_sb);
 767         return -EIO;
 768     }
 769     ih = get_ih(&path);
 770     res = 0;
 771     if ( is_direct_le_ih(ih) ) {
 772         /* Ok, closest item is file tail (tails are stored in "direct"
 773          * items), so we need to unpack it. */
 774         /* To not overcomplicate matters, we just call generic_cont_expand
 775            which will in turn call other stuff and finally will boil down to
 776             reiserfs_get_block() that would do necessary conversion. */
 777         cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key));
 778         pathrelse(&path);
 779         res = generic_cont_expand( inode, cont_expand_offset);
 780     } else
 781         pathrelse(&path);
 782
 783     reiserfs_write_unlock(inode->i_sb);
 784     return res;
 785 }
 786
 787 /* This function locks pages starting from @pos for @inode.
 788    @num_pages pages are locked and stored in
 789    @prepared_pages array. Also buffers are allocated for these pages.
 790    First and last page of the region is read if it is overwritten only
 791    partially. If last page did not exist before write (file hole or file
 792    append), it is zeroed, then.
 793    Returns number of unallocated blocks that should be allocated to cover
 794    new file data.*/
 795 int reiserfs_prepare_file_region_for_write(
 796                                 struct inode *inode /* Inode of the file */,
 797                                 loff_t pos, /* position in the file */
 798                                 int num_pages, /* number of pages to
 799                                                   prepare */
 800                                 int write_bytes, /* Amount of bytes to be
 801                                                     overwritten from
 802                                                     @pos */
 803                                 struct page **prepared_pages /* pointer to array
 804                                                                where to store
 805                                                                prepared pages */
 806                                            )
 807 {
 808     int res=0; // Return values of different functions we call.
 809     unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages.
 810     int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page
 811     int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
 812                                          /* offset of last modified byte in last
 813                                             page */
 814     struct address_space *mapping = inode->i_mapping; // Pages are mapped here.
 815     int i; // Simple counter
 816     int blocks = 0; /* Return value (blocks that should be allocated) */
 817     struct buffer_head *bh, *head; // Current bufferhead and first bufferhead
 818                                    // of a page.
 819     unsigned block_start, block_end; // Starting and ending offsets of current
 820                                      // buffer in the page.
 821     struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if
 822                                                  // Page appeared to be not up
 823                                                  // to date. Note how we have
 824                                                  // at most 2 buffers, this is
 825                                                  // because we at most may
 826                                                  // partially overwrite two
 827                                                  // buffers for one page. One at                                                 // the beginning of write area
 828                                                  // and one at the end.
 829                                                  // Everything inthe middle gets                                                 // overwritten totally.
 830
 831     struct cpu_key key; // cpu key of item that we are going to deal with
 832     struct item_head *ih = NULL; // pointer to item head that we are going to deal with
 833     struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with
 834     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
 835     __u32 * item=NULL; // pointer to item we are going to deal with
 836     int item_pos=-1; /* Position in indirect item */
 837
 838
 839     if ( num_pages < 1 ) {
 840         reiserfs_warning (inode->i_sb,
 841                           "green-9001: reiserfs_prepare_file_region_for_write "
 842                           "called with zero number of pages to process");
 843         return -EFAULT;
 844     }
 845
 846     /* We have 2 loops for pages. In first loop we grab and lock the pages, so
 847        that nobody would touch these until we release the pages. Then
 848        we'd start to deal with mapping buffers to blocks. */
 849     for ( i = 0; i < num_pages; i++) {
 850         prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page
 851         if ( !prepared_pages[i]) {
 852             res = -ENOMEM;
 853             goto failed_page_grabbing;
 854         }
 855         if (!page_has_buffers(prepared_pages[i]))
 856             create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0);
 857     }
 858
 859     /* Let's count amount of blocks for a case where all the blocks
 860        overwritten are new (we will substract already allocated blocks later)*/
 861     if ( num_pages > 2 )
 862         /* These are full-overwritten pages so we count all the blocks in
 863            these pages are counted as needed to be allocated */
 864         blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 865
 866     /* count blocks needed for first page (possibly partially written) */
 867     blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) +
 868            !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */
 869
 870     /* Now we account for last page. If last page == first page (we
 871        overwrite only one page), we substract all the blocks past the
 872        last writing position in a page out of already calculated number
 873        of blocks */
 874     blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) -
 875            ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
 876            /* Note how we do not roundup here since partial blocks still
 877                    should be allocated */
 878
 879     /* Now if all the write area lies past the file end, no point in
 880        maping blocks, since there is none, so we just zero out remaining
 881        parts of first and last pages in write area (if needed) */
 882     if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) {
 883         if ( from != 0 ) {/* First page needs to be partially zeroed */
 884             char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
 885             memset(kaddr, 0, from);
 886             kunmap_atomic( kaddr, KM_USER0);
 887         }
 888         if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */
 889             char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
 890             memset(kaddr+to, 0, PAGE_CACHE_SIZE - to);
 891             kunmap_atomic( kaddr, KM_USER0);
 892         }
 893
 894         /* Since all blocks are new - use already calculated value */
 895         return blocks;
 896     }
 897
 898     /* Well, since we write somewhere into the middle of a file, there is
 899        possibility we are writing over some already allocated blocks, so
 900        let's map these blocks and substract number of such blocks out of blocks
 901        we need to allocate (calculated above) */
 902     /* Mask write position to start on blocksize, we do it out of the
 903        loop for performance reasons */
 904     pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
 905     /* Set cpu key to the starting position in a file (on left block boundary)*/
 906     make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/);
 907
 908     reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key()
 909     for ( i = 0; i < num_pages ; i++ ) {
 910
 911         head = page_buffers(prepared_pages[i]);
 912         /* For each buffer in the page */
 913         for(bh = head, block_start = 0; bh != head || !block_start;
 914             block_start=block_end, bh = bh->b_this_page) {
 915                 if (!bh)
 916                     reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
 917                 /* Find where this buffer ends */
 918                 block_end = block_start+inode->i_sb->s_blocksize;
 919                 if (i == 0 && block_end <= from )
 920                     /* if this buffer is before requested data to map, skip it*/
 921                     continue;
 922
 923                 if (i == num_pages - 1 && block_start >= to) {
 924                     /* If this buffer is after requested data to map, abort
 925                        processing of current page */
 926                     break;
 927                 }
 928
 929                 if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) {
 930                     /* This is optimisation for a case where buffer is mapped
 931                        and have blocknumber assigned. In case significant amount
 932                        of such buffers are present, we may avoid some amount
 933                        of search_by_key calls.
 934                        Probably it would be possible to move parts of this code
 935                        out of BKL, but I afraid that would overcomplicate code
 936                        without any noticeable benefit.
 937                     */
 938                     item_pos++;
 939                     /* Update the key */
 940                     set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
 941                     blocks--; // Decrease the amount of blocks that need to be
 942                               // allocated
 943                     continue; // Go to the next buffer
 944                 }
 945
 946                 if ( !itembuf || /* if first iteration */
 947                      item_pos >= ih_item_len(ih)/UNFM_P_SIZE)
 948                                              { /* or if we progressed past the
 949                                                   current unformatted_item */
 950                         /* Try to find next item */
 951                         res = search_for_position_by_key(inode->i_sb, &key, &path);
 952                         /* Abort if no more items */
 953                         if ( res != POSITION_FOUND ) {
 954                             /* make sure later loops don't use this item */
 955                             itembuf = NULL;
 956                             item = NULL;
 957                             break;
 958                         }
 959
 960                         /* Update information about current indirect item */
 961                         itembuf = get_last_bh( &path );
 962                         ih = get_ih( &path );
 963                         item = get_item( &path );
 964                         item_pos = path.pos_in_item;
 965
 966                         RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected");
 967                 }
 968
 969                 /* See if there is some block associated with the file
 970                    at that position, map the buffer to this block */
 971                 if ( get_block_num(item,item_pos) ) {
 972                     map_bh(bh, inode->i_sb, get_block_num(item,item_pos));
 973                     blocks--; // Decrease the amount of blocks that need to be
 974                               // allocated
 975                 }
 976                 item_pos++;
 977                 /* Update the key */
 978                 set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
 979         }
 980     }
 981     pathrelse(&path); // Free the path
 982     reiserfs_write_unlock(inode->i_sb);
 983
 984         /* Now zero out unmappend buffers for the first and last pages of
 985            write area or issue read requests if page is mapped. */
 986         /* First page, see if it is not uptodate */
 987         if ( !PageUptodate(prepared_pages[0]) ) {
 988             head = page_buffers(prepared_pages[0]);
 989
 990             /* For each buffer in page */
 991             for(bh = head, block_start = 0; bh != head || !block_start;
 992                 block_start=block_end, bh = bh->b_this_page) {
 993
 994                 if (!bh)
 995                     reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
 996                 /* Find where this buffer ends */
 997                 block_end = block_start+inode->i_sb->s_blocksize;
 998                 if ( block_end <= from )
 999                     /* if this buffer is before requested data to map, skip it*/
1000                     continue;
1001                 if ( block_start < from ) { /* Aha, our partial buffer */
1002                     if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1003                                                   issue READ request for it to
1004                                                   not loose data */
1005                         ll_rw_block(READ, 1, &bh);
1006                         *wait_bh++=bh;
1007                     } else { /* Not mapped, zero it */
1008                         char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1009                         memset(kaddr+block_start, 0, from-block_start);
1010                         kunmap_atomic( kaddr, KM_USER0);
1011                         set_buffer_uptodate(bh);
1012                     }
1013                 }
1014             }
1015         }
1016
1017         /* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1018         if ( !PageUptodate(prepared_pages[num_pages-1]) ||
1019             ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) {
1020             head = page_buffers(prepared_pages[num_pages-1]);
1021
1022             /* for each buffer in page */
1023             for(bh = head, block_start = 0; bh != head || !block_start;
1024                 block_start=block_end, bh = bh->b_this_page) {
1025
1026                 if (!bh)
1027                     reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
1028                 /* Find where this buffer ends */
1029                 block_end = block_start+inode->i_sb->s_blocksize;
1030                 if ( block_start >= to )
1031                     /* if this buffer is after requested data to map, skip it*/
1032                     break;
1033                 if ( block_end > to ) { /* Aha, our partial buffer */
1034                     if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1035                                                   issue READ request for it to
1036                                                   not loose data */
1037                         ll_rw_block(READ, 1, &bh);
1038                         *wait_bh++=bh;
1039                     } else { /* Not mapped, zero it */
1040                         char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
1041                         memset(kaddr+to, 0, block_end-to);
1042                         kunmap_atomic( kaddr, KM_USER0);
1043                         set_buffer_uptodate(bh);
1044                     }
1045                 }
1046             }
1047         }
1048
1049     /* Wait for read requests we made to happen, if necessary */
1050     while(wait_bh > wait) {
1051         wait_on_buffer(*--wait_bh);
1052         if (!buffer_uptodate(*wait_bh)) {
1053             res = -EIO;
1054             goto failed_read;
1055         }
1056     }
1057
1058     return blocks;
1059 failed_page_grabbing:
1060     num_pages = i;
1061 failed_read:
1062     reiserfs_unprepare_pages(prepared_pages, num_pages);
1063     return res;
1064 }
1065
1066 /* Write @count bytes at position @ppos in a file indicated by @file
1067    from the buffer @buf.
1068
1069    generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1070    something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1071    written for (ext2/3).  This is for several reasons:
1072
1073    * It has no understanding of any filesystem specific optimizations.
1074
1075    * It enters the filesystem repeatedly for each page that is written.
1076
1077    * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1078    * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1079    * to reiserfs which allows for fewer tree traversals.
1080
1081    * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1082
1083    * Asking the block allocation code for blocks one at a time is slightly less efficient.
1084
1085    All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1086    use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1087    things right finally.
1088
1089    Future Features: providing search_by_key with hints.
1090
1091 */
1092 ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */
1093                              const char __user *buf, /*  pointer to user supplied data
1094 (in userspace) */
1095                              size_t count, /* amount of bytes to write */
1096                              loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to
1097                                            * new current position before returning. */ )
1098 {
1099     size_t already_written = 0; // Number of bytes already written to the file.
1100     loff_t pos; // Current position in the file.
1101     size_t res; // return value of various functions that we call.
1102     struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
1103                                 /* To simplify coding at this time, we store
1104                                    locked pages in array for now */
1105     struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1106     struct reiserfs_transaction_handle th;
1107     th.t_trans_id = 0;
1108
1109     if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
1110         int result, after_file_end = 0;
1111         if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
1112             /* If we are appending a file, we need to put this savelink in here.
1113                If we will crash while doing direct io, finish_unfinished will
1114                cut the garbage from the file end. */
1115             reiserfs_write_lock(inode->i_sb);
1116             journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
1117             reiserfs_update_inode_transaction(inode);
1118             add_save_link (&th, inode, 1 /* Truncate */);
1119             journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
1120             reiserfs_write_unlock(inode->i_sb);
1121             after_file_end = 1;
1122         }
1123         result = generic_file_write(file, buf, count, ppos);
1124
1125         if ( after_file_end ) { /* Now update i_size and remove the savelink */
1126             struct reiserfs_transaction_handle th;
1127             reiserfs_write_lock(inode->i_sb);
1128             journal_begin(&th, inode->i_sb, 1);
1129             reiserfs_update_inode_transaction(inode);
1130             reiserfs_update_sd(&th, inode);
1131             journal_end(&th, inode->i_sb, 1);
1132             remove_save_link (inode, 1/* truncate */);
1133             reiserfs_write_unlock(inode->i_sb);
1134         }
1135
1136         return result;
1137     }
1138
1139     if ( unlikely((ssize_t) count < 0 ))
1140         return -EINVAL;
1141
1142     if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1143         return -EFAULT;
1144
1145     down(&inode->i_sem); // locks the entire file for just us
1146
1147     pos = *ppos;
1148
1149     /* Check if we can write to specified region of file, file
1150        is not overly big and this kind of stuff. Adjust pos and
1151        count, if needed */
1152     res = generic_write_checks(file, &pos, &count, 0);
1153     if (res)
1154         goto out;
1155
1156     if ( count == 0 )
1157         goto out;
1158
1159     res = remove_suid(file->f_dentry);
1160     if (res)
1161         goto out;
1162
1163     inode_update_time(inode, 1); /* Both mtime and ctime */
1164
1165     // Ok, we are done with all the checks.
1166
1167     // Now we should start real work
1168
1169     /* If we are going to write past the file's packed tail or if we are going
1170        to overwrite part of the tail, we need that tail to be converted into
1171        unformatted node */
1172     res = reiserfs_check_for_tail_and_convert( inode, pos, count);
1173     if (res)
1174         goto out;
1175
1176     while ( count > 0) {
1177         /* This is the main loop in which we running until some error occures
1178            or until we write all of the data. */
1179         int num_pages;/* amount of pages we are going to write this iteration */
1180         int write_bytes; /* amount of bytes to write during this iteration */
1181         int blocks_to_allocate; /* how much blocks we need to allocate for
1182                                    this iteration */
1183
1184         /*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/
1185         num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial
1186                                                           pages */
1187                     ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT);
1188                                                 /* convert size to amount of
1189                                                    pages */
1190         reiserfs_write_lock(inode->i_sb);
1191         if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1192                 || num_pages > reiserfs_can_fit_pages(inode->i_sb) ) {
1193             /* If we were asked to write more data than we want to or if there
1194                is not that much space, then we shorten amount of data to write
1195                for this iteration. */
1196             num_pages = min_t(int, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb));
1197             /* Also we should not forget to set size in bytes accordingly */
1198             write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1199                             (pos & (PAGE_CACHE_SIZE-1));
1200                                          /* If position is not on the
1201                                             start of the page, we need
1202                                             to substract the offset
1203                                             within page */
1204         } else
1205             write_bytes = count;
1206
1207         /* reserve the blocks to be allocated later, so that later on
1208            we still have the space to write the blocks to */
1209         reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1210         reiserfs_write_unlock(inode->i_sb);
1211
1212         if ( !num_pages ) { /* If we do not have enough space even for */
1213             res = -ENOSPC;  /* single page, return -ENOSPC */
1214             if ( pos > (inode->i_size & (inode->i_sb->s_blocksize-1)))
1215                 break; // In case we are writing past the file end, break.
1216             // Otherwise we are possibly overwriting the file, so
1217             // let's set write size to be equal or less than blocksize.
1218             // This way we get it correctly for file holes.
1219             // But overwriting files on absolutelly full volumes would not
1220             // be very efficient. Well, people are not supposed to fill
1221             // 100% of disk space anyway.
1222             write_bytes = min_t(int, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1)));
1223             num_pages = 1;
1224             // No blocks were claimed before, so do it now.
1225             reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1226         }
1227
1228         /* Prepare for writing into the region, read in all the
1229            partially overwritten pages, if needed. And lock the pages,
1230            so that nobody else can access these until we are done.
1231            We get number of actual blocks needed as a result.*/
1232         blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages);
1233         if ( blocks_to_allocate < 0 ) {
1234             res = blocks_to_allocate;
1235             reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1236             break;
1237         }
1238
1239         /* First we correct our estimate of how many blocks we need */
1240         reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate );
1241
1242         if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
1243             /* Fill in all the possible holes and append the file if needed */
1244             res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
1245         }
1246
1247         /* well, we have allocated the blocks, so it is time to free
1248            the reservation we made earlier. */
1249         reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate);
1250         if ( res ) {
1251             reiserfs_unprepare_pages(prepared_pages, num_pages);
1252             break;
1253         }
1254
1255 /* NOTE that allocating blocks and filling blocks can be done in reverse order
1256    and probably we would do that just to get rid of garbage in files after a
1257    crash */
1258
1259         /* Copy data from user-supplied buffer to file's pages */
1260         res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf);
1261         if ( res ) {
1262             reiserfs_unprepare_pages(prepared_pages, num_pages);
1263             break;
1264         }
1265
1266         /* Send the pages to disk and unlock them. */
1267         res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
1268                                                     write_bytes,prepared_pages);
1269         if ( res )
1270             break;
1271
1272         already_written += write_bytes;
1273         buf += write_bytes;
1274         *ppos = pos += write_bytes;
1275         count -= write_bytes;
1276         balance_dirty_pages_ratelimited(inode->i_mapping);
1277     }
1278
1279     /* this is only true on error */
1280     if (th.t_trans_id) {
1281         reiserfs_write_lock(inode->i_sb);
1282         journal_end(&th, th.t_super, th.t_blocks_allocated);
1283         reiserfs_write_unlock(inode->i_sb);
1284     }
1285
1286     if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1287         res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
1288
1289     up(&inode->i_sem);
1290     reiserfs_async_progress_wait(inode->i_sb);
1291     return (already_written != 0)?already_written:res;
1292
1293 out:
1294     up(&inode->i_sem); // unlock the file on exit.
1295     return res;
1296 }
1297
1298 static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf,
1299                                size_t count, loff_t pos)
1300 {
1301     return generic_file_aio_write(iocb, buf, count, pos);
1302 }
1303
1304
1305
1306 struct file_operations reiserfs_file_operations = {
1307     .read       = generic_file_read,
1308     .write      = reiserfs_file_write,
1309     .ioctl      = reiserfs_ioctl,
1310     .mmap       = generic_file_mmap,
1311     .release    = reiserfs_file_release,
1312     .fsync      = reiserfs_sync_file,
1313     .sendfile   = generic_file_sendfile,
1314     .aio_read   = generic_file_aio_read,
1315     .aio_write  = reiserfs_aio_write,
1316 };
1317
1318
1319 struct  inode_operations reiserfs_file_inode_operations = {
1320     .truncate   = reiserfs_vfs_truncate_file,
1321     .setattr    = reiserfs_setattr,
1322     .setxattr   = reiserfs_setxattr,
1323     .getxattr   = reiserfs_getxattr,
1324     .listxattr  = reiserfs_listxattr,
1325     .removexattr = reiserfs_removexattr,
1326     .permission = reiserfs_permission,
1327 };
1328
1329