fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5  */
   6
   7 /*
   8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9  *
  10  * Removed a lot of unnecessary code and simplified things now that
  11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12  *
  13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15  *
  16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17  *
  18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19  */
  20
  21 #include <linux/config.h>
  22 #include <linux/kernel.h>
  23 #include <linux/syscalls.h>
  24 #include <linux/fs.h>
  25 #include <linux/mm.h>
  26 #include <linux/percpu.h>
  27 #include <linux/slab.h>
  28 #include <linux/smp_lock.h>
  29 #include <linux/blkdev.h>
  30 #include <linux/file.h>
  31 #include <linux/quotaops.h>
  32 #include <linux/highmem.h>
  33 #include <linux/module.h>
  34 #include <linux/writeback.h>
  35 #include <linux/hash.h>
  36 #include <linux/suspend.h>
  37 #include <linux/buffer_head.h>
  38 #include <linux/bio.h>
  39 #include <linux/notifier.h>
  40 #include <linux/cpu.h>
  41 #include <linux/bitops.h>
  42
  43 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  44 static void invalidate_bh_lrus(void);
  45
  46 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  47
  48 inline void
  49 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  50 {
  51         bh->b_end_io = handler;
  52         bh->b_private = private;
  53 }
  54
  55 static int sync_buffer(void *word)
  56 {
  57         struct block_device *bd;
  58         struct buffer_head *bh
  59                 = container_of(word, struct buffer_head, b_state);
  60
  61         smp_mb();
  62         bd = bh->b_bdev;
  63         if (bd)
  64                 blk_run_address_space(bd->bd_inode->i_mapping);
  65         io_schedule();
  66         return 0;
  67 }
  68
  69 void fastcall __lock_buffer(struct buffer_head *bh)
  70 {
  71         wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
  72                                                         TASK_UNINTERRUPTIBLE);
  73 }
  74 EXPORT_SYMBOL(__lock_buffer);
  75
  76 void fastcall unlock_buffer(struct buffer_head *bh)
  77 {
  78         clear_buffer_locked(bh);
  79         smp_mb__after_clear_bit();
  80         wake_up_bit(&bh->b_state, BH_Lock);
  81 }
  82
  83 /*
  84  * Block until a buffer comes unlocked.  This doesn't stop it
  85  * from becoming locked again - you have to lock it yourself
  86  * if you want to preserve its state.
  87  */
  88 void __wait_on_buffer(struct buffer_head * bh)
  89 {
  90         wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
  91 }
  92
  93 static void
  94 __clear_page_buffers(struct page *page)
  95 {
  96         ClearPagePrivate(page);
  97         page->private = 0;
  98         page_cache_release(page);
  99 }
 100
 101 static void buffer_io_error(struct buffer_head *bh)
 102 {
 103         char b[BDEVNAME_SIZE];
 104
 105         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 106                         bdevname(bh->b_bdev, b),
 107                         (unsigned long long)bh->b_blocknr);
 108 }
 109
 110 /*
 111  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 112  * unlock the buffer. This is what ll_rw_block uses too.
 113  */
 114 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 115 {
 116         if (uptodate) {
 117                 set_buffer_uptodate(bh);
 118         } else {
 119                 /* This happens, due to failed READA attempts. */
 120                 clear_buffer_uptodate(bh);
 121         }
 122         unlock_buffer(bh);
 123         put_bh(bh);
 124 }
 125
 126 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 127 {
 128         char b[BDEVNAME_SIZE];
 129
 130         if (uptodate) {
 131                 set_buffer_uptodate(bh);
 132         } else {
 133                 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 134                         buffer_io_error(bh);
 135                         printk(KERN_WARNING "lost page write due to "
 136                                         "I/O error on %s\n",
 137                                        bdevname(bh->b_bdev, b));
 138                 }
 139                 set_buffer_write_io_error(bh);
 140                 clear_buffer_uptodate(bh);
 141         }
 142         unlock_buffer(bh);
 143         put_bh(bh);
 144 }
 145
 146 /*
 147  * Write out and wait upon all the dirty data associated with a block
 148  * device via its mapping.  Does not take the superblock lock.
 149  */
 150 int sync_blockdev(struct block_device *bdev)
 151 {
 152         int ret = 0;
 153
 154         if (bdev) {
 155                 int err;
 156
 157                 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
 158                 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
 159                 if (!ret)
 160                         ret = err;
 161         }
 162         return ret;
 163 }
 164 EXPORT_SYMBOL(sync_blockdev);
 165
 166 /*
 167  * Write out and wait upon all dirty data associated with this
 168  * superblock.  Filesystem data as well as the underlying block
 169  * device.  Takes the superblock lock.
 170  */
 171 int fsync_super(struct super_block *sb)
 172 {
 173         sync_inodes_sb(sb, 0);
 174         DQUOT_SYNC(sb);
 175         lock_super(sb);
 176         if (sb->s_dirt && sb->s_op->write_super)
 177                 sb->s_op->write_super(sb);
 178         unlock_super(sb);
 179         if (sb->s_op->sync_fs)
 180                 sb->s_op->sync_fs(sb, 1);
 181         sync_blockdev(sb->s_bdev);
 182         sync_inodes_sb(sb, 1);
 183
 184         return sync_blockdev(sb->s_bdev);
 185 }
 186
 187 /*
 188  * Write out and wait upon all dirty data associated with this
 189  * device.   Filesystem data as well as the underlying block
 190  * device.  Takes the superblock lock.
 191  */
 192 int fsync_bdev(struct block_device *bdev)
 193 {
 194         struct super_block *sb = get_super(bdev);
 195         if (sb) {
 196                 int res = fsync_super(sb);
 197                 drop_super(sb);
 198                 return res;
 199         }
 200         return sync_blockdev(bdev);
 201 }
 202
 203 /**
 204  * freeze_bdev  --  lock a filesystem and force it into a consistent state
 205  * @bdev:       blockdevice to lock
 206  *
 207  * This takes the block device bd_mount_sem to make sure no new mounts
 208  * happen on bdev until thaw_bdev() is called.
 209  * If a superblock is found on this device, we take the s_umount semaphore
 210  * on it to make sure nobody unmounts until the snapshot creation is done.
 211  */
 212 struct super_block *freeze_bdev(struct block_device *bdev)
 213 {
 214         struct super_block *sb;
 215
 216         down(&bdev->bd_mount_sem);
 217         sb = get_super(bdev);
 218         if (sb && !(sb->s_flags & MS_RDONLY)) {
 219                 sb->s_frozen = SB_FREEZE_WRITE;
 220                 wmb();
 221
 222                 sync_inodes_sb(sb, 0);
 223                 DQUOT_SYNC(sb);
 224
 225                 lock_super(sb);
 226                 if (sb->s_dirt && sb->s_op->write_super)
 227                         sb->s_op->write_super(sb);
 228                 unlock_super(sb);
 229
 230                 if (sb->s_op->sync_fs)
 231                         sb->s_op->sync_fs(sb, 1);
 232
 233                 sync_blockdev(sb->s_bdev);
 234                 sync_inodes_sb(sb, 1);
 235
 236                 sb->s_frozen = SB_FREEZE_TRANS;
 237                 wmb();
 238
 239                 sync_blockdev(sb->s_bdev);
 240
 241                 if (sb->s_op->write_super_lockfs)
 242                         sb->s_op->write_super_lockfs(sb);
 243         }
 244
 245         sync_blockdev(bdev);
 246         return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
 247 }
 248 EXPORT_SYMBOL(freeze_bdev);
 249
 250 /**
 251  * thaw_bdev  -- unlock filesystem
 252  * @bdev:       blockdevice to unlock
 253  * @sb:         associated superblock
 254  *
 255  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 256  */
 257 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 258 {
 259         if (sb) {
 260                 BUG_ON(sb->s_bdev != bdev);
 261
 262                 if (sb->s_op->unlockfs)
 263                         sb->s_op->unlockfs(sb);
 264                 sb->s_frozen = SB_UNFROZEN;
 265                 wmb();
 266                 wake_up(&sb->s_wait_unfrozen);
 267                 drop_super(sb);
 268         }
 269
 270         up(&bdev->bd_mount_sem);
 271 }
 272 EXPORT_SYMBOL(thaw_bdev);
 273
 274 /*
 275  * sync everything.  Start out by waking pdflush, because that writes back
 276  * all queues in parallel.
 277  */
 278 static void do_sync(unsigned long wait)
 279 {
 280         wakeup_bdflush(0);
 281         sync_inodes(0);         /* All mappings, inodes and their blockdevs */
 282         DQUOT_SYNC(NULL);
 283         sync_supers();          /* Write the superblocks */
 284         sync_filesystems(0);    /* Start syncing the filesystems */
 285         sync_filesystems(wait); /* Waitingly sync the filesystems */
 286         sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
 287         if (!wait)
 288                 printk("Emergency Sync complete\n");
 289         if (unlikely(laptop_mode))
 290                 laptop_sync_completion();
 291 }
 292
 293 asmlinkage long sys_sync(void)
 294 {
 295         do_sync(1);
 296         return 0;
 297 }
 298
 299 void emergency_sync(void)
 300 {
 301         pdflush_operation(do_sync, 0);
 302 }
 303
 304 /*
 305  * Generic function to fsync a file.
 306  *
 307  * filp may be NULL if called via the msync of a vma.
 308  */
 309
 310 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 311 {
 312         struct inode * inode = dentry->d_inode;
 313         struct super_block * sb;
 314         int ret;
 315
 316         /* sync the inode to buffers */
 317         write_inode_now(inode, 0);
 318
 319         /* sync the superblock to buffers */
 320         sb = inode->i_sb;
 321         lock_super(sb);
 322         if (sb->s_op->write_super)
 323                 sb->s_op->write_super(sb);
 324         unlock_super(sb);
 325
 326         /* .. finally sync the buffers to disk */
 327         ret = sync_blockdev(sb->s_bdev);
 328         return ret;
 329 }
 330
 331 asmlinkage long sys_fsync(unsigned int fd)
 332 {
 333         struct file * file;
 334         struct address_space *mapping;
 335         int ret, err;
 336
 337         ret = -EBADF;
 338         file = fget(fd);
 339         if (!file)
 340                 goto out;
 341
 342         mapping = file->f_mapping;
 343
 344         ret = -EINVAL;
 345         if (!file->f_op || !file->f_op->fsync) {
 346                 /* Why?  We can still call filemap_fdatawrite */
 347                 goto out_putf;
 348         }
 349
 350         /* We need to protect against concurrent writers.. */
 351         down(&mapping->host->i_sem);
 352         current->flags |= PF_SYNCWRITE;
 353         ret = filemap_fdatawrite(mapping);
 354         err = file->f_op->fsync(file, file->f_dentry, 0);
 355         if (!ret)
 356                 ret = err;
 357         err = filemap_fdatawait(mapping);
 358         if (!ret)
 359                 ret = err;
 360         current->flags &= ~PF_SYNCWRITE;
 361         up(&mapping->host->i_sem);
 362
 363 out_putf:
 364         fput(file);
 365 out:
 366         return ret;
 367 }
 368
 369 asmlinkage long sys_fdatasync(unsigned int fd)
 370 {
 371         struct file * file;
 372         struct address_space *mapping;
 373         int ret, err;
 374
 375         ret = -EBADF;
 376         file = fget(fd);
 377         if (!file)
 378                 goto out;
 379
 380         ret = -EINVAL;
 381         if (!file->f_op || !file->f_op->fsync)
 382                 goto out_putf;
 383
 384         mapping = file->f_mapping;
 385
 386         down(&mapping->host->i_sem);
 387         current->flags |= PF_SYNCWRITE;
 388         ret = filemap_fdatawrite(mapping);
 389         err = file->f_op->fsync(file, file->f_dentry, 1);
 390         if (!ret)
 391                 ret = err;
 392         err = filemap_fdatawait(mapping);
 393         if (!ret)
 394                 ret = err;
 395         current->flags &= ~PF_SYNCWRITE;
 396         up(&mapping->host->i_sem);
 397
 398 out_putf:
 399         fput(file);
 400 out:
 401         return ret;
 402 }
 403
 404 /*
 405  * Various filesystems appear to want __find_get_block to be non-blocking.
 406  * But it's the page lock which protects the buffers.  To get around this,
 407  * we get exclusion from try_to_free_buffers with the blockdev mapping's
 408  * private_lock.
 409  *
 410  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 411  * may be quite high.  This code could TryLock the page, and if that
 412  * succeeds, there is no need to take private_lock. (But if
 413  * private_lock is contended then so is mapping->tree_lock).
 414  */
 415 static struct buffer_head *
 416 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
 417 {
 418         struct inode *bd_inode = bdev->bd_inode;
 419         struct address_space *bd_mapping = bd_inode->i_mapping;
 420         struct buffer_head *ret = NULL;
 421         pgoff_t index;
 422         struct buffer_head *bh;
 423         struct buffer_head *head;
 424         struct page *page;
 425         int all_mapped = 1;
 426
 427         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 428         page = find_get_page(bd_mapping, index);
 429         if (!page)
 430                 goto out;
 431
 432         spin_lock(&bd_mapping->private_lock);
 433         if (!page_has_buffers(page))
 434                 goto out_unlock;
 435         head = page_buffers(page);
 436         bh = head;
 437         do {
 438                 if (bh->b_blocknr == block) {
 439                         ret = bh;
 440                         get_bh(bh);
 441                         goto out_unlock;
 442                 }
 443                 if (!buffer_mapped(bh))
 444                         all_mapped = 0;
 445                 bh = bh->b_this_page;
 446         } while (bh != head);
 447
 448         /* we might be here because some of the buffers on this page are
 449          * not mapped.  This is due to various races between
 450          * file io on the block device and getblk.  It gets dealt with
 451          * elsewhere, don't buffer_error if we had some unmapped buffers
 452          */
 453         if (all_mapped) {
 454                 printk("__find_get_block_slow() failed. "
 455                         "block=%llu, b_blocknr=%llu\n",
 456                         (unsigned long long)block, (unsigned long long)bh->b_blocknr);
 457                 printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
 458                 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 459         }
 460 out_unlock:
 461         spin_unlock(&bd_mapping->private_lock);
 462         page_cache_release(page);
 463 out:
 464         return ret;
 465 }
 466
 467 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 468    of fs corruption is going on. Trashing dirty data always imply losing
 469    information that was supposed to be just stored on the physical layer
 470    by the user.
 471
 472    Thus invalidate_buffers in general usage is not allwowed to trash
 473    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 474    be preserved.  These buffers are simply skipped.
 475
 476    We also skip buffers which are still in use.  For example this can
 477    happen if a userspace program is reading the block device.
 478
 479    NOTE: In the case where the user removed a removable-media-disk even if
 480    there's still dirty data not synced on disk (due a bug in the device driver
 481    or due an error of the user), by not destroying the dirty buffers we could
 482    generate corruption also on the next media inserted, thus a parameter is
 483    necessary to handle this case in the most safe way possible (trying
 484    to not corrupt also the new disk inserted with the data belonging to
 485    the old now corrupted disk). Also for the ramdisk the natural thing
 486    to do in order to release the ramdisk memory is to destroy dirty buffers.
 487
 488    These are two special cases. Normal usage imply the device driver
 489    to issue a sync on the device (without waiting I/O completion) and
 490    then an invalidate_buffers call that doesn't trash dirty buffers.
 491
 492    For handling cache coherency with the blkdev pagecache the 'update' case
 493    is been introduced. It is needed to re-read from disk any pinned
 494    buffer. NOTE: re-reading from disk is destructive so we can do it only
 495    when we assume nobody is changing the buffercache under our I/O and when
 496    we think the disk contains more recent information than the buffercache.
 497    The update == 1 pass marks the buffers we need to update, the update == 2
 498    pass does the actual I/O. */
 499 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 500 {
 501         invalidate_bh_lrus();
 502         /*
 503          * FIXME: what about destroy_dirty_buffers?
 504          * We really want to use invalidate_inode_pages2() for
 505          * that, but not until that's cleaned up.
 506          */
 507         invalidate_inode_pages(bdev->bd_inode->i_mapping);
 508 }
 509
 510 /*
 511  * Kick pdflush then try to free up some ZONE_NORMAL memory.
 512  */
 513 static void free_more_memory(void)
 514 {
 515         struct zone **zones;
 516         pg_data_t *pgdat;
 517
 518         wakeup_bdflush(1024);
 519         yield();
 520
 521         for_each_pgdat(pgdat) {
 522                 zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
 523                 if (*zones)
 524                         try_to_free_pages(zones, GFP_NOFS, 0);
 525         }
 526 }
 527
 528 /*
 529  * I/O completion handler for block_read_full_page() - pages
 530  * which come unlocked at the end of I/O.
 531  */
 532 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 533 {
 534         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 535         unsigned long flags;
 536         struct buffer_head *tmp;
 537         struct page *page;
 538         int page_uptodate = 1;
 539
 540         BUG_ON(!buffer_async_read(bh));
 541
 542         page = bh->b_page;
 543         if (uptodate) {
 544                 set_buffer_uptodate(bh);
 545         } else {
 546                 clear_buffer_uptodate(bh);
 547                 buffer_io_error(bh);
 548                 SetPageError(page);
 549         }
 550
 551         /*
 552          * Be _very_ careful from here on. Bad things can happen if
 553          * two buffer heads end IO at almost the same time and both
 554          * decide that the page is now completely done.
 555          */
 556         spin_lock_irqsave(&page_uptodate_lock, flags);
 557         clear_buffer_async_read(bh);
 558         unlock_buffer(bh);
 559         tmp = bh;
 560         do {
 561                 if (!buffer_uptodate(tmp))
 562                         page_uptodate = 0;
 563                 if (buffer_async_read(tmp)) {
 564                         BUG_ON(!buffer_locked(tmp));
 565                         goto still_busy;
 566                 }
 567                 tmp = tmp->b_this_page;
 568         } while (tmp != bh);
 569         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 570
 571         /*
 572          * If none of the buffers had errors and they are all
 573          * uptodate then we can set the page uptodate.
 574          */
 575         if (page_uptodate && !PageError(page))
 576                 SetPageUptodate(page);
 577         unlock_page(page);
 578         return;
 579
 580 still_busy:
 581         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 582         return;
 583 }
 584
 585 /*
 586  * Completion handler for block_write_full_page() - pages which are unlocked
 587  * during I/O, and which have PageWriteback cleared upon I/O completion.
 588  */
 589 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 590 {
 591         char b[BDEVNAME_SIZE];
 592         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 593         unsigned long flags;
 594         struct buffer_head *tmp;
 595         struct page *page;
 596
 597         BUG_ON(!buffer_async_write(bh));
 598
 599         page = bh->b_page;
 600         if (uptodate) {
 601                 set_buffer_uptodate(bh);
 602         } else {
 603                 if (printk_ratelimit()) {
 604                         buffer_io_error(bh);
 605                         printk(KERN_WARNING "lost page write due to "
 606                                         "I/O error on %s\n",
 607                                bdevname(bh->b_bdev, b));
 608                 }
 609                 set_bit(AS_EIO, &page->mapping->flags);
 610                 clear_buffer_uptodate(bh);
 611                 SetPageError(page);
 612         }
 613
 614         spin_lock_irqsave(&page_uptodate_lock, flags);
 615         clear_buffer_async_write(bh);
 616         unlock_buffer(bh);
 617         tmp = bh->b_this_page;
 618         while (tmp != bh) {
 619                 if (buffer_async_write(tmp)) {
 620                         BUG_ON(!buffer_locked(tmp));
 621                         goto still_busy;
 622                 }
 623                 tmp = tmp->b_this_page;
 624         }
 625         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 626         end_page_writeback(page);
 627         return;
 628
 629 still_busy:
 630         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 631         return;
 632 }
 633
 634 /*
 635  * If a page's buffers are under async readin (end_buffer_async_read
 636  * completion) then there is a possibility that another thread of
 637  * control could lock one of the buffers after it has completed
 638  * but while some of the other buffers have not completed.  This
 639  * locked buffer would confuse end_buffer_async_read() into not unlocking
 640  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 641  * that this buffer is not under async I/O.
 642  *
 643  * The page comes unlocked when it has no locked buffer_async buffers
 644  * left.
 645  *
 646  * PageLocked prevents anyone starting new async I/O reads any of
 647  * the buffers.
 648  *
 649  * PageWriteback is used to prevent simultaneous writeout of the same
 650  * page.
 651  *
 652  * PageLocked prevents anyone from starting writeback of a page which is
 653  * under read I/O (PageWriteback is only ever set against a locked page).
 654  */
 655 static void mark_buffer_async_read(struct buffer_head *bh)
 656 {
 657         bh->b_end_io = end_buffer_async_read;
 658         set_buffer_async_read(bh);
 659 }
 660
 661 void mark_buffer_async_write(struct buffer_head *bh)
 662 {
 663         bh->b_end_io = end_buffer_async_write;
 664         set_buffer_async_write(bh);
 665 }
 666 EXPORT_SYMBOL(mark_buffer_async_write);
 667
 668
 669 /*
 670  * fs/buffer.c contains helper functions for buffer-backed address space's
 671  * fsync functions.  A common requirement for buffer-based filesystems is
 672  * that certain data from the backing blockdev needs to be written out for
 673  * a successful fsync().  For example, ext2 indirect blocks need to be
 674  * written back and waited upon before fsync() returns.
 675  *
 676  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 677  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 678  * management of a list of dependent buffers at ->i_mapping->private_list.
 679  *
 680  * Locking is a little subtle: try_to_free_buffers() will remove buffers
 681  * from their controlling inode's queue when they are being freed.  But
 682  * try_to_free_buffers() will be operating against the *blockdev* mapping
 683  * at the time, not against the S_ISREG file which depends on those buffers.
 684  * So the locking for private_list is via the private_lock in the address_space
 685  * which backs the buffers.  Which is different from the address_space
 686  * against which the buffers are listed.  So for a particular address_space,
 687  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 688  * mapping->private_list will always be protected by the backing blockdev's
 689  * ->private_lock.
 690  *
 691  * Which introduces a requirement: all buffers on an address_space's
 692  * ->private_list must be from the same address_space: the blockdev's.
 693  *
 694  * address_spaces which do not place buffers at ->private_list via these
 695  * utility functions are free to use private_lock and private_list for
 696  * whatever they want.  The only requirement is that list_empty(private_list)
 697  * be true at clear_inode() time.
 698  *
 699  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 700  * filesystems should do that.  invalidate_inode_buffers() should just go
 701  * BUG_ON(!list_empty).
 702  *
 703  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 704  * take an address_space, not an inode.  And it should be called
 705  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 706  * queued up.
 707  *
 708  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 709  * list if it is already on a list.  Because if the buffer is on a list,
 710  * it *must* already be on the right one.  If not, the filesystem is being
 711  * silly.  This will save a ton of locking.  But first we have to ensure
 712  * that buffers are taken *off* the old inode's list when they are freed
 713  * (presumably in truncate).  That requires careful auditing of all
 714  * filesystems (do it inside bforget()).  It could also be done by bringing
 715  * b_inode back.
 716  */
 717
 718 /*
 719  * The buffer's backing address_space's private_lock must be held
 720  */
 721 static inline void __remove_assoc_queue(struct buffer_head *bh)
 722 {
 723         list_del_init(&bh->b_assoc_buffers);
 724 }
 725
 726 int inode_has_buffers(struct inode *inode)
 727 {
 728         return !list_empty(&inode->i_data.private_list);
 729 }
 730
 731 /*
 732  * osync is designed to support O_SYNC io.  It waits synchronously for
 733  * all already-submitted IO to complete, but does not queue any new
 734  * writes to the disk.
 735  *
 736  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 737  * you dirty the buffers, and then use osync_inode_buffers to wait for
 738  * completion.  Any other dirty buffers which are not yet queued for
 739  * write will not be flushed to disk by the osync.
 740  */
 741 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 742 {
 743         struct buffer_head *bh;
 744         struct list_head *p;
 745         int err = 0;
 746
 747         spin_lock(lock);
 748 repeat:
 749         list_for_each_prev(p, list) {
 750                 bh = BH_ENTRY(p);
 751                 if (buffer_locked(bh)) {
 752                         get_bh(bh);
 753                         spin_unlock(lock);
 754                         wait_on_buffer(bh);
 755                         if (!buffer_uptodate(bh))
 756                                 err = -EIO;
 757                         brelse(bh);
 758                         spin_lock(lock);
 759                         goto repeat;
 760                 }
 761         }
 762         spin_unlock(lock);
 763         return err;
 764 }
 765
 766 /**
 767  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 768  *                        buffers
 769  * @buffer_mapping - the mapping which backs the buffers' data
 770  * @mapping - the mapping which wants those buffers written
 771  *
 772  * Starts I/O against the buffers at mapping->private_list, and waits upon
 773  * that I/O.
 774  *
 775  * Basically, this is a convenience function for fsync().  @buffer_mapping is
 776  * the blockdev which "owns" the buffers and @mapping is a file or directory
 777  * which needs those buffers to be written for a successful fsync().
 778  */
 779 int sync_mapping_buffers(struct address_space *mapping)
 780 {
 781         struct address_space *buffer_mapping = mapping->assoc_mapping;
 782
 783         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 784                 return 0;
 785
 786         return fsync_buffers_list(&buffer_mapping->private_lock,
 787                                         &mapping->private_list);
 788 }
 789 EXPORT_SYMBOL(sync_mapping_buffers);
 790
 791 /*
 792  * Called when we've recently written block `bblock', and it is known that
 793  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 794  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 795  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 796  */
 797 void write_boundary_block(struct block_device *bdev,
 798                         sector_t bblock, unsigned blocksize)
 799 {
 800         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 801         if (bh) {
 802                 if (buffer_dirty(bh))
 803                         ll_rw_block(WRITE, 1, &bh);
 804                 put_bh(bh);
 805         }
 806 }
 807
 808 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 809 {
 810         struct address_space *mapping = inode->i_mapping;
 811         struct address_space *buffer_mapping = bh->b_page->mapping;
 812
 813         mark_buffer_dirty(bh);
 814         if (!mapping->assoc_mapping) {
 815                 mapping->assoc_mapping = buffer_mapping;
 816         } else {
 817                 if (mapping->assoc_mapping != buffer_mapping)
 818                         BUG();
 819         }
 820         if (list_empty(&bh->b_assoc_buffers)) {
 821                 spin_lock(&buffer_mapping->private_lock);
 822                 list_move_tail(&bh->b_assoc_buffers,
 823                                 &mapping->private_list);
 824                 spin_unlock(&buffer_mapping->private_lock);
 825         }
 826 }
 827 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 828
 829 /*
 830  * Add a page to the dirty page list.
 831  *
 832  * It is a sad fact of life that this function is called from several places
 833  * deeply under spinlocking.  It may not sleep.
 834  *
 835  * If the page has buffers, the uptodate buffers are set dirty, to preserve
 836  * dirty-state coherency between the page and the buffers.  It the page does
 837  * not have buffers then when they are later attached they will all be set
 838  * dirty.
 839  *
 840  * The buffers are dirtied before the page is dirtied.  There's a small race
 841  * window in which a writepage caller may see the page cleanness but not the
 842  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 843  * before the buffers, a concurrent writepage caller could clear the page dirty
 844  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 845  * page on the dirty page list.
 846  *
 847  * We use private_lock to lock against try_to_free_buffers while using the
 848  * page's buffer list.  Also use this to protect against clean buffers being
 849  * added to the page after it was set dirty.
 850  *
 851  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 852  * address_space though.
 853  */
 854 int __set_page_dirty_buffers(struct page *page)
 855 {
 856         struct address_space * const mapping = page->mapping;
 857
 858         spin_lock(&mapping->private_lock);
 859         if (page_has_buffers(page)) {
 860                 struct buffer_head *head = page_buffers(page);
 861                 struct buffer_head *bh = head;
 862
 863                 do {
 864                         set_buffer_dirty(bh);
 865                         bh = bh->b_this_page;
 866                 } while (bh != head);
 867         }
 868         spin_unlock(&mapping->private_lock);
 869
 870         if (!TestSetPageDirty(page)) {
 871                 spin_lock_irq(&mapping->tree_lock);
 872                 if (page->mapping) {    /* Race with truncate? */
 873                         if (!mapping->backing_dev_info->memory_backed)
 874                                 inc_page_state(nr_dirty);
 875                         radix_tree_tag_set(&mapping->page_tree,
 876                                                 page_index(page),
 877                                                 PAGECACHE_TAG_DIRTY);
 878                 }
 879                 spin_unlock_irq(&mapping->tree_lock);
 880                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 881         }
 882
 883         return 0;
 884 }
 885 EXPORT_SYMBOL(__set_page_dirty_buffers);
 886
 887 /*
 888  * Write out and wait upon a list of buffers.
 889  *
 890  * We have conflicting pressures: we want to make sure that all
 891  * initially dirty buffers get waited on, but that any subsequently
 892  * dirtied buffers don't.  After all, we don't want fsync to last
 893  * forever if somebody is actively writing to the file.
 894  *
 895  * Do this in two main stages: first we copy dirty buffers to a
 896  * temporary inode list, queueing the writes as we go.  Then we clean
 897  * up, waiting for those writes to complete.
 898  *
 899  * During this second stage, any subsequent updates to the file may end
 900  * up refiling the buffer on the original inode's dirty list again, so
 901  * there is a chance we will end up with a buffer queued for write but
 902  * not yet completed on that list.  So, as a final cleanup we go through
 903  * the osync code to catch these locked, dirty buffers without requeuing
 904  * any newly dirty buffers for write.
 905  */
 906 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 907 {
 908         struct buffer_head *bh;
 909         struct list_head tmp;
 910         int err = 0, err2;
 911
 912         INIT_LIST_HEAD(&tmp);
 913
 914         spin_lock(lock);
 915         while (!list_empty(list)) {
 916                 bh = BH_ENTRY(list->next);
 917                 list_del_init(&bh->b_assoc_buffers);
 918                 if (buffer_dirty(bh) || buffer_locked(bh)) {
 919                         list_add(&bh->b_assoc_buffers, &tmp);
 920                         if (buffer_dirty(bh)) {
 921                                 get_bh(bh);
 922                                 spin_unlock(lock);
 923                                 /*
 924                                  * Ensure any pending I/O completes so that
 925                                  * ll_rw_block() actually writes the current
 926                                  * contents - it is a noop if I/O is still in
 927                                  * flight on potentially older contents.
 928                                  */
 929                                 wait_on_buffer(bh);
 930                                 ll_rw_block(WRITE, 1, &bh);
 931                                 brelse(bh);
 932                                 spin_lock(lock);
 933                         }
 934                 }
 935         }
 936
 937         while (!list_empty(&tmp)) {
 938                 bh = BH_ENTRY(tmp.prev);
 939                 __remove_assoc_queue(bh);
 940                 get_bh(bh);
 941                 spin_unlock(lock);
 942                 wait_on_buffer(bh);
 943                 if (!buffer_uptodate(bh))
 944                         err = -EIO;
 945                 brelse(bh);
 946                 spin_lock(lock);
 947         }
 948
 949         spin_unlock(lock);
 950         err2 = osync_buffers_list(lock, list);
 951         if (err)
 952                 return err;
 953         else
 954                 return err2;
 955 }
 956
 957 /*
 958  * Invalidate any and all dirty buffers on a given inode.  We are
 959  * probably unmounting the fs, but that doesn't mean we have already
 960  * done a sync().  Just drop the buffers from the inode list.
 961  *
 962  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 963  * assumes that all the buffers are against the blockdev.  Not true
 964  * for reiserfs.
 965  */
 966 void invalidate_inode_buffers(struct inode *inode)
 967 {
 968         if (inode_has_buffers(inode)) {
 969                 struct address_space *mapping = &inode->i_data;
 970                 struct list_head *list = &mapping->private_list;
 971                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 972
 973                 spin_lock(&buffer_mapping->private_lock);
 974                 while (!list_empty(list))
 975                         __remove_assoc_queue(BH_ENTRY(list->next));
 976                 spin_unlock(&buffer_mapping->private_lock);
 977         }
 978 }
 979
 980 /*
 981  * Remove any clean buffers from the inode's buffer list.  This is called
 982  * when we're trying to free the inode itself.  Those buffers can pin it.
 983  *
 984  * Returns true if all buffers were removed.
 985  */
 986 int remove_inode_buffers(struct inode *inode)
 987 {
 988         int ret = 1;
 989
 990         if (inode_has_buffers(inode)) {
 991                 struct address_space *mapping = &inode->i_data;
 992                 struct list_head *list = &mapping->private_list;
 993                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 994
 995                 spin_lock(&buffer_mapping->private_lock);
 996                 while (!list_empty(list)) {
 997                         struct buffer_head *bh = BH_ENTRY(list->next);
 998                         if (buffer_dirty(bh)) {
 999                                 ret = 0;
1000                                 break;
1001                         }
1002                         __remove_assoc_queue(bh);
1003                 }
1004                 spin_unlock(&buffer_mapping->private_lock);
1005         }
1006         return ret;
1007 }
1008
1009 /*
1010  * Create the appropriate buffers when given a page for data area and
1011  * the size of each buffer.. Use the bh->b_this_page linked list to
1012  * follow the buffers created.  Return NULL if unable to create more
1013  * buffers.
1014  *
1015  * The retry flag is used to differentiate async IO (paging, swapping)
1016  * which may not fail from ordinary buffer allocations.
1017  */
1018 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
1019                 int retry)
1020 {
1021         struct buffer_head *bh, *head;
1022         long offset;
1023
1024 try_again:
1025         head = NULL;
1026         offset = PAGE_SIZE;
1027         while ((offset -= size) >= 0) {
1028                 bh = alloc_buffer_head(GFP_NOFS);
1029                 if (!bh)
1030                         goto no_grow;
1031
1032                 bh->b_bdev = NULL;
1033                 bh->b_this_page = head;
1034                 bh->b_blocknr = -1;
1035                 head = bh;
1036
1037                 bh->b_state = 0;
1038                 atomic_set(&bh->b_count, 0);
1039                 bh->b_size = size;
1040
1041                 /* Link the buffer to its page */
1042                 set_bh_page(bh, page, offset);
1043
1044                 bh->b_end_io = NULL;
1045         }
1046         return head;
1047 /*
1048  * In case anything failed, we just free everything we got.
1049  */
1050 no_grow:
1051         if (head) {
1052                 do {
1053                         bh = head;
1054                         head = head->b_this_page;
1055                         free_buffer_head(bh);
1056                 } while (head);
1057         }
1058
1059         /*
1060          * Return failure for non-async IO requests.  Async IO requests
1061          * are not allowed to fail, so we have to wait until buffer heads
1062          * become available.  But we don't want tasks sleeping with
1063          * partially complete buffers, so all were released above.
1064          */
1065         if (!retry)
1066                 return NULL;
1067
1068         /* We're _really_ low on memory. Now we just
1069          * wait for old buffer heads to become free due to
1070          * finishing IO.  Since this is an async request and
1071          * the reserve list is empty, we're sure there are
1072          * async buffer heads in use.
1073          */
1074         free_more_memory();
1075         goto try_again;
1076 }
1077 EXPORT_SYMBOL_GPL(alloc_page_buffers);
1078
1079 static inline void
1080 link_dev_buffers(struct page *page, struct buffer_head *head)
1081 {
1082         struct buffer_head *bh, *tail;
1083
1084         bh = head;
1085         do {
1086                 tail = bh;
1087                 bh = bh->b_this_page;
1088         } while (bh);
1089         tail->b_this_page = head;
1090         attach_page_buffers(page, head);
1091 }
1092
1093 /*
1094  * Initialise the state of a blockdev page's buffers.
1095  */
1096 static void
1097 init_page_buffers(struct page *page, struct block_device *bdev,
1098                         sector_t block, int size)
1099 {
1100         struct buffer_head *head = page_buffers(page);
1101         struct buffer_head *bh = head;
1102         int uptodate = PageUptodate(page);
1103
1104         do {
1105                 if (!buffer_mapped(bh)) {
1106                         init_buffer(bh, NULL, NULL);
1107                         bh->b_bdev = bdev;
1108                         bh->b_blocknr = block;
1109                         if (uptodate)
1110                                 set_buffer_uptodate(bh);
1111                         set_buffer_mapped(bh);
1112                 }
1113                 block++;
1114                 bh = bh->b_this_page;
1115         } while (bh != head);
1116 }
1117
1118 /*
1119  * Create the page-cache page that contains the requested block.
1120  *
1121  * This is user purely for blockdev mappings.
1122  */
1123 static struct page *
1124 grow_dev_page(struct block_device *bdev, sector_t block,
1125                 pgoff_t index, int size)
1126 {
1127         struct inode *inode = bdev->bd_inode;
1128         struct page *page;
1129         struct buffer_head *bh;
1130
1131         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1132         if (!page)
1133                 return NULL;
1134
1135         if (!PageLocked(page))
1136                 BUG();
1137
1138         if (page_has_buffers(page)) {
1139                 bh = page_buffers(page);
1140                 if (bh->b_size == size) {
1141                         init_page_buffers(page, bdev, block, size);
1142                         return page;
1143                 }
1144                 if (!try_to_free_buffers(page))
1145                         goto failed;
1146         }
1147
1148         /*
1149          * Allocate some buffers for this page
1150          */
1151         bh = alloc_page_buffers(page, size, 0);
1152         if (!bh)
1153                 goto failed;
1154
1155         /*
1156          * Link the page to the buffers and initialise them.  Take the
1157          * lock to be atomic wrt __find_get_block(), which does not
1158          * run under the page lock.
1159          */
1160         spin_lock(&inode->i_mapping->private_lock);
1161         link_dev_buffers(page, bh);
1162         init_page_buffers(page, bdev, block, size);
1163         spin_unlock(&inode->i_mapping->private_lock);
1164         return page;
1165
1166 failed:
1167         BUG();
1168         unlock_page(page);
1169         page_cache_release(page);
1170         return NULL;
1171 }
1172
1173 /*
1174  * Create buffers for the specified block device block's page.  If
1175  * that page was dirty, the buffers are set dirty also.
1176  *
1177  * Except that's a bug.  Attaching dirty buffers to a dirty
1178  * blockdev's page can result in filesystem corruption, because
1179  * some of those buffers may be aliases of filesystem data.
1180  * grow_dev_page() will go BUG() if this happens.
1181  */
1182 static inline int
1183 grow_buffers(struct block_device *bdev, sector_t block, int size)
1184 {
1185         struct page *page;
1186         pgoff_t index;
1187         int sizebits;
1188
1189         sizebits = -1;
1190         do {
1191                 sizebits++;
1192         } while ((size << sizebits) < PAGE_SIZE);
1193
1194         index = block >> sizebits;
1195         block = index << sizebits;
1196
1197         /* Create a page with the proper size buffers.. */
1198         page = grow_dev_page(bdev, block, index, size);
1199         if (!page)
1200                 return 0;
1201         unlock_page(page);
1202         page_cache_release(page);
1203         return 1;
1204 }
1205
1206 struct buffer_head *
1207 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1208 {
1209         /* Size must be multiple of hard sectorsize */
1210         if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1211                         (size < 512 || size > PAGE_SIZE))) {
1212                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1213                                         size);
1214                 printk(KERN_ERR "hardsect size: %d\n",
1215                                         bdev_hardsect_size(bdev));
1216
1217                 dump_stack();
1218                 return NULL;
1219         }
1220
1221         for (;;) {
1222                 struct buffer_head * bh;
1223
1224                 bh = __find_get_block(bdev, block, size);
1225                 if (bh)
1226                         return bh;
1227
1228                 if (!grow_buffers(bdev, block, size))
1229                         free_more_memory();
1230         }
1231 }
1232
1233 /*
1234  * The relationship between dirty buffers and dirty pages:
1235  *
1236  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1237  * the page is tagged dirty in its radix tree.
1238  *
1239  * At all times, the dirtiness of the buffers represents the dirtiness of
1240  * subsections of the page.  If the page has buffers, the page dirty bit is
1241  * merely a hint about the true dirty state.
1242  *
1243  * When a page is set dirty in its entirety, all its buffers are marked dirty
1244  * (if the page has buffers).
1245  *
1246  * When a buffer is marked dirty, its page is dirtied, but the page's other
1247  * buffers are not.
1248  *
1249  * Also.  When blockdev buffers are explicitly read with bread(), they
1250  * individually become uptodate.  But their backing page remains not
1251  * uptodate - even if all of its buffers are uptodate.  A subsequent
1252  * block_read_full_page() against that page will discover all the uptodate
1253  * buffers, will set the page uptodate and will perform no I/O.
1254  */
1255
1256 /**
1257  * mark_buffer_dirty - mark a buffer_head as needing writeout
1258  *
1259  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1260  * backing page dirty, then tag the page as dirty in its address_space's radix
1261  * tree and then attach the address_space's inode to its superblock's dirty
1262  * inode list.
1263  *
1264  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1265  * mapping->tree_lock and the global inode_lock.
1266  */
1267 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1268 {
1269         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1270                 __set_page_dirty_nobuffers(bh->b_page);
1271 }
1272
1273 /*
1274  * Decrement a buffer_head's reference count.  If all buffers against a page
1275  * have zero reference count, are clean and unlocked, and if the page is clean
1276  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1277  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1278  * a page but it ends up not being freed, and buffers may later be reattached).
1279  */
1280 void __brelse(struct buffer_head * buf)
1281 {
1282         if (atomic_read(&buf->b_count)) {
1283                 put_bh(buf);
1284                 return;
1285         }
1286         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1287         WARN_ON(1);
1288 }
1289
1290 /*
1291  * bforget() is like brelse(), except it discards any
1292  * potentially dirty data.
1293  */
1294 void __bforget(struct buffer_head *bh)
1295 {
1296         clear_buffer_dirty(bh);
1297         if (!list_empty(&bh->b_assoc_buffers)) {
1298                 struct address_space *buffer_mapping = bh->b_page->mapping;
1299
1300                 spin_lock(&buffer_mapping->private_lock);
1301                 list_del_init(&bh->b_assoc_buffers);
1302                 spin_unlock(&buffer_mapping->private_lock);
1303         }
1304         __brelse(bh);
1305 }
1306
1307 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1308 {
1309         lock_buffer(bh);
1310         if (buffer_uptodate(bh)) {
1311                 unlock_buffer(bh);
1312                 return bh;
1313         } else {
1314                 get_bh(bh);
1315                 bh->b_end_io = end_buffer_read_sync;
1316                 submit_bh(READ, bh);
1317                 wait_on_buffer(bh);
1318                 if (buffer_uptodate(bh))
1319                         return bh;
1320         }
1321         brelse(bh);
1322         return NULL;
1323 }
1324
1325 /*
1326  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1327  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1328  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1329  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1330  * CPU's LRUs at the same time.
1331  *
1332  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1333  * sb_find_get_block().
1334  *
1335  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1336  * a local interrupt disable for that.
1337  */
1338
1339 #define BH_LRU_SIZE     8
1340
1341 struct bh_lru {
1342         struct buffer_head *bhs[BH_LRU_SIZE];
1343 };
1344
1345 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1346
1347 #ifdef CONFIG_SMP
1348 #define bh_lru_lock()   local_irq_disable()
1349 #define bh_lru_unlock() local_irq_enable()
1350 #else
1351 #define bh_lru_lock()   preempt_disable()
1352 #define bh_lru_unlock() preempt_enable()
1353 #endif
1354
1355 static inline void check_irqs_on(void)
1356 {
1357 #ifdef irqs_disabled
1358         BUG_ON(irqs_disabled());
1359 #endif
1360 }
1361
1362 /*
1363  * The LRU management algorithm is dopey-but-simple.  Sorry.
1364  */
1365 static void bh_lru_install(struct buffer_head *bh)
1366 {
1367         struct buffer_head *evictee = NULL;
1368         struct bh_lru *lru;
1369
1370         check_irqs_on();
1371         bh_lru_lock();
1372         lru = &__get_cpu_var(bh_lrus);
1373         if (lru->bhs[0] != bh) {
1374                 struct buffer_head *bhs[BH_LRU_SIZE];
1375                 int in;
1376                 int out = 0;
1377
1378                 get_bh(bh);
1379                 bhs[out++] = bh;
1380                 for (in = 0; in < BH_LRU_SIZE; in++) {
1381                         struct buffer_head *bh2 = lru->bhs[in];
1382
1383                         if (bh2 == bh) {
1384                                 __brelse(bh2);
1385                         } else {
1386                                 if (out >= BH_LRU_SIZE) {
1387                                         BUG_ON(evictee != NULL);
1388                                         evictee = bh2;
1389                                 } else {
1390                                         bhs[out++] = bh2;
1391                                 }
1392                         }
1393                 }
1394                 while (out < BH_LRU_SIZE)
1395                         bhs[out++] = NULL;
1396                 memcpy(lru->bhs, bhs, sizeof(bhs));
1397         }
1398         bh_lru_unlock();
1399
1400         if (evictee)
1401                 __brelse(evictee);
1402 }
1403
1404 /*
1405  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1406  */
1407 static inline struct buffer_head *
1408 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1409 {
1410         struct buffer_head *ret = NULL;
1411         struct bh_lru *lru;
1412         int i;
1413
1414         check_irqs_on();
1415         bh_lru_lock();
1416         lru = &__get_cpu_var(bh_lrus);
1417         for (i = 0; i < BH_LRU_SIZE; i++) {
1418                 struct buffer_head *bh = lru->bhs[i];
1419
1420                 if (bh && bh->b_bdev == bdev &&
1421                                 bh->b_blocknr == block && bh->b_size == size) {
1422                         if (i) {
1423                                 while (i) {
1424                                         lru->bhs[i] = lru->bhs[i - 1];
1425                                         i--;
1426                                 }
1427                                 lru->bhs[0] = bh;
1428                         }
1429                         get_bh(bh);
1430                         ret = bh;
1431                         break;
1432                 }
1433         }
1434         bh_lru_unlock();
1435         return ret;
1436 }
1437
1438 /*
1439  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1440  * it in the LRU and mark it as accessed.  If it is not present then return
1441  * NULL
1442  */
1443 struct buffer_head *
1444 __find_get_block(struct block_device *bdev, sector_t block, int size)
1445 {
1446         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1447
1448         if (bh == NULL) {
1449                 bh = __find_get_block_slow(bdev, block, size);
1450                 if (bh)
1451                         bh_lru_install(bh);
1452         }
1453         if (bh)
1454                 touch_buffer(bh);
1455         return bh;
1456 }
1457 EXPORT_SYMBOL(__find_get_block);
1458
1459 /*
1460  * __getblk will locate (and, if necessary, create) the buffer_head
1461  * which corresponds to the passed block_device, block and size. The
1462  * returned buffer has its reference count incremented.
1463  *
1464  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1465  * illegal block number, __getblk() will happily return a buffer_head
1466  * which represents the non-existent block.  Very weird.
1467  *
1468  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1469  * attempt is failing.  FIXME, perhaps?
1470  */
1471 struct buffer_head *
1472 __getblk(struct block_device *bdev, sector_t block, int size)
1473 {
1474         struct buffer_head *bh = __find_get_block(bdev, block, size);
1475
1476         might_sleep();
1477         if (bh == NULL)
1478                 bh = __getblk_slow(bdev, block, size);
1479         return bh;
1480 }
1481 EXPORT_SYMBOL(__getblk);
1482
1483 /*
1484  * Do async read-ahead on a buffer..
1485  */
1486 void __breadahead(struct block_device *bdev, sector_t block, int size)
1487 {
1488         struct buffer_head *bh = __getblk(bdev, block, size);
1489         ll_rw_block(READA, 1, &bh);
1490         brelse(bh);
1491 }
1492 EXPORT_SYMBOL(__breadahead);
1493
1494 /**
1495  *  __bread() - reads a specified block and returns the bh
1496  *  @block: number of block
1497  *  @size: size (in bytes) to read
1498  *
1499  *  Reads a specified block, and returns buffer head that contains it.
1500  *  It returns NULL if the block was unreadable.
1501  */
1502 struct buffer_head *
1503 __bread(struct block_device *bdev, sector_t block, int size)
1504 {
1505         struct buffer_head *bh = __getblk(bdev, block, size);
1506
1507         if (!buffer_uptodate(bh))
1508                 bh = __bread_slow(bh);
1509         return bh;
1510 }
1511 EXPORT_SYMBOL(__bread);
1512
1513 /*
1514  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1515  * This doesn't race because it runs in each cpu either in irq
1516  * or with preempt disabled.
1517  */
1518 static void invalidate_bh_lru(void *arg)
1519 {
1520         struct bh_lru *b = &get_cpu_var(bh_lrus);
1521         int i;
1522
1523         for (i = 0; i < BH_LRU_SIZE; i++) {
1524                 brelse(b->bhs[i]);
1525                 b->bhs[i] = NULL;
1526         }
1527         put_cpu_var(bh_lrus);
1528 }
1529
1530 static void invalidate_bh_lrus(void)
1531 {
1532         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1533 }
1534
1535 void set_bh_page(struct buffer_head *bh,
1536                 struct page *page, unsigned long offset)
1537 {
1538         bh->b_page = page;
1539         if (offset >= PAGE_SIZE)
1540                 BUG();
1541         if (PageHighMem(page))
1542                 /*
1543                  * This catches illegal uses and preserves the offset:
1544                  */
1545                 bh->b_data = (char *)(0 + offset);
1546         else
1547                 bh->b_data = page_address(page) + offset;
1548 }
1549 EXPORT_SYMBOL(set_bh_page);
1550
1551 /*
1552  * Called when truncating a buffer on a page completely.
1553  */
1554 static inline void discard_buffer(struct buffer_head * bh)
1555 {
1556         lock_buffer(bh);
1557         clear_buffer_dirty(bh);
1558         bh->b_bdev = NULL;
1559         clear_buffer_mapped(bh);
1560         clear_buffer_req(bh);
1561         clear_buffer_new(bh);
1562         clear_buffer_delay(bh);
1563         unlock_buffer(bh);
1564 }
1565
1566 /**
1567  * try_to_release_page() - release old fs-specific metadata on a page
1568  *
1569  * @page: the page which the kernel is trying to free
1570  * @gfp_mask: memory allocation flags (and I/O mode)
1571  *
1572  * The address_space is to try to release any data against the page
1573  * (presumably at page->private).  If the release was successful, return `1'.
1574  * Otherwise return zero.
1575  *
1576  * The @gfp_mask argument specifies whether I/O may be performed to release
1577  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1578  *
1579  * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1580  */
1581 int try_to_release_page(struct page *page, int gfp_mask)
1582 {
1583         struct address_space * const mapping = page->mapping;
1584
1585         BUG_ON(!PageLocked(page));
1586         if (PageWriteback(page))
1587                 return 0;
1588
1589         if (mapping && mapping->a_ops->releasepage)
1590                 return mapping->a_ops->releasepage(page, gfp_mask);
1591         return try_to_free_buffers(page);
1592 }
1593 EXPORT_SYMBOL(try_to_release_page);
1594
1595 /**
1596  * block_invalidatepage - invalidate part of all of a buffer-backed page
1597  *
1598  * @page: the page which is affected
1599  * @offset: the index of the truncation point
1600  *
1601  * block_invalidatepage() is called when all or part of the page has become
1602  * invalidatedby a truncate operation.
1603  *
1604  * block_invalidatepage() does not have to release all buffers, but it must
1605  * ensure that no dirty buffer is left outside @offset and that no I/O
1606  * is underway against any of the blocks which are outside the truncation
1607  * point.  Because the caller is about to free (and possibly reuse) those
1608  * blocks on-disk.
1609  */
1610 int block_invalidatepage(struct page *page, unsigned long offset)
1611 {
1612         struct buffer_head *head, *bh, *next;
1613         unsigned int curr_off = 0;
1614         int ret = 1;
1615
1616         BUG_ON(!PageLocked(page));
1617         if (!page_has_buffers(page))
1618                 goto out;
1619
1620         head = page_buffers(page);
1621         bh = head;
1622         do {
1623                 unsigned int next_off = curr_off + bh->b_size;
1624                 next = bh->b_this_page;
1625
1626                 /*
1627                  * is this block fully invalidated?
1628                  */
1629                 if (offset <= curr_off)
1630                         discard_buffer(bh);
1631                 curr_off = next_off;
1632                 bh = next;
1633         } while (bh != head);
1634
1635         /*
1636          * We release buffers only if the entire page is being invalidated.
1637          * The get_block cached value has been unconditionally invalidated,
1638          * so real IO is not possible anymore.
1639          */
1640         if (offset == 0)
1641                 ret = try_to_release_page(page, 0);
1642 out:
1643         return ret;
1644 }
1645 EXPORT_SYMBOL(block_invalidatepage);
1646
1647 /*
1648  * We attach and possibly dirty the buffers atomically wrt
1649  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1650  * is already excluded via the page lock.
1651  */
1652 void create_empty_buffers(struct page *page,
1653                         unsigned long blocksize, unsigned long b_state)
1654 {
1655         struct buffer_head *bh, *head, *tail;
1656
1657         head = alloc_page_buffers(page, blocksize, 1);
1658         bh = head;
1659         do {
1660                 bh->b_state |= b_state;
1661                 tail = bh;
1662                 bh = bh->b_this_page;
1663         } while (bh);
1664         tail->b_this_page = head;
1665
1666         spin_lock(&page->mapping->private_lock);
1667         if (PageUptodate(page) || PageDirty(page)) {
1668                 bh = head;
1669                 do {
1670                         if (PageDirty(page))
1671                                 set_buffer_dirty(bh);
1672                         if (PageUptodate(page))
1673                                 set_buffer_uptodate(bh);
1674                         bh = bh->b_this_page;
1675                 } while (bh != head);
1676         }
1677         attach_page_buffers(page, head);
1678         spin_unlock(&page->mapping->private_lock);
1679 }
1680 EXPORT_SYMBOL(create_empty_buffers);
1681
1682 /*
1683  * We are taking a block for data and we don't want any output from any
1684  * buffer-cache aliases starting from return from that function and
1685  * until the moment when something will explicitly mark the buffer
1686  * dirty (hopefully that will not happen until we will free that block ;-)
1687  * We don't even need to mark it not-uptodate - nobody can expect
1688  * anything from a newly allocated buffer anyway. We used to used
1689  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1690  * don't want to mark the alias unmapped, for example - it would confuse
1691  * anyone who might pick it with bread() afterwards...
1692  *
1693  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1694  * be writeout I/O going on against recently-freed buffers.  We don't
1695  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1696  * only if we really need to.  That happens here.
1697  */
1698 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1699 {
1700         struct buffer_head *old_bh;
1701
1702         might_sleep();
1703
1704         old_bh = __find_get_block_slow(bdev, block, 0);
1705         if (old_bh) {
1706                 clear_buffer_dirty(old_bh);
1707                 wait_on_buffer(old_bh);
1708                 clear_buffer_req(old_bh);
1709                 __brelse(old_bh);
1710         }
1711 }
1712 EXPORT_SYMBOL(unmap_underlying_metadata);
1713
1714 /*
1715  * NOTE! All mapped/uptodate combinations are valid:
1716  *
1717  *      Mapped  Uptodate        Meaning
1718  *
1719  *      No      No              "unknown" - must do get_block()
1720  *      No      Yes             "hole" - zero-filled
1721  *      Yes     No              "allocated" - allocated on disk, not read in
1722  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1723  *
1724  * "Dirty" is valid only with the last case (mapped+uptodate).
1725  */
1726
1727 /*
1728  * While block_write_full_page is writing back the dirty buffers under
1729  * the page lock, whoever dirtied the buffers may decide to clean them
1730  * again at any time.  We handle that by only looking at the buffer
1731  * state inside lock_buffer().
1732  *
1733  * If block_write_full_page() is called for regular writeback
1734  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1735  * locked buffer.   This only can happen if someone has written the buffer
1736  * directly, with submit_bh().  At the address_space level PageWriteback
1737  * prevents this contention from occurring.
1738  */
1739 static int __block_write_full_page(struct inode *inode, struct page *page,
1740                         get_block_t *get_block, struct writeback_control *wbc)
1741 {
1742         int err;
1743         sector_t block;
1744         sector_t last_block;
1745         struct buffer_head *bh, *head;
1746         int nr_underway = 0;
1747
1748         BUG_ON(!PageLocked(page));
1749
1750         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1751
1752         if (!page_has_buffers(page)) {
1753                 create_empty_buffers(page, 1 << inode->i_blkbits,
1754                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1755         }
1756
1757         /*
1758          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1759          * here, and the (potentially unmapped) buffers may become dirty at
1760          * any time.  If a buffer becomes dirty here after we've inspected it
1761          * then we just miss that fact, and the page stays dirty.
1762          *
1763          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1764          * handle that here by just cleaning them.
1765          */
1766
1767         block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1768         head = page_buffers(page);
1769         bh = head;
1770
1771         /*
1772          * Get all the dirty buffers mapped to disk addresses and
1773          * handle any aliases from the underlying blockdev's mapping.
1774          */
1775         do {
1776                 if (block > last_block) {
1777                         /*
1778                          * mapped buffers outside i_size will occur, because
1779                          * this page can be outside i_size when there is a
1780                          * truncate in progress.
1781                          */
1782                         /*
1783                          * The buffer was zeroed by block_write_full_page()
1784                          */
1785                         clear_buffer_dirty(bh);
1786                         set_buffer_uptodate(bh);
1787                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1788                         err = get_block(inode, block, bh, 1);
1789                         if (err)
1790                                 goto recover;
1791                         if (buffer_new(bh)) {
1792                                 /* blockdev mappings never come here */
1793                                 clear_buffer_new(bh);
1794                                 unmap_underlying_metadata(bh->b_bdev,
1795                                                         bh->b_blocknr);
1796                         }
1797                 }
1798                 bh = bh->b_this_page;
1799                 block++;
1800         } while (bh != head);
1801
1802         do {
1803                 get_bh(bh);
1804                 if (!buffer_mapped(bh))
1805                         continue;
1806                 /*
1807                  * If it's a fully non-blocking write attempt and we cannot
1808                  * lock the buffer then redirty the page.  Note that this can
1809                  * potentially cause a busy-wait loop from pdflush and kswapd
1810                  * activity, but those code paths have their own higher-level
1811                  * throttling.
1812                  */
1813                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1814                         lock_buffer(bh);
1815                 } else if (test_set_buffer_locked(bh)) {
1816                         redirty_page_for_writepage(wbc, page);
1817                         continue;
1818                 }
1819                 if (test_clear_buffer_dirty(bh)) {
1820                         mark_buffer_async_write(bh);
1821                 } else {
1822                         unlock_buffer(bh);
1823                 }
1824         } while ((bh = bh->b_this_page) != head);
1825
1826         /*
1827          * The page and its buffers are protected by PageWriteback(), so we can
1828          * drop the bh refcounts early.
1829          */
1830         BUG_ON(PageWriteback(page));
1831         set_page_writeback(page);
1832         unlock_page(page);
1833
1834         do {
1835                 struct buffer_head *next = bh->b_this_page;
1836                 if (buffer_async_write(bh)) {
1837                         submit_bh(WRITE, bh);
1838                         nr_underway++;
1839                 }
1840                 put_bh(bh);
1841                 bh = next;
1842         } while (bh != head);
1843
1844         err = 0;
1845 done:
1846         if (nr_underway == 0) {
1847                 /*
1848                  * The page was marked dirty, but the buffers were
1849                  * clean.  Someone wrote them back by hand with
1850                  * ll_rw_block/submit_bh.  A rare case.
1851                  */
1852                 int uptodate = 1;
1853                 do {
1854                         if (!buffer_uptodate(bh)) {
1855                                 uptodate = 0;
1856                                 break;
1857                         }
1858                         bh = bh->b_this_page;
1859                 } while (bh != head);
1860                 if (uptodate)
1861                         SetPageUptodate(page);
1862                 end_page_writeback(page);
1863                 /*
1864                  * The page and buffer_heads can be released at any time from
1865                  * here on.
1866                  */
1867                 wbc->pages_skipped++;   /* We didn't write this page */
1868         }
1869         return err;
1870
1871 recover:
1872         /*
1873          * ENOSPC, or some other error.  We may already have added some
1874          * blocks to the file, so we need to write these out to avoid
1875          * exposing stale data.
1876          * The page is currently locked and not marked for writeback
1877          */
1878         bh = head;
1879         /* Recovery: lock and submit the mapped buffers */
1880         do {
1881                 get_bh(bh);
1882                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1883                         lock_buffer(bh);
1884                         mark_buffer_async_write(bh);
1885                 } else {
1886                         /*
1887                          * The buffer may have been set dirty during
1888                          * attachment to a dirty page.
1889                          */
1890                         clear_buffer_dirty(bh);
1891                 }
1892         } while ((bh = bh->b_this_page) != head);
1893         SetPageError(page);
1894         BUG_ON(PageWriteback(page));
1895         set_page_writeback(page);
1896         unlock_page(page);
1897         do {
1898                 struct buffer_head *next = bh->b_this_page;
1899                 if (buffer_async_write(bh)) {
1900                         clear_buffer_dirty(bh);
1901                         submit_bh(WRITE, bh);
1902                         nr_underway++;
1903                 }
1904                 put_bh(bh);
1905                 bh = next;
1906         } while (bh != head);
1907         goto done;
1908 }
1909
1910 static int __block_prepare_write(struct inode *inode, struct page *page,
1911                 unsigned from, unsigned to, get_block_t *get_block)
1912 {
1913         unsigned block_start, block_end;
1914         sector_t block;
1915         int err = 0;
1916         unsigned blocksize, bbits;
1917         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1918
1919         BUG_ON(!PageLocked(page));
1920         BUG_ON(from > PAGE_CACHE_SIZE);
1921         BUG_ON(to > PAGE_CACHE_SIZE);
1922         BUG_ON(from > to);
1923
1924         blocksize = 1 << inode->i_blkbits;
1925         if (!page_has_buffers(page))
1926                 create_empty_buffers(page, blocksize, 0);
1927         head = page_buffers(page);
1928
1929         bbits = inode->i_blkbits;
1930         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1931
1932         for(bh = head, block_start = 0; bh != head || !block_start;
1933             block++, block_start=block_end, bh = bh->b_this_page) {
1934                 block_end = block_start + blocksize;
1935                 if (block_end <= from || block_start >= to) {
1936                         if (PageUptodate(page)) {
1937                                 if (!buffer_uptodate(bh))
1938                                         set_buffer_uptodate(bh);
1939                         }
1940                         continue;
1941                 }
1942                 if (buffer_new(bh))
1943                         clear_buffer_new(bh);
1944                 if (!buffer_mapped(bh)) {
1945                         err = get_block(inode, block, bh, 1);
1946                         if (err)
1947                                 goto out;
1948                         if (buffer_new(bh)) {
1949                                 clear_buffer_new(bh);
1950                                 unmap_underlying_metadata(bh->b_bdev,
1951                                                         bh->b_blocknr);
1952                                 if (PageUptodate(page)) {
1953                                         set_buffer_uptodate(bh);
1954                                         continue;
1955                                 }
1956                                 if (block_end > to || block_start < from) {
1957                                         void *kaddr;
1958
1959                                         kaddr = kmap_atomic(page, KM_USER0);
1960                                         if (block_end > to)
1961                                                 memset(kaddr+to, 0,
1962                                                         block_end-to);
1963                                         if (block_start < from)
1964                                                 memset(kaddr+block_start,
1965                                                         0, from-block_start);
1966                                         flush_dcache_page(page);
1967                                         kunmap_atomic(kaddr, KM_USER0);
1968                                 }
1969                                 continue;
1970                         }
1971                 }
1972                 if (PageUptodate(page)) {
1973                         if (!buffer_uptodate(bh))
1974                                 set_buffer_uptodate(bh);
1975                         continue;
1976                 }
1977                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1978                      (block_start < from || block_end > to)) {
1979                         ll_rw_block(READ, 1, &bh);
1980                         *wait_bh++=bh;
1981                 }
1982         }
1983         /*
1984          * If we issued read requests - let them complete.
1985          */
1986         while(wait_bh > wait) {
1987                 wait_on_buffer(*--wait_bh);
1988                 if (!buffer_uptodate(*wait_bh))
1989                         return -EIO;
1990         }
1991         return 0;
1992 out:
1993         /*
1994          * Zero out any newly allocated blocks to avoid exposing stale
1995          * data.  If BH_New is set, we know that the block was newly
1996          * allocated in the above loop.
1997          */
1998         bh = head;
1999         block_start = 0;
2000         do {
2001                 block_end = block_start+blocksize;
2002                 if (block_end <= from)
2003                         goto next_bh;
2004                 if (block_start >= to)
2005                         break;
2006                 if (buffer_new(bh)) {
2007                         void *kaddr;
2008
2009                         clear_buffer_new(bh);
2010                         kaddr = kmap_atomic(page, KM_USER0);
2011                         memset(kaddr+block_start, 0, bh->b_size);
2012                         kunmap_atomic(kaddr, KM_USER0);
2013                         set_buffer_uptodate(bh);
2014                         mark_buffer_dirty(bh);
2015                 }
2016 next_bh:
2017                 block_start = block_end;
2018                 bh = bh->b_this_page;
2019         } while (bh != head);
2020         return err;
2021 }
2022
2023 static int __block_commit_write(struct inode *inode, struct page *page,
2024                 unsigned from, unsigned to)
2025 {
2026         unsigned block_start, block_end;
2027         int partial = 0;
2028         unsigned blocksize;
2029         struct buffer_head *bh, *head;
2030
2031         blocksize = 1 << inode->i_blkbits;
2032
2033         for(bh = head = page_buffers(page), block_start = 0;
2034             bh != head || !block_start;
2035             block_start=block_end, bh = bh->b_this_page) {
2036                 block_end = block_start + blocksize;
2037                 if (block_end <= from || block_start >= to) {
2038                         if (!buffer_uptodate(bh))
2039                                 partial = 1;
2040                 } else {
2041                         set_buffer_uptodate(bh);
2042                         mark_buffer_dirty(bh);
2043                 }
2044         }
2045
2046         /*
2047          * If this is a partial write which happened to make all buffers
2048          * uptodate then we can optimize away a bogus readpage() for
2049          * the next read(). Here we 'discover' whether the page went
2050          * uptodate as a result of this (potentially partial) write.
2051          */
2052         if (!partial)
2053                 SetPageUptodate(page);
2054         return 0;
2055 }
2056
2057 /*
2058  * Generic "read page" function for block devices that have the normal
2059  * get_block functionality. This is most of the block device filesystems.
2060  * Reads the page asynchronously --- the unlock_buffer() and
2061  * set/clear_buffer_uptodate() functions propagate buffer state into the
2062  * page struct once IO has completed.
2063  */
2064 int block_read_full_page(struct page *page, get_block_t *get_block)
2065 {
2066         struct inode *inode = page->mapping->host;
2067         sector_t iblock, lblock;
2068         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2069         unsigned int blocksize;
2070         int nr, i;
2071         int fully_mapped = 1;
2072
2073         if (!PageLocked(page))
2074                 PAGE_BUG(page);
2075         blocksize = 1 << inode->i_blkbits;
2076         if (!page_has_buffers(page))
2077                 create_empty_buffers(page, blocksize, 0);
2078         head = page_buffers(page);
2079
2080         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2081         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2082         bh = head;
2083         nr = 0;
2084         i = 0;
2085
2086         do {
2087                 if (buffer_uptodate(bh))
2088                         continue;
2089
2090                 if (!buffer_mapped(bh)) {
2091                         fully_mapped = 0;
2092                         if (iblock < lblock) {
2093                                 if (get_block(inode, iblock, bh, 0))
2094                                         SetPageError(page);
2095                         }
2096                         if (!buffer_mapped(bh)) {
2097                                 void *kaddr = kmap_atomic(page, KM_USER0);
2098                                 memset(kaddr + i * blocksize, 0, blocksize);
2099                                 flush_dcache_page(page);
2100                                 kunmap_atomic(kaddr, KM_USER0);
2101                                 set_buffer_uptodate(bh);
2102                                 continue;
2103                         }
2104                         /*
2105                          * get_block() might have updated the buffer
2106                          * synchronously
2107                          */
2108                         if (buffer_uptodate(bh))
2109                                 continue;
2110                 }
2111                 arr[nr++] = bh;
2112         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2113
2114         if (fully_mapped)
2115                 SetPageMappedToDisk(page);
2116
2117         if (!nr) {
2118                 /*
2119                  * All buffers are uptodate - we can set the page uptodate
2120                  * as well. But not if get_block() returned an error.
2121                  */
2122                 if (!PageError(page))
2123                         SetPageUptodate(page);
2124                 unlock_page(page);
2125                 return 0;
2126         }
2127
2128         /* Stage two: lock the buffers */
2129         for (i = 0; i < nr; i++) {
2130                 bh = arr[i];
2131                 lock_buffer(bh);
2132                 mark_buffer_async_read(bh);
2133         }
2134
2135         /*
2136          * Stage 3: start the IO.  Check for uptodateness
2137          * inside the buffer lock in case another process reading
2138          * the underlying blockdev brought it uptodate (the sct fix).
2139          */
2140         for (i = 0; i < nr; i++) {
2141                 bh = arr[i];
2142                 if (buffer_uptodate(bh))
2143                         end_buffer_async_read(bh, 1);
2144                 else
2145                         submit_bh(READ, bh);
2146         }
2147         return 0;
2148 }
2149
2150 /* utility function for filesystems that need to do work on expanding
2151  * truncates.  Uses prepare/commit_write to allow the filesystem to
2152  * deal with the hole.
2153  */
2154 int generic_cont_expand(struct inode *inode, loff_t size)
2155 {
2156         struct address_space *mapping = inode->i_mapping;
2157         struct page *page;
2158         unsigned long index, offset, limit;
2159         int err;
2160
2161         err = -EFBIG;
2162         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2163         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2164                 send_sig(SIGXFSZ, current, 0);
2165                 goto out;
2166         }
2167         if (size > inode->i_sb->s_maxbytes)
2168                 goto out;
2169
2170         offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2171
2172         /* ugh.  in prepare/commit_write, if from==to==start of block, we
2173         ** skip the prepare.  make sure we never send an offset for the start
2174         ** of a block
2175         */
2176         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2177                 offset++;
2178         }
2179         index = size >> PAGE_CACHE_SHIFT;
2180         err = -ENOMEM;
2181         page = grab_cache_page(mapping, index);
2182         if (!page)
2183                 goto out;
2184         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2185         if (!err) {
2186                 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2187         }
2188         unlock_page(page);
2189         page_cache_release(page);
2190         if (err > 0)
2191                 err = 0;
2192 out:
2193         return err;
2194 }
2195
2196 /*
2197  * For moronic filesystems that do not allow holes in file.
2198  * We may have to extend the file.
2199  */
2200
2201 int cont_prepare_write(struct page *page, unsigned offset,
2202                 unsigned to, get_block_t *get_block, loff_t *bytes)
2203 {
2204         struct address_space *mapping = page->mapping;
2205         struct inode *inode = mapping->host;
2206         struct page *new_page;
2207         pgoff_t pgpos;
2208         long status;
2209         unsigned zerofrom;
2210         unsigned blocksize = 1 << inode->i_blkbits;
2211         void *kaddr;
2212
2213         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2214                 status = -ENOMEM;
2215                 new_page = grab_cache_page(mapping, pgpos);
2216                 if (!new_page)
2217                         goto out;
2218                 /* we might sleep */
2219                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2220                         unlock_page(new_page);
2221                         page_cache_release(new_page);
2222                         continue;
2223                 }
2224                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2225                 if (zerofrom & (blocksize-1)) {
2226                         *bytes |= (blocksize-1);
2227                         (*bytes)++;
2228                 }
2229                 status = __block_prepare_write(inode, new_page, zerofrom,
2230                                                 PAGE_CACHE_SIZE, get_block);
2231                 if (status)
2232                         goto out_unmap;
2233                 kaddr = kmap_atomic(new_page, KM_USER0);
2234                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2235                 flush_dcache_page(new_page);
2236                 kunmap_atomic(kaddr, KM_USER0);
2237                 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2238                 unlock_page(new_page);
2239                 page_cache_release(new_page);
2240         }
2241
2242         if (page->index < pgpos) {
2243                 /* completely inside the area */
2244                 zerofrom = offset;
2245         } else {
2246                 /* page covers the boundary, find the boundary offset */
2247                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2248
2249                 /* if we will expand the thing last block will be filled */
2250                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2251                         *bytes |= (blocksize-1);
2252                         (*bytes)++;
2253                 }
2254
2255                 /* starting below the boundary? Nothing to zero out */
2256                 if (offset <= zerofrom)
2257                         zerofrom = offset;
2258         }
2259         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2260         if (status)
2261                 goto out1;
2262         if (zerofrom < offset) {
2263                 kaddr = kmap_atomic(page, KM_USER0);
2264                 memset(kaddr+zerofrom, 0, offset-zerofrom);
2265                 flush_dcache_page(page);
2266                 kunmap_atomic(kaddr, KM_USER0);
2267                 __block_commit_write(inode, page, zerofrom, offset);
2268         }
2269         return 0;
2270 out1:
2271         ClearPageUptodate(page);
2272         return status;
2273
2274 out_unmap:
2275         ClearPageUptodate(new_page);
2276         unlock_page(new_page);
2277         page_cache_release(new_page);
2278 out:
2279         return status;
2280 }
2281
2282 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2283                         get_block_t *get_block)
2284 {
2285         struct inode *inode = page->mapping->host;
2286         int err = __block_prepare_write(inode, page, from, to, get_block);
2287         if (err)
2288                 ClearPageUptodate(page);
2289         return err;
2290 }
2291
2292 int block_commit_write(struct page *page, unsigned from, unsigned to)
2293 {
2294         struct inode *inode = page->mapping->host;
2295         __block_commit_write(inode,page,from,to);
2296         return 0;
2297 }
2298
2299 int generic_commit_write(struct file *file, struct page *page,
2300                 unsigned from, unsigned to)
2301 {
2302         struct inode *inode = page->mapping->host;
2303         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2304         __block_commit_write(inode,page,from,to);
2305         /*
2306          * No need to use i_size_read() here, the i_size
2307          * cannot change under us because we hold i_sem.
2308          */
2309         if (pos > inode->i_size) {
2310                 i_size_write(inode, pos);
2311                 mark_inode_dirty(inode);
2312         }
2313         return 0;
2314 }
2315
2316
2317 /*
2318  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2319  * immediately, while under the page lock.  So it needs a special end_io
2320  * handler which does not touch the bh after unlocking it.
2321  *
2322  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2323  * a race there is benign: unlock_buffer() only use the bh's address for
2324  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2325  * itself.
2326  */
2327 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2328 {
2329         if (uptodate) {
2330                 set_buffer_uptodate(bh);
2331         } else {
2332                 /* This happens, due to failed READA attempts. */
2333                 clear_buffer_uptodate(bh);
2334         }
2335         unlock_buffer(bh);
2336 }
2337
2338 /*
2339  * On entry, the page is fully not uptodate.
2340  * On exit the page is fully uptodate in the areas outside (from,to)
2341  */
2342 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2343                         get_block_t *get_block)
2344 {
2345         struct inode *inode = page->mapping->host;
2346         const unsigned blkbits = inode->i_blkbits;
2347         const unsigned blocksize = 1 << blkbits;
2348         struct buffer_head map_bh;
2349         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2350         unsigned block_in_page;
2351         unsigned block_start;
2352         sector_t block_in_file;
2353         char *kaddr;
2354         int nr_reads = 0;
2355         int i;
2356         int ret = 0;
2357         int is_mapped_to_disk = 1;
2358         int dirtied_it = 0;
2359
2360         if (PageMappedToDisk(page))
2361                 return 0;
2362
2363         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2364         map_bh.b_page = page;
2365
2366         /*
2367          * We loop across all blocks in the page, whether or not they are
2368          * part of the affected region.  This is so we can discover if the
2369          * page is fully mapped-to-disk.
2370          */
2371         for (block_start = 0, block_in_page = 0;
2372                   block_start < PAGE_CACHE_SIZE;
2373                   block_in_page++, block_start += blocksize) {
2374                 unsigned block_end = block_start + blocksize;
2375                 int create;
2376
2377                 map_bh.b_state = 0;
2378                 create = 1;
2379                 if (block_start >= to)
2380                         create = 0;
2381                 ret = get_block(inode, block_in_file + block_in_page,
2382                                         &map_bh, create);
2383                 if (ret)
2384                         goto failed;
2385                 if (!buffer_mapped(&map_bh))
2386                         is_mapped_to_disk = 0;
2387                 if (buffer_new(&map_bh))
2388                         unmap_underlying_metadata(map_bh.b_bdev,
2389                                                         map_bh.b_blocknr);
2390                 if (PageUptodate(page))
2391                         continue;
2392                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2393                         kaddr = kmap_atomic(page, KM_USER0);
2394                         if (block_start < from) {
2395                                 memset(kaddr+block_start, 0, from-block_start);
2396                                 dirtied_it = 1;
2397                         }
2398                         if (block_end > to) {
2399                                 memset(kaddr + to, 0, block_end - to);
2400                                 dirtied_it = 1;
2401                         }
2402                         flush_dcache_page(page);
2403                         kunmap_atomic(kaddr, KM_USER0);
2404                         continue;
2405                 }
2406                 if (buffer_uptodate(&map_bh))
2407                         continue;       /* reiserfs does this */
2408                 if (block_start < from || block_end > to) {
2409                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2410
2411                         if (!bh) {
2412                                 ret = -ENOMEM;
2413                                 goto failed;
2414                         }
2415                         bh->b_state = map_bh.b_state;
2416                         atomic_set(&bh->b_count, 0);
2417                         bh->b_this_page = NULL;
2418                         bh->b_page = page;
2419                         bh->b_blocknr = map_bh.b_blocknr;
2420                         bh->b_size = blocksize;
2421                         bh->b_data = (char *)(long)block_start;
2422                         bh->b_bdev = map_bh.b_bdev;
2423                         bh->b_private = NULL;
2424                         read_bh[nr_reads++] = bh;
2425                 }
2426         }
2427
2428         if (nr_reads) {
2429                 struct buffer_head *bh;
2430
2431                 /*
2432                  * The page is locked, so these buffers are protected from
2433                  * any VM or truncate activity.  Hence we don't need to care
2434                  * for the buffer_head refcounts.
2435                  */
2436                 for (i = 0; i < nr_reads; i++) {
2437                         bh = read_bh[i];
2438                         lock_buffer(bh);
2439                         bh->b_end_io = end_buffer_read_nobh;
2440                         submit_bh(READ, bh);
2441                 }
2442                 for (i = 0; i < nr_reads; i++) {
2443                         bh = read_bh[i];
2444                         wait_on_buffer(bh);
2445                         if (!buffer_uptodate(bh))
2446                                 ret = -EIO;
2447                         free_buffer_head(bh);
2448                         read_bh[i] = NULL;
2449                 }
2450                 if (ret)
2451                         goto failed;
2452         }
2453
2454         if (is_mapped_to_disk)
2455                 SetPageMappedToDisk(page);
2456         SetPageUptodate(page);
2457
2458         /*
2459          * Setting the page dirty here isn't necessary for the prepare_write
2460          * function - commit_write will do that.  But if/when this function is
2461          * used within the pagefault handler to ensure that all mmapped pages
2462          * have backing space in the filesystem, we will need to dirty the page
2463          * if its contents were altered.
2464          */
2465         if (dirtied_it)
2466                 set_page_dirty(page);
2467
2468         return 0;
2469
2470 failed:
2471         for (i = 0; i < nr_reads; i++) {
2472                 if (read_bh[i])
2473                         free_buffer_head(read_bh[i]);
2474         }
2475
2476         /*
2477          * Error recovery is pretty slack.  Clear the page and mark it dirty
2478          * so we'll later zero out any blocks which _were_ allocated.
2479          */
2480         kaddr = kmap_atomic(page, KM_USER0);
2481         memset(kaddr, 0, PAGE_CACHE_SIZE);
2482         kunmap_atomic(kaddr, KM_USER0);
2483         SetPageUptodate(page);
2484         set_page_dirty(page);
2485         return ret;
2486 }
2487 EXPORT_SYMBOL(nobh_prepare_write);
2488
2489 int nobh_commit_write(struct file *file, struct page *page,
2490                 unsigned from, unsigned to)
2491 {
2492         struct inode *inode = page->mapping->host;
2493         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2494
2495         set_page_dirty(page);
2496         if (pos > inode->i_size) {
2497                 i_size_write(inode, pos);
2498                 mark_inode_dirty(inode);
2499         }
2500         return 0;
2501 }
2502 EXPORT_SYMBOL(nobh_commit_write);
2503
2504 /*
2505  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2506  */
2507 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2508 {
2509         struct inode *inode = mapping->host;
2510         unsigned blocksize = 1 << inode->i_blkbits;
2511         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2512         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2513         unsigned to;
2514         struct page *page;
2515         struct address_space_operations *a_ops = mapping->a_ops;
2516         char *kaddr;
2517         int ret = 0;
2518
2519         if ((offset & (blocksize - 1)) == 0)
2520                 goto out;
2521
2522         ret = -ENOMEM;
2523         page = grab_cache_page(mapping, index);
2524         if (!page)
2525                 goto out;
2526
2527         to = (offset + blocksize) & ~(blocksize - 1);
2528         ret = a_ops->prepare_write(NULL, page, offset, to);
2529         if (ret == 0) {
2530                 kaddr = kmap_atomic(page, KM_USER0);
2531                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2532                 flush_dcache_page(page);
2533                 kunmap_atomic(kaddr, KM_USER0);
2534                 set_page_dirty(page);
2535         }
2536         unlock_page(page);
2537         page_cache_release(page);
2538 out:
2539         return ret;
2540 }
2541 EXPORT_SYMBOL(nobh_truncate_page);
2542
2543 int block_truncate_page(struct address_space *mapping,
2544                         loff_t from, get_block_t *get_block)
2545 {
2546         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2547         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2548         unsigned blocksize;
2549         pgoff_t iblock;
2550         unsigned length, pos;
2551         struct inode *inode = mapping->host;
2552         struct page *page;
2553         struct buffer_head *bh;
2554         void *kaddr;
2555         int err;
2556
2557         blocksize = 1 << inode->i_blkbits;
2558         length = offset & (blocksize - 1);
2559
2560         /* Block boundary? Nothing to do */
2561         if (!length)
2562                 return 0;
2563
2564         length = blocksize - length;
2565         iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2566
2567         page = grab_cache_page(mapping, index);
2568         err = -ENOMEM;
2569         if (!page)
2570                 goto out;
2571
2572         if (!page_has_buffers(page))
2573                 create_empty_buffers(page, blocksize, 0);
2574
2575         /* Find the buffer that contains "offset" */
2576         bh = page_buffers(page);
2577         pos = blocksize;
2578         while (offset >= pos) {
2579                 bh = bh->b_this_page;
2580                 iblock++;
2581                 pos += blocksize;
2582         }
2583
2584         err = 0;
2585         if (!buffer_mapped(bh)) {
2586                 err = get_block(inode, iblock, bh, 0);
2587                 if (err)
2588                         goto unlock;
2589                 /* unmapped? It's a hole - nothing to do */
2590                 if (!buffer_mapped(bh))
2591                         goto unlock;
2592         }
2593
2594         /* Ok, it's mapped. Make sure it's up-to-date */
2595         if (PageUptodate(page))
2596                 set_buffer_uptodate(bh);
2597
2598         if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2599                 err = -EIO;
2600                 ll_rw_block(READ, 1, &bh);
2601                 wait_on_buffer(bh);
2602                 /* Uhhuh. Read error. Complain and punt. */
2603                 if (!buffer_uptodate(bh))
2604                         goto unlock;
2605         }
2606
2607         kaddr = kmap_atomic(page, KM_USER0);
2608         memset(kaddr + offset, 0, length);
2609         flush_dcache_page(page);
2610         kunmap_atomic(kaddr, KM_USER0);
2611
2612         mark_buffer_dirty(bh);
2613         err = 0;
2614
2615 unlock:
2616         unlock_page(page);
2617         page_cache_release(page);
2618 out:
2619         return err;
2620 }
2621
2622 /*
2623  * The generic ->writepage function for buffer-backed address_spaces
2624  */
2625 int block_write_full_page(struct page *page, get_block_t *get_block,
2626                         struct writeback_control *wbc)
2627 {
2628         struct inode * const inode = page->mapping->host;
2629         loff_t i_size = i_size_read(inode);
2630         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2631         unsigned offset;
2632         void *kaddr;
2633
2634         /* Is the page fully inside i_size? */
2635         if (page->index < end_index)
2636                 return __block_write_full_page(inode, page, get_block, wbc);
2637
2638         /* Is the page fully outside i_size? (truncate in progress) */
2639         offset = i_size & (PAGE_CACHE_SIZE-1);
2640         if (page->index >= end_index+1 || !offset) {
2641                 /*
2642                  * The page may have dirty, unmapped buffers.  For example,
2643                  * they may have been added in ext3_writepage().  Make them
2644                  * freeable here, so the page does not leak.
2645                  */
2646                 block_invalidatepage(page, 0);
2647                 unlock_page(page);
2648                 return 0; /* don't care */
2649         }
2650
2651         /*
2652          * The page straddles i_size.  It must be zeroed out on each and every
2653          * writepage invokation because it may be mmapped.  "A file is mapped
2654          * in multiples of the page size.  For a file that is not a multiple of
2655          * the  page size, the remaining memory is zeroed when mapped, and
2656          * writes to that region are not written out to the file."
2657          */
2658         kaddr = kmap_atomic(page, KM_USER0);
2659         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2660         flush_dcache_page(page);
2661         kunmap_atomic(kaddr, KM_USER0);
2662         return __block_write_full_page(inode, page, get_block, wbc);
2663 }
2664
2665 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2666                             get_block_t *get_block)
2667 {
2668         struct buffer_head tmp;
2669         struct inode *inode = mapping->host;
2670         tmp.b_state = 0;
2671         tmp.b_blocknr = 0;
2672         get_block(inode, block, &tmp, 0);
2673         return tmp.b_blocknr;
2674 }
2675
2676 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2677 {
2678         struct buffer_head *bh = bio->bi_private;
2679
2680         if (bio->bi_size)
2681                 return 1;
2682
2683         if (err == -EOPNOTSUPP) {
2684                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2685                 set_bit(BH_Eopnotsupp, &bh->b_state);
2686         }
2687
2688         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2689         bio_put(bio);
2690         return 0;
2691 }
2692
2693 int submit_bh(int rw, struct buffer_head * bh)
2694 {
2695         struct bio *bio;
2696         int ret = 0;
2697
2698         BUG_ON(!buffer_locked(bh));
2699         BUG_ON(!buffer_mapped(bh));
2700         BUG_ON(!bh->b_end_io);
2701
2702         if (buffer_ordered(bh) && (rw == WRITE))
2703                 rw = WRITE_BARRIER;
2704
2705         /*
2706          * Only clear out a write error when rewriting, should this
2707          * include WRITE_SYNC as well?
2708          */
2709         if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2710                 clear_buffer_write_io_error(bh);
2711
2712         /*
2713          * from here on down, it's all bio -- do the initial mapping,
2714          * submit_bio -> generic_make_request may further map this bio around
2715          */
2716         bio = bio_alloc(GFP_NOIO, 1);
2717
2718         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2719         bio->bi_bdev = bh->b_bdev;
2720         bio->bi_io_vec[0].bv_page = bh->b_page;
2721         bio->bi_io_vec[0].bv_len = bh->b_size;
2722         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2723
2724         bio->bi_vcnt = 1;
2725         bio->bi_idx = 0;
2726         bio->bi_size = bh->b_size;
2727
2728         bio->bi_end_io = end_bio_bh_io_sync;
2729         bio->bi_private = bh;
2730
2731         bio_get(bio);
2732         submit_bio(rw, bio);
2733
2734         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2735                 ret = -EOPNOTSUPP;
2736
2737         bio_put(bio);
2738         return ret;
2739 }
2740
2741 /**
2742  * ll_rw_block: low-level access to block devices (DEPRECATED)
2743  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2744  * @nr: number of &struct buffer_heads in the array
2745  * @bhs: array of pointers to &struct buffer_head
2746  *
2747  * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2748  * and requests an I/O operation on them, either a %READ or a %WRITE.
2749  * The third %READA option is described in the documentation for
2750  * generic_make_request() which ll_rw_block() calls.
2751  *
2752  * This function drops any buffer that it cannot get a lock on (with the
2753  * BH_Lock state bit), any buffer that appears to be clean when doing a
2754  * write request, and any buffer that appears to be up-to-date when doing
2755  * read request.  Further it marks as clean buffers that are processed for
2756  * writing (the buffer cache won't assume that they are actually clean until
2757  * the buffer gets unlocked).
2758  *
2759  * ll_rw_block sets b_end_io to simple completion handler that marks
2760  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2761  * any waiters.
2762  *
2763  * All of the buffers must be for the same device, and must also be a
2764  * multiple of the current approved size for the device.
2765  */
2766 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2767 {
2768         int i;
2769
2770         for (i = 0; i < nr; i++) {
2771                 struct buffer_head *bh = bhs[i];
2772
2773                 if (test_set_buffer_locked(bh))
2774                         continue;
2775
2776                 get_bh(bh);
2777                 if (rw == WRITE) {
2778                         bh->b_end_io = end_buffer_write_sync;
2779                         if (test_clear_buffer_dirty(bh)) {
2780                                 submit_bh(WRITE, bh);
2781                                 continue;
2782                         }
2783                 } else {
2784                         bh->b_end_io = end_buffer_read_sync;
2785                         if (!buffer_uptodate(bh)) {
2786                                 submit_bh(rw, bh);
2787                                 continue;
2788                         }
2789                 }
2790                 unlock_buffer(bh);
2791                 put_bh(bh);
2792         }
2793 }
2794
2795 /*
2796  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2797  * and then start new I/O and then wait upon it.  The caller must have a ref on
2798  * the buffer_head.
2799  */
2800 int sync_dirty_buffer(struct buffer_head *bh)
2801 {
2802         int ret = 0;
2803
2804         WARN_ON(atomic_read(&bh->b_count) < 1);
2805         lock_buffer(bh);
2806         if (test_clear_buffer_dirty(bh)) {
2807                 get_bh(bh);
2808                 bh->b_end_io = end_buffer_write_sync;
2809                 ret = submit_bh(WRITE, bh);
2810                 wait_on_buffer(bh);
2811                 if (buffer_eopnotsupp(bh)) {
2812                         clear_buffer_eopnotsupp(bh);
2813                         ret = -EOPNOTSUPP;
2814                 }
2815                 if (!ret && !buffer_uptodate(bh))
2816                         ret = -EIO;
2817         } else {
2818                 unlock_buffer(bh);
2819         }
2820         return ret;
2821 }
2822
2823 /*
2824  * try_to_free_buffers() checks if all the buffers on this particular page
2825  * are unused, and releases them if so.
2826  *
2827  * Exclusion against try_to_free_buffers may be obtained by either
2828  * locking the page or by holding its mapping's private_lock.
2829  *
2830  * If the page is dirty but all the buffers are clean then we need to
2831  * be sure to mark the page clean as well.  This is because the page
2832  * may be against a block device, and a later reattachment of buffers
2833  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2834  * filesystem data on the same device.
2835  *
2836  * The same applies to regular filesystem pages: if all the buffers are
2837  * clean then we set the page clean and proceed.  To do that, we require
2838  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2839  * private_lock.
2840  *
2841  * try_to_free_buffers() is non-blocking.
2842  */
2843 static inline int buffer_busy(struct buffer_head *bh)
2844 {
2845         return atomic_read(&bh->b_count) |
2846                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2847 }
2848
2849 static int
2850 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2851 {
2852         struct buffer_head *head = page_buffers(page);
2853         struct buffer_head *bh;
2854
2855         bh = head;
2856         do {
2857                 if (buffer_write_io_error(bh))
2858                         set_bit(AS_EIO, &page->mapping->flags);
2859                 if (buffer_busy(bh))
2860                         goto failed;
2861                 bh = bh->b_this_page;
2862         } while (bh != head);
2863
2864         do {
2865                 struct buffer_head *next = bh->b_this_page;
2866
2867                 if (!list_empty(&bh->b_assoc_buffers))
2868                         __remove_assoc_queue(bh);
2869                 bh = next;
2870         } while (bh != head);
2871         *buffers_to_free = head;
2872         __clear_page_buffers(page);
2873         return 1;
2874 failed:
2875         return 0;
2876 }
2877
2878 int try_to_free_buffers(struct page *page)
2879 {
2880         struct address_space * const mapping = page->mapping;
2881         struct buffer_head *buffers_to_free = NULL;
2882         int ret = 0;
2883
2884         BUG_ON(!PageLocked(page));
2885         if (PageWriteback(page))
2886                 return 0;
2887
2888         if (mapping == NULL) {          /* can this still happen? */
2889                 ret = drop_buffers(page, &buffers_to_free);
2890                 goto out;
2891         }
2892
2893         spin_lock(&mapping->private_lock);
2894         ret = drop_buffers(page, &buffers_to_free);
2895         if (ret) {
2896                 /*
2897                  * If the filesystem writes its buffers by hand (eg ext3)
2898                  * then we can have clean buffers against a dirty page.  We
2899                  * clean the page here; otherwise later reattachment of buffers
2900                  * could encounter a non-uptodate page, which is unresolvable.
2901                  * This only applies in the rare case where try_to_free_buffers
2902                  * succeeds but the page is not freed.
2903                  */
2904                 clear_page_dirty(page);
2905         }
2906         spin_unlock(&mapping->private_lock);
2907 out:
2908         if (buffers_to_free) {
2909                 struct buffer_head *bh = buffers_to_free;
2910
2911                 do {
2912                         struct buffer_head *next = bh->b_this_page;
2913                         free_buffer_head(bh);
2914                         bh = next;
2915                 } while (bh != buffers_to_free);
2916         }
2917         return ret;
2918 }
2919 EXPORT_SYMBOL(try_to_free_buffers);
2920
2921 int block_sync_page(struct page *page)
2922 {
2923         struct address_space *mapping;
2924
2925         smp_mb();
2926         mapping = page_mapping(page);
2927         if (mapping)
2928                 blk_run_backing_dev(mapping->backing_dev_info, page);
2929         return 0;
2930 }
2931
2932 /*
2933  * There are no bdflush tunables left.  But distributions are
2934  * still running obsolete flush daemons, so we terminate them here.
2935  *
2936  * Use of bdflush() is deprecated and will be removed in a future kernel.
2937  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2938  */
2939 asmlinkage long sys_bdflush(int func, long data)
2940 {
2941         static int msg_count;
2942
2943         if (!capable(CAP_SYS_ADMIN))
2944                 return -EPERM;
2945
2946         if (msg_count < 5) {
2947                 msg_count++;
2948                 printk(KERN_INFO
2949                         "warning: process `%s' used the obsolete bdflush"
2950                         " system call\n", current->comm);
2951                 printk(KERN_INFO "Fix your initscripts?\n");
2952         }
2953
2954         if (func == 1)
2955                 do_exit(0);
2956         return 0;
2957 }
2958
2959 /*
2960  * Buffer-head allocation
2961  */
2962 static kmem_cache_t *bh_cachep;
2963
2964 /*
2965  * Once the number of bh's in the machine exceeds this level, we start
2966  * stripping them in writeback.
2967  */
2968 static int max_buffer_heads;
2969
2970 int buffer_heads_over_limit;
2971
2972 struct bh_accounting {
2973         int nr;                 /* Number of live bh's */
2974         int ratelimit;          /* Limit cacheline bouncing */
2975 };
2976
2977 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2978
2979 static void recalc_bh_state(void)
2980 {
2981         int i;
2982         int tot = 0;
2983
2984         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2985                 return;
2986         __get_cpu_var(bh_accounting).ratelimit = 0;
2987         for_each_cpu(i)
2988                 tot += per_cpu(bh_accounting, i).nr;
2989         buffer_heads_over_limit = (tot > max_buffer_heads);
2990 }
2991
2992 struct buffer_head *alloc_buffer_head(int gfp_flags)
2993 {
2994         struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
2995         if (ret) {
2996                 preempt_disable();
2997                 __get_cpu_var(bh_accounting).nr++;
2998                 recalc_bh_state();
2999                 preempt_enable();
3000         }
3001         return ret;
3002 }
3003 EXPORT_SYMBOL(alloc_buffer_head);
3004
3005 void free_buffer_head(struct buffer_head *bh)
3006 {
3007         BUG_ON(!list_empty(&bh->b_assoc_buffers));
3008         kmem_cache_free(bh_cachep, bh);
3009         preempt_disable();
3010         __get_cpu_var(bh_accounting).nr--;
3011         recalc_bh_state();
3012         preempt_enable();
3013 }
3014 EXPORT_SYMBOL(free_buffer_head);
3015
3016 static void
3017 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3018 {
3019         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3020                             SLAB_CTOR_CONSTRUCTOR) {
3021                 struct buffer_head * bh = (struct buffer_head *)data;
3022
3023                 memset(bh, 0, sizeof(*bh));
3024                 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3025         }
3026 }
3027
3028 #ifdef CONFIG_HOTPLUG_CPU
3029 static void buffer_exit_cpu(int cpu)
3030 {
3031         int i;
3032         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3033
3034         for (i = 0; i < BH_LRU_SIZE; i++) {
3035                 brelse(b->bhs[i]);
3036                 b->bhs[i] = NULL;
3037         }
3038 }
3039
3040 static int buffer_cpu_notify(struct notifier_block *self,
3041                               unsigned long action, void *hcpu)
3042 {
3043         if (action == CPU_DEAD)
3044                 buffer_exit_cpu((unsigned long)hcpu);
3045         return NOTIFY_OK;
3046 }
3047 #endif /* CONFIG_HOTPLUG_CPU */
3048
3049 void __init buffer_init(void)
3050 {
3051         int nrpages;
3052
3053         bh_cachep = kmem_cache_create("buffer_head",
3054                         sizeof(struct buffer_head), 0,
3055                         SLAB_PANIC, init_buffer_head, NULL);
3056
3057         /*
3058          * Limit the bh occupancy to 10% of ZONE_NORMAL
3059          */
3060         nrpages = (nr_free_buffer_pages() * 10) / 100;
3061         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3062         hotcpu_notifier(buffer_cpu_notify, 0);
3063 }
3064
3065 EXPORT_SYMBOL(__bforget);
3066 EXPORT_SYMBOL(__brelse);
3067 EXPORT_SYMBOL(__wait_on_buffer);
3068 EXPORT_SYMBOL(block_commit_write);
3069 EXPORT_SYMBOL(block_prepare_write);
3070 EXPORT_SYMBOL(block_read_full_page);
3071 EXPORT_SYMBOL(block_sync_page);
3072 EXPORT_SYMBOL(block_truncate_page);
3073 EXPORT_SYMBOL(block_write_full_page);
3074 EXPORT_SYMBOL(cont_prepare_write);
3075 EXPORT_SYMBOL(end_buffer_async_write);
3076 EXPORT_SYMBOL(end_buffer_read_sync);
3077 EXPORT_SYMBOL(end_buffer_write_sync);
3078 EXPORT_SYMBOL(file_fsync);
3079 EXPORT_SYMBOL(fsync_bdev);
3080 EXPORT_SYMBOL(generic_block_bmap);
3081 EXPORT_SYMBOL(generic_commit_write);
3082 EXPORT_SYMBOL(generic_cont_expand);
3083 EXPORT_SYMBOL(init_buffer);
3084 EXPORT_SYMBOL(invalidate_bdev);
3085 EXPORT_SYMBOL(ll_rw_block);
3086 EXPORT_SYMBOL(mark_buffer_dirty);
3087 EXPORT_SYMBOL(submit_bh);
3088 EXPORT_SYMBOL(sync_dirty_buffer);
3089 EXPORT_SYMBOL(unlock_buffer);