4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/config.h>
22 #include <linux/kernel.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/bio.h>
38 #include <linux/notifier.h>
39 #include <linux/cpu.h>
40 #include <asm/bitops.h>
42 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
43 static void invalidate_bh_lrus(void);
45 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
47 struct bh_wait_queue {
48 struct buffer_head *bh;
52 #define __DEFINE_BH_WAIT(name, b, f) \
53 struct bh_wait_queue name = { \
58 .func = bh_wake_function, \
60 LIST_HEAD_INIT(name.wait.task_list),\
63 #define DEFINE_BH_WAIT(name, bh) __DEFINE_BH_WAIT(name, bh, 0)
64 #define DEFINE_BH_WAIT_EXCLUSIVE(name, bh) \
65 __DEFINE_BH_WAIT(name, bh, WQ_FLAG_EXCLUSIVE)
68 * Hashed waitqueue_head's for wait_on_buffer()
70 #define BH_WAIT_TABLE_ORDER 7
71 static struct bh_wait_queue_head {
72 wait_queue_head_t wqh;
73 } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
76 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
78 bh->b_end_io = handler;
79 bh->b_private = private;
83 * Return the address of the waitqueue_head to be used for this
86 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
88 return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
90 EXPORT_SYMBOL(bh_waitq_head);
92 void wake_up_buffer(struct buffer_head *bh)
94 wait_queue_head_t *wq = bh_waitq_head(bh);
97 if (waitqueue_active(wq))
98 __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, bh);
100 EXPORT_SYMBOL(wake_up_buffer);
102 static int bh_wake_function(wait_queue_t *wait, unsigned mode,
105 struct buffer_head *bh = key;
106 struct bh_wait_queue *wq;
108 wq = container_of(wait, struct bh_wait_queue, wait);
109 if (wq->bh != bh || buffer_locked(bh))
112 return autoremove_wake_function(wait, mode, sync, key);
115 static void sync_buffer(struct buffer_head *bh)
117 struct block_device *bd;
122 blk_run_address_space(bd->bd_inode->i_mapping);
125 void fastcall __lock_buffer(struct buffer_head *bh)
127 wait_queue_head_t *wqh = bh_waitq_head(bh);
128 DEFINE_BH_WAIT_EXCLUSIVE(wait, bh);
131 prepare_to_wait_exclusive(wqh, &wait.wait,
132 TASK_UNINTERRUPTIBLE);
133 if (buffer_locked(bh)) {
137 } while (test_set_buffer_locked(bh));
138 finish_wait(wqh, &wait.wait);
140 EXPORT_SYMBOL(__lock_buffer);
142 void fastcall unlock_buffer(struct buffer_head *bh)
144 clear_buffer_locked(bh);
145 smp_mb__after_clear_bit();
150 * Block until a buffer comes unlocked. This doesn't stop it
151 * from becoming locked again - you have to lock it yourself
152 * if you want to preserve its state.
154 void __wait_on_buffer(struct buffer_head * bh)
156 wait_queue_head_t *wqh = bh_waitq_head(bh);
157 DEFINE_BH_WAIT(wait, bh);
160 prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
161 if (buffer_locked(bh)) {
165 } while (buffer_locked(bh));
166 finish_wait(wqh, &wait.wait);
170 __set_page_buffers(struct page *page, struct buffer_head *head)
172 page_cache_get(page);
173 SetPagePrivate(page);
174 page->private = (unsigned long)head;
178 __clear_page_buffers(struct page *page)
180 ClearPagePrivate(page);
182 page_cache_release(page);
185 static void buffer_io_error(struct buffer_head *bh)
187 char b[BDEVNAME_SIZE];
189 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
190 bdevname(bh->b_bdev, b),
191 (unsigned long long)bh->b_blocknr);
195 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
196 * unlock the buffer. This is what ll_rw_block uses too.
198 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
201 set_buffer_uptodate(bh);
203 /* This happens, due to failed READA attempts. */
204 clear_buffer_uptodate(bh);
210 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
212 char b[BDEVNAME_SIZE];
215 set_buffer_uptodate(bh);
217 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
219 printk(KERN_WARNING "lost page write due to "
221 bdevname(bh->b_bdev, b));
223 set_buffer_write_io_error(bh);
224 clear_buffer_uptodate(bh);
231 * Write out and wait upon all the dirty data associated with a block
232 * device via its mapping. Does not take the superblock lock.
234 int sync_blockdev(struct block_device *bdev)
241 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
242 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
248 EXPORT_SYMBOL(sync_blockdev);
251 * Write out and wait upon all dirty data associated with this
252 * superblock. Filesystem data as well as the underlying block
253 * device. Takes the superblock lock.
255 int fsync_super(struct super_block *sb)
257 sync_inodes_sb(sb, 0);
260 if (sb->s_dirt && sb->s_op->write_super)
261 sb->s_op->write_super(sb);
263 if (sb->s_op->sync_fs)
264 sb->s_op->sync_fs(sb, 1);
265 sync_blockdev(sb->s_bdev);
266 sync_inodes_sb(sb, 1);
268 return sync_blockdev(sb->s_bdev);
272 * Write out and wait upon all dirty data associated with this
273 * device. Filesystem data as well as the underlying block
274 * device. Takes the superblock lock.
276 int fsync_bdev(struct block_device *bdev)
278 struct super_block *sb = get_super(bdev);
280 int res = fsync_super(sb);
284 return sync_blockdev(bdev);
288 * freeze_bdev -- lock a filesystem and force it into a consistent state
289 * @bdev: blockdevice to lock
291 * This takes the block device bd_mount_sem to make sure no new mounts
292 * happen on bdev until thaw_bdev() is called.
293 * If a superblock is found on this device, we take the s_umount semaphore
294 * on it to make sure nobody unmounts until the snapshot creation is done.
296 struct super_block *freeze_bdev(struct block_device *bdev)
298 struct super_block *sb;
300 down(&bdev->bd_mount_sem);
301 sb = get_super(bdev);
302 if (sb && !(sb->s_flags & MS_RDONLY)) {
303 sb->s_frozen = SB_FREEZE_WRITE;
306 sync_inodes_sb(sb, 0);
310 if (sb->s_dirt && sb->s_op->write_super)
311 sb->s_op->write_super(sb);
314 if (sb->s_op->sync_fs)
315 sb->s_op->sync_fs(sb, 1);
317 sync_blockdev(sb->s_bdev);
318 sync_inodes_sb(sb, 1);
320 sb->s_frozen = SB_FREEZE_TRANS;
323 sync_blockdev(sb->s_bdev);
325 if (sb->s_op->write_super_lockfs)
326 sb->s_op->write_super_lockfs(sb);
330 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
332 EXPORT_SYMBOL(freeze_bdev);
335 * thaw_bdev -- unlock filesystem
336 * @bdev: blockdevice to unlock
337 * @sb: associated superblock
339 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
341 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
344 BUG_ON(sb->s_bdev != bdev);
346 if (sb->s_op->unlockfs)
347 sb->s_op->unlockfs(sb);
348 sb->s_frozen = SB_UNFROZEN;
350 wake_up(&sb->s_wait_unfrozen);
354 up(&bdev->bd_mount_sem);
356 EXPORT_SYMBOL(thaw_bdev);
359 * sync everything. Start out by waking pdflush, because that writes back
360 * all queues in parallel.
362 static void do_sync(unsigned long wait)
365 sync_inodes(0); /* All mappings, inodes and their blockdevs */
367 sync_supers(); /* Write the superblocks */
368 sync_filesystems(0); /* Start syncing the filesystems */
369 sync_filesystems(wait); /* Waitingly sync the filesystems */
370 sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
372 printk("Emergency Sync complete\n");
373 if (unlikely(laptop_mode))
374 laptop_sync_completion();
377 asmlinkage long sys_sync(void)
383 void emergency_sync(void)
385 pdflush_operation(do_sync, 0);
389 * Generic function to fsync a file.
391 * filp may be NULL if called via the msync of a vma.
394 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
396 struct inode * inode = dentry->d_inode;
397 struct super_block * sb;
400 /* sync the inode to buffers */
401 write_inode_now(inode, 0);
403 /* sync the superblock to buffers */
406 if (sb->s_op->write_super)
407 sb->s_op->write_super(sb);
410 /* .. finally sync the buffers to disk */
411 ret = sync_blockdev(sb->s_bdev);
415 asmlinkage long sys_fsync(unsigned int fd)
418 struct address_space *mapping;
426 mapping = file->f_mapping;
429 if (!file->f_op || !file->f_op->fsync) {
430 /* Why? We can still call filemap_fdatawrite */
434 /* We need to protect against concurrent writers.. */
435 down(&mapping->host->i_sem);
436 current->flags |= PF_SYNCWRITE;
437 ret = filemap_fdatawrite(mapping);
438 err = file->f_op->fsync(file, file->f_dentry, 0);
441 err = filemap_fdatawait(mapping);
444 current->flags &= ~PF_SYNCWRITE;
445 up(&mapping->host->i_sem);
453 asmlinkage long sys_fdatasync(unsigned int fd)
456 struct address_space *mapping;
465 if (!file->f_op || !file->f_op->fsync)
468 mapping = file->f_mapping;
470 down(&mapping->host->i_sem);
471 current->flags |= PF_SYNCWRITE;
472 ret = filemap_fdatawrite(mapping);
473 err = file->f_op->fsync(file, file->f_dentry, 1);
476 err = filemap_fdatawait(mapping);
479 current->flags &= ~PF_SYNCWRITE;
480 up(&mapping->host->i_sem);
489 * Various filesystems appear to want __find_get_block to be non-blocking.
490 * But it's the page lock which protects the buffers. To get around this,
491 * we get exclusion from try_to_free_buffers with the blockdev mapping's
494 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
495 * may be quite high. This code could TryLock the page, and if that
496 * succeeds, there is no need to take private_lock. (But if
497 * private_lock is contended then so is mapping->tree_lock).
499 static struct buffer_head *
500 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
502 struct inode *bd_inode = bdev->bd_inode;
503 struct address_space *bd_mapping = bd_inode->i_mapping;
504 struct buffer_head *ret = NULL;
506 struct buffer_head *bh;
507 struct buffer_head *head;
511 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
512 page = find_get_page(bd_mapping, index);
516 spin_lock(&bd_mapping->private_lock);
517 if (!page_has_buffers(page))
519 head = page_buffers(page);
522 if (bh->b_blocknr == block) {
527 if (!buffer_mapped(bh))
529 bh = bh->b_this_page;
530 } while (bh != head);
532 /* we might be here because some of the buffers on this page are
533 * not mapped. This is due to various races between
534 * file io on the block device and getblk. It gets dealt with
535 * elsewhere, don't buffer_error if we had some unmapped buffers
538 printk("__find_get_block_slow() failed. "
539 "block=%llu, b_blocknr=%llu\n",
540 (unsigned long long)block, (unsigned long long)bh->b_blocknr);
541 printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
542 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
545 spin_unlock(&bd_mapping->private_lock);
546 page_cache_release(page);
551 /* If invalidate_buffers() will trash dirty buffers, it means some kind
552 of fs corruption is going on. Trashing dirty data always imply losing
553 information that was supposed to be just stored on the physical layer
556 Thus invalidate_buffers in general usage is not allwowed to trash
557 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
558 be preserved. These buffers are simply skipped.
560 We also skip buffers which are still in use. For example this can
561 happen if a userspace program is reading the block device.
563 NOTE: In the case where the user removed a removable-media-disk even if
564 there's still dirty data not synced on disk (due a bug in the device driver
565 or due an error of the user), by not destroying the dirty buffers we could
566 generate corruption also on the next media inserted, thus a parameter is
567 necessary to handle this case in the most safe way possible (trying
568 to not corrupt also the new disk inserted with the data belonging to
569 the old now corrupted disk). Also for the ramdisk the natural thing
570 to do in order to release the ramdisk memory is to destroy dirty buffers.
572 These are two special cases. Normal usage imply the device driver
573 to issue a sync on the device (without waiting I/O completion) and
574 then an invalidate_buffers call that doesn't trash dirty buffers.
576 For handling cache coherency with the blkdev pagecache the 'update' case
577 is been introduced. It is needed to re-read from disk any pinned
578 buffer. NOTE: re-reading from disk is destructive so we can do it only
579 when we assume nobody is changing the buffercache under our I/O and when
580 we think the disk contains more recent information than the buffercache.
581 The update == 1 pass marks the buffers we need to update, the update == 2
582 pass does the actual I/O. */
583 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
585 invalidate_bh_lrus();
587 * FIXME: what about destroy_dirty_buffers?
588 * We really want to use invalidate_inode_pages2() for
589 * that, but not until that's cleaned up.
591 invalidate_inode_pages(bdev->bd_inode->i_mapping);
595 * Kick pdflush then try to free up some ZONE_NORMAL memory.
597 static void free_more_memory(void)
602 wakeup_bdflush(1024);
605 for_each_pgdat(pgdat) {
606 zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
608 try_to_free_pages(zones, GFP_NOFS, 0);
613 * I/O completion handler for block_read_full_page() - pages
614 * which come unlocked at the end of I/O.
616 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
618 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
620 struct buffer_head *tmp;
622 int page_uptodate = 1;
624 BUG_ON(!buffer_async_read(bh));
628 set_buffer_uptodate(bh);
630 clear_buffer_uptodate(bh);
636 * Be _very_ careful from here on. Bad things can happen if
637 * two buffer heads end IO at almost the same time and both
638 * decide that the page is now completely done.
640 spin_lock_irqsave(&page_uptodate_lock, flags);
641 clear_buffer_async_read(bh);
645 if (!buffer_uptodate(tmp))
647 if (buffer_async_read(tmp)) {
648 BUG_ON(!buffer_locked(tmp));
651 tmp = tmp->b_this_page;
653 spin_unlock_irqrestore(&page_uptodate_lock, flags);
656 * If none of the buffers had errors and they are all
657 * uptodate then we can set the page uptodate.
659 if (page_uptodate && !PageError(page))
660 SetPageUptodate(page);
665 spin_unlock_irqrestore(&page_uptodate_lock, flags);
670 * Completion handler for block_write_full_page() - pages which are unlocked
671 * during I/O, and which have PageWriteback cleared upon I/O completion.
673 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
675 char b[BDEVNAME_SIZE];
676 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
678 struct buffer_head *tmp;
681 BUG_ON(!buffer_async_write(bh));
685 set_buffer_uptodate(bh);
687 if (printk_ratelimit()) {
689 printk(KERN_WARNING "lost page write due to "
691 bdevname(bh->b_bdev, b));
693 set_bit(AS_EIO, &page->mapping->flags);
694 clear_buffer_uptodate(bh);
698 spin_lock_irqsave(&page_uptodate_lock, flags);
699 clear_buffer_async_write(bh);
701 tmp = bh->b_this_page;
703 if (buffer_async_write(tmp)) {
704 BUG_ON(!buffer_locked(tmp));
707 tmp = tmp->b_this_page;
709 spin_unlock_irqrestore(&page_uptodate_lock, flags);
710 end_page_writeback(page);
714 spin_unlock_irqrestore(&page_uptodate_lock, flags);
719 * If a page's buffers are under async readin (end_buffer_async_read
720 * completion) then there is a possibility that another thread of
721 * control could lock one of the buffers after it has completed
722 * but while some of the other buffers have not completed. This
723 * locked buffer would confuse end_buffer_async_read() into not unlocking
724 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
725 * that this buffer is not under async I/O.
727 * The page comes unlocked when it has no locked buffer_async buffers
730 * PageLocked prevents anyone starting new async I/O reads any of
733 * PageWriteback is used to prevent simultaneous writeout of the same
736 * PageLocked prevents anyone from starting writeback of a page which is
737 * under read I/O (PageWriteback is only ever set against a locked page).
739 static void mark_buffer_async_read(struct buffer_head *bh)
741 bh->b_end_io = end_buffer_async_read;
742 set_buffer_async_read(bh);
745 void mark_buffer_async_write(struct buffer_head *bh)
747 bh->b_end_io = end_buffer_async_write;
748 set_buffer_async_write(bh);
750 EXPORT_SYMBOL(mark_buffer_async_write);
754 * fs/buffer.c contains helper functions for buffer-backed address space's
755 * fsync functions. A common requirement for buffer-based filesystems is
756 * that certain data from the backing blockdev needs to be written out for
757 * a successful fsync(). For example, ext2 indirect blocks need to be
758 * written back and waited upon before fsync() returns.
760 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
761 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
762 * management of a list of dependent buffers at ->i_mapping->private_list.
764 * Locking is a little subtle: try_to_free_buffers() will remove buffers
765 * from their controlling inode's queue when they are being freed. But
766 * try_to_free_buffers() will be operating against the *blockdev* mapping
767 * at the time, not against the S_ISREG file which depends on those buffers.
768 * So the locking for private_list is via the private_lock in the address_space
769 * which backs the buffers. Which is different from the address_space
770 * against which the buffers are listed. So for a particular address_space,
771 * mapping->private_lock does *not* protect mapping->private_list! In fact,
772 * mapping->private_list will always be protected by the backing blockdev's
775 * Which introduces a requirement: all buffers on an address_space's
776 * ->private_list must be from the same address_space: the blockdev's.
778 * address_spaces which do not place buffers at ->private_list via these
779 * utility functions are free to use private_lock and private_list for
780 * whatever they want. The only requirement is that list_empty(private_list)
781 * be true at clear_inode() time.
783 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
784 * filesystems should do that. invalidate_inode_buffers() should just go
785 * BUG_ON(!list_empty).
787 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
788 * take an address_space, not an inode. And it should be called
789 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
792 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
793 * list if it is already on a list. Because if the buffer is on a list,
794 * it *must* already be on the right one. If not, the filesystem is being
795 * silly. This will save a ton of locking. But first we have to ensure
796 * that buffers are taken *off* the old inode's list when they are freed
797 * (presumably in truncate). That requires careful auditing of all
798 * filesystems (do it inside bforget()). It could also be done by bringing
803 * The buffer's backing address_space's private_lock must be held
805 static inline void __remove_assoc_queue(struct buffer_head *bh)
807 list_del_init(&bh->b_assoc_buffers);
810 int inode_has_buffers(struct inode *inode)
812 return !list_empty(&inode->i_data.private_list);
816 * osync is designed to support O_SYNC io. It waits synchronously for
817 * all already-submitted IO to complete, but does not queue any new
818 * writes to the disk.
820 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
821 * you dirty the buffers, and then use osync_inode_buffers to wait for
822 * completion. Any other dirty buffers which are not yet queued for
823 * write will not be flushed to disk by the osync.
825 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
827 struct buffer_head *bh;
833 list_for_each_prev(p, list) {
835 if (buffer_locked(bh)) {
839 if (!buffer_uptodate(bh))
851 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
853 * @buffer_mapping - the mapping which backs the buffers' data
854 * @mapping - the mapping which wants those buffers written
856 * Starts I/O against the buffers at mapping->private_list, and waits upon
859 * Basically, this is a convenience function for fsync(). @buffer_mapping is
860 * the blockdev which "owns" the buffers and @mapping is a file or directory
861 * which needs those buffers to be written for a successful fsync().
863 int sync_mapping_buffers(struct address_space *mapping)
865 struct address_space *buffer_mapping = mapping->assoc_mapping;
867 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
870 return fsync_buffers_list(&buffer_mapping->private_lock,
871 &mapping->private_list);
873 EXPORT_SYMBOL(sync_mapping_buffers);
876 * Called when we've recently written block `bblock', and it is known that
877 * `bblock' was for a buffer_boundary() buffer. This means that the block at
878 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
879 * dirty, schedule it for IO. So that indirects merge nicely with their data.
881 void write_boundary_block(struct block_device *bdev,
882 sector_t bblock, unsigned blocksize)
884 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
886 if (buffer_dirty(bh))
887 ll_rw_block(WRITE, 1, &bh);
892 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
894 struct address_space *mapping = inode->i_mapping;
895 struct address_space *buffer_mapping = bh->b_page->mapping;
897 mark_buffer_dirty(bh);
898 if (!mapping->assoc_mapping) {
899 mapping->assoc_mapping = buffer_mapping;
901 if (mapping->assoc_mapping != buffer_mapping)
904 if (list_empty(&bh->b_assoc_buffers)) {
905 spin_lock(&buffer_mapping->private_lock);
906 list_move_tail(&bh->b_assoc_buffers,
907 &mapping->private_list);
908 spin_unlock(&buffer_mapping->private_lock);
911 EXPORT_SYMBOL(mark_buffer_dirty_inode);
914 * Add a page to the dirty page list.
916 * It is a sad fact of life that this function is called from several places
917 * deeply under spinlocking. It may not sleep.
919 * If the page has buffers, the uptodate buffers are set dirty, to preserve
920 * dirty-state coherency between the page and the buffers. It the page does
921 * not have buffers then when they are later attached they will all be set
924 * The buffers are dirtied before the page is dirtied. There's a small race
925 * window in which a writepage caller may see the page cleanness but not the
926 * buffer dirtiness. That's fine. If this code were to set the page dirty
927 * before the buffers, a concurrent writepage caller could clear the page dirty
928 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
929 * page on the dirty page list.
931 * We use private_lock to lock against try_to_free_buffers while using the
932 * page's buffer list. Also use this to protect against clean buffers being
933 * added to the page after it was set dirty.
935 * FIXME: may need to call ->reservepage here as well. That's rather up to the
936 * address_space though.
938 int __set_page_dirty_buffers(struct page *page)
940 struct address_space * const mapping = page->mapping;
942 spin_lock(&mapping->private_lock);
943 if (page_has_buffers(page)) {
944 struct buffer_head *head = page_buffers(page);
945 struct buffer_head *bh = head;
948 set_buffer_dirty(bh);
949 bh = bh->b_this_page;
950 } while (bh != head);
952 spin_unlock(&mapping->private_lock);
954 if (!TestSetPageDirty(page)) {
955 spin_lock_irq(&mapping->tree_lock);
956 if (page->mapping) { /* Race with truncate? */
957 if (!mapping->backing_dev_info->memory_backed)
958 inc_page_state(nr_dirty);
959 radix_tree_tag_set(&mapping->page_tree,
961 PAGECACHE_TAG_DIRTY);
963 spin_unlock_irq(&mapping->tree_lock);
964 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
969 EXPORT_SYMBOL(__set_page_dirty_buffers);
972 * Write out and wait upon a list of buffers.
974 * We have conflicting pressures: we want to make sure that all
975 * initially dirty buffers get waited on, but that any subsequently
976 * dirtied buffers don't. After all, we don't want fsync to last
977 * forever if somebody is actively writing to the file.
979 * Do this in two main stages: first we copy dirty buffers to a
980 * temporary inode list, queueing the writes as we go. Then we clean
981 * up, waiting for those writes to complete.
983 * During this second stage, any subsequent updates to the file may end
984 * up refiling the buffer on the original inode's dirty list again, so
985 * there is a chance we will end up with a buffer queued for write but
986 * not yet completed on that list. So, as a final cleanup we go through
987 * the osync code to catch these locked, dirty buffers without requeuing
988 * any newly dirty buffers for write.
990 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
992 struct buffer_head *bh;
993 struct list_head tmp;
996 INIT_LIST_HEAD(&tmp);
999 while (!list_empty(list)) {
1000 bh = BH_ENTRY(list->next);
1001 list_del_init(&bh->b_assoc_buffers);
1002 if (buffer_dirty(bh) || buffer_locked(bh)) {
1003 list_add(&bh->b_assoc_buffers, &tmp);
1004 if (buffer_dirty(bh)) {
1008 * Ensure any pending I/O completes so that
1009 * ll_rw_block() actually writes the current
1010 * contents - it is a noop if I/O is still in
1011 * flight on potentially older contents.
1014 ll_rw_block(WRITE, 1, &bh);
1021 while (!list_empty(&tmp)) {
1022 bh = BH_ENTRY(tmp.prev);
1023 __remove_assoc_queue(bh);
1027 if (!buffer_uptodate(bh))
1034 err2 = osync_buffers_list(lock, list);
1042 * Invalidate any and all dirty buffers on a given inode. We are
1043 * probably unmounting the fs, but that doesn't mean we have already
1044 * done a sync(). Just drop the buffers from the inode list.
1046 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
1047 * assumes that all the buffers are against the blockdev. Not true
1050 void invalidate_inode_buffers(struct inode *inode)
1052 if (inode_has_buffers(inode)) {
1053 struct address_space *mapping = &inode->i_data;
1054 struct list_head *list = &mapping->private_list;
1055 struct address_space *buffer_mapping = mapping->assoc_mapping;
1057 spin_lock(&buffer_mapping->private_lock);
1058 while (!list_empty(list))
1059 __remove_assoc_queue(BH_ENTRY(list->next));
1060 spin_unlock(&buffer_mapping->private_lock);
1065 * Remove any clean buffers from the inode's buffer list. This is called
1066 * when we're trying to free the inode itself. Those buffers can pin it.
1068 * Returns true if all buffers were removed.
1070 int remove_inode_buffers(struct inode *inode)
1074 if (inode_has_buffers(inode)) {
1075 struct address_space *mapping = &inode->i_data;
1076 struct list_head *list = &mapping->private_list;
1077 struct address_space *buffer_mapping = mapping->assoc_mapping;
1079 spin_lock(&buffer_mapping->private_lock);
1080 while (!list_empty(list)) {
1081 struct buffer_head *bh = BH_ENTRY(list->next);
1082 if (buffer_dirty(bh)) {
1086 __remove_assoc_queue(bh);
1088 spin_unlock(&buffer_mapping->private_lock);
1094 * Create the appropriate buffers when given a page for data area and
1095 * the size of each buffer.. Use the bh->b_this_page linked list to
1096 * follow the buffers created. Return NULL if unable to create more
1099 * The retry flag is used to differentiate async IO (paging, swapping)
1100 * which may not fail from ordinary buffer allocations.
1102 static struct buffer_head *
1103 create_buffers(struct page * page, unsigned long size, int retry)
1105 struct buffer_head *bh, *head;
1111 while ((offset -= size) >= 0) {
1112 bh = alloc_buffer_head(GFP_NOFS);
1117 bh->b_this_page = head;
1122 atomic_set(&bh->b_count, 0);
1125 /* Link the buffer to its page */
1126 set_bh_page(bh, page, offset);
1128 bh->b_end_io = NULL;
1132 * In case anything failed, we just free everything we got.
1138 head = head->b_this_page;
1139 free_buffer_head(bh);
1144 * Return failure for non-async IO requests. Async IO requests
1145 * are not allowed to fail, so we have to wait until buffer heads
1146 * become available. But we don't want tasks sleeping with
1147 * partially complete buffers, so all were released above.
1152 /* We're _really_ low on memory. Now we just
1153 * wait for old buffer heads to become free due to
1154 * finishing IO. Since this is an async request and
1155 * the reserve list is empty, we're sure there are
1156 * async buffer heads in use.
1163 link_dev_buffers(struct page *page, struct buffer_head *head)
1165 struct buffer_head *bh, *tail;
1170 bh = bh->b_this_page;
1172 tail->b_this_page = head;
1173 __set_page_buffers(page, head);
1177 * Initialise the state of a blockdev page's buffers.
1180 init_page_buffers(struct page *page, struct block_device *bdev,
1181 sector_t block, int size)
1183 struct buffer_head *head = page_buffers(page);
1184 struct buffer_head *bh = head;
1185 int uptodate = PageUptodate(page);
1188 if (!buffer_mapped(bh)) {
1189 init_buffer(bh, NULL, NULL);
1191 bh->b_blocknr = block;
1193 set_buffer_uptodate(bh);
1194 set_buffer_mapped(bh);
1197 bh = bh->b_this_page;
1198 } while (bh != head);
1202 * Create the page-cache page that contains the requested block.
1204 * This is user purely for blockdev mappings.
1206 static struct page *
1207 grow_dev_page(struct block_device *bdev, sector_t block,
1208 pgoff_t index, int size)
1210 struct inode *inode = bdev->bd_inode;
1212 struct buffer_head *bh;
1214 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1218 if (!PageLocked(page))
1221 if (page_has_buffers(page)) {
1222 bh = page_buffers(page);
1223 if (bh->b_size == size) {
1224 init_page_buffers(page, bdev, block, size);
1227 if (!try_to_free_buffers(page))
1232 * Allocate some buffers for this page
1234 bh = create_buffers(page, size, 0);
1239 * Link the page to the buffers and initialise them. Take the
1240 * lock to be atomic wrt __find_get_block(), which does not
1241 * run under the page lock.
1243 spin_lock(&inode->i_mapping->private_lock);
1244 link_dev_buffers(page, bh);
1245 init_page_buffers(page, bdev, block, size);
1246 spin_unlock(&inode->i_mapping->private_lock);
1252 page_cache_release(page);
1257 * Create buffers for the specified block device block's page. If
1258 * that page was dirty, the buffers are set dirty also.
1260 * Except that's a bug. Attaching dirty buffers to a dirty
1261 * blockdev's page can result in filesystem corruption, because
1262 * some of those buffers may be aliases of filesystem data.
1263 * grow_dev_page() will go BUG() if this happens.
1266 grow_buffers(struct block_device *bdev, sector_t block, int size)
1275 } while ((size << sizebits) < PAGE_SIZE);
1277 index = block >> sizebits;
1278 block = index << sizebits;
1280 /* Create a page with the proper size buffers.. */
1281 page = grow_dev_page(bdev, block, index, size);
1285 page_cache_release(page);
1289 struct buffer_head *
1290 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1292 /* Size must be multiple of hard sectorsize */
1293 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1294 (size < 512 || size > PAGE_SIZE))) {
1295 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1297 printk(KERN_ERR "hardsect size: %d\n",
1298 bdev_hardsect_size(bdev));
1305 struct buffer_head * bh;
1307 bh = __find_get_block(bdev, block, size);
1311 if (!grow_buffers(bdev, block, size))
1317 * The relationship between dirty buffers and dirty pages:
1319 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1320 * the page is tagged dirty in its radix tree.
1322 * At all times, the dirtiness of the buffers represents the dirtiness of
1323 * subsections of the page. If the page has buffers, the page dirty bit is
1324 * merely a hint about the true dirty state.
1326 * When a page is set dirty in its entirety, all its buffers are marked dirty
1327 * (if the page has buffers).
1329 * When a buffer is marked dirty, its page is dirtied, but the page's other
1332 * Also. When blockdev buffers are explicitly read with bread(), they
1333 * individually become uptodate. But their backing page remains not
1334 * uptodate - even if all of its buffers are uptodate. A subsequent
1335 * block_read_full_page() against that page will discover all the uptodate
1336 * buffers, will set the page uptodate and will perform no I/O.
1340 * mark_buffer_dirty - mark a buffer_head as needing writeout
1342 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1343 * backing page dirty, then tag the page as dirty in its address_space's radix
1344 * tree and then attach the address_space's inode to its superblock's dirty
1347 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1348 * mapping->tree_lock and the global inode_lock.
1350 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1352 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1353 __set_page_dirty_nobuffers(bh->b_page);
1357 * Decrement a buffer_head's reference count. If all buffers against a page
1358 * have zero reference count, are clean and unlocked, and if the page is clean
1359 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1360 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1361 * a page but it ends up not being freed, and buffers may later be reattached).
1363 void __brelse(struct buffer_head * buf)
1365 if (atomic_read(&buf->b_count)) {
1369 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1374 * bforget() is like brelse(), except it discards any
1375 * potentially dirty data.
1377 void __bforget(struct buffer_head *bh)
1379 clear_buffer_dirty(bh);
1380 if (!list_empty(&bh->b_assoc_buffers)) {
1381 struct address_space *buffer_mapping = bh->b_page->mapping;
1383 spin_lock(&buffer_mapping->private_lock);
1384 list_del_init(&bh->b_assoc_buffers);
1385 spin_unlock(&buffer_mapping->private_lock);
1390 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1393 if (buffer_uptodate(bh)) {
1398 bh->b_end_io = end_buffer_read_sync;
1399 submit_bh(READ, bh);
1401 if (buffer_uptodate(bh))
1409 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1410 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1411 * refcount elevated by one when they're in an LRU. A buffer can only appear
1412 * once in a particular CPU's LRU. A single buffer can be present in multiple
1413 * CPU's LRUs at the same time.
1415 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1416 * sb_find_get_block().
1418 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1419 * a local interrupt disable for that.
1422 #define BH_LRU_SIZE 8
1425 struct buffer_head *bhs[BH_LRU_SIZE];
1428 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1431 #define bh_lru_lock() local_irq_disable()
1432 #define bh_lru_unlock() local_irq_enable()
1434 #define bh_lru_lock() preempt_disable()
1435 #define bh_lru_unlock() preempt_enable()
1438 static inline void check_irqs_on(void)
1440 #ifdef irqs_disabled
1441 BUG_ON(irqs_disabled());
1446 * The LRU management algorithm is dopey-but-simple. Sorry.
1448 static void bh_lru_install(struct buffer_head *bh)
1450 struct buffer_head *evictee = NULL;
1455 lru = &__get_cpu_var(bh_lrus);
1456 if (lru->bhs[0] != bh) {
1457 struct buffer_head *bhs[BH_LRU_SIZE];
1463 for (in = 0; in < BH_LRU_SIZE; in++) {
1464 struct buffer_head *bh2 = lru->bhs[in];
1469 if (out >= BH_LRU_SIZE) {
1470 BUG_ON(evictee != NULL);
1477 while (out < BH_LRU_SIZE)
1479 memcpy(lru->bhs, bhs, sizeof(bhs));
1488 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1490 static inline struct buffer_head *
1491 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1493 struct buffer_head *ret = NULL;
1499 lru = &__get_cpu_var(bh_lrus);
1500 for (i = 0; i < BH_LRU_SIZE; i++) {
1501 struct buffer_head *bh = lru->bhs[i];
1503 if (bh && bh->b_bdev == bdev &&
1504 bh->b_blocknr == block && bh->b_size == size) {
1507 lru->bhs[i] = lru->bhs[i - 1];
1522 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1523 * it in the LRU and mark it as accessed. If it is not present then return
1526 struct buffer_head *
1527 __find_get_block(struct block_device *bdev, sector_t block, int size)
1529 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1532 bh = __find_get_block_slow(bdev, block, size);
1540 EXPORT_SYMBOL(__find_get_block);
1543 * __getblk will locate (and, if necessary, create) the buffer_head
1544 * which corresponds to the passed block_device, block and size. The
1545 * returned buffer has its reference count incremented.
1547 * __getblk() cannot fail - it just keeps trying. If you pass it an
1548 * illegal block number, __getblk() will happily return a buffer_head
1549 * which represents the non-existent block. Very weird.
1551 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1552 * attempt is failing. FIXME, perhaps?
1554 struct buffer_head *
1555 __getblk(struct block_device *bdev, sector_t block, int size)
1557 struct buffer_head *bh = __find_get_block(bdev, block, size);
1561 bh = __getblk_slow(bdev, block, size);
1564 EXPORT_SYMBOL(__getblk);
1567 * Do async read-ahead on a buffer..
1569 void __breadahead(struct block_device *bdev, sector_t block, int size)
1571 struct buffer_head *bh = __getblk(bdev, block, size);
1572 ll_rw_block(READA, 1, &bh);
1575 EXPORT_SYMBOL(__breadahead);
1578 * __bread() - reads a specified block and returns the bh
1579 * @block: number of block
1580 * @size: size (in bytes) to read
1582 * Reads a specified block, and returns buffer head that contains it.
1583 * It returns NULL if the block was unreadable.
1585 struct buffer_head *
1586 __bread(struct block_device *bdev, sector_t block, int size)
1588 struct buffer_head *bh = __getblk(bdev, block, size);
1590 if (!buffer_uptodate(bh))
1591 bh = __bread_slow(bh);
1594 EXPORT_SYMBOL(__bread);
1597 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1598 * This doesn't race because it runs in each cpu either in irq
1599 * or with preempt disabled.
1601 static void invalidate_bh_lru(void *arg)
1603 struct bh_lru *b = &get_cpu_var(bh_lrus);
1606 for (i = 0; i < BH_LRU_SIZE; i++) {
1610 put_cpu_var(bh_lrus);
1613 static void invalidate_bh_lrus(void)
1615 on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1618 void set_bh_page(struct buffer_head *bh,
1619 struct page *page, unsigned long offset)
1622 if (offset >= PAGE_SIZE)
1624 if (PageHighMem(page))
1626 * This catches illegal uses and preserves the offset:
1628 bh->b_data = (char *)(0 + offset);
1630 bh->b_data = page_address(page) + offset;
1632 EXPORT_SYMBOL(set_bh_page);
1635 * Called when truncating a buffer on a page completely.
1637 static inline void discard_buffer(struct buffer_head * bh)
1640 clear_buffer_dirty(bh);
1642 clear_buffer_mapped(bh);
1643 clear_buffer_req(bh);
1644 clear_buffer_new(bh);
1645 clear_buffer_delay(bh);
1650 * try_to_release_page() - release old fs-specific metadata on a page
1652 * @page: the page which the kernel is trying to free
1653 * @gfp_mask: memory allocation flags (and I/O mode)
1655 * The address_space is to try to release any data against the page
1656 * (presumably at page->private). If the release was successful, return `1'.
1657 * Otherwise return zero.
1659 * The @gfp_mask argument specifies whether I/O may be performed to release
1660 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1662 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1664 int try_to_release_page(struct page *page, int gfp_mask)
1666 struct address_space * const mapping = page->mapping;
1668 BUG_ON(!PageLocked(page));
1669 if (PageWriteback(page))
1672 if (mapping && mapping->a_ops->releasepage)
1673 return mapping->a_ops->releasepage(page, gfp_mask);
1674 return try_to_free_buffers(page);
1676 EXPORT_SYMBOL(try_to_release_page);
1679 * block_invalidatepage - invalidate part of all of a buffer-backed page
1681 * @page: the page which is affected
1682 * @offset: the index of the truncation point
1684 * block_invalidatepage() is called when all or part of the page has become
1685 * invalidatedby a truncate operation.
1687 * block_invalidatepage() does not have to release all buffers, but it must
1688 * ensure that no dirty buffer is left outside @offset and that no I/O
1689 * is underway against any of the blocks which are outside the truncation
1690 * point. Because the caller is about to free (and possibly reuse) those
1693 int block_invalidatepage(struct page *page, unsigned long offset)
1695 struct buffer_head *head, *bh, *next;
1696 unsigned int curr_off = 0;
1699 BUG_ON(!PageLocked(page));
1700 if (!page_has_buffers(page))
1703 head = page_buffers(page);
1706 unsigned int next_off = curr_off + bh->b_size;
1707 next = bh->b_this_page;
1710 * is this block fully invalidated?
1712 if (offset <= curr_off)
1714 curr_off = next_off;
1716 } while (bh != head);
1719 * We release buffers only if the entire page is being invalidated.
1720 * The get_block cached value has been unconditionally invalidated,
1721 * so real IO is not possible anymore.
1724 ret = try_to_release_page(page, 0);
1728 EXPORT_SYMBOL(block_invalidatepage);
1731 * We attach and possibly dirty the buffers atomically wrt
1732 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1733 * is already excluded via the page lock.
1735 void create_empty_buffers(struct page *page,
1736 unsigned long blocksize, unsigned long b_state)
1738 struct buffer_head *bh, *head, *tail;
1740 head = create_buffers(page, blocksize, 1);
1743 bh->b_state |= b_state;
1745 bh = bh->b_this_page;
1747 tail->b_this_page = head;
1749 spin_lock(&page->mapping->private_lock);
1750 if (PageUptodate(page) || PageDirty(page)) {
1753 if (PageDirty(page))
1754 set_buffer_dirty(bh);
1755 if (PageUptodate(page))
1756 set_buffer_uptodate(bh);
1757 bh = bh->b_this_page;
1758 } while (bh != head);
1760 __set_page_buffers(page, head);
1761 spin_unlock(&page->mapping->private_lock);
1763 EXPORT_SYMBOL(create_empty_buffers);
1766 * We are taking a block for data and we don't want any output from any
1767 * buffer-cache aliases starting from return from that function and
1768 * until the moment when something will explicitly mark the buffer
1769 * dirty (hopefully that will not happen until we will free that block ;-)
1770 * We don't even need to mark it not-uptodate - nobody can expect
1771 * anything from a newly allocated buffer anyway. We used to used
1772 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1773 * don't want to mark the alias unmapped, for example - it would confuse
1774 * anyone who might pick it with bread() afterwards...
1776 * Also.. Note that bforget() doesn't lock the buffer. So there can
1777 * be writeout I/O going on against recently-freed buffers. We don't
1778 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1779 * only if we really need to. That happens here.
1781 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1783 struct buffer_head *old_bh;
1787 old_bh = __find_get_block_slow(bdev, block, 0);
1789 clear_buffer_dirty(old_bh);
1790 wait_on_buffer(old_bh);
1791 clear_buffer_req(old_bh);
1795 EXPORT_SYMBOL(unmap_underlying_metadata);
1798 * NOTE! All mapped/uptodate combinations are valid:
1800 * Mapped Uptodate Meaning
1802 * No No "unknown" - must do get_block()
1803 * No Yes "hole" - zero-filled
1804 * Yes No "allocated" - allocated on disk, not read in
1805 * Yes Yes "valid" - allocated and up-to-date in memory.
1807 * "Dirty" is valid only with the last case (mapped+uptodate).
1811 * While block_write_full_page is writing back the dirty buffers under
1812 * the page lock, whoever dirtied the buffers may decide to clean them
1813 * again at any time. We handle that by only looking at the buffer
1814 * state inside lock_buffer().
1816 * If block_write_full_page() is called for regular writeback
1817 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1818 * locked buffer. This only can happen if someone has written the buffer
1819 * directly, with submit_bh(). At the address_space level PageWriteback
1820 * prevents this contention from occurring.
1822 static int __block_write_full_page(struct inode *inode, struct page *page,
1823 get_block_t *get_block, struct writeback_control *wbc)
1827 sector_t last_block;
1828 struct buffer_head *bh, *head;
1829 int nr_underway = 0;
1831 BUG_ON(!PageLocked(page));
1833 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1835 if (!page_has_buffers(page)) {
1836 create_empty_buffers(page, 1 << inode->i_blkbits,
1837 (1 << BH_Dirty)|(1 << BH_Uptodate));
1841 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1842 * here, and the (potentially unmapped) buffers may become dirty at
1843 * any time. If a buffer becomes dirty here after we've inspected it
1844 * then we just miss that fact, and the page stays dirty.
1846 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1847 * handle that here by just cleaning them.
1850 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1851 head = page_buffers(page);
1855 * Get all the dirty buffers mapped to disk addresses and
1856 * handle any aliases from the underlying blockdev's mapping.
1859 if (block > last_block) {
1861 * mapped buffers outside i_size will occur, because
1862 * this page can be outside i_size when there is a
1863 * truncate in progress.
1866 * The buffer was zeroed by block_write_full_page()
1868 clear_buffer_dirty(bh);
1869 set_buffer_uptodate(bh);
1870 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1871 err = get_block(inode, block, bh, 1);
1874 if (buffer_new(bh)) {
1875 /* blockdev mappings never come here */
1876 clear_buffer_new(bh);
1877 unmap_underlying_metadata(bh->b_bdev,
1881 bh = bh->b_this_page;
1883 } while (bh != head);
1887 if (!buffer_mapped(bh))
1890 * If it's a fully non-blocking write attempt and we cannot
1891 * lock the buffer then redirty the page. Note that this can
1892 * potentially cause a busy-wait loop from pdflush and kswapd
1893 * activity, but those code paths have their own higher-level
1896 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1898 } else if (test_set_buffer_locked(bh)) {
1899 redirty_page_for_writepage(wbc, page);
1902 if (test_clear_buffer_dirty(bh)) {
1903 mark_buffer_async_write(bh);
1907 } while ((bh = bh->b_this_page) != head);
1910 * The page and its buffers are protected by PageWriteback(), so we can
1911 * drop the bh refcounts early.
1913 BUG_ON(PageWriteback(page));
1914 set_page_writeback(page);
1918 struct buffer_head *next = bh->b_this_page;
1919 if (buffer_async_write(bh)) {
1920 submit_bh(WRITE, bh);
1925 } while (bh != head);
1929 if (nr_underway == 0) {
1931 * The page was marked dirty, but the buffers were
1932 * clean. Someone wrote them back by hand with
1933 * ll_rw_block/submit_bh. A rare case.
1937 if (!buffer_uptodate(bh)) {
1941 bh = bh->b_this_page;
1942 } while (bh != head);
1944 SetPageUptodate(page);
1945 end_page_writeback(page);
1947 * The page and buffer_heads can be released at any time from
1950 wbc->pages_skipped++; /* We didn't write this page */
1956 * ENOSPC, or some other error. We may already have added some
1957 * blocks to the file, so we need to write these out to avoid
1958 * exposing stale data.
1959 * The page is currently locked and not marked for writeback
1962 /* Recovery: lock and submit the mapped buffers */
1965 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1967 mark_buffer_async_write(bh);
1970 * The buffer may have been set dirty during
1971 * attachment to a dirty page.
1973 clear_buffer_dirty(bh);
1975 } while ((bh = bh->b_this_page) != head);
1977 BUG_ON(PageWriteback(page));
1978 set_page_writeback(page);
1981 struct buffer_head *next = bh->b_this_page;
1982 if (buffer_async_write(bh)) {
1983 clear_buffer_dirty(bh);
1984 submit_bh(WRITE, bh);
1989 } while (bh != head);
1993 static int __block_prepare_write(struct inode *inode, struct page *page,
1994 unsigned from, unsigned to, get_block_t *get_block)
1996 unsigned block_start, block_end;
1999 unsigned blocksize, bbits;
2000 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2002 BUG_ON(!PageLocked(page));
2003 BUG_ON(from > PAGE_CACHE_SIZE);
2004 BUG_ON(to > PAGE_CACHE_SIZE);
2007 blocksize = 1 << inode->i_blkbits;
2008 if (!page_has_buffers(page))
2009 create_empty_buffers(page, blocksize, 0);
2010 head = page_buffers(page);
2012 bbits = inode->i_blkbits;
2013 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2015 for(bh = head, block_start = 0; bh != head || !block_start;
2016 block++, block_start=block_end, bh = bh->b_this_page) {
2017 block_end = block_start + blocksize;
2018 if (block_end <= from || block_start >= to) {
2019 if (PageUptodate(page)) {
2020 if (!buffer_uptodate(bh))
2021 set_buffer_uptodate(bh);
2026 clear_buffer_new(bh);
2027 if (!buffer_mapped(bh)) {
2028 err = get_block(inode, block, bh, 1);
2031 if (buffer_new(bh)) {
2032 clear_buffer_new(bh);
2033 unmap_underlying_metadata(bh->b_bdev,
2035 if (PageUptodate(page)) {
2036 set_buffer_uptodate(bh);
2039 if (block_end > to || block_start < from) {
2042 kaddr = kmap_atomic(page, KM_USER0);
2046 if (block_start < from)
2047 memset(kaddr+block_start,
2048 0, from-block_start);
2049 flush_dcache_page(page);
2050 kunmap_atomic(kaddr, KM_USER0);
2055 if (PageUptodate(page)) {
2056 if (!buffer_uptodate(bh))
2057 set_buffer_uptodate(bh);
2060 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2061 (block_start < from || block_end > to)) {
2062 ll_rw_block(READ, 1, &bh);
2067 * If we issued read requests - let them complete.
2069 while(wait_bh > wait) {
2070 wait_on_buffer(*--wait_bh);
2071 if (!buffer_uptodate(*wait_bh))
2077 * Zero out any newly allocated blocks to avoid exposing stale
2078 * data. If BH_New is set, we know that the block was newly
2079 * allocated in the above loop.
2084 block_end = block_start+blocksize;
2085 if (block_end <= from)
2087 if (block_start >= to)
2089 if (buffer_new(bh)) {
2092 clear_buffer_new(bh);
2093 kaddr = kmap_atomic(page, KM_USER0);
2094 memset(kaddr+block_start, 0, bh->b_size);
2095 kunmap_atomic(kaddr, KM_USER0);
2096 set_buffer_uptodate(bh);
2097 mark_buffer_dirty(bh);
2100 block_start = block_end;
2101 bh = bh->b_this_page;
2102 } while (bh != head);
2106 static int __block_commit_write(struct inode *inode, struct page *page,
2107 unsigned from, unsigned to)
2109 unsigned block_start, block_end;
2112 struct buffer_head *bh, *head;
2114 blocksize = 1 << inode->i_blkbits;
2116 for(bh = head = page_buffers(page), block_start = 0;
2117 bh != head || !block_start;
2118 block_start=block_end, bh = bh->b_this_page) {
2119 block_end = block_start + blocksize;
2120 if (block_end <= from || block_start >= to) {
2121 if (!buffer_uptodate(bh))
2124 set_buffer_uptodate(bh);
2125 mark_buffer_dirty(bh);
2130 * If this is a partial write which happened to make all buffers
2131 * uptodate then we can optimize away a bogus readpage() for
2132 * the next read(). Here we 'discover' whether the page went
2133 * uptodate as a result of this (potentially partial) write.
2136 SetPageUptodate(page);
2141 * Generic "read page" function for block devices that have the normal
2142 * get_block functionality. This is most of the block device filesystems.
2143 * Reads the page asynchronously --- the unlock_buffer() and
2144 * set/clear_buffer_uptodate() functions propagate buffer state into the
2145 * page struct once IO has completed.
2147 int block_read_full_page(struct page *page, get_block_t *get_block)
2149 struct inode *inode = page->mapping->host;
2150 sector_t iblock, lblock;
2151 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2152 unsigned int blocksize;
2154 int fully_mapped = 1;
2156 if (!PageLocked(page))
2158 blocksize = 1 << inode->i_blkbits;
2159 if (!page_has_buffers(page))
2160 create_empty_buffers(page, blocksize, 0);
2161 head = page_buffers(page);
2163 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2164 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2170 if (buffer_uptodate(bh))
2173 if (!buffer_mapped(bh)) {
2175 if (iblock < lblock) {
2176 if (get_block(inode, iblock, bh, 0))
2179 if (!buffer_mapped(bh)) {
2180 void *kaddr = kmap_atomic(page, KM_USER0);
2181 memset(kaddr + i * blocksize, 0, blocksize);
2182 flush_dcache_page(page);
2183 kunmap_atomic(kaddr, KM_USER0);
2184 set_buffer_uptodate(bh);
2188 * get_block() might have updated the buffer
2191 if (buffer_uptodate(bh))
2195 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2198 SetPageMappedToDisk(page);
2202 * All buffers are uptodate - we can set the page uptodate
2203 * as well. But not if get_block() returned an error.
2205 if (!PageError(page))
2206 SetPageUptodate(page);
2211 /* Stage two: lock the buffers */
2212 for (i = 0; i < nr; i++) {
2215 mark_buffer_async_read(bh);
2219 * Stage 3: start the IO. Check for uptodateness
2220 * inside the buffer lock in case another process reading
2221 * the underlying blockdev brought it uptodate (the sct fix).
2223 for (i = 0; i < nr; i++) {
2225 if (buffer_uptodate(bh))
2226 end_buffer_async_read(bh, 1);
2228 submit_bh(READ, bh);
2233 /* utility function for filesystems that need to do work on expanding
2234 * truncates. Uses prepare/commit_write to allow the filesystem to
2235 * deal with the hole.
2237 int generic_cont_expand(struct inode *inode, loff_t size)
2239 struct address_space *mapping = inode->i_mapping;
2241 unsigned long index, offset, limit;
2245 limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2246 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2247 send_sig(SIGXFSZ, current, 0);
2250 if (size > inode->i_sb->s_maxbytes)
2253 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2255 /* ugh. in prepare/commit_write, if from==to==start of block, we
2256 ** skip the prepare. make sure we never send an offset for the start
2259 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2262 index = size >> PAGE_CACHE_SHIFT;
2264 page = grab_cache_page(mapping, index);
2267 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2269 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2272 page_cache_release(page);
2280 * For moronic filesystems that do not allow holes in file.
2281 * We may have to extend the file.
2284 int cont_prepare_write(struct page *page, unsigned offset,
2285 unsigned to, get_block_t *get_block, loff_t *bytes)
2287 struct address_space *mapping = page->mapping;
2288 struct inode *inode = mapping->host;
2289 struct page *new_page;
2293 unsigned blocksize = 1 << inode->i_blkbits;
2296 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2298 new_page = grab_cache_page(mapping, pgpos);
2301 /* we might sleep */
2302 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2303 unlock_page(new_page);
2304 page_cache_release(new_page);
2307 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2308 if (zerofrom & (blocksize-1)) {
2309 *bytes |= (blocksize-1);
2312 status = __block_prepare_write(inode, new_page, zerofrom,
2313 PAGE_CACHE_SIZE, get_block);
2316 kaddr = kmap_atomic(new_page, KM_USER0);
2317 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2318 flush_dcache_page(new_page);
2319 kunmap_atomic(kaddr, KM_USER0);
2320 __block_commit_write(inode, new_page,
2321 zerofrom, PAGE_CACHE_SIZE);
2322 unlock_page(new_page);
2323 page_cache_release(new_page);
2326 if (page->index < pgpos) {
2327 /* completely inside the area */
2330 /* page covers the boundary, find the boundary offset */
2331 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2333 /* if we will expand the thing last block will be filled */
2334 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2335 *bytes |= (blocksize-1);
2339 /* starting below the boundary? Nothing to zero out */
2340 if (offset <= zerofrom)
2343 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2346 if (zerofrom < offset) {
2347 kaddr = kmap_atomic(page, KM_USER0);
2348 memset(kaddr+zerofrom, 0, offset-zerofrom);
2349 flush_dcache_page(page);
2350 kunmap_atomic(kaddr, KM_USER0);
2351 __block_commit_write(inode, page, zerofrom, offset);
2355 ClearPageUptodate(page);
2359 ClearPageUptodate(new_page);
2360 unlock_page(new_page);
2361 page_cache_release(new_page);
2366 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2367 get_block_t *get_block)
2369 struct inode *inode = page->mapping->host;
2370 int err = __block_prepare_write(inode, page, from, to, get_block);
2372 ClearPageUptodate(page);
2376 int block_commit_write(struct page *page, unsigned from, unsigned to)
2378 struct inode *inode = page->mapping->host;
2379 __block_commit_write(inode,page,from,to);
2383 int generic_commit_write(struct file *file, struct page *page,
2384 unsigned from, unsigned to)
2386 struct inode *inode = page->mapping->host;
2387 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2388 __block_commit_write(inode,page,from,to);
2390 * No need to use i_size_read() here, the i_size
2391 * cannot change under us because we hold i_sem.
2393 if (pos > inode->i_size) {
2394 i_size_write(inode, pos);
2395 mark_inode_dirty(inode);
2402 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2403 * immediately, while under the page lock. So it needs a special end_io
2404 * handler which does not touch the bh after unlocking it.
2406 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2407 * a race there is benign: unlock_buffer() only use the bh's address for
2408 * hashing after unlocking the buffer, so it doesn't actually touch the bh
2411 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2414 set_buffer_uptodate(bh);
2416 /* This happens, due to failed READA attempts. */
2417 clear_buffer_uptodate(bh);
2423 * On entry, the page is fully not uptodate.
2424 * On exit the page is fully uptodate in the areas outside (from,to)
2426 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2427 get_block_t *get_block)
2429 struct inode *inode = page->mapping->host;
2430 const unsigned blkbits = inode->i_blkbits;
2431 const unsigned blocksize = 1 << blkbits;
2432 struct buffer_head map_bh;
2433 struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2434 unsigned block_in_page;
2435 unsigned block_start;
2436 sector_t block_in_file;
2441 int is_mapped_to_disk = 1;
2444 if (PageMappedToDisk(page))
2447 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2448 map_bh.b_page = page;
2451 * We loop across all blocks in the page, whether or not they are
2452 * part of the affected region. This is so we can discover if the
2453 * page is fully mapped-to-disk.
2455 for (block_start = 0, block_in_page = 0;
2456 block_start < PAGE_CACHE_SIZE;
2457 block_in_page++, block_start += blocksize) {
2458 unsigned block_end = block_start + blocksize;
2463 if (block_start >= to)
2465 ret = get_block(inode, block_in_file + block_in_page,
2469 if (!buffer_mapped(&map_bh))
2470 is_mapped_to_disk = 0;
2471 if (buffer_new(&map_bh))
2472 unmap_underlying_metadata(map_bh.b_bdev,
2474 if (PageUptodate(page))
2476 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2477 kaddr = kmap_atomic(page, KM_USER0);
2478 if (block_start < from) {
2479 memset(kaddr+block_start, 0, from-block_start);
2482 if (block_end > to) {
2483 memset(kaddr + to, 0, block_end - to);
2486 flush_dcache_page(page);
2487 kunmap_atomic(kaddr, KM_USER0);
2490 if (buffer_uptodate(&map_bh))
2491 continue; /* reiserfs does this */
2492 if (block_start < from || block_end > to) {
2493 struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2499 bh->b_state = map_bh.b_state;
2500 atomic_set(&bh->b_count, 0);
2501 bh->b_this_page = NULL;
2503 bh->b_blocknr = map_bh.b_blocknr;
2504 bh->b_size = blocksize;
2505 bh->b_data = (char *)(long)block_start;
2506 bh->b_bdev = map_bh.b_bdev;
2507 bh->b_private = NULL;
2508 read_bh[nr_reads++] = bh;
2513 struct buffer_head *bh;
2516 * The page is locked, so these buffers are protected from
2517 * any VM or truncate activity. Hence we don't need to care
2518 * for the buffer_head refcounts.
2520 for (i = 0; i < nr_reads; i++) {
2523 bh->b_end_io = end_buffer_read_nobh;
2524 submit_bh(READ, bh);
2526 for (i = 0; i < nr_reads; i++) {
2529 if (!buffer_uptodate(bh))
2531 free_buffer_head(bh);
2538 if (is_mapped_to_disk)
2539 SetPageMappedToDisk(page);
2540 SetPageUptodate(page);
2543 * Setting the page dirty here isn't necessary for the prepare_write
2544 * function - commit_write will do that. But if/when this function is
2545 * used within the pagefault handler to ensure that all mmapped pages
2546 * have backing space in the filesystem, we will need to dirty the page
2547 * if its contents were altered.
2550 set_page_dirty(page);
2555 for (i = 0; i < nr_reads; i++) {
2557 free_buffer_head(read_bh[i]);
2561 * Error recovery is pretty slack. Clear the page and mark it dirty
2562 * so we'll later zero out any blocks which _were_ allocated.
2564 kaddr = kmap_atomic(page, KM_USER0);
2565 memset(kaddr, 0, PAGE_CACHE_SIZE);
2566 kunmap_atomic(kaddr, KM_USER0);
2567 SetPageUptodate(page);
2568 set_page_dirty(page);
2571 EXPORT_SYMBOL(nobh_prepare_write);
2573 int nobh_commit_write(struct file *file, struct page *page,
2574 unsigned from, unsigned to)
2576 struct inode *inode = page->mapping->host;
2577 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2579 set_page_dirty(page);
2580 if (pos > inode->i_size) {
2581 i_size_write(inode, pos);
2582 mark_inode_dirty(inode);
2586 EXPORT_SYMBOL(nobh_commit_write);
2589 * This function assumes that ->prepare_write() uses nobh_prepare_write().
2591 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2593 struct inode *inode = mapping->host;
2594 unsigned blocksize = 1 << inode->i_blkbits;
2595 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2596 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2599 struct address_space_operations *a_ops = mapping->a_ops;
2603 if ((offset & (blocksize - 1)) == 0)
2607 page = grab_cache_page(mapping, index);
2611 to = (offset + blocksize) & ~(blocksize - 1);
2612 ret = a_ops->prepare_write(NULL, page, offset, to);
2614 kaddr = kmap_atomic(page, KM_USER0);
2615 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2616 flush_dcache_page(page);
2617 kunmap_atomic(kaddr, KM_USER0);
2618 set_page_dirty(page);
2621 page_cache_release(page);
2625 EXPORT_SYMBOL(nobh_truncate_page);
2627 int block_truncate_page(struct address_space *mapping,
2628 loff_t from, get_block_t *get_block)
2630 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2631 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2634 unsigned length, pos;
2635 struct inode *inode = mapping->host;
2637 struct buffer_head *bh;
2641 blocksize = 1 << inode->i_blkbits;
2642 length = offset & (blocksize - 1);
2644 /* Block boundary? Nothing to do */
2648 length = blocksize - length;
2649 iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2651 page = grab_cache_page(mapping, index);
2656 if (!page_has_buffers(page))
2657 create_empty_buffers(page, blocksize, 0);
2659 /* Find the buffer that contains "offset" */
2660 bh = page_buffers(page);
2662 while (offset >= pos) {
2663 bh = bh->b_this_page;
2669 if (!buffer_mapped(bh)) {
2670 err = get_block(inode, iblock, bh, 0);
2673 /* unmapped? It's a hole - nothing to do */
2674 if (!buffer_mapped(bh))
2678 /* Ok, it's mapped. Make sure it's up-to-date */
2679 if (PageUptodate(page))
2680 set_buffer_uptodate(bh);
2682 if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2684 ll_rw_block(READ, 1, &bh);
2686 /* Uhhuh. Read error. Complain and punt. */
2687 if (!buffer_uptodate(bh))
2691 kaddr = kmap_atomic(page, KM_USER0);
2692 memset(kaddr + offset, 0, length);
2693 flush_dcache_page(page);
2694 kunmap_atomic(kaddr, KM_USER0);
2696 mark_buffer_dirty(bh);
2701 page_cache_release(page);
2707 * The generic ->writepage function for buffer-backed address_spaces
2709 int block_write_full_page(struct page *page, get_block_t *get_block,
2710 struct writeback_control *wbc)
2712 struct inode * const inode = page->mapping->host;
2713 loff_t i_size = i_size_read(inode);
2714 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2718 /* Is the page fully inside i_size? */
2719 if (page->index < end_index)
2720 return __block_write_full_page(inode, page, get_block, wbc);
2722 /* Is the page fully outside i_size? (truncate in progress) */
2723 offset = i_size & (PAGE_CACHE_SIZE-1);
2724 if (page->index >= end_index+1 || !offset) {
2726 * The page may have dirty, unmapped buffers. For example,
2727 * they may have been added in ext3_writepage(). Make them
2728 * freeable here, so the page does not leak.
2730 block_invalidatepage(page, 0);
2732 return 0; /* don't care */
2736 * The page straddles i_size. It must be zeroed out on each and every
2737 * writepage invokation because it may be mmapped. "A file is mapped
2738 * in multiples of the page size. For a file that is not a multiple of
2739 * the page size, the remaining memory is zeroed when mapped, and
2740 * writes to that region are not written out to the file."
2742 kaddr = kmap_atomic(page, KM_USER0);
2743 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2744 flush_dcache_page(page);
2745 kunmap_atomic(kaddr, KM_USER0);
2746 return __block_write_full_page(inode, page, get_block, wbc);
2749 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2750 get_block_t *get_block)
2752 struct buffer_head tmp;
2753 struct inode *inode = mapping->host;
2756 get_block(inode, block, &tmp, 0);
2757 return tmp.b_blocknr;
2760 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2762 struct buffer_head *bh = bio->bi_private;
2767 if (err == -EOPNOTSUPP) {
2768 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2769 set_bit(BH_Eopnotsupp, &bh->b_state);
2772 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2777 int submit_bh(int rw, struct buffer_head * bh)
2782 BUG_ON(!buffer_locked(bh));
2783 BUG_ON(!buffer_mapped(bh));
2784 BUG_ON(!bh->b_end_io);
2786 if (buffer_ordered(bh) && (rw == WRITE))
2790 * Only clear out a write error when rewriting, should this
2791 * include WRITE_SYNC as well?
2793 if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2794 clear_buffer_write_io_error(bh);
2797 * from here on down, it's all bio -- do the initial mapping,
2798 * submit_bio -> generic_make_request may further map this bio around
2800 bio = bio_alloc(GFP_NOIO, 1);
2802 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2803 bio->bi_bdev = bh->b_bdev;
2804 bio->bi_io_vec[0].bv_page = bh->b_page;
2805 bio->bi_io_vec[0].bv_len = bh->b_size;
2806 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2810 bio->bi_size = bh->b_size;
2812 bio->bi_end_io = end_bio_bh_io_sync;
2813 bio->bi_private = bh;
2816 submit_bio(rw, bio);
2818 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2826 * ll_rw_block: low-level access to block devices (DEPRECATED)
2827 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2828 * @nr: number of &struct buffer_heads in the array
2829 * @bhs: array of pointers to &struct buffer_head
2831 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2832 * and requests an I/O operation on them, either a %READ or a %WRITE.
2833 * The third %READA option is described in the documentation for
2834 * generic_make_request() which ll_rw_block() calls.
2836 * This function drops any buffer that it cannot get a lock on (with the
2837 * BH_Lock state bit), any buffer that appears to be clean when doing a
2838 * write request, and any buffer that appears to be up-to-date when doing
2839 * read request. Further it marks as clean buffers that are processed for
2840 * writing (the buffer cache won't assume that they are actually clean until
2841 * the buffer gets unlocked).
2843 * ll_rw_block sets b_end_io to simple completion handler that marks
2844 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2847 * All of the buffers must be for the same device, and must also be a
2848 * multiple of the current approved size for the device.
2850 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2854 for (i = 0; i < nr; i++) {
2855 struct buffer_head *bh = bhs[i];
2857 if (test_set_buffer_locked(bh))
2862 bh->b_end_io = end_buffer_write_sync;
2863 if (test_clear_buffer_dirty(bh)) {
2864 submit_bh(WRITE, bh);
2868 bh->b_end_io = end_buffer_read_sync;
2869 if (!buffer_uptodate(bh)) {
2880 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2881 * and then start new I/O and then wait upon it. The caller must have a ref on
2884 int sync_dirty_buffer(struct buffer_head *bh)
2888 WARN_ON(atomic_read(&bh->b_count) < 1);
2890 if (test_clear_buffer_dirty(bh)) {
2892 bh->b_end_io = end_buffer_write_sync;
2893 ret = submit_bh(WRITE, bh);
2895 if (buffer_eopnotsupp(bh)) {
2896 clear_buffer_eopnotsupp(bh);
2899 if (!ret && !buffer_uptodate(bh))
2908 * try_to_free_buffers() checks if all the buffers on this particular page
2909 * are unused, and releases them if so.
2911 * Exclusion against try_to_free_buffers may be obtained by either
2912 * locking the page or by holding its mapping's private_lock.
2914 * If the page is dirty but all the buffers are clean then we need to
2915 * be sure to mark the page clean as well. This is because the page
2916 * may be against a block device, and a later reattachment of buffers
2917 * to a dirty page will set *all* buffers dirty. Which would corrupt
2918 * filesystem data on the same device.
2920 * The same applies to regular filesystem pages: if all the buffers are
2921 * clean then we set the page clean and proceed. To do that, we require
2922 * total exclusion from __set_page_dirty_buffers(). That is obtained with
2925 * try_to_free_buffers() is non-blocking.
2927 static inline int buffer_busy(struct buffer_head *bh)
2929 return atomic_read(&bh->b_count) |
2930 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2934 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2936 struct buffer_head *head = page_buffers(page);
2937 struct buffer_head *bh;
2941 if (buffer_write_io_error(bh))
2942 set_bit(AS_EIO, &page->mapping->flags);
2943 if (buffer_busy(bh))
2945 bh = bh->b_this_page;
2946 } while (bh != head);
2949 struct buffer_head *next = bh->b_this_page;
2951 if (!list_empty(&bh->b_assoc_buffers))
2952 __remove_assoc_queue(bh);
2954 } while (bh != head);
2955 *buffers_to_free = head;
2956 __clear_page_buffers(page);
2962 int try_to_free_buffers(struct page *page)
2964 struct address_space * const mapping = page->mapping;
2965 struct buffer_head *buffers_to_free = NULL;
2968 BUG_ON(!PageLocked(page));
2969 if (PageWriteback(page))
2972 if (mapping == NULL) { /* can this still happen? */
2973 ret = drop_buffers(page, &buffers_to_free);
2977 spin_lock(&mapping->private_lock);
2978 ret = drop_buffers(page, &buffers_to_free);
2981 * If the filesystem writes its buffers by hand (eg ext3)
2982 * then we can have clean buffers against a dirty page. We
2983 * clean the page here; otherwise later reattachment of buffers
2984 * could encounter a non-uptodate page, which is unresolvable.
2985 * This only applies in the rare case where try_to_free_buffers
2986 * succeeds but the page is not freed.
2988 clear_page_dirty(page);
2990 spin_unlock(&mapping->private_lock);
2992 if (buffers_to_free) {
2993 struct buffer_head *bh = buffers_to_free;
2996 struct buffer_head *next = bh->b_this_page;
2997 free_buffer_head(bh);
2999 } while (bh != buffers_to_free);
3003 EXPORT_SYMBOL(try_to_free_buffers);
3005 int block_sync_page(struct page *page)
3007 struct address_space *mapping;
3010 mapping = page_mapping(page);
3012 blk_run_backing_dev(mapping->backing_dev_info, page);
3017 * There are no bdflush tunables left. But distributions are
3018 * still running obsolete flush daemons, so we terminate them here.
3020 * Use of bdflush() is deprecated and will be removed in a future kernel.
3021 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3023 asmlinkage long sys_bdflush(int func, long data)
3025 static int msg_count;
3027 if (!capable(CAP_SYS_ADMIN))
3030 if (msg_count < 5) {
3033 "warning: process `%s' used the obsolete bdflush"
3034 " system call\n", current->comm);
3035 printk(KERN_INFO "Fix your initscripts?\n");
3044 * Buffer-head allocation
3046 static kmem_cache_t *bh_cachep;
3049 * Once the number of bh's in the machine exceeds this level, we start
3050 * stripping them in writeback.
3052 static int max_buffer_heads;
3054 int buffer_heads_over_limit;
3056 struct bh_accounting {
3057 int nr; /* Number of live bh's */
3058 int ratelimit; /* Limit cacheline bouncing */
3061 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3063 static void recalc_bh_state(void)
3068 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3070 __get_cpu_var(bh_accounting).ratelimit = 0;
3072 tot += per_cpu(bh_accounting, i).nr;
3073 buffer_heads_over_limit = (tot > max_buffer_heads);
3076 struct buffer_head *alloc_buffer_head(int gfp_flags)
3078 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3081 __get_cpu_var(bh_accounting).nr++;
3087 EXPORT_SYMBOL(alloc_buffer_head);
3089 void free_buffer_head(struct buffer_head *bh)
3091 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3092 kmem_cache_free(bh_cachep, bh);
3094 __get_cpu_var(bh_accounting).nr--;
3098 EXPORT_SYMBOL(free_buffer_head);
3101 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3103 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3104 SLAB_CTOR_CONSTRUCTOR) {
3105 struct buffer_head * bh = (struct buffer_head *)data;
3107 memset(bh, 0, sizeof(*bh));
3108 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3112 #ifdef CONFIG_HOTPLUG_CPU
3113 static void buffer_exit_cpu(int cpu)
3116 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3118 for (i = 0; i < BH_LRU_SIZE; i++) {
3124 static int buffer_cpu_notify(struct notifier_block *self,
3125 unsigned long action, void *hcpu)
3127 if (action == CPU_DEAD)
3128 buffer_exit_cpu((unsigned long)hcpu);
3131 #endif /* CONFIG_HOTPLUG_CPU */
3133 void __init buffer_init(void)
3138 bh_cachep = kmem_cache_create("buffer_head",
3139 sizeof(struct buffer_head), 0,
3140 SLAB_PANIC, init_buffer_head, NULL);
3141 for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
3142 init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
3145 * Limit the bh occupancy to 10% of ZONE_NORMAL
3147 nrpages = (nr_free_buffer_pages() * 10) / 100;
3148 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3149 hotcpu_notifier(buffer_cpu_notify, 0);
3152 EXPORT_SYMBOL(__bforget);
3153 EXPORT_SYMBOL(__brelse);
3154 EXPORT_SYMBOL(__wait_on_buffer);
3155 EXPORT_SYMBOL(block_commit_write);
3156 EXPORT_SYMBOL(block_prepare_write);
3157 EXPORT_SYMBOL(block_read_full_page);
3158 EXPORT_SYMBOL(block_sync_page);
3159 EXPORT_SYMBOL(block_truncate_page);
3160 EXPORT_SYMBOL(block_write_full_page);
3161 EXPORT_SYMBOL(cont_prepare_write);
3162 EXPORT_SYMBOL(end_buffer_async_write);
3163 EXPORT_SYMBOL(end_buffer_read_sync);
3164 EXPORT_SYMBOL(end_buffer_write_sync);
3165 EXPORT_SYMBOL(file_fsync);
3166 EXPORT_SYMBOL(fsync_bdev);
3167 EXPORT_SYMBOL(generic_block_bmap);
3168 EXPORT_SYMBOL(generic_commit_write);
3169 EXPORT_SYMBOL(generic_cont_expand);
3170 EXPORT_SYMBOL(init_buffer);
3171 EXPORT_SYMBOL(invalidate_bdev);
3172 EXPORT_SYMBOL(ll_rw_block);
3173 EXPORT_SYMBOL(mark_buffer_dirty);
3174 EXPORT_SYMBOL(submit_bh);
3175 EXPORT_SYMBOL(sync_dirty_buffer);
3176 EXPORT_SYMBOL(unlock_buffer);