4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/config.h>
22 #include <linux/kernel.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/bio.h>
38 #include <linux/notifier.h>
39 #include <linux/cpu.h>
40 #include <asm/bitops.h>
42 static void invalidate_bh_lrus(void);
44 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
47 * Hashed waitqueue_head's for wait_on_buffer()
49 #define BH_WAIT_TABLE_ORDER 7
50 static struct bh_wait_queue_head {
51 wait_queue_head_t wqh;
52 } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
55 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
57 bh->b_end_io = handler;
58 bh->b_private = private;
62 * Return the address of the waitqueue_head to be used for this
65 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
67 return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
69 EXPORT_SYMBOL(bh_waitq_head);
71 void wake_up_buffer(struct buffer_head *bh)
73 wait_queue_head_t *wq = bh_waitq_head(bh);
76 if (waitqueue_active(wq))
79 EXPORT_SYMBOL(wake_up_buffer);
81 void fastcall unlock_buffer(struct buffer_head *bh)
83 clear_buffer_locked(bh);
84 smp_mb__after_clear_bit();
89 * Block until a buffer comes unlocked. This doesn't stop it
90 * from becoming locked again - you have to lock it yourself
91 * if you want to preserve its state.
93 void __wait_on_buffer(struct buffer_head * bh)
95 wait_queue_head_t *wqh = bh_waitq_head(bh);
99 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
100 if (buffer_locked(bh)) {
101 struct block_device *bd;
105 blk_run_address_space(bd->bd_inode->i_mapping);
108 } while (buffer_locked(bh));
109 finish_wait(wqh, &wait);
113 __set_page_buffers(struct page *page, struct buffer_head *head)
115 page_cache_get(page);
116 SetPagePrivate(page);
117 page->private = (unsigned long)head;
121 __clear_page_buffers(struct page *page)
123 ClearPagePrivate(page);
125 page_cache_release(page);
128 static void buffer_io_error(struct buffer_head *bh)
130 char b[BDEVNAME_SIZE];
132 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
133 bdevname(bh->b_bdev, b),
134 (unsigned long long)bh->b_blocknr);
138 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
139 * unlock the buffer. This is what ll_rw_block uses too.
141 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
144 set_buffer_uptodate(bh);
146 /* This happens, due to failed READA attempts. */
147 clear_buffer_uptodate(bh);
153 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
155 char b[BDEVNAME_SIZE];
158 set_buffer_uptodate(bh);
160 if (printk_ratelimit()) {
162 printk(KERN_WARNING "lost page write due to "
164 bdevname(bh->b_bdev, b));
166 set_buffer_write_io_error(bh);
167 clear_buffer_uptodate(bh);
174 * Write out and wait upon all the dirty data associated with a block
175 * device via its mapping. Does not take the superblock lock.
177 int sync_blockdev(struct block_device *bdev)
184 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
185 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
191 EXPORT_SYMBOL(sync_blockdev);
194 * Write out and wait upon all dirty data associated with this
195 * superblock. Filesystem data as well as the underlying block
196 * device. Takes the superblock lock.
198 int fsync_super(struct super_block *sb)
200 sync_inodes_sb(sb, 0);
203 if (sb->s_dirt && sb->s_op->write_super)
204 sb->s_op->write_super(sb);
206 if (sb->s_op->sync_fs)
207 sb->s_op->sync_fs(sb, 1);
208 sync_blockdev(sb->s_bdev);
209 sync_inodes_sb(sb, 1);
211 return sync_blockdev(sb->s_bdev);
215 * Write out and wait upon all dirty data associated with this
216 * device. Filesystem data as well as the underlying block
217 * device. Takes the superblock lock.
219 int fsync_bdev(struct block_device *bdev)
221 struct super_block *sb = get_super(bdev);
223 int res = fsync_super(sb);
227 return sync_blockdev(bdev);
231 * freeze_bdev -- lock a filesystem and force it into a consistent state
232 * @bdev: blockdevice to lock
234 * This takes the block device bd_mount_sem to make sure no new mounts
235 * happen on bdev until thaw_bdev() is called.
236 * If a superblock is found on this device, we take the s_umount semaphore
237 * on it to make sure nobody unmounts until the snapshot creation is done.
239 struct super_block *freeze_bdev(struct block_device *bdev)
241 struct super_block *sb;
243 down(&bdev->bd_mount_sem);
244 sb = get_super(bdev);
245 if (sb && !(sb->s_flags & MS_RDONLY)) {
246 sb->s_frozen = SB_FREEZE_WRITE;
249 sync_inodes_sb(sb, 0);
253 if (sb->s_dirt && sb->s_op->write_super)
254 sb->s_op->write_super(sb);
257 if (sb->s_op->sync_fs)
258 sb->s_op->sync_fs(sb, 1);
260 sync_blockdev(sb->s_bdev);
261 sync_inodes_sb(sb, 1);
263 sb->s_frozen = SB_FREEZE_TRANS;
266 sync_blockdev(sb->s_bdev);
268 if (sb->s_op->write_super_lockfs)
269 sb->s_op->write_super_lockfs(sb);
273 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
275 EXPORT_SYMBOL(freeze_bdev);
278 * thaw_bdev -- unlock filesystem
279 * @bdev: blockdevice to unlock
280 * @sb: associated superblock
282 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
284 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
287 BUG_ON(sb->s_bdev != bdev);
289 if (sb->s_op->unlockfs)
290 sb->s_op->unlockfs(sb);
291 sb->s_frozen = SB_UNFROZEN;
293 wake_up(&sb->s_wait_unfrozen);
297 up(&bdev->bd_mount_sem);
299 EXPORT_SYMBOL(thaw_bdev);
302 * sync everything. Start out by waking pdflush, because that writes back
303 * all queues in parallel.
305 static void do_sync(unsigned long wait)
308 sync_inodes(0); /* All mappings, inodes and their blockdevs */
310 sync_supers(); /* Write the superblocks */
311 sync_filesystems(0); /* Start syncing the filesystems */
312 sync_filesystems(wait); /* Waitingly sync the filesystems */
313 sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
315 printk("Emergency Sync complete\n");
316 if (unlikely(laptop_mode))
317 laptop_sync_completion();
320 asmlinkage long sys_sync(void)
326 void emergency_sync(void)
328 pdflush_operation(do_sync, 0);
332 * Generic function to fsync a file.
334 * filp may be NULL if called via the msync of a vma.
337 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
339 struct inode * inode = dentry->d_inode;
340 struct super_block * sb;
343 /* sync the inode to buffers */
344 write_inode_now(inode, 0);
346 /* sync the superblock to buffers */
349 if (sb->s_op->write_super)
350 sb->s_op->write_super(sb);
353 /* .. finally sync the buffers to disk */
354 ret = sync_blockdev(sb->s_bdev);
358 asmlinkage long sys_fsync(unsigned int fd)
361 struct address_space *mapping;
369 mapping = file->f_mapping;
372 if (!file->f_op || !file->f_op->fsync) {
373 /* Why? We can still call filemap_fdatawrite */
377 /* We need to protect against concurrent writers.. */
378 down(&mapping->host->i_sem);
379 current->flags |= PF_SYNCWRITE;
380 ret = filemap_fdatawrite(mapping);
381 err = file->f_op->fsync(file, file->f_dentry, 0);
384 err = filemap_fdatawait(mapping);
387 current->flags &= ~PF_SYNCWRITE;
388 up(&mapping->host->i_sem);
396 asmlinkage long sys_fdatasync(unsigned int fd)
399 struct address_space *mapping;
408 if (!file->f_op || !file->f_op->fsync)
411 mapping = file->f_mapping;
413 down(&mapping->host->i_sem);
414 current->flags |= PF_SYNCWRITE;
415 ret = filemap_fdatawrite(mapping);
416 err = file->f_op->fsync(file, file->f_dentry, 1);
419 err = filemap_fdatawait(mapping);
422 current->flags &= ~PF_SYNCWRITE;
423 up(&mapping->host->i_sem);
432 * Various filesystems appear to want __find_get_block to be non-blocking.
433 * But it's the page lock which protects the buffers. To get around this,
434 * we get exclusion from try_to_free_buffers with the blockdev mapping's
437 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
438 * may be quite high. This code could TryLock the page, and if that
439 * succeeds, there is no need to take private_lock. (But if
440 * private_lock is contended then so is mapping->tree_lock).
442 static struct buffer_head *
443 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
445 struct inode *bd_inode = bdev->bd_inode;
446 struct address_space *bd_mapping = bd_inode->i_mapping;
447 struct buffer_head *ret = NULL;
449 struct buffer_head *bh;
450 struct buffer_head *head;
453 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
454 page = find_get_page(bd_mapping, index);
458 spin_lock(&bd_mapping->private_lock);
459 if (!page_has_buffers(page))
461 head = page_buffers(page);
464 if (bh->b_blocknr == block) {
469 bh = bh->b_this_page;
470 } while (bh != head);
472 printk("__find_get_block_slow() failed. "
473 "block=%llu, b_blocknr=%llu\n",
474 (unsigned long long)block, (unsigned long long)bh->b_blocknr);
475 printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
476 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
478 spin_unlock(&bd_mapping->private_lock);
479 page_cache_release(page);
484 /* If invalidate_buffers() will trash dirty buffers, it means some kind
485 of fs corruption is going on. Trashing dirty data always imply losing
486 information that was supposed to be just stored on the physical layer
489 Thus invalidate_buffers in general usage is not allwowed to trash
490 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
491 be preserved. These buffers are simply skipped.
493 We also skip buffers which are still in use. For example this can
494 happen if a userspace program is reading the block device.
496 NOTE: In the case where the user removed a removable-media-disk even if
497 there's still dirty data not synced on disk (due a bug in the device driver
498 or due an error of the user), by not destroying the dirty buffers we could
499 generate corruption also on the next media inserted, thus a parameter is
500 necessary to handle this case in the most safe way possible (trying
501 to not corrupt also the new disk inserted with the data belonging to
502 the old now corrupted disk). Also for the ramdisk the natural thing
503 to do in order to release the ramdisk memory is to destroy dirty buffers.
505 These are two special cases. Normal usage imply the device driver
506 to issue a sync on the device (without waiting I/O completion) and
507 then an invalidate_buffers call that doesn't trash dirty buffers.
509 For handling cache coherency with the blkdev pagecache the 'update' case
510 is been introduced. It is needed to re-read from disk any pinned
511 buffer. NOTE: re-reading from disk is destructive so we can do it only
512 when we assume nobody is changing the buffercache under our I/O and when
513 we think the disk contains more recent information than the buffercache.
514 The update == 1 pass marks the buffers we need to update, the update == 2
515 pass does the actual I/O. */
516 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
518 invalidate_bh_lrus();
520 * FIXME: what about destroy_dirty_buffers?
521 * We really want to use invalidate_inode_pages2() for
522 * that, but not until that's cleaned up.
524 invalidate_inode_pages(bdev->bd_inode->i_mapping);
528 * Kick pdflush then try to free up some ZONE_NORMAL memory.
530 static void free_more_memory(void)
535 wakeup_bdflush(1024);
538 for_each_pgdat(pgdat) {
539 zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
541 try_to_free_pages(zones, GFP_NOFS, 0);
546 * I/O completion handler for block_read_full_page() - pages
547 * which come unlocked at the end of I/O.
549 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
551 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
553 struct buffer_head *tmp;
555 int page_uptodate = 1;
557 BUG_ON(!buffer_async_read(bh));
561 set_buffer_uptodate(bh);
563 clear_buffer_uptodate(bh);
569 * Be _very_ careful from here on. Bad things can happen if
570 * two buffer heads end IO at almost the same time and both
571 * decide that the page is now completely done.
573 spin_lock_irqsave(&page_uptodate_lock, flags);
574 clear_buffer_async_read(bh);
578 if (!buffer_uptodate(tmp))
580 if (buffer_async_read(tmp)) {
581 BUG_ON(!buffer_locked(tmp));
584 tmp = tmp->b_this_page;
586 spin_unlock_irqrestore(&page_uptodate_lock, flags);
589 * If none of the buffers had errors and they are all
590 * uptodate then we can set the page uptodate.
592 if (page_uptodate && !PageError(page))
593 SetPageUptodate(page);
598 spin_unlock_irqrestore(&page_uptodate_lock, flags);
603 * Completion handler for block_write_full_page() - pages which are unlocked
604 * during I/O, and which have PageWriteback cleared upon I/O completion.
606 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
608 char b[BDEVNAME_SIZE];
609 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
611 struct buffer_head *tmp;
614 BUG_ON(!buffer_async_write(bh));
618 set_buffer_uptodate(bh);
620 if (printk_ratelimit()) {
622 printk(KERN_WARNING "lost page write due to "
624 bdevname(bh->b_bdev, b));
626 set_bit(AS_EIO, &page->mapping->flags);
627 clear_buffer_uptodate(bh);
631 spin_lock_irqsave(&page_uptodate_lock, flags);
632 clear_buffer_async_write(bh);
634 tmp = bh->b_this_page;
636 if (buffer_async_write(tmp)) {
637 BUG_ON(!buffer_locked(tmp));
640 tmp = tmp->b_this_page;
642 spin_unlock_irqrestore(&page_uptodate_lock, flags);
643 end_page_writeback(page);
647 spin_unlock_irqrestore(&page_uptodate_lock, flags);
652 * If a page's buffers are under async readin (end_buffer_async_read
653 * completion) then there is a possibility that another thread of
654 * control could lock one of the buffers after it has completed
655 * but while some of the other buffers have not completed. This
656 * locked buffer would confuse end_buffer_async_read() into not unlocking
657 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
658 * that this buffer is not under async I/O.
660 * The page comes unlocked when it has no locked buffer_async buffers
663 * PageLocked prevents anyone starting new async I/O reads any of
666 * PageWriteback is used to prevent simultaneous writeout of the same
669 * PageLocked prevents anyone from starting writeback of a page which is
670 * under read I/O (PageWriteback is only ever set against a locked page).
672 void mark_buffer_async_read(struct buffer_head *bh)
674 bh->b_end_io = end_buffer_async_read;
675 set_buffer_async_read(bh);
677 EXPORT_SYMBOL(mark_buffer_async_read);
679 void mark_buffer_async_write(struct buffer_head *bh)
681 bh->b_end_io = end_buffer_async_write;
682 set_buffer_async_write(bh);
684 EXPORT_SYMBOL(mark_buffer_async_write);
688 * fs/buffer.c contains helper functions for buffer-backed address space's
689 * fsync functions. A common requirement for buffer-based filesystems is
690 * that certain data from the backing blockdev needs to be written out for
691 * a successful fsync(). For example, ext2 indirect blocks need to be
692 * written back and waited upon before fsync() returns.
694 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
695 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
696 * management of a list of dependent buffers at ->i_mapping->private_list.
698 * Locking is a little subtle: try_to_free_buffers() will remove buffers
699 * from their controlling inode's queue when they are being freed. But
700 * try_to_free_buffers() will be operating against the *blockdev* mapping
701 * at the time, not against the S_ISREG file which depends on those buffers.
702 * So the locking for private_list is via the private_lock in the address_space
703 * which backs the buffers. Which is different from the address_space
704 * against which the buffers are listed. So for a particular address_space,
705 * mapping->private_lock does *not* protect mapping->private_list! In fact,
706 * mapping->private_list will always be protected by the backing blockdev's
709 * Which introduces a requirement: all buffers on an address_space's
710 * ->private_list must be from the same address_space: the blockdev's.
712 * address_spaces which do not place buffers at ->private_list via these
713 * utility functions are free to use private_lock and private_list for
714 * whatever they want. The only requirement is that list_empty(private_list)
715 * be true at clear_inode() time.
717 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
718 * filesystems should do that. invalidate_inode_buffers() should just go
719 * BUG_ON(!list_empty).
721 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
722 * take an address_space, not an inode. And it should be called
723 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
726 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
727 * list if it is already on a list. Because if the buffer is on a list,
728 * it *must* already be on the right one. If not, the filesystem is being
729 * silly. This will save a ton of locking. But first we have to ensure
730 * that buffers are taken *off* the old inode's list when they are freed
731 * (presumably in truncate). That requires careful auditing of all
732 * filesystems (do it inside bforget()). It could also be done by bringing
736 void buffer_insert_list(spinlock_t *lock,
737 struct buffer_head *bh, struct list_head *list)
740 list_move_tail(&bh->b_assoc_buffers, list);
745 * The buffer's backing address_space's private_lock must be held
747 static inline void __remove_assoc_queue(struct buffer_head *bh)
749 list_del_init(&bh->b_assoc_buffers);
752 int inode_has_buffers(struct inode *inode)
754 return !list_empty(&inode->i_data.private_list);
758 * osync is designed to support O_SYNC io. It waits synchronously for
759 * all already-submitted IO to complete, but does not queue any new
760 * writes to the disk.
762 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
763 * you dirty the buffers, and then use osync_inode_buffers to wait for
764 * completion. Any other dirty buffers which are not yet queued for
765 * write will not be flushed to disk by the osync.
767 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
769 struct buffer_head *bh;
775 list_for_each_prev(p, list) {
777 if (buffer_locked(bh)) {
781 if (!buffer_uptodate(bh))
793 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
795 * @buffer_mapping - the mapping which backs the buffers' data
796 * @mapping - the mapping which wants those buffers written
798 * Starts I/O against the buffers at mapping->private_list, and waits upon
801 * Basically, this is a convenience function for fsync(). @buffer_mapping is
802 * the blockdev which "owns" the buffers and @mapping is a file or directory
803 * which needs those buffers to be written for a successful fsync().
805 int sync_mapping_buffers(struct address_space *mapping)
807 struct address_space *buffer_mapping = mapping->assoc_mapping;
809 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
812 return fsync_buffers_list(&buffer_mapping->private_lock,
813 &mapping->private_list);
815 EXPORT_SYMBOL(sync_mapping_buffers);
818 * Called when we've recently written block `bblock', and it is known that
819 * `bblock' was for a buffer_boundary() buffer. This means that the block at
820 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
821 * dirty, schedule it for IO. So that indirects merge nicely with their data.
823 void write_boundary_block(struct block_device *bdev,
824 sector_t bblock, unsigned blocksize)
826 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
828 if (buffer_dirty(bh))
829 ll_rw_block(WRITE, 1, &bh);
834 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
836 struct address_space *mapping = inode->i_mapping;
837 struct address_space *buffer_mapping = bh->b_page->mapping;
839 mark_buffer_dirty(bh);
840 if (!mapping->assoc_mapping) {
841 mapping->assoc_mapping = buffer_mapping;
843 if (mapping->assoc_mapping != buffer_mapping)
846 if (list_empty(&bh->b_assoc_buffers))
847 buffer_insert_list(&buffer_mapping->private_lock,
848 bh, &mapping->private_list);
850 EXPORT_SYMBOL(mark_buffer_dirty_inode);
853 * Add a page to the dirty page list.
855 * It is a sad fact of life that this function is called from several places
856 * deeply under spinlocking. It may not sleep.
858 * If the page has buffers, the uptodate buffers are set dirty, to preserve
859 * dirty-state coherency between the page and the buffers. It the page does
860 * not have buffers then when they are later attached they will all be set
863 * The buffers are dirtied before the page is dirtied. There's a small race
864 * window in which a writepage caller may see the page cleanness but not the
865 * buffer dirtiness. That's fine. If this code were to set the page dirty
866 * before the buffers, a concurrent writepage caller could clear the page dirty
867 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
868 * page on the dirty page list.
870 * We use private_lock to lock against try_to_free_buffers while using the
871 * page's buffer list. Also use this to protect against clean buffers being
872 * added to the page after it was set dirty.
874 * FIXME: may need to call ->reservepage here as well. That's rather up to the
875 * address_space though.
877 int __set_page_dirty_buffers(struct page *page)
879 struct address_space * const mapping = page->mapping;
881 spin_lock(&mapping->private_lock);
882 if (page_has_buffers(page)) {
883 struct buffer_head *head = page_buffers(page);
884 struct buffer_head *bh = head;
887 set_buffer_dirty(bh);
888 bh = bh->b_this_page;
889 } while (bh != head);
891 spin_unlock(&mapping->private_lock);
893 if (!TestSetPageDirty(page)) {
894 spin_lock_irq(&mapping->tree_lock);
895 if (page->mapping) { /* Race with truncate? */
896 if (!mapping->backing_dev_info->memory_backed)
897 inc_page_state(nr_dirty);
898 radix_tree_tag_set(&mapping->page_tree, page->index,
899 PAGECACHE_TAG_DIRTY);
901 spin_unlock_irq(&mapping->tree_lock);
902 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
907 EXPORT_SYMBOL(__set_page_dirty_buffers);
910 * Write out and wait upon a list of buffers.
912 * We have conflicting pressures: we want to make sure that all
913 * initially dirty buffers get waited on, but that any subsequently
914 * dirtied buffers don't. After all, we don't want fsync to last
915 * forever if somebody is actively writing to the file.
917 * Do this in two main stages: first we copy dirty buffers to a
918 * temporary inode list, queueing the writes as we go. Then we clean
919 * up, waiting for those writes to complete.
921 * During this second stage, any subsequent updates to the file may end
922 * up refiling the buffer on the original inode's dirty list again, so
923 * there is a chance we will end up with a buffer queued for write but
924 * not yet completed on that list. So, as a final cleanup we go through
925 * the osync code to catch these locked, dirty buffers without requeuing
926 * any newly dirty buffers for write.
928 int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
930 struct buffer_head *bh;
931 struct list_head tmp;
934 INIT_LIST_HEAD(&tmp);
937 while (!list_empty(list)) {
938 bh = BH_ENTRY(list->next);
939 list_del_init(&bh->b_assoc_buffers);
940 if (buffer_dirty(bh) || buffer_locked(bh)) {
941 list_add(&bh->b_assoc_buffers, &tmp);
942 if (buffer_dirty(bh)) {
946 * Ensure any pending I/O completes so that
947 * ll_rw_block() actually writes the current
948 * contents - it is a noop if I/O is still in
949 * flight on potentially older contents.
952 ll_rw_block(WRITE, 1, &bh);
959 while (!list_empty(&tmp)) {
960 bh = BH_ENTRY(tmp.prev);
961 __remove_assoc_queue(bh);
965 if (!buffer_uptodate(bh))
972 err2 = osync_buffers_list(lock, list);
980 * Invalidate any and all dirty buffers on a given inode. We are
981 * probably unmounting the fs, but that doesn't mean we have already
982 * done a sync(). Just drop the buffers from the inode list.
984 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
985 * assumes that all the buffers are against the blockdev. Not true
988 void invalidate_inode_buffers(struct inode *inode)
990 if (inode_has_buffers(inode)) {
991 struct address_space *mapping = &inode->i_data;
992 struct list_head *list = &mapping->private_list;
993 struct address_space *buffer_mapping = mapping->assoc_mapping;
995 spin_lock(&buffer_mapping->private_lock);
996 while (!list_empty(list))
997 __remove_assoc_queue(BH_ENTRY(list->next));
998 spin_unlock(&buffer_mapping->private_lock);
1003 * Remove any clean buffers from the inode's buffer list. This is called
1004 * when we're trying to free the inode itself. Those buffers can pin it.
1006 * Returns true if all buffers were removed.
1008 int remove_inode_buffers(struct inode *inode)
1012 if (inode_has_buffers(inode)) {
1013 struct address_space *mapping = &inode->i_data;
1014 struct list_head *list = &mapping->private_list;
1015 struct address_space *buffer_mapping = mapping->assoc_mapping;
1017 spin_lock(&buffer_mapping->private_lock);
1018 while (!list_empty(list)) {
1019 struct buffer_head *bh = BH_ENTRY(list->next);
1020 if (buffer_dirty(bh)) {
1024 __remove_assoc_queue(bh);
1026 spin_unlock(&buffer_mapping->private_lock);
1032 * Create the appropriate buffers when given a page for data area and
1033 * the size of each buffer.. Use the bh->b_this_page linked list to
1034 * follow the buffers created. Return NULL if unable to create more
1037 * The retry flag is used to differentiate async IO (paging, swapping)
1038 * which may not fail from ordinary buffer allocations.
1040 static struct buffer_head *
1041 create_buffers(struct page * page, unsigned long size, int retry)
1043 struct buffer_head *bh, *head;
1049 while ((offset -= size) >= 0) {
1050 bh = alloc_buffer_head(GFP_NOFS);
1055 bh->b_this_page = head;
1060 atomic_set(&bh->b_count, 0);
1063 /* Link the buffer to its page */
1064 set_bh_page(bh, page, offset);
1066 bh->b_end_io = NULL;
1070 * In case anything failed, we just free everything we got.
1076 head = head->b_this_page;
1077 free_buffer_head(bh);
1082 * Return failure for non-async IO requests. Async IO requests
1083 * are not allowed to fail, so we have to wait until buffer heads
1084 * become available. But we don't want tasks sleeping with
1085 * partially complete buffers, so all were released above.
1090 /* We're _really_ low on memory. Now we just
1091 * wait for old buffer heads to become free due to
1092 * finishing IO. Since this is an async request and
1093 * the reserve list is empty, we're sure there are
1094 * async buffer heads in use.
1101 link_dev_buffers(struct page *page, struct buffer_head *head)
1103 struct buffer_head *bh, *tail;
1108 bh = bh->b_this_page;
1110 tail->b_this_page = head;
1111 __set_page_buffers(page, head);
1115 * Initialise the state of a blockdev page's buffers.
1118 init_page_buffers(struct page *page, struct block_device *bdev,
1119 sector_t block, int size)
1121 struct buffer_head *head = page_buffers(page);
1122 struct buffer_head *bh = head;
1123 unsigned int b_state;
1125 b_state = 1 << BH_Mapped;
1126 if (PageUptodate(page))
1127 b_state |= 1 << BH_Uptodate;
1130 if (!(bh->b_state & (1 << BH_Mapped))) {
1131 init_buffer(bh, NULL, NULL);
1133 bh->b_blocknr = block;
1134 bh->b_state = b_state;
1137 bh = bh->b_this_page;
1138 } while (bh != head);
1142 * Create the page-cache page that contains the requested block.
1144 * This is user purely for blockdev mappings.
1146 static struct page *
1147 grow_dev_page(struct block_device *bdev, sector_t block,
1148 pgoff_t index, int size)
1150 struct inode *inode = bdev->bd_inode;
1152 struct buffer_head *bh;
1154 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1158 if (!PageLocked(page))
1161 if (page_has_buffers(page)) {
1162 bh = page_buffers(page);
1163 if (bh->b_size == size)
1165 if (!try_to_free_buffers(page))
1170 * Allocate some buffers for this page
1172 bh = create_buffers(page, size, 0);
1177 * Link the page to the buffers and initialise them. Take the
1178 * lock to be atomic wrt __find_get_block(), which does not
1179 * run under the page lock.
1181 spin_lock(&inode->i_mapping->private_lock);
1182 link_dev_buffers(page, bh);
1183 init_page_buffers(page, bdev, block, size);
1184 spin_unlock(&inode->i_mapping->private_lock);
1190 page_cache_release(page);
1195 * Create buffers for the specified block device block's page. If
1196 * that page was dirty, the buffers are set dirty also.
1198 * Except that's a bug. Attaching dirty buffers to a dirty
1199 * blockdev's page can result in filesystem corruption, because
1200 * some of those buffers may be aliases of filesystem data.
1201 * grow_dev_page() will go BUG() if this happens.
1204 grow_buffers(struct block_device *bdev, sector_t block, int size)
1210 /* Size must be multiple of hard sectorsize */
1211 if (size & (bdev_hardsect_size(bdev)-1))
1213 if (size < 512 || size > PAGE_SIZE)
1219 } while ((size << sizebits) < PAGE_SIZE);
1221 index = block >> sizebits;
1222 block = index << sizebits;
1224 /* Create a page with the proper size buffers.. */
1225 page = grow_dev_page(bdev, block, index, size);
1229 page_cache_release(page);
1233 struct buffer_head *
1234 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1237 struct buffer_head * bh;
1239 bh = __find_get_block(bdev, block, size);
1243 if (!grow_buffers(bdev, block, size))
1249 * The relationship between dirty buffers and dirty pages:
1251 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1252 * the page is tagged dirty in its radix tree.
1254 * At all times, the dirtiness of the buffers represents the dirtiness of
1255 * subsections of the page. If the page has buffers, the page dirty bit is
1256 * merely a hint about the true dirty state.
1258 * When a page is set dirty in its entirety, all its buffers are marked dirty
1259 * (if the page has buffers).
1261 * When a buffer is marked dirty, its page is dirtied, but the page's other
1264 * Also. When blockdev buffers are explicitly read with bread(), they
1265 * individually become uptodate. But their backing page remains not
1266 * uptodate - even if all of its buffers are uptodate. A subsequent
1267 * block_read_full_page() against that page will discover all the uptodate
1268 * buffers, will set the page uptodate and will perform no I/O.
1272 * mark_buffer_dirty - mark a buffer_head as needing writeout
1274 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1275 * backing page dirty, then tag the page as dirty in its address_space's radix
1276 * tree and then attach the address_space's inode to its superblock's dirty
1279 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1280 * mapping->tree_lock and the global inode_lock.
1282 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1284 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1285 __set_page_dirty_nobuffers(bh->b_page);
1289 * Decrement a buffer_head's reference count. If all buffers against a page
1290 * have zero reference count, are clean and unlocked, and if the page is clean
1291 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1292 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1293 * a page but it ends up not being freed, and buffers may later be reattached).
1295 void __brelse(struct buffer_head * buf)
1297 if (atomic_read(&buf->b_count)) {
1301 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1306 * bforget() is like brelse(), except it discards any
1307 * potentially dirty data.
1309 void __bforget(struct buffer_head *bh)
1311 clear_buffer_dirty(bh);
1312 if (!list_empty(&bh->b_assoc_buffers)) {
1313 struct address_space *buffer_mapping = bh->b_page->mapping;
1315 spin_lock(&buffer_mapping->private_lock);
1316 list_del_init(&bh->b_assoc_buffers);
1317 spin_unlock(&buffer_mapping->private_lock);
1322 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1325 if (buffer_uptodate(bh)) {
1330 bh->b_end_io = end_buffer_read_sync;
1331 submit_bh(READ, bh);
1333 if (buffer_uptodate(bh))
1341 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1342 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1343 * refcount elevated by one when they're in an LRU. A buffer can only appear
1344 * once in a particular CPU's LRU. A single buffer can be present in multiple
1345 * CPU's LRUs at the same time.
1347 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1348 * sb_find_get_block().
1350 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1351 * a local interrupt disable for that.
1354 #define BH_LRU_SIZE 8
1357 struct buffer_head *bhs[BH_LRU_SIZE];
1360 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{0}};
1363 #define bh_lru_lock() local_irq_disable()
1364 #define bh_lru_unlock() local_irq_enable()
1366 #define bh_lru_lock() preempt_disable()
1367 #define bh_lru_unlock() preempt_enable()
1370 static inline void check_irqs_on(void)
1372 #ifdef irqs_disabled
1373 BUG_ON(irqs_disabled());
1378 * The LRU management algorithm is dopey-but-simple. Sorry.
1380 static void bh_lru_install(struct buffer_head *bh)
1382 struct buffer_head *evictee = NULL;
1387 lru = &__get_cpu_var(bh_lrus);
1388 if (lru->bhs[0] != bh) {
1389 struct buffer_head *bhs[BH_LRU_SIZE];
1395 for (in = 0; in < BH_LRU_SIZE; in++) {
1396 struct buffer_head *bh2 = lru->bhs[in];
1401 if (out >= BH_LRU_SIZE) {
1402 BUG_ON(evictee != NULL);
1409 while (out < BH_LRU_SIZE)
1411 memcpy(lru->bhs, bhs, sizeof(bhs));
1420 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1422 static inline struct buffer_head *
1423 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1425 struct buffer_head *ret = NULL;
1431 lru = &__get_cpu_var(bh_lrus);
1432 for (i = 0; i < BH_LRU_SIZE; i++) {
1433 struct buffer_head *bh = lru->bhs[i];
1435 if (bh && bh->b_bdev == bdev &&
1436 bh->b_blocknr == block && bh->b_size == size) {
1439 lru->bhs[i] = lru->bhs[i - 1];
1454 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1455 * it in the LRU and mark it as accessed. If it is not present then return
1458 struct buffer_head *
1459 __find_get_block(struct block_device *bdev, sector_t block, int size)
1461 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1464 bh = __find_get_block_slow(bdev, block, size);
1472 EXPORT_SYMBOL(__find_get_block);
1475 * __getblk will locate (and, if necessary, create) the buffer_head
1476 * which corresponds to the passed block_device, block and size. The
1477 * returned buffer has its reference count incremented.
1479 * __getblk() cannot fail - it just keeps trying. If you pass it an
1480 * illegal block number, __getblk() will happily return a buffer_head
1481 * which represents the non-existent block. Very weird.
1483 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1484 * attempt is failing. FIXME, perhaps?
1486 struct buffer_head *
1487 __getblk(struct block_device *bdev, sector_t block, int size)
1489 struct buffer_head *bh = __find_get_block(bdev, block, size);
1492 bh = __getblk_slow(bdev, block, size);
1495 EXPORT_SYMBOL(__getblk);
1498 * Do async read-ahead on a buffer..
1500 void __breadahead(struct block_device *bdev, sector_t block, int size)
1502 struct buffer_head *bh = __getblk(bdev, block, size);
1503 ll_rw_block(READA, 1, &bh);
1506 EXPORT_SYMBOL(__breadahead);
1509 * __bread() - reads a specified block and returns the bh
1510 * @block: number of block
1511 * @size: size (in bytes) to read
1513 * Reads a specified block, and returns buffer head that contains it.
1514 * It returns NULL if the block was unreadable.
1516 struct buffer_head *
1517 __bread(struct block_device *bdev, sector_t block, int size)
1519 struct buffer_head *bh = __getblk(bdev, block, size);
1521 if (!buffer_uptodate(bh))
1522 bh = __bread_slow(bh);
1525 EXPORT_SYMBOL(__bread);
1528 * invalidate_bh_lrus() is called rarely - at unmount. Because it is only for
1529 * unmount it only needs to ensure that all buffers from the target device are
1530 * invalidated on return and it doesn't need to worry about new buffers from
1531 * that device being added - the unmount code has to prevent that.
1533 static void invalidate_bh_lru(void *arg)
1535 struct bh_lru *b = &get_cpu_var(bh_lrus);
1538 for (i = 0; i < BH_LRU_SIZE; i++) {
1542 put_cpu_var(bh_lrus);
1545 static void invalidate_bh_lrus(void)
1547 on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1550 void set_bh_page(struct buffer_head *bh,
1551 struct page *page, unsigned long offset)
1554 if (offset >= PAGE_SIZE)
1556 if (PageHighMem(page))
1558 * This catches illegal uses and preserves the offset:
1560 bh->b_data = (char *)(0 + offset);
1562 bh->b_data = page_address(page) + offset;
1564 EXPORT_SYMBOL(set_bh_page);
1567 * Called when truncating a buffer on a page completely.
1569 static inline void discard_buffer(struct buffer_head * bh)
1572 clear_buffer_dirty(bh);
1574 clear_buffer_mapped(bh);
1575 clear_buffer_req(bh);
1576 clear_buffer_new(bh);
1577 clear_buffer_delay(bh);
1582 * try_to_release_page() - release old fs-specific metadata on a page
1584 * @page: the page which the kernel is trying to free
1585 * @gfp_mask: memory allocation flags (and I/O mode)
1587 * The address_space is to try to release any data against the page
1588 * (presumably at page->private). If the release was successful, return `1'.
1589 * Otherwise return zero.
1591 * The @gfp_mask argument specifies whether I/O may be performed to release
1592 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1594 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1596 int try_to_release_page(struct page *page, int gfp_mask)
1598 struct address_space * const mapping = page->mapping;
1600 BUG_ON(!PageLocked(page));
1601 if (PageWriteback(page))
1604 if (mapping && mapping->a_ops->releasepage)
1605 return mapping->a_ops->releasepage(page, gfp_mask);
1606 return try_to_free_buffers(page);
1608 EXPORT_SYMBOL(try_to_release_page);
1611 * block_invalidatepage - invalidate part of all of a buffer-backed page
1613 * @page: the page which is affected
1614 * @offset: the index of the truncation point
1616 * block_invalidatepage() is called when all or part of the page has become
1617 * invalidatedby a truncate operation.
1619 * block_invalidatepage() does not have to release all buffers, but it must
1620 * ensure that no dirty buffer is left outside @offset and that no I/O
1621 * is underway against any of the blocks which are outside the truncation
1622 * point. Because the caller is about to free (and possibly reuse) those
1625 int block_invalidatepage(struct page *page, unsigned long offset)
1627 struct buffer_head *head, *bh, *next;
1628 unsigned int curr_off = 0;
1631 BUG_ON(!PageLocked(page));
1632 if (!page_has_buffers(page))
1635 head = page_buffers(page);
1638 unsigned int next_off = curr_off + bh->b_size;
1639 next = bh->b_this_page;
1642 * is this block fully invalidated?
1644 if (offset <= curr_off)
1646 curr_off = next_off;
1648 } while (bh != head);
1651 * We release buffers only if the entire page is being invalidated.
1652 * The get_block cached value has been unconditionally invalidated,
1653 * so real IO is not possible anymore.
1656 ret = try_to_release_page(page, 0);
1660 EXPORT_SYMBOL(block_invalidatepage);
1663 * We attach and possibly dirty the buffers atomically wrt
1664 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1665 * is already excluded via the page lock.
1667 void create_empty_buffers(struct page *page,
1668 unsigned long blocksize, unsigned long b_state)
1670 struct buffer_head *bh, *head, *tail;
1672 head = create_buffers(page, blocksize, 1);
1675 bh->b_state |= b_state;
1677 bh = bh->b_this_page;
1679 tail->b_this_page = head;
1681 spin_lock(&page->mapping->private_lock);
1682 if (PageUptodate(page) || PageDirty(page)) {
1685 if (PageDirty(page))
1686 set_buffer_dirty(bh);
1687 if (PageUptodate(page))
1688 set_buffer_uptodate(bh);
1689 bh = bh->b_this_page;
1690 } while (bh != head);
1692 __set_page_buffers(page, head);
1693 spin_unlock(&page->mapping->private_lock);
1695 EXPORT_SYMBOL(create_empty_buffers);
1698 * We are taking a block for data and we don't want any output from any
1699 * buffer-cache aliases starting from return from that function and
1700 * until the moment when something will explicitly mark the buffer
1701 * dirty (hopefully that will not happen until we will free that block ;-)
1702 * We don't even need to mark it not-uptodate - nobody can expect
1703 * anything from a newly allocated buffer anyway. We used to used
1704 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1705 * don't want to mark the alias unmapped, for example - it would confuse
1706 * anyone who might pick it with bread() afterwards...
1708 * Also.. Note that bforget() doesn't lock the buffer. So there can
1709 * be writeout I/O going on against recently-freed buffers. We don't
1710 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1711 * only if we really need to. That happens here.
1713 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1715 struct buffer_head *old_bh;
1717 old_bh = __find_get_block_slow(bdev, block, 0);
1719 clear_buffer_dirty(old_bh);
1720 wait_on_buffer(old_bh);
1721 clear_buffer_req(old_bh);
1725 EXPORT_SYMBOL(unmap_underlying_metadata);
1728 * NOTE! All mapped/uptodate combinations are valid:
1730 * Mapped Uptodate Meaning
1732 * No No "unknown" - must do get_block()
1733 * No Yes "hole" - zero-filled
1734 * Yes No "allocated" - allocated on disk, not read in
1735 * Yes Yes "valid" - allocated and up-to-date in memory.
1737 * "Dirty" is valid only with the last case (mapped+uptodate).
1741 * While block_write_full_page is writing back the dirty buffers under
1742 * the page lock, whoever dirtied the buffers may decide to clean them
1743 * again at any time. We handle that by only looking at the buffer
1744 * state inside lock_buffer().
1746 * If block_write_full_page() is called for regular writeback
1747 * (called_for_sync() is false) then it will redirty a page which has a locked
1748 * buffer. This only can happen if someone has written the buffer directly,
1749 * with submit_bh(). At the address_space level PageWriteback prevents this
1750 * contention from occurring.
1752 static int __block_write_full_page(struct inode *inode, struct page *page,
1753 get_block_t *get_block, struct writeback_control *wbc)
1757 sector_t last_block;
1758 struct buffer_head *bh, *head;
1759 int nr_underway = 0;
1761 BUG_ON(!PageLocked(page));
1763 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1765 if (!page_has_buffers(page)) {
1766 create_empty_buffers(page, 1 << inode->i_blkbits,
1767 (1 << BH_Dirty)|(1 << BH_Uptodate));
1771 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1772 * here, and the (potentially unmapped) buffers may become dirty at
1773 * any time. If a buffer becomes dirty here after we've inspected it
1774 * then we just miss that fact, and the page stays dirty.
1776 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1777 * handle that here by just cleaning them.
1780 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1781 head = page_buffers(page);
1785 * Get all the dirty buffers mapped to disk addresses and
1786 * handle any aliases from the underlying blockdev's mapping.
1789 if (block > last_block) {
1791 * mapped buffers outside i_size will occur, because
1792 * this page can be outside i_size when there is a
1793 * truncate in progress.
1796 * The buffer was zeroed by block_write_full_page()
1798 clear_buffer_dirty(bh);
1799 set_buffer_uptodate(bh);
1800 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1801 err = get_block(inode, block, bh, 1);
1804 if (buffer_new(bh)) {
1805 /* blockdev mappings never come here */
1806 clear_buffer_new(bh);
1807 unmap_underlying_metadata(bh->b_bdev,
1811 bh = bh->b_this_page;
1813 } while (bh != head);
1817 if (!buffer_mapped(bh))
1820 * If it's a fully non-blocking write attempt and we cannot
1821 * lock the buffer then redirty the page. Note that this can
1822 * potentially cause a busy-wait loop from pdflush and kswapd
1823 * activity, but those code paths have their own higher-level
1826 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1828 } else if (test_set_buffer_locked(bh)) {
1829 redirty_page_for_writepage(wbc, page);
1832 if (test_clear_buffer_dirty(bh)) {
1833 mark_buffer_async_write(bh);
1837 } while ((bh = bh->b_this_page) != head);
1839 BUG_ON(PageWriteback(page));
1840 set_page_writeback(page); /* Keeps try_to_free_buffers() away */
1844 * The page may come unlocked any time after the *first* submit_bh()
1845 * call. Be careful with its buffers.
1848 struct buffer_head *next = bh->b_this_page;
1849 if (buffer_async_write(bh)) {
1850 submit_bh(WRITE, bh);
1855 } while (bh != head);
1859 if (nr_underway == 0) {
1861 * The page was marked dirty, but the buffers were
1862 * clean. Someone wrote them back by hand with
1863 * ll_rw_block/submit_bh. A rare case.
1867 if (!buffer_uptodate(bh)) {
1871 bh = bh->b_this_page;
1872 } while (bh != head);
1874 SetPageUptodate(page);
1875 end_page_writeback(page);
1876 wbc->pages_skipped++; /* We didn't write this page */
1882 * ENOSPC, or some other error. We may already have added some
1883 * blocks to the file, so we need to write these out to avoid
1884 * exposing stale data.
1885 * The page is currently locked and not marked for writeback
1888 /* Recovery: lock and submit the mapped buffers */
1891 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1893 mark_buffer_async_write(bh);
1896 * The buffer may have been set dirty during
1897 * attachment to a dirty page.
1899 clear_buffer_dirty(bh);
1901 } while ((bh = bh->b_this_page) != head);
1903 BUG_ON(PageWriteback(page));
1904 set_page_writeback(page);
1907 struct buffer_head *next = bh->b_this_page;
1908 if (buffer_async_write(bh)) {
1909 clear_buffer_dirty(bh);
1910 submit_bh(WRITE, bh);
1915 } while (bh != head);
1919 static int __block_prepare_write(struct inode *inode, struct page *page,
1920 unsigned from, unsigned to, get_block_t *get_block)
1922 unsigned block_start, block_end;
1925 unsigned blocksize, bbits;
1926 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1928 BUG_ON(!PageLocked(page));
1929 BUG_ON(from > PAGE_CACHE_SIZE);
1930 BUG_ON(to > PAGE_CACHE_SIZE);
1933 blocksize = 1 << inode->i_blkbits;
1934 if (!page_has_buffers(page))
1935 create_empty_buffers(page, blocksize, 0);
1936 head = page_buffers(page);
1938 bbits = inode->i_blkbits;
1939 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1941 for(bh = head, block_start = 0; bh != head || !block_start;
1942 block++, block_start=block_end, bh = bh->b_this_page) {
1943 block_end = block_start + blocksize;
1944 if (block_end <= from || block_start >= to) {
1945 if (PageUptodate(page)) {
1946 if (!buffer_uptodate(bh))
1947 set_buffer_uptodate(bh);
1952 clear_buffer_new(bh);
1953 if (!buffer_mapped(bh)) {
1954 err = get_block(inode, block, bh, 1);
1957 if (buffer_new(bh)) {
1958 clear_buffer_new(bh);
1959 unmap_underlying_metadata(bh->b_bdev,
1961 if (PageUptodate(page)) {
1962 set_buffer_uptodate(bh);
1965 if (block_end > to || block_start < from) {
1968 kaddr = kmap_atomic(page, KM_USER0);
1972 if (block_start < from)
1973 memset(kaddr+block_start,
1974 0, from-block_start);
1975 flush_dcache_page(page);
1976 kunmap_atomic(kaddr, KM_USER0);
1981 if (PageUptodate(page)) {
1982 if (!buffer_uptodate(bh))
1983 set_buffer_uptodate(bh);
1986 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1987 (block_start < from || block_end > to)) {
1988 ll_rw_block(READ, 1, &bh);
1993 * If we issued read requests - let them complete.
1995 while(wait_bh > wait) {
1996 wait_on_buffer(*--wait_bh);
1997 if (!buffer_uptodate(*wait_bh))
2003 * Zero out any newly allocated blocks to avoid exposing stale
2004 * data. If BH_New is set, we know that the block was newly
2005 * allocated in the above loop.
2010 block_end = block_start+blocksize;
2011 if (block_end <= from)
2013 if (block_start >= to)
2015 if (buffer_new(bh)) {
2018 clear_buffer_new(bh);
2019 kaddr = kmap_atomic(page, KM_USER0);
2020 memset(kaddr+block_start, 0, bh->b_size);
2021 kunmap_atomic(kaddr, KM_USER0);
2022 set_buffer_uptodate(bh);
2023 mark_buffer_dirty(bh);
2026 block_start = block_end;
2027 bh = bh->b_this_page;
2028 } while (bh != head);
2032 static int __block_commit_write(struct inode *inode, struct page *page,
2033 unsigned from, unsigned to)
2035 unsigned block_start, block_end;
2038 struct buffer_head *bh, *head;
2040 blocksize = 1 << inode->i_blkbits;
2042 for(bh = head = page_buffers(page), block_start = 0;
2043 bh != head || !block_start;
2044 block_start=block_end, bh = bh->b_this_page) {
2045 block_end = block_start + blocksize;
2046 if (block_end <= from || block_start >= to) {
2047 if (!buffer_uptodate(bh))
2050 set_buffer_uptodate(bh);
2051 mark_buffer_dirty(bh);
2056 * If this is a partial write which happened to make all buffers
2057 * uptodate then we can optimize away a bogus readpage() for
2058 * the next read(). Here we 'discover' whether the page went
2059 * uptodate as a result of this (potentially partial) write.
2062 SetPageUptodate(page);
2067 * Generic "read page" function for block devices that have the normal
2068 * get_block functionality. This is most of the block device filesystems.
2069 * Reads the page asynchronously --- the unlock_buffer() and
2070 * set/clear_buffer_uptodate() functions propagate buffer state into the
2071 * page struct once IO has completed.
2073 int block_read_full_page(struct page *page, get_block_t *get_block)
2075 struct inode *inode = page->mapping->host;
2076 sector_t iblock, lblock;
2077 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2078 unsigned int blocksize;
2080 int fully_mapped = 1;
2082 if (!PageLocked(page))
2084 blocksize = 1 << inode->i_blkbits;
2085 if (!page_has_buffers(page))
2086 create_empty_buffers(page, blocksize, 0);
2087 head = page_buffers(page);
2089 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2090 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2096 if (buffer_uptodate(bh))
2099 if (!buffer_mapped(bh)) {
2101 if (iblock < lblock) {
2102 if (get_block(inode, iblock, bh, 0))
2105 if (!buffer_mapped(bh)) {
2106 void *kaddr = kmap_atomic(page, KM_USER0);
2107 memset(kaddr + i * blocksize, 0, blocksize);
2108 flush_dcache_page(page);
2109 kunmap_atomic(kaddr, KM_USER0);
2110 set_buffer_uptodate(bh);
2114 * get_block() might have updated the buffer
2117 if (buffer_uptodate(bh))
2121 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2124 SetPageMappedToDisk(page);
2128 * All buffers are uptodate - we can set the page uptodate
2129 * as well. But not if get_block() returned an error.
2131 if (!PageError(page))
2132 SetPageUptodate(page);
2137 /* Stage two: lock the buffers */
2138 for (i = 0; i < nr; i++) {
2141 mark_buffer_async_read(bh);
2145 * Stage 3: start the IO. Check for uptodateness
2146 * inside the buffer lock in case another process reading
2147 * the underlying blockdev brought it uptodate (the sct fix).
2149 for (i = 0; i < nr; i++) {
2151 if (buffer_uptodate(bh))
2152 end_buffer_async_read(bh, 1);
2154 submit_bh(READ, bh);
2159 /* utility function for filesystems that need to do work on expanding
2160 * truncates. Uses prepare/commit_write to allow the filesystem to
2161 * deal with the hole.
2163 int generic_cont_expand(struct inode *inode, loff_t size)
2165 struct address_space *mapping = inode->i_mapping;
2167 unsigned long index, offset, limit;
2171 limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2172 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2173 send_sig(SIGXFSZ, current, 0);
2176 if (size > inode->i_sb->s_maxbytes)
2179 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2181 /* ugh. in prepare/commit_write, if from==to==start of block, we
2182 ** skip the prepare. make sure we never send an offset for the start
2185 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2188 index = size >> PAGE_CACHE_SHIFT;
2190 page = grab_cache_page(mapping, index);
2193 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2195 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2198 page_cache_release(page);
2206 * For moronic filesystems that do not allow holes in file.
2207 * We may have to extend the file.
2210 int cont_prepare_write(struct page *page, unsigned offset,
2211 unsigned to, get_block_t *get_block, loff_t *bytes)
2213 struct address_space *mapping = page->mapping;
2214 struct inode *inode = mapping->host;
2215 struct page *new_page;
2219 unsigned blocksize = 1 << inode->i_blkbits;
2222 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2224 new_page = grab_cache_page(mapping, pgpos);
2227 /* we might sleep */
2228 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2229 unlock_page(new_page);
2230 page_cache_release(new_page);
2233 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2234 if (zerofrom & (blocksize-1)) {
2235 *bytes |= (blocksize-1);
2238 status = __block_prepare_write(inode, new_page, zerofrom,
2239 PAGE_CACHE_SIZE, get_block);
2242 kaddr = kmap_atomic(new_page, KM_USER0);
2243 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2244 flush_dcache_page(new_page);
2245 kunmap_atomic(kaddr, KM_USER0);
2246 __block_commit_write(inode, new_page,
2247 zerofrom, PAGE_CACHE_SIZE);
2248 unlock_page(new_page);
2249 page_cache_release(new_page);
2252 if (page->index < pgpos) {
2253 /* completely inside the area */
2256 /* page covers the boundary, find the boundary offset */
2257 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2259 /* if we will expand the thing last block will be filled */
2260 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2261 *bytes |= (blocksize-1);
2265 /* starting below the boundary? Nothing to zero out */
2266 if (offset <= zerofrom)
2269 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2272 if (zerofrom < offset) {
2273 kaddr = kmap_atomic(page, KM_USER0);
2274 memset(kaddr+zerofrom, 0, offset-zerofrom);
2275 flush_dcache_page(page);
2276 kunmap_atomic(kaddr, KM_USER0);
2277 __block_commit_write(inode, page, zerofrom, offset);
2281 ClearPageUptodate(page);
2285 ClearPageUptodate(new_page);
2286 unlock_page(new_page);
2287 page_cache_release(new_page);
2292 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2293 get_block_t *get_block)
2295 struct inode *inode = page->mapping->host;
2296 int err = __block_prepare_write(inode, page, from, to, get_block);
2298 ClearPageUptodate(page);
2302 int block_commit_write(struct page *page, unsigned from, unsigned to)
2304 struct inode *inode = page->mapping->host;
2305 __block_commit_write(inode,page,from,to);
2309 int generic_commit_write(struct file *file, struct page *page,
2310 unsigned from, unsigned to)
2312 struct inode *inode = page->mapping->host;
2313 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2314 __block_commit_write(inode,page,from,to);
2316 * No need to use i_size_read() here, the i_size
2317 * cannot change under us because we hold i_sem.
2319 if (pos > inode->i_size) {
2320 i_size_write(inode, pos);
2321 mark_inode_dirty(inode);
2328 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2329 * immediately, while under the page lock. So it needs a special end_io
2330 * handler which does not touch the bh after unlocking it.
2332 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2333 * a race there is benign: unlock_buffer() only use the bh's address for
2334 * hashing after unlocking the buffer, so it doesn't actually touch the bh
2337 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2340 set_buffer_uptodate(bh);
2342 /* This happens, due to failed READA attempts. */
2343 clear_buffer_uptodate(bh);
2349 * On entry, the page is fully not uptodate.
2350 * On exit the page is fully uptodate in the areas outside (from,to)
2352 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2353 get_block_t *get_block)
2355 struct inode *inode = page->mapping->host;
2356 const unsigned blkbits = inode->i_blkbits;
2357 const unsigned blocksize = 1 << blkbits;
2358 struct buffer_head map_bh;
2359 struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2360 unsigned block_in_page;
2361 unsigned block_start;
2362 sector_t block_in_file;
2367 int is_mapped_to_disk = 1;
2370 if (PageMappedToDisk(page))
2373 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2374 map_bh.b_page = page;
2377 * We loop across all blocks in the page, whether or not they are
2378 * part of the affected region. This is so we can discover if the
2379 * page is fully mapped-to-disk.
2381 for (block_start = 0, block_in_page = 0;
2382 block_start < PAGE_CACHE_SIZE;
2383 block_in_page++, block_start += blocksize) {
2384 unsigned block_end = block_start + blocksize;
2389 if (block_start >= to)
2391 ret = get_block(inode, block_in_file + block_in_page,
2395 if (!buffer_mapped(&map_bh))
2396 is_mapped_to_disk = 0;
2397 if (buffer_new(&map_bh))
2398 unmap_underlying_metadata(map_bh.b_bdev,
2400 if (PageUptodate(page))
2402 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2403 kaddr = kmap_atomic(page, KM_USER0);
2404 if (block_start < from) {
2405 memset(kaddr+block_start, 0, from-block_start);
2408 if (block_end > to) {
2409 memset(kaddr + to, 0, block_end - to);
2412 flush_dcache_page(page);
2413 kunmap_atomic(kaddr, KM_USER0);
2416 if (buffer_uptodate(&map_bh))
2417 continue; /* reiserfs does this */
2418 if (block_start < from || block_end > to) {
2419 struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2425 bh->b_state = map_bh.b_state;
2426 atomic_set(&bh->b_count, 0);
2427 bh->b_this_page = 0;
2429 bh->b_blocknr = map_bh.b_blocknr;
2430 bh->b_size = blocksize;
2431 bh->b_data = (char *)(long)block_start;
2432 bh->b_bdev = map_bh.b_bdev;
2433 bh->b_private = NULL;
2434 read_bh[nr_reads++] = bh;
2439 struct buffer_head *bh;
2442 * The page is locked, so these buffers are protected from
2443 * any VM or truncate activity. Hence we don't need to care
2444 * for the buffer_head refcounts.
2446 for (i = 0; i < nr_reads; i++) {
2449 bh->b_end_io = end_buffer_read_nobh;
2450 submit_bh(READ, bh);
2452 for (i = 0; i < nr_reads; i++) {
2455 if (!buffer_uptodate(bh))
2457 free_buffer_head(bh);
2464 if (is_mapped_to_disk)
2465 SetPageMappedToDisk(page);
2466 SetPageUptodate(page);
2469 * Setting the page dirty here isn't necessary for the prepare_write
2470 * function - commit_write will do that. But if/when this function is
2471 * used within the pagefault handler to ensure that all mmapped pages
2472 * have backing space in the filesystem, we will need to dirty the page
2473 * if its contents were altered.
2476 set_page_dirty(page);
2481 for (i = 0; i < nr_reads; i++) {
2483 free_buffer_head(read_bh[i]);
2487 * Error recovery is pretty slack. Clear the page and mark it dirty
2488 * so we'll later zero out any blocks which _were_ allocated.
2490 kaddr = kmap_atomic(page, KM_USER0);
2491 memset(kaddr, 0, PAGE_CACHE_SIZE);
2492 kunmap_atomic(kaddr, KM_USER0);
2493 SetPageUptodate(page);
2494 set_page_dirty(page);
2497 EXPORT_SYMBOL(nobh_prepare_write);
2499 int nobh_commit_write(struct file *file, struct page *page,
2500 unsigned from, unsigned to)
2502 struct inode *inode = page->mapping->host;
2503 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2505 set_page_dirty(page);
2506 if (pos > inode->i_size) {
2507 i_size_write(inode, pos);
2508 mark_inode_dirty(inode);
2512 EXPORT_SYMBOL(nobh_commit_write);
2515 * This function assumes that ->prepare_write() uses nobh_prepare_write().
2517 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2519 struct inode *inode = mapping->host;
2520 unsigned blocksize = 1 << inode->i_blkbits;
2521 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2522 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2525 struct address_space_operations *a_ops = mapping->a_ops;
2529 if ((offset & (blocksize - 1)) == 0)
2533 page = grab_cache_page(mapping, index);
2537 to = (offset + blocksize) & ~(blocksize - 1);
2538 ret = a_ops->prepare_write(NULL, page, offset, to);
2540 kaddr = kmap_atomic(page, KM_USER0);
2541 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2542 flush_dcache_page(page);
2543 kunmap_atomic(kaddr, KM_USER0);
2544 set_page_dirty(page);
2547 page_cache_release(page);
2551 EXPORT_SYMBOL(nobh_truncate_page);
2553 int block_truncate_page(struct address_space *mapping,
2554 loff_t from, get_block_t *get_block)
2556 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2557 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2560 unsigned length, pos;
2561 struct inode *inode = mapping->host;
2563 struct buffer_head *bh;
2567 blocksize = 1 << inode->i_blkbits;
2568 length = offset & (blocksize - 1);
2570 /* Block boundary? Nothing to do */
2574 length = blocksize - length;
2575 iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2577 page = grab_cache_page(mapping, index);
2582 if (!page_has_buffers(page))
2583 create_empty_buffers(page, blocksize, 0);
2585 /* Find the buffer that contains "offset" */
2586 bh = page_buffers(page);
2588 while (offset >= pos) {
2589 bh = bh->b_this_page;
2595 if (!buffer_mapped(bh)) {
2596 err = get_block(inode, iblock, bh, 0);
2599 /* unmapped? It's a hole - nothing to do */
2600 if (!buffer_mapped(bh))
2604 /* Ok, it's mapped. Make sure it's up-to-date */
2605 if (PageUptodate(page))
2606 set_buffer_uptodate(bh);
2608 if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2610 ll_rw_block(READ, 1, &bh);
2612 /* Uhhuh. Read error. Complain and punt. */
2613 if (!buffer_uptodate(bh))
2617 kaddr = kmap_atomic(page, KM_USER0);
2618 memset(kaddr + offset, 0, length);
2619 flush_dcache_page(page);
2620 kunmap_atomic(kaddr, KM_USER0);
2622 mark_buffer_dirty(bh);
2627 page_cache_release(page);
2633 * The generic ->writepage function for buffer-backed address_spaces
2635 int block_write_full_page(struct page *page, get_block_t *get_block,
2636 struct writeback_control *wbc)
2638 struct inode * const inode = page->mapping->host;
2639 loff_t i_size = i_size_read(inode);
2640 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2644 /* Is the page fully inside i_size? */
2645 if (page->index < end_index)
2646 return __block_write_full_page(inode, page, get_block, wbc);
2648 /* Is the page fully outside i_size? (truncate in progress) */
2649 offset = i_size & (PAGE_CACHE_SIZE-1);
2650 if (page->index >= end_index+1 || !offset) {
2652 * The page may have dirty, unmapped buffers. For example,
2653 * they may have been added in ext3_writepage(). Make them
2654 * freeable here, so the page does not leak.
2656 block_invalidatepage(page, 0);
2658 return 0; /* don't care */
2662 * The page straddles i_size. It must be zeroed out on each and every
2663 * writepage invocation because it may be mmapped. "A file is mapped
2664 * in multiples of the page size. For a file that is not a multiple of
2665 * the page size, the remaining memory is zeroed when mapped, and
2666 * writes to that region are not written out to the file."
2668 kaddr = kmap_atomic(page, KM_USER0);
2669 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2670 flush_dcache_page(page);
2671 kunmap_atomic(kaddr, KM_USER0);
2672 return __block_write_full_page(inode, page, get_block, wbc);
2675 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2676 get_block_t *get_block)
2678 struct buffer_head tmp;
2679 struct inode *inode = mapping->host;
2682 get_block(inode, block, &tmp, 0);
2683 return tmp.b_blocknr;
2686 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2688 struct buffer_head *bh = bio->bi_private;
2693 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2698 void submit_bh(int rw, struct buffer_head * bh)
2702 BUG_ON(!buffer_locked(bh));
2703 BUG_ON(!buffer_mapped(bh));
2704 BUG_ON(!bh->b_end_io);
2706 /* Only clear out a write error when rewriting */
2707 if (test_set_buffer_req(bh) && rw == WRITE)
2708 clear_buffer_write_io_error(bh);
2711 * from here on down, it's all bio -- do the initial mapping,
2712 * submit_bio -> generic_make_request may further map this bio around
2714 bio = bio_alloc(GFP_NOIO, 1);
2716 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2717 bio->bi_bdev = bh->b_bdev;
2718 bio->bi_io_vec[0].bv_page = bh->b_page;
2719 bio->bi_io_vec[0].bv_len = bh->b_size;
2720 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2724 bio->bi_size = bh->b_size;
2726 bio->bi_end_io = end_bio_bh_io_sync;
2727 bio->bi_private = bh;
2729 submit_bio(rw, bio);
2733 * ll_rw_block: low-level access to block devices (DEPRECATED)
2734 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2735 * @nr: number of &struct buffer_heads in the array
2736 * @bhs: array of pointers to &struct buffer_head
2738 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2739 * and requests an I/O operation on them, either a %READ or a %WRITE.
2740 * The third %READA option is described in the documentation for
2741 * generic_make_request() which ll_rw_block() calls.
2743 * This function drops any buffer that it cannot get a lock on (with the
2744 * BH_Lock state bit), any buffer that appears to be clean when doing a
2745 * write request, and any buffer that appears to be up-to-date when doing
2746 * read request. Further it marks as clean buffers that are processed for
2747 * writing (the buffer cache won't assume that they are actually clean until
2748 * the buffer gets unlocked).
2750 * ll_rw_block sets b_end_io to simple completion handler that marks
2751 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2754 * All of the buffers must be for the same device, and must also be a
2755 * multiple of the current approved size for the device.
2757 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2761 for (i = 0; i < nr; i++) {
2762 struct buffer_head *bh = bhs[i];
2764 if (test_set_buffer_locked(bh))
2769 bh->b_end_io = end_buffer_write_sync;
2770 if (test_clear_buffer_dirty(bh)) {
2771 submit_bh(WRITE, bh);
2775 bh->b_end_io = end_buffer_read_sync;
2776 if (!buffer_uptodate(bh)) {
2787 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2788 * and then start new I/O and then wait upon it.
2790 void sync_dirty_buffer(struct buffer_head *bh)
2792 WARN_ON(atomic_read(&bh->b_count) < 1);
2794 if (test_clear_buffer_dirty(bh)) {
2796 bh->b_end_io = end_buffer_write_sync;
2797 submit_bh(WRITE, bh);
2805 * try_to_free_buffers() checks if all the buffers on this particular page
2806 * are unused, and releases them if so.
2808 * Exclusion against try_to_free_buffers may be obtained by either
2809 * locking the page or by holding its mapping's private_lock.
2811 * If the page is dirty but all the buffers are clean then we need to
2812 * be sure to mark the page clean as well. This is because the page
2813 * may be against a block device, and a later reattachment of buffers
2814 * to a dirty page will set *all* buffers dirty. Which would corrupt
2815 * filesystem data on the same device.
2817 * The same applies to regular filesystem pages: if all the buffers are
2818 * clean then we set the page clean and proceed. To do that, we require
2819 * total exclusion from __set_page_dirty_buffers(). That is obtained with
2822 * try_to_free_buffers() is non-blocking.
2824 static inline int buffer_busy(struct buffer_head *bh)
2826 return atomic_read(&bh->b_count) |
2827 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2831 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2833 struct buffer_head *head = page_buffers(page);
2834 struct buffer_head *bh;
2835 int was_uptodate = 1;
2839 if (buffer_write_io_error(bh))
2840 set_bit(AS_EIO, &page->mapping->flags);
2841 if (buffer_busy(bh))
2843 if (!buffer_uptodate(bh) && !buffer_req(bh))
2845 bh = bh->b_this_page;
2846 } while (bh != head);
2849 struct buffer_head *next = bh->b_this_page;
2851 if (!list_empty(&bh->b_assoc_buffers))
2852 __remove_assoc_queue(bh);
2854 } while (bh != head);
2855 *buffers_to_free = head;
2856 __clear_page_buffers(page);
2862 int try_to_free_buffers(struct page *page)
2864 struct address_space * const mapping = page->mapping;
2865 struct buffer_head *buffers_to_free = NULL;
2868 BUG_ON(!PageLocked(page));
2869 if (PageWriteback(page))
2872 if (mapping == NULL) { /* can this still happen? */
2873 ret = drop_buffers(page, &buffers_to_free);
2877 spin_lock(&mapping->private_lock);
2878 ret = drop_buffers(page, &buffers_to_free);
2881 * If the filesystem writes its buffers by hand (eg ext3)
2882 * then we can have clean buffers against a dirty page. We
2883 * clean the page here; otherwise later reattachment of buffers
2884 * could encounter a non-uptodate page, which is unresolvable.
2885 * This only applies in the rare case where try_to_free_buffers
2886 * succeeds but the page is not freed.
2888 clear_page_dirty(page);
2890 spin_unlock(&mapping->private_lock);
2892 if (buffers_to_free) {
2893 struct buffer_head *bh = buffers_to_free;
2896 struct buffer_head *next = bh->b_this_page;
2897 free_buffer_head(bh);
2899 } while (bh != buffers_to_free);
2903 EXPORT_SYMBOL(try_to_free_buffers);
2905 int block_sync_page(struct page *page)
2907 struct address_space *mapping;
2909 mapping = page->mapping;
2910 blk_run_address_space(mapping);
2915 * There are no bdflush tunables left. But distributions are
2916 * still running obsolete flush daemons, so we terminate them here.
2918 * Use of bdflush() is deprecated and will be removed in a future kernel.
2919 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2921 asmlinkage long sys_bdflush(int func, long data)
2923 static int msg_count;
2925 if (!capable(CAP_SYS_ADMIN))
2928 if (msg_count < 5) {
2931 "warning: process `%s' used the obsolete bdflush"
2932 " system call\n", current->comm);
2933 printk(KERN_INFO "Fix your initscripts?\n");
2942 * Buffer-head allocation
2944 static kmem_cache_t *bh_cachep;
2947 * Once the number of bh's in the machine exceeds this level, we start
2948 * stripping them in writeback.
2950 static int max_buffer_heads;
2952 int buffer_heads_over_limit;
2954 struct bh_accounting {
2955 int nr; /* Number of live bh's */
2956 int ratelimit; /* Limit cacheline bouncing */
2959 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2961 static void recalc_bh_state(void)
2966 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2968 __get_cpu_var(bh_accounting).ratelimit = 0;
2970 tot += per_cpu(bh_accounting, i).nr;
2971 buffer_heads_over_limit = (tot > max_buffer_heads);
2974 struct buffer_head *alloc_buffer_head(int gfp_flags)
2976 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
2979 __get_cpu_var(bh_accounting).nr++;
2985 EXPORT_SYMBOL(alloc_buffer_head);
2987 void free_buffer_head(struct buffer_head *bh)
2989 BUG_ON(!list_empty(&bh->b_assoc_buffers));
2990 kmem_cache_free(bh_cachep, bh);
2992 __get_cpu_var(bh_accounting).nr--;
2996 EXPORT_SYMBOL(free_buffer_head);
2999 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3001 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3002 SLAB_CTOR_CONSTRUCTOR) {
3003 struct buffer_head * bh = (struct buffer_head *)data;
3005 memset(bh, 0, sizeof(*bh));
3006 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3010 #ifdef CONFIG_HOTPLUG_CPU
3011 static void buffer_exit_cpu(int cpu)
3014 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3016 for (i = 0; i < BH_LRU_SIZE; i++) {
3022 static int buffer_cpu_notify(struct notifier_block *self,
3023 unsigned long action, void *hcpu)
3025 if (action == CPU_DEAD)
3026 buffer_exit_cpu((unsigned long)hcpu);
3029 #endif /* CONFIG_HOTPLUG_CPU */
3031 void __init buffer_init(void)
3036 bh_cachep = kmem_cache_create("buffer_head",
3037 sizeof(struct buffer_head), 0,
3038 0, init_buffer_head, NULL);
3039 for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
3040 init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
3043 * Limit the bh occupancy to 10% of ZONE_NORMAL
3045 nrpages = (nr_free_buffer_pages() * 10) / 100;
3046 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3047 hotcpu_notifier(buffer_cpu_notify, 0);
3050 EXPORT_SYMBOL(__bforget);
3051 EXPORT_SYMBOL(__brelse);
3052 EXPORT_SYMBOL(__wait_on_buffer);
3053 EXPORT_SYMBOL(block_commit_write);
3054 EXPORT_SYMBOL(block_prepare_write);
3055 EXPORT_SYMBOL(block_read_full_page);
3056 EXPORT_SYMBOL(block_sync_page);
3057 EXPORT_SYMBOL(block_truncate_page);
3058 EXPORT_SYMBOL(block_write_full_page);
3059 EXPORT_SYMBOL(buffer_insert_list);
3060 EXPORT_SYMBOL(cont_prepare_write);
3061 EXPORT_SYMBOL(end_buffer_async_write);
3062 EXPORT_SYMBOL(end_buffer_read_sync);
3063 EXPORT_SYMBOL(end_buffer_write_sync);
3064 EXPORT_SYMBOL(file_fsync);
3065 EXPORT_SYMBOL(fsync_bdev);
3066 EXPORT_SYMBOL(fsync_buffers_list);
3067 EXPORT_SYMBOL(generic_block_bmap);
3068 EXPORT_SYMBOL(generic_commit_write);
3069 EXPORT_SYMBOL(generic_cont_expand);
3070 EXPORT_SYMBOL(init_buffer);
3071 EXPORT_SYMBOL(invalidate_bdev);
3072 EXPORT_SYMBOL(ll_rw_block);
3073 EXPORT_SYMBOL(mark_buffer_dirty);
3074 EXPORT_SYMBOL(submit_bh);
3075 EXPORT_SYMBOL(sync_dirty_buffer);
3076 EXPORT_SYMBOL(unlock_buffer);