ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20
21 #include <linux/config.h>
22 #include <linux/kernel.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/bio.h>
38 #include <linux/notifier.h>
39 #include <linux/cpu.h>
40 #include <asm/bitops.h>
41
42 static void invalidate_bh_lrus(void);
43
44 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
45
46 /*
47  * Hashed waitqueue_head's for wait_on_buffer()
48  */
49 #define BH_WAIT_TABLE_ORDER     7
50 static struct bh_wait_queue_head {
51         wait_queue_head_t wqh;
52 } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
53
54 inline void
55 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
56 {
57         bh->b_end_io = handler;
58         bh->b_private = private;
59 }
60
61 /*
62  * Return the address of the waitqueue_head to be used for this
63  * buffer_head
64  */
65 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
66 {
67         return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
68 }
69 EXPORT_SYMBOL(bh_waitq_head);
70
71 void wake_up_buffer(struct buffer_head *bh)
72 {
73         wait_queue_head_t *wq = bh_waitq_head(bh);
74
75         smp_mb();
76         if (waitqueue_active(wq))
77                 wake_up_all(wq);
78 }
79 EXPORT_SYMBOL(wake_up_buffer);
80
81 void fastcall unlock_buffer(struct buffer_head *bh)
82 {
83         clear_buffer_locked(bh);
84         smp_mb__after_clear_bit();
85         wake_up_buffer(bh);
86 }
87
88 /*
89  * Block until a buffer comes unlocked.  This doesn't stop it
90  * from becoming locked again - you have to lock it yourself
91  * if you want to preserve its state.
92  */
93 void __wait_on_buffer(struct buffer_head * bh)
94 {
95         wait_queue_head_t *wqh = bh_waitq_head(bh);
96         DEFINE_WAIT(wait);
97
98         do {
99                 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
100                 if (buffer_locked(bh)) {
101                         struct block_device *bd;
102                         smp_mb();
103                         bd = bh->b_bdev;
104                         if (bd)
105                                 blk_run_address_space(bd->bd_inode->i_mapping);
106                         io_schedule();
107                 }
108         } while (buffer_locked(bh));
109         finish_wait(wqh, &wait);
110 }
111
112 static void
113 __set_page_buffers(struct page *page, struct buffer_head *head)
114 {
115         page_cache_get(page);
116         SetPagePrivate(page);
117         page->private = (unsigned long)head;
118 }
119
120 static void
121 __clear_page_buffers(struct page *page)
122 {
123         ClearPagePrivate(page);
124         page->private = 0;
125         page_cache_release(page);
126 }
127
128 static void buffer_io_error(struct buffer_head *bh)
129 {
130         char b[BDEVNAME_SIZE];
131
132         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
133                         bdevname(bh->b_bdev, b),
134                         (unsigned long long)bh->b_blocknr);
135 }
136
137 /*
138  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
139  * unlock the buffer. This is what ll_rw_block uses too.
140  */
141 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
142 {
143         if (uptodate) {
144                 set_buffer_uptodate(bh);
145         } else {
146                 /* This happens, due to failed READA attempts. */
147                 clear_buffer_uptodate(bh);
148         }
149         unlock_buffer(bh);
150         put_bh(bh);
151 }
152
153 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
154 {
155         char b[BDEVNAME_SIZE];
156
157         if (uptodate) {
158                 set_buffer_uptodate(bh);
159         } else {
160                 if (printk_ratelimit()) {
161                         buffer_io_error(bh);
162                         printk(KERN_WARNING "lost page write due to "
163                                         "I/O error on %s\n",
164                                        bdevname(bh->b_bdev, b));
165                 }
166                 set_buffer_write_io_error(bh);
167                 clear_buffer_uptodate(bh);
168         }
169         unlock_buffer(bh);
170         put_bh(bh);
171 }
172
173 /*
174  * Write out and wait upon all the dirty data associated with a block
175  * device via its mapping.  Does not take the superblock lock.
176  */
177 int sync_blockdev(struct block_device *bdev)
178 {
179         int ret = 0;
180
181         if (bdev) {
182                 int err;
183
184                 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
185                 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
186                 if (!ret)
187                         ret = err;
188         }
189         return ret;
190 }
191 EXPORT_SYMBOL(sync_blockdev);
192
193 /*
194  * Write out and wait upon all dirty data associated with this
195  * superblock.  Filesystem data as well as the underlying block
196  * device.  Takes the superblock lock.
197  */
198 int fsync_super(struct super_block *sb)
199 {
200         sync_inodes_sb(sb, 0);
201         DQUOT_SYNC(sb);
202         lock_super(sb);
203         if (sb->s_dirt && sb->s_op->write_super)
204                 sb->s_op->write_super(sb);
205         unlock_super(sb);
206         if (sb->s_op->sync_fs)
207                 sb->s_op->sync_fs(sb, 1);
208         sync_blockdev(sb->s_bdev);
209         sync_inodes_sb(sb, 1);
210
211         return sync_blockdev(sb->s_bdev);
212 }
213
214 /*
215  * Write out and wait upon all dirty data associated with this
216  * device.   Filesystem data as well as the underlying block
217  * device.  Takes the superblock lock.
218  */
219 int fsync_bdev(struct block_device *bdev)
220 {
221         struct super_block *sb = get_super(bdev);
222         if (sb) {
223                 int res = fsync_super(sb);
224                 drop_super(sb);
225                 return res;
226         }
227         return sync_blockdev(bdev);
228 }
229
230 /**
231  * freeze_bdev  --  lock a filesystem and force it into a consistent state
232  * @bdev:       blockdevice to lock
233  *
234  * This takes the block device bd_mount_sem to make sure no new mounts
235  * happen on bdev until thaw_bdev() is called.
236  * If a superblock is found on this device, we take the s_umount semaphore
237  * on it to make sure nobody unmounts until the snapshot creation is done.
238  */
239 struct super_block *freeze_bdev(struct block_device *bdev)
240 {
241         struct super_block *sb;
242
243         down(&bdev->bd_mount_sem);
244         sb = get_super(bdev);
245         if (sb && !(sb->s_flags & MS_RDONLY)) {
246                 sb->s_frozen = SB_FREEZE_WRITE;
247                 wmb();
248
249                 sync_inodes_sb(sb, 0);
250                 DQUOT_SYNC(sb);
251
252                 lock_super(sb);
253                 if (sb->s_dirt && sb->s_op->write_super)
254                         sb->s_op->write_super(sb);
255                 unlock_super(sb);
256
257                 if (sb->s_op->sync_fs)
258                         sb->s_op->sync_fs(sb, 1);
259
260                 sync_blockdev(sb->s_bdev);
261                 sync_inodes_sb(sb, 1);
262
263                 sb->s_frozen = SB_FREEZE_TRANS;
264                 wmb();
265
266                 sync_blockdev(sb->s_bdev);
267
268                 if (sb->s_op->write_super_lockfs)
269                         sb->s_op->write_super_lockfs(sb);
270         }
271
272         sync_blockdev(bdev);
273         return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
274 }
275 EXPORT_SYMBOL(freeze_bdev);
276
277 /**
278  * thaw_bdev  -- unlock filesystem
279  * @bdev:       blockdevice to unlock
280  * @sb:         associated superblock
281  *
282  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
283  */
284 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
285 {
286         if (sb) {
287                 BUG_ON(sb->s_bdev != bdev);
288
289                 if (sb->s_op->unlockfs)
290                         sb->s_op->unlockfs(sb);
291                 sb->s_frozen = SB_UNFROZEN;
292                 wmb();
293                 wake_up(&sb->s_wait_unfrozen);
294                 drop_super(sb);
295         }
296
297         up(&bdev->bd_mount_sem);
298 }
299 EXPORT_SYMBOL(thaw_bdev);
300
301 /*
302  * sync everything.  Start out by waking pdflush, because that writes back
303  * all queues in parallel.
304  */
305 static void do_sync(unsigned long wait)
306 {
307         wakeup_bdflush(0);
308         sync_inodes(0);         /* All mappings, inodes and their blockdevs */
309         DQUOT_SYNC(NULL);
310         sync_supers();          /* Write the superblocks */
311         sync_filesystems(0);    /* Start syncing the filesystems */
312         sync_filesystems(wait); /* Waitingly sync the filesystems */
313         sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
314         if (!wait)
315                 printk("Emergency Sync complete\n");
316         if (unlikely(laptop_mode))
317                 laptop_sync_completion();
318 }
319
320 asmlinkage long sys_sync(void)
321 {
322         do_sync(1);
323         return 0;
324 }
325
326 void emergency_sync(void)
327 {
328         pdflush_operation(do_sync, 0);
329 }
330
331 /*
332  * Generic function to fsync a file.
333  *
334  * filp may be NULL if called via the msync of a vma.
335  */
336  
337 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
338 {
339         struct inode * inode = dentry->d_inode;
340         struct super_block * sb;
341         int ret;
342
343         /* sync the inode to buffers */
344         write_inode_now(inode, 0);
345
346         /* sync the superblock to buffers */
347         sb = inode->i_sb;
348         lock_super(sb);
349         if (sb->s_op->write_super)
350                 sb->s_op->write_super(sb);
351         unlock_super(sb);
352
353         /* .. finally sync the buffers to disk */
354         ret = sync_blockdev(sb->s_bdev);
355         return ret;
356 }
357
358 asmlinkage long sys_fsync(unsigned int fd)
359 {
360         struct file * file;
361         struct address_space *mapping;
362         int ret, err;
363
364         ret = -EBADF;
365         file = fget(fd);
366         if (!file)
367                 goto out;
368
369         mapping = file->f_mapping;
370
371         ret = -EINVAL;
372         if (!file->f_op || !file->f_op->fsync) {
373                 /* Why?  We can still call filemap_fdatawrite */
374                 goto out_putf;
375         }
376
377         /* We need to protect against concurrent writers.. */
378         down(&mapping->host->i_sem);
379         current->flags |= PF_SYNCWRITE;
380         ret = filemap_fdatawrite(mapping);
381         err = file->f_op->fsync(file, file->f_dentry, 0);
382         if (!ret)
383                 ret = err;
384         err = filemap_fdatawait(mapping);
385         if (!ret)
386                 ret = err;
387         current->flags &= ~PF_SYNCWRITE;
388         up(&mapping->host->i_sem);
389
390 out_putf:
391         fput(file);
392 out:
393         return ret;
394 }
395
396 asmlinkage long sys_fdatasync(unsigned int fd)
397 {
398         struct file * file;
399         struct address_space *mapping;
400         int ret, err;
401
402         ret = -EBADF;
403         file = fget(fd);
404         if (!file)
405                 goto out;
406
407         ret = -EINVAL;
408         if (!file->f_op || !file->f_op->fsync)
409                 goto out_putf;
410
411         mapping = file->f_mapping;
412
413         down(&mapping->host->i_sem);
414         current->flags |= PF_SYNCWRITE;
415         ret = filemap_fdatawrite(mapping);
416         err = file->f_op->fsync(file, file->f_dentry, 1);
417         if (!ret)
418                 ret = err;
419         err = filemap_fdatawait(mapping);
420         if (!ret)
421                 ret = err;
422         current->flags &= ~PF_SYNCWRITE;
423         up(&mapping->host->i_sem);
424
425 out_putf:
426         fput(file);
427 out:
428         return ret;
429 }
430
431 /*
432  * Various filesystems appear to want __find_get_block to be non-blocking.
433  * But it's the page lock which protects the buffers.  To get around this,
434  * we get exclusion from try_to_free_buffers with the blockdev mapping's
435  * private_lock.
436  *
437  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
438  * may be quite high.  This code could TryLock the page, and if that
439  * succeeds, there is no need to take private_lock. (But if
440  * private_lock is contended then so is mapping->tree_lock).
441  */
442 static struct buffer_head *
443 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
444 {
445         struct inode *bd_inode = bdev->bd_inode;
446         struct address_space *bd_mapping = bd_inode->i_mapping;
447         struct buffer_head *ret = NULL;
448         pgoff_t index;
449         struct buffer_head *bh;
450         struct buffer_head *head;
451         struct page *page;
452
453         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
454         page = find_get_page(bd_mapping, index);
455         if (!page)
456                 goto out;
457
458         spin_lock(&bd_mapping->private_lock);
459         if (!page_has_buffers(page))
460                 goto out_unlock;
461         head = page_buffers(page);
462         bh = head;
463         do {
464                 if (bh->b_blocknr == block) {
465                         ret = bh;
466                         get_bh(bh);
467                         goto out_unlock;
468                 }
469                 bh = bh->b_this_page;
470         } while (bh != head);
471
472         printk("__find_get_block_slow() failed. "
473                 "block=%llu, b_blocknr=%llu\n",
474                 (unsigned long long)block, (unsigned long long)bh->b_blocknr);
475         printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
476         printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
477 out_unlock:
478         spin_unlock(&bd_mapping->private_lock);
479         page_cache_release(page);
480 out:
481         return ret;
482 }
483
484 /* If invalidate_buffers() will trash dirty buffers, it means some kind
485    of fs corruption is going on. Trashing dirty data always imply losing
486    information that was supposed to be just stored on the physical layer
487    by the user.
488
489    Thus invalidate_buffers in general usage is not allwowed to trash
490    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
491    be preserved.  These buffers are simply skipped.
492   
493    We also skip buffers which are still in use.  For example this can
494    happen if a userspace program is reading the block device.
495
496    NOTE: In the case where the user removed a removable-media-disk even if
497    there's still dirty data not synced on disk (due a bug in the device driver
498    or due an error of the user), by not destroying the dirty buffers we could
499    generate corruption also on the next media inserted, thus a parameter is
500    necessary to handle this case in the most safe way possible (trying
501    to not corrupt also the new disk inserted with the data belonging to
502    the old now corrupted disk). Also for the ramdisk the natural thing
503    to do in order to release the ramdisk memory is to destroy dirty buffers.
504
505    These are two special cases. Normal usage imply the device driver
506    to issue a sync on the device (without waiting I/O completion) and
507    then an invalidate_buffers call that doesn't trash dirty buffers.
508
509    For handling cache coherency with the blkdev pagecache the 'update' case
510    is been introduced. It is needed to re-read from disk any pinned
511    buffer. NOTE: re-reading from disk is destructive so we can do it only
512    when we assume nobody is changing the buffercache under our I/O and when
513    we think the disk contains more recent information than the buffercache.
514    The update == 1 pass marks the buffers we need to update, the update == 2
515    pass does the actual I/O. */
516 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
517 {
518         invalidate_bh_lrus();
519         /*
520          * FIXME: what about destroy_dirty_buffers?
521          * We really want to use invalidate_inode_pages2() for
522          * that, but not until that's cleaned up.
523          */
524         invalidate_inode_pages(bdev->bd_inode->i_mapping);
525 }
526
527 /*
528  * Kick pdflush then try to free up some ZONE_NORMAL memory.
529  */
530 static void free_more_memory(void)
531 {
532         struct zone **zones;
533         pg_data_t *pgdat;
534
535         wakeup_bdflush(1024);
536         yield();
537
538         for_each_pgdat(pgdat) {
539                 zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
540                 if (*zones)
541                         try_to_free_pages(zones, GFP_NOFS, 0);
542         }
543 }
544
545 /*
546  * I/O completion handler for block_read_full_page() - pages
547  * which come unlocked at the end of I/O.
548  */
549 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
550 {
551         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
552         unsigned long flags;
553         struct buffer_head *tmp;
554         struct page *page;
555         int page_uptodate = 1;
556
557         BUG_ON(!buffer_async_read(bh));
558
559         page = bh->b_page;
560         if (uptodate) {
561                 set_buffer_uptodate(bh);
562         } else {
563                 clear_buffer_uptodate(bh);
564                 buffer_io_error(bh);
565                 SetPageError(page);
566         }
567
568         /*
569          * Be _very_ careful from here on. Bad things can happen if
570          * two buffer heads end IO at almost the same time and both
571          * decide that the page is now completely done.
572          */
573         spin_lock_irqsave(&page_uptodate_lock, flags);
574         clear_buffer_async_read(bh);
575         unlock_buffer(bh);
576         tmp = bh;
577         do {
578                 if (!buffer_uptodate(tmp))
579                         page_uptodate = 0;
580                 if (buffer_async_read(tmp)) {
581                         BUG_ON(!buffer_locked(tmp));
582                         goto still_busy;
583                 }
584                 tmp = tmp->b_this_page;
585         } while (tmp != bh);
586         spin_unlock_irqrestore(&page_uptodate_lock, flags);
587
588         /*
589          * If none of the buffers had errors and they are all
590          * uptodate then we can set the page uptodate.
591          */
592         if (page_uptodate && !PageError(page))
593                 SetPageUptodate(page);
594         unlock_page(page);
595         return;
596
597 still_busy:
598         spin_unlock_irqrestore(&page_uptodate_lock, flags);
599         return;
600 }
601
602 /*
603  * Completion handler for block_write_full_page() - pages which are unlocked
604  * during I/O, and which have PageWriteback cleared upon I/O completion.
605  */
606 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
607 {
608         char b[BDEVNAME_SIZE];
609         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
610         unsigned long flags;
611         struct buffer_head *tmp;
612         struct page *page;
613
614         BUG_ON(!buffer_async_write(bh));
615
616         page = bh->b_page;
617         if (uptodate) {
618                 set_buffer_uptodate(bh);
619         } else {
620                 if (printk_ratelimit()) {
621                         buffer_io_error(bh);
622                         printk(KERN_WARNING "lost page write due to "
623                                         "I/O error on %s\n",
624                                bdevname(bh->b_bdev, b));
625                 }
626                 set_bit(AS_EIO, &page->mapping->flags);
627                 clear_buffer_uptodate(bh);
628                 SetPageError(page);
629         }
630
631         spin_lock_irqsave(&page_uptodate_lock, flags);
632         clear_buffer_async_write(bh);
633         unlock_buffer(bh);
634         tmp = bh->b_this_page;
635         while (tmp != bh) {
636                 if (buffer_async_write(tmp)) {
637                         BUG_ON(!buffer_locked(tmp));
638                         goto still_busy;
639                 }
640                 tmp = tmp->b_this_page;
641         }
642         spin_unlock_irqrestore(&page_uptodate_lock, flags);
643         end_page_writeback(page);
644         return;
645
646 still_busy:
647         spin_unlock_irqrestore(&page_uptodate_lock, flags);
648         return;
649 }
650
651 /*
652  * If a page's buffers are under async readin (end_buffer_async_read
653  * completion) then there is a possibility that another thread of
654  * control could lock one of the buffers after it has completed
655  * but while some of the other buffers have not completed.  This
656  * locked buffer would confuse end_buffer_async_read() into not unlocking
657  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
658  * that this buffer is not under async I/O.
659  *
660  * The page comes unlocked when it has no locked buffer_async buffers
661  * left.
662  *
663  * PageLocked prevents anyone starting new async I/O reads any of
664  * the buffers.
665  *
666  * PageWriteback is used to prevent simultaneous writeout of the same
667  * page.
668  *
669  * PageLocked prevents anyone from starting writeback of a page which is
670  * under read I/O (PageWriteback is only ever set against a locked page).
671  */
672 void mark_buffer_async_read(struct buffer_head *bh)
673 {
674         bh->b_end_io = end_buffer_async_read;
675         set_buffer_async_read(bh);
676 }
677 EXPORT_SYMBOL(mark_buffer_async_read);
678
679 void mark_buffer_async_write(struct buffer_head *bh)
680 {
681         bh->b_end_io = end_buffer_async_write;
682         set_buffer_async_write(bh);
683 }
684 EXPORT_SYMBOL(mark_buffer_async_write);
685
686
687 /*
688  * fs/buffer.c contains helper functions for buffer-backed address space's
689  * fsync functions.  A common requirement for buffer-based filesystems is
690  * that certain data from the backing blockdev needs to be written out for
691  * a successful fsync().  For example, ext2 indirect blocks need to be
692  * written back and waited upon before fsync() returns.
693  *
694  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
695  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
696  * management of a list of dependent buffers at ->i_mapping->private_list.
697  *
698  * Locking is a little subtle: try_to_free_buffers() will remove buffers
699  * from their controlling inode's queue when they are being freed.  But
700  * try_to_free_buffers() will be operating against the *blockdev* mapping
701  * at the time, not against the S_ISREG file which depends on those buffers.
702  * So the locking for private_list is via the private_lock in the address_space
703  * which backs the buffers.  Which is different from the address_space 
704  * against which the buffers are listed.  So for a particular address_space,
705  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
706  * mapping->private_list will always be protected by the backing blockdev's
707  * ->private_lock.
708  *
709  * Which introduces a requirement: all buffers on an address_space's
710  * ->private_list must be from the same address_space: the blockdev's.
711  *
712  * address_spaces which do not place buffers at ->private_list via these
713  * utility functions are free to use private_lock and private_list for
714  * whatever they want.  The only requirement is that list_empty(private_list)
715  * be true at clear_inode() time.
716  *
717  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
718  * filesystems should do that.  invalidate_inode_buffers() should just go
719  * BUG_ON(!list_empty).
720  *
721  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
722  * take an address_space, not an inode.  And it should be called
723  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
724  * queued up.
725  *
726  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
727  * list if it is already on a list.  Because if the buffer is on a list,
728  * it *must* already be on the right one.  If not, the filesystem is being
729  * silly.  This will save a ton of locking.  But first we have to ensure
730  * that buffers are taken *off* the old inode's list when they are freed
731  * (presumably in truncate).  That requires careful auditing of all
732  * filesystems (do it inside bforget()).  It could also be done by bringing
733  * b_inode back.
734  */
735
736 void buffer_insert_list(spinlock_t *lock,
737                 struct buffer_head *bh, struct list_head *list)
738 {
739         spin_lock(lock);
740         list_move_tail(&bh->b_assoc_buffers, list);
741         spin_unlock(lock);
742 }
743
744 /*
745  * The buffer's backing address_space's private_lock must be held
746  */
747 static inline void __remove_assoc_queue(struct buffer_head *bh)
748 {
749         list_del_init(&bh->b_assoc_buffers);
750 }
751
752 int inode_has_buffers(struct inode *inode)
753 {
754         return !list_empty(&inode->i_data.private_list);
755 }
756
757 /*
758  * osync is designed to support O_SYNC io.  It waits synchronously for
759  * all already-submitted IO to complete, but does not queue any new
760  * writes to the disk.
761  *
762  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
763  * you dirty the buffers, and then use osync_inode_buffers to wait for
764  * completion.  Any other dirty buffers which are not yet queued for
765  * write will not be flushed to disk by the osync.
766  */
767 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
768 {
769         struct buffer_head *bh;
770         struct list_head *p;
771         int err = 0;
772
773         spin_lock(lock);
774 repeat:
775         list_for_each_prev(p, list) {
776                 bh = BH_ENTRY(p);
777                 if (buffer_locked(bh)) {
778                         get_bh(bh);
779                         spin_unlock(lock);
780                         wait_on_buffer(bh);
781                         if (!buffer_uptodate(bh))
782                                 err = -EIO;
783                         brelse(bh);
784                         spin_lock(lock);
785                         goto repeat;
786                 }
787         }
788         spin_unlock(lock);
789         return err;
790 }
791
792 /**
793  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
794  *                        buffers
795  * @buffer_mapping - the mapping which backs the buffers' data
796  * @mapping - the mapping which wants those buffers written
797  *
798  * Starts I/O against the buffers at mapping->private_list, and waits upon
799  * that I/O.
800  *
801  * Basically, this is a convenience function for fsync().  @buffer_mapping is
802  * the blockdev which "owns" the buffers and @mapping is a file or directory
803  * which needs those buffers to be written for a successful fsync().
804  */
805 int sync_mapping_buffers(struct address_space *mapping)
806 {
807         struct address_space *buffer_mapping = mapping->assoc_mapping;
808
809         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
810                 return 0;
811
812         return fsync_buffers_list(&buffer_mapping->private_lock,
813                                         &mapping->private_list);
814 }
815 EXPORT_SYMBOL(sync_mapping_buffers);
816
817 /*
818  * Called when we've recently written block `bblock', and it is known that
819  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
820  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
821  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
822  */
823 void write_boundary_block(struct block_device *bdev,
824                         sector_t bblock, unsigned blocksize)
825 {
826         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
827         if (bh) {
828                 if (buffer_dirty(bh))
829                         ll_rw_block(WRITE, 1, &bh);
830                 put_bh(bh);
831         }
832 }
833
834 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
835 {
836         struct address_space *mapping = inode->i_mapping;
837         struct address_space *buffer_mapping = bh->b_page->mapping;
838
839         mark_buffer_dirty(bh);
840         if (!mapping->assoc_mapping) {
841                 mapping->assoc_mapping = buffer_mapping;
842         } else {
843                 if (mapping->assoc_mapping != buffer_mapping)
844                         BUG();
845         }
846         if (list_empty(&bh->b_assoc_buffers))
847                 buffer_insert_list(&buffer_mapping->private_lock,
848                                 bh, &mapping->private_list);
849 }
850 EXPORT_SYMBOL(mark_buffer_dirty_inode);
851
852 /*
853  * Add a page to the dirty page list.
854  *
855  * It is a sad fact of life that this function is called from several places
856  * deeply under spinlocking.  It may not sleep.
857  *
858  * If the page has buffers, the uptodate buffers are set dirty, to preserve
859  * dirty-state coherency between the page and the buffers.  It the page does
860  * not have buffers then when they are later attached they will all be set
861  * dirty.
862  *
863  * The buffers are dirtied before the page is dirtied.  There's a small race
864  * window in which a writepage caller may see the page cleanness but not the
865  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
866  * before the buffers, a concurrent writepage caller could clear the page dirty
867  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
868  * page on the dirty page list.
869  *
870  * We use private_lock to lock against try_to_free_buffers while using the
871  * page's buffer list.  Also use this to protect against clean buffers being
872  * added to the page after it was set dirty.
873  *
874  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
875  * address_space though.
876  */
877 int __set_page_dirty_buffers(struct page *page)
878 {
879         struct address_space * const mapping = page->mapping;
880
881         spin_lock(&mapping->private_lock);
882         if (page_has_buffers(page)) {
883                 struct buffer_head *head = page_buffers(page);
884                 struct buffer_head *bh = head;
885
886                 do {
887                         set_buffer_dirty(bh);
888                         bh = bh->b_this_page;
889                 } while (bh != head);
890         }
891         spin_unlock(&mapping->private_lock);
892
893         if (!TestSetPageDirty(page)) {
894                 spin_lock_irq(&mapping->tree_lock);
895                 if (page->mapping) {    /* Race with truncate? */
896                         if (!mapping->backing_dev_info->memory_backed)
897                                 inc_page_state(nr_dirty);
898                         radix_tree_tag_set(&mapping->page_tree, page->index,
899                                                 PAGECACHE_TAG_DIRTY);
900                 }
901                 spin_unlock_irq(&mapping->tree_lock);
902                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
903         }
904         
905         return 0;
906 }
907 EXPORT_SYMBOL(__set_page_dirty_buffers);
908
909 /*
910  * Write out and wait upon a list of buffers.
911  *
912  * We have conflicting pressures: we want to make sure that all
913  * initially dirty buffers get waited on, but that any subsequently
914  * dirtied buffers don't.  After all, we don't want fsync to last
915  * forever if somebody is actively writing to the file.
916  *
917  * Do this in two main stages: first we copy dirty buffers to a
918  * temporary inode list, queueing the writes as we go.  Then we clean
919  * up, waiting for those writes to complete.
920  * 
921  * During this second stage, any subsequent updates to the file may end
922  * up refiling the buffer on the original inode's dirty list again, so
923  * there is a chance we will end up with a buffer queued for write but
924  * not yet completed on that list.  So, as a final cleanup we go through
925  * the osync code to catch these locked, dirty buffers without requeuing
926  * any newly dirty buffers for write.
927  */
928 int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
929 {
930         struct buffer_head *bh;
931         struct list_head tmp;
932         int err = 0, err2;
933
934         INIT_LIST_HEAD(&tmp);
935
936         spin_lock(lock);
937         while (!list_empty(list)) {
938                 bh = BH_ENTRY(list->next);
939                 list_del_init(&bh->b_assoc_buffers);
940                 if (buffer_dirty(bh) || buffer_locked(bh)) {
941                         list_add(&bh->b_assoc_buffers, &tmp);
942                         if (buffer_dirty(bh)) {
943                                 get_bh(bh);
944                                 spin_unlock(lock);
945                                 /*
946                                  * Ensure any pending I/O completes so that
947                                  * ll_rw_block() actually writes the current
948                                  * contents - it is a noop if I/O is still in
949                                  * flight on potentially older contents.
950                                  */
951                                 wait_on_buffer(bh);
952                                 ll_rw_block(WRITE, 1, &bh);
953                                 brelse(bh);
954                                 spin_lock(lock);
955                         }
956                 }
957         }
958
959         while (!list_empty(&tmp)) {
960                 bh = BH_ENTRY(tmp.prev);
961                 __remove_assoc_queue(bh);
962                 get_bh(bh);
963                 spin_unlock(lock);
964                 wait_on_buffer(bh);
965                 if (!buffer_uptodate(bh))
966                         err = -EIO;
967                 brelse(bh);
968                 spin_lock(lock);
969         }
970         
971         spin_unlock(lock);
972         err2 = osync_buffers_list(lock, list);
973         if (err)
974                 return err;
975         else
976                 return err2;
977 }
978
979 /*
980  * Invalidate any and all dirty buffers on a given inode.  We are
981  * probably unmounting the fs, but that doesn't mean we have already
982  * done a sync().  Just drop the buffers from the inode list.
983  *
984  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
985  * assumes that all the buffers are against the blockdev.  Not true
986  * for reiserfs.
987  */
988 void invalidate_inode_buffers(struct inode *inode)
989 {
990         if (inode_has_buffers(inode)) {
991                 struct address_space *mapping = &inode->i_data;
992                 struct list_head *list = &mapping->private_list;
993                 struct address_space *buffer_mapping = mapping->assoc_mapping;
994
995                 spin_lock(&buffer_mapping->private_lock);
996                 while (!list_empty(list))
997                         __remove_assoc_queue(BH_ENTRY(list->next));
998                 spin_unlock(&buffer_mapping->private_lock);
999         }
1000 }
1001
1002 /*
1003  * Remove any clean buffers from the inode's buffer list.  This is called
1004  * when we're trying to free the inode itself.  Those buffers can pin it.
1005  *
1006  * Returns true if all buffers were removed.
1007  */
1008 int remove_inode_buffers(struct inode *inode)
1009 {
1010         int ret = 1;
1011
1012         if (inode_has_buffers(inode)) {
1013                 struct address_space *mapping = &inode->i_data;
1014                 struct list_head *list = &mapping->private_list;
1015                 struct address_space *buffer_mapping = mapping->assoc_mapping;
1016
1017                 spin_lock(&buffer_mapping->private_lock);
1018                 while (!list_empty(list)) {
1019                         struct buffer_head *bh = BH_ENTRY(list->next);
1020                         if (buffer_dirty(bh)) {
1021                                 ret = 0;
1022                                 break;
1023                         }
1024                         __remove_assoc_queue(bh);
1025                 }
1026                 spin_unlock(&buffer_mapping->private_lock);
1027         }
1028         return ret;
1029 }
1030
1031 /*
1032  * Create the appropriate buffers when given a page for data area and
1033  * the size of each buffer.. Use the bh->b_this_page linked list to
1034  * follow the buffers created.  Return NULL if unable to create more
1035  * buffers.
1036  *
1037  * The retry flag is used to differentiate async IO (paging, swapping)
1038  * which may not fail from ordinary buffer allocations.
1039  */
1040 static struct buffer_head *
1041 create_buffers(struct page * page, unsigned long size, int retry)
1042 {
1043         struct buffer_head *bh, *head;
1044         long offset;
1045
1046 try_again:
1047         head = NULL;
1048         offset = PAGE_SIZE;
1049         while ((offset -= size) >= 0) {
1050                 bh = alloc_buffer_head(GFP_NOFS);
1051                 if (!bh)
1052                         goto no_grow;
1053
1054                 bh->b_bdev = NULL;
1055                 bh->b_this_page = head;
1056                 bh->b_blocknr = -1;
1057                 head = bh;
1058
1059                 bh->b_state = 0;
1060                 atomic_set(&bh->b_count, 0);
1061                 bh->b_size = size;
1062
1063                 /* Link the buffer to its page */
1064                 set_bh_page(bh, page, offset);
1065
1066                 bh->b_end_io = NULL;
1067         }
1068         return head;
1069 /*
1070  * In case anything failed, we just free everything we got.
1071  */
1072 no_grow:
1073         if (head) {
1074                 do {
1075                         bh = head;
1076                         head = head->b_this_page;
1077                         free_buffer_head(bh);
1078                 } while (head);
1079         }
1080
1081         /*
1082          * Return failure for non-async IO requests.  Async IO requests
1083          * are not allowed to fail, so we have to wait until buffer heads
1084          * become available.  But we don't want tasks sleeping with 
1085          * partially complete buffers, so all were released above.
1086          */
1087         if (!retry)
1088                 return NULL;
1089
1090         /* We're _really_ low on memory. Now we just
1091          * wait for old buffer heads to become free due to
1092          * finishing IO.  Since this is an async request and
1093          * the reserve list is empty, we're sure there are 
1094          * async buffer heads in use.
1095          */
1096         free_more_memory();
1097         goto try_again;
1098 }
1099
1100 static inline void
1101 link_dev_buffers(struct page *page, struct buffer_head *head)
1102 {
1103         struct buffer_head *bh, *tail;
1104
1105         bh = head;
1106         do {
1107                 tail = bh;
1108                 bh = bh->b_this_page;
1109         } while (bh);
1110         tail->b_this_page = head;
1111         __set_page_buffers(page, head);
1112 }
1113
1114 /*
1115  * Initialise the state of a blockdev page's buffers.
1116  */ 
1117 static void
1118 init_page_buffers(struct page *page, struct block_device *bdev,
1119                         sector_t block, int size)
1120 {
1121         struct buffer_head *head = page_buffers(page);
1122         struct buffer_head *bh = head;
1123         unsigned int b_state;
1124
1125         b_state = 1 << BH_Mapped;
1126         if (PageUptodate(page))
1127                 b_state |= 1 << BH_Uptodate;
1128
1129         do {
1130                 if (!(bh->b_state & (1 << BH_Mapped))) {
1131                         init_buffer(bh, NULL, NULL);
1132                         bh->b_bdev = bdev;
1133                         bh->b_blocknr = block;
1134                         bh->b_state = b_state;
1135                 }
1136                 block++;
1137                 bh = bh->b_this_page;
1138         } while (bh != head);
1139 }
1140
1141 /*
1142  * Create the page-cache page that contains the requested block.
1143  *
1144  * This is user purely for blockdev mappings.
1145  */
1146 static struct page *
1147 grow_dev_page(struct block_device *bdev, sector_t block,
1148                 pgoff_t index, int size)
1149 {
1150         struct inode *inode = bdev->bd_inode;
1151         struct page *page;
1152         struct buffer_head *bh;
1153
1154         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1155         if (!page)
1156                 return NULL;
1157
1158         if (!PageLocked(page))
1159                 BUG();
1160
1161         if (page_has_buffers(page)) {
1162                 bh = page_buffers(page);
1163                 if (bh->b_size == size)
1164                         return page;
1165                 if (!try_to_free_buffers(page))
1166                         goto failed;
1167         }
1168
1169         /*
1170          * Allocate some buffers for this page
1171          */
1172         bh = create_buffers(page, size, 0);
1173         if (!bh)
1174                 goto failed;
1175
1176         /*
1177          * Link the page to the buffers and initialise them.  Take the
1178          * lock to be atomic wrt __find_get_block(), which does not
1179          * run under the page lock.
1180          */
1181         spin_lock(&inode->i_mapping->private_lock);
1182         link_dev_buffers(page, bh);
1183         init_page_buffers(page, bdev, block, size);
1184         spin_unlock(&inode->i_mapping->private_lock);
1185         return page;
1186
1187 failed:
1188         BUG();
1189         unlock_page(page);
1190         page_cache_release(page);
1191         return NULL;
1192 }
1193
1194 /*
1195  * Create buffers for the specified block device block's page.  If
1196  * that page was dirty, the buffers are set dirty also.
1197  *
1198  * Except that's a bug.  Attaching dirty buffers to a dirty
1199  * blockdev's page can result in filesystem corruption, because
1200  * some of those buffers may be aliases of filesystem data.
1201  * grow_dev_page() will go BUG() if this happens.
1202  */
1203 static inline int
1204 grow_buffers(struct block_device *bdev, sector_t block, int size)
1205 {
1206         struct page *page;
1207         pgoff_t index;
1208         int sizebits;
1209
1210         /* Size must be multiple of hard sectorsize */
1211         if (size & (bdev_hardsect_size(bdev)-1))
1212                 BUG();
1213         if (size < 512 || size > PAGE_SIZE)
1214                 BUG();
1215
1216         sizebits = -1;
1217         do {
1218                 sizebits++;
1219         } while ((size << sizebits) < PAGE_SIZE);
1220
1221         index = block >> sizebits;
1222         block = index << sizebits;
1223
1224         /* Create a page with the proper size buffers.. */
1225         page = grow_dev_page(bdev, block, index, size);
1226         if (!page)
1227                 return 0;
1228         unlock_page(page);
1229         page_cache_release(page);
1230         return 1;
1231 }
1232
1233 struct buffer_head *
1234 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1235 {
1236         for (;;) {
1237                 struct buffer_head * bh;
1238
1239                 bh = __find_get_block(bdev, block, size);
1240                 if (bh)
1241                         return bh;
1242
1243                 if (!grow_buffers(bdev, block, size))
1244                         free_more_memory();
1245         }
1246 }
1247
1248 /*
1249  * The relationship between dirty buffers and dirty pages:
1250  *
1251  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1252  * the page is tagged dirty in its radix tree.
1253  *
1254  * At all times, the dirtiness of the buffers represents the dirtiness of
1255  * subsections of the page.  If the page has buffers, the page dirty bit is
1256  * merely a hint about the true dirty state.
1257  *
1258  * When a page is set dirty in its entirety, all its buffers are marked dirty
1259  * (if the page has buffers).
1260  *
1261  * When a buffer is marked dirty, its page is dirtied, but the page's other
1262  * buffers are not.
1263  *
1264  * Also.  When blockdev buffers are explicitly read with bread(), they
1265  * individually become uptodate.  But their backing page remains not
1266  * uptodate - even if all of its buffers are uptodate.  A subsequent
1267  * block_read_full_page() against that page will discover all the uptodate
1268  * buffers, will set the page uptodate and will perform no I/O.
1269  */
1270
1271 /**
1272  * mark_buffer_dirty - mark a buffer_head as needing writeout
1273  *
1274  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1275  * backing page dirty, then tag the page as dirty in its address_space's radix
1276  * tree and then attach the address_space's inode to its superblock's dirty
1277  * inode list.
1278  *
1279  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1280  * mapping->tree_lock and the global inode_lock.
1281  */
1282 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1283 {
1284         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1285                 __set_page_dirty_nobuffers(bh->b_page);
1286 }
1287
1288 /*
1289  * Decrement a buffer_head's reference count.  If all buffers against a page
1290  * have zero reference count, are clean and unlocked, and if the page is clean
1291  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1292  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1293  * a page but it ends up not being freed, and buffers may later be reattached).
1294  */
1295 void __brelse(struct buffer_head * buf)
1296 {
1297         if (atomic_read(&buf->b_count)) {
1298                 put_bh(buf);
1299                 return;
1300         }
1301         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1302         WARN_ON(1);
1303 }
1304
1305 /*
1306  * bforget() is like brelse(), except it discards any
1307  * potentially dirty data.
1308  */
1309 void __bforget(struct buffer_head *bh)
1310 {
1311         clear_buffer_dirty(bh);
1312         if (!list_empty(&bh->b_assoc_buffers)) {
1313                 struct address_space *buffer_mapping = bh->b_page->mapping;
1314
1315                 spin_lock(&buffer_mapping->private_lock);
1316                 list_del_init(&bh->b_assoc_buffers);
1317                 spin_unlock(&buffer_mapping->private_lock);
1318         }
1319         __brelse(bh);
1320 }
1321
1322 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1323 {
1324         lock_buffer(bh);
1325         if (buffer_uptodate(bh)) {
1326                 unlock_buffer(bh);
1327                 return bh;
1328         } else {
1329                 get_bh(bh);
1330                 bh->b_end_io = end_buffer_read_sync;
1331                 submit_bh(READ, bh);
1332                 wait_on_buffer(bh);
1333                 if (buffer_uptodate(bh))
1334                         return bh;
1335         }
1336         brelse(bh);
1337         return NULL;
1338 }
1339
1340 /*
1341  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1342  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1343  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1344  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1345  * CPU's LRUs at the same time.
1346  *
1347  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1348  * sb_find_get_block().
1349  *
1350  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1351  * a local interrupt disable for that.
1352  */
1353
1354 #define BH_LRU_SIZE     8
1355
1356 struct bh_lru {
1357         struct buffer_head *bhs[BH_LRU_SIZE];
1358 };
1359
1360 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{0}};
1361
1362 #ifdef CONFIG_SMP
1363 #define bh_lru_lock()   local_irq_disable()
1364 #define bh_lru_unlock() local_irq_enable()
1365 #else
1366 #define bh_lru_lock()   preempt_disable()
1367 #define bh_lru_unlock() preempt_enable()
1368 #endif
1369
1370 static inline void check_irqs_on(void)
1371 {
1372 #ifdef irqs_disabled
1373         BUG_ON(irqs_disabled());
1374 #endif
1375 }
1376
1377 /*
1378  * The LRU management algorithm is dopey-but-simple.  Sorry.
1379  */
1380 static void bh_lru_install(struct buffer_head *bh)
1381 {
1382         struct buffer_head *evictee = NULL;
1383         struct bh_lru *lru;
1384
1385         check_irqs_on();
1386         bh_lru_lock();
1387         lru = &__get_cpu_var(bh_lrus);
1388         if (lru->bhs[0] != bh) {
1389                 struct buffer_head *bhs[BH_LRU_SIZE];
1390                 int in;
1391                 int out = 0;
1392
1393                 get_bh(bh);
1394                 bhs[out++] = bh;
1395                 for (in = 0; in < BH_LRU_SIZE; in++) {
1396                         struct buffer_head *bh2 = lru->bhs[in];
1397
1398                         if (bh2 == bh) {
1399                                 __brelse(bh2);
1400                         } else {
1401                                 if (out >= BH_LRU_SIZE) {
1402                                         BUG_ON(evictee != NULL);
1403                                         evictee = bh2;
1404                                 } else {
1405                                         bhs[out++] = bh2;
1406                                 }
1407                         }
1408                 }
1409                 while (out < BH_LRU_SIZE)
1410                         bhs[out++] = NULL;
1411                 memcpy(lru->bhs, bhs, sizeof(bhs));
1412         }
1413         bh_lru_unlock();
1414
1415         if (evictee)
1416                 __brelse(evictee);
1417 }
1418
1419 /*
1420  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1421  */
1422 static inline struct buffer_head *
1423 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1424 {
1425         struct buffer_head *ret = NULL;
1426         struct bh_lru *lru;
1427         int i;
1428
1429         check_irqs_on();
1430         bh_lru_lock();
1431         lru = &__get_cpu_var(bh_lrus);
1432         for (i = 0; i < BH_LRU_SIZE; i++) {
1433                 struct buffer_head *bh = lru->bhs[i];
1434
1435                 if (bh && bh->b_bdev == bdev &&
1436                                 bh->b_blocknr == block && bh->b_size == size) {
1437                         if (i) {
1438                                 while (i) {
1439                                         lru->bhs[i] = lru->bhs[i - 1];
1440                                         i--;
1441                                 }
1442                                 lru->bhs[0] = bh;
1443                         }
1444                         get_bh(bh);
1445                         ret = bh;
1446                         break;
1447                 }
1448         }
1449         bh_lru_unlock();
1450         return ret;
1451 }
1452
1453 /*
1454  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1455  * it in the LRU and mark it as accessed.  If it is not present then return
1456  * NULL
1457  */
1458 struct buffer_head *
1459 __find_get_block(struct block_device *bdev, sector_t block, int size)
1460 {
1461         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1462
1463         if (bh == NULL) {
1464                 bh = __find_get_block_slow(bdev, block, size);
1465                 if (bh)
1466                         bh_lru_install(bh);
1467         }
1468         if (bh)
1469                 touch_buffer(bh);
1470         return bh;
1471 }
1472 EXPORT_SYMBOL(__find_get_block);
1473
1474 /*
1475  * __getblk will locate (and, if necessary, create) the buffer_head
1476  * which corresponds to the passed block_device, block and size. The
1477  * returned buffer has its reference count incremented.
1478  *
1479  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1480  * illegal block number, __getblk() will happily return a buffer_head
1481  * which represents the non-existent block.  Very weird.
1482  *
1483  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1484  * attempt is failing.  FIXME, perhaps?
1485  */
1486 struct buffer_head *
1487 __getblk(struct block_device *bdev, sector_t block, int size)
1488 {
1489         struct buffer_head *bh = __find_get_block(bdev, block, size);
1490
1491         if (bh == NULL)
1492                 bh = __getblk_slow(bdev, block, size);
1493         return bh;
1494 }
1495 EXPORT_SYMBOL(__getblk);
1496
1497 /*
1498  * Do async read-ahead on a buffer..
1499  */
1500 void __breadahead(struct block_device *bdev, sector_t block, int size)
1501 {
1502         struct buffer_head *bh = __getblk(bdev, block, size);
1503         ll_rw_block(READA, 1, &bh);
1504         brelse(bh);
1505 }
1506 EXPORT_SYMBOL(__breadahead);
1507
1508 /**
1509  *  __bread() - reads a specified block and returns the bh
1510  *  @block: number of block
1511  *  @size: size (in bytes) to read
1512  * 
1513  *  Reads a specified block, and returns buffer head that contains it.
1514  *  It returns NULL if the block was unreadable.
1515  */
1516 struct buffer_head *
1517 __bread(struct block_device *bdev, sector_t block, int size)
1518 {
1519         struct buffer_head *bh = __getblk(bdev, block, size);
1520
1521         if (!buffer_uptodate(bh))
1522                 bh = __bread_slow(bh);
1523         return bh;
1524 }
1525 EXPORT_SYMBOL(__bread);
1526
1527 /*
1528  * invalidate_bh_lrus() is called rarely - at unmount.  Because it is only for
1529  * unmount it only needs to ensure that all buffers from the target device are
1530  * invalidated on return and it doesn't need to worry about new buffers from
1531  * that device being added - the unmount code has to prevent that.
1532  */
1533 static void invalidate_bh_lru(void *arg)
1534 {
1535         struct bh_lru *b = &get_cpu_var(bh_lrus);
1536         int i;
1537
1538         for (i = 0; i < BH_LRU_SIZE; i++) {
1539                 brelse(b->bhs[i]);
1540                 b->bhs[i] = NULL;
1541         }
1542         put_cpu_var(bh_lrus);
1543 }
1544         
1545 static void invalidate_bh_lrus(void)
1546 {
1547         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1548 }
1549
1550 void set_bh_page(struct buffer_head *bh,
1551                 struct page *page, unsigned long offset)
1552 {
1553         bh->b_page = page;
1554         if (offset >= PAGE_SIZE)
1555                 BUG();
1556         if (PageHighMem(page))
1557                 /*
1558                  * This catches illegal uses and preserves the offset:
1559                  */
1560                 bh->b_data = (char *)(0 + offset);
1561         else
1562                 bh->b_data = page_address(page) + offset;
1563 }
1564 EXPORT_SYMBOL(set_bh_page);
1565
1566 /*
1567  * Called when truncating a buffer on a page completely.
1568  */
1569 static inline void discard_buffer(struct buffer_head * bh)
1570 {
1571         lock_buffer(bh);
1572         clear_buffer_dirty(bh);
1573         bh->b_bdev = NULL;
1574         clear_buffer_mapped(bh);
1575         clear_buffer_req(bh);
1576         clear_buffer_new(bh);
1577         clear_buffer_delay(bh);
1578         unlock_buffer(bh);
1579 }
1580
1581 /**
1582  * try_to_release_page() - release old fs-specific metadata on a page
1583  *
1584  * @page: the page which the kernel is trying to free
1585  * @gfp_mask: memory allocation flags (and I/O mode)
1586  *
1587  * The address_space is to try to release any data against the page
1588  * (presumably at page->private).  If the release was successful, return `1'.
1589  * Otherwise return zero.
1590  *
1591  * The @gfp_mask argument specifies whether I/O may be performed to release
1592  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1593  *
1594  * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1595  */
1596 int try_to_release_page(struct page *page, int gfp_mask)
1597 {
1598         struct address_space * const mapping = page->mapping;
1599
1600         BUG_ON(!PageLocked(page));
1601         if (PageWriteback(page))
1602                 return 0;
1603         
1604         if (mapping && mapping->a_ops->releasepage)
1605                 return mapping->a_ops->releasepage(page, gfp_mask);
1606         return try_to_free_buffers(page);
1607 }
1608 EXPORT_SYMBOL(try_to_release_page);
1609
1610 /**
1611  * block_invalidatepage - invalidate part of all of a buffer-backed page
1612  *
1613  * @page: the page which is affected
1614  * @offset: the index of the truncation point
1615  *
1616  * block_invalidatepage() is called when all or part of the page has become
1617  * invalidatedby a truncate operation.
1618  *
1619  * block_invalidatepage() does not have to release all buffers, but it must
1620  * ensure that no dirty buffer is left outside @offset and that no I/O
1621  * is underway against any of the blocks which are outside the truncation
1622  * point.  Because the caller is about to free (and possibly reuse) those
1623  * blocks on-disk.
1624  */
1625 int block_invalidatepage(struct page *page, unsigned long offset)
1626 {
1627         struct buffer_head *head, *bh, *next;
1628         unsigned int curr_off = 0;
1629         int ret = 1;
1630
1631         BUG_ON(!PageLocked(page));
1632         if (!page_has_buffers(page))
1633                 goto out;
1634
1635         head = page_buffers(page);
1636         bh = head;
1637         do {
1638                 unsigned int next_off = curr_off + bh->b_size;
1639                 next = bh->b_this_page;
1640
1641                 /*
1642                  * is this block fully invalidated?
1643                  */
1644                 if (offset <= curr_off)
1645                         discard_buffer(bh);
1646                 curr_off = next_off;
1647                 bh = next;
1648         } while (bh != head);
1649
1650         /*
1651          * We release buffers only if the entire page is being invalidated.
1652          * The get_block cached value has been unconditionally invalidated,
1653          * so real IO is not possible anymore.
1654          */
1655         if (offset == 0)
1656                 ret = try_to_release_page(page, 0);
1657 out:
1658         return ret;
1659 }
1660 EXPORT_SYMBOL(block_invalidatepage);
1661
1662 /*
1663  * We attach and possibly dirty the buffers atomically wrt
1664  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1665  * is already excluded via the page lock.
1666  */
1667 void create_empty_buffers(struct page *page,
1668                         unsigned long blocksize, unsigned long b_state)
1669 {
1670         struct buffer_head *bh, *head, *tail;
1671
1672         head = create_buffers(page, blocksize, 1);
1673         bh = head;
1674         do {
1675                 bh->b_state |= b_state;
1676                 tail = bh;
1677                 bh = bh->b_this_page;
1678         } while (bh);
1679         tail->b_this_page = head;
1680
1681         spin_lock(&page->mapping->private_lock);
1682         if (PageUptodate(page) || PageDirty(page)) {
1683                 bh = head;
1684                 do {
1685                         if (PageDirty(page))
1686                                 set_buffer_dirty(bh);
1687                         if (PageUptodate(page))
1688                                 set_buffer_uptodate(bh);
1689                         bh = bh->b_this_page;
1690                 } while (bh != head);
1691         }
1692         __set_page_buffers(page, head);
1693         spin_unlock(&page->mapping->private_lock);
1694 }
1695 EXPORT_SYMBOL(create_empty_buffers);
1696
1697 /*
1698  * We are taking a block for data and we don't want any output from any
1699  * buffer-cache aliases starting from return from that function and
1700  * until the moment when something will explicitly mark the buffer
1701  * dirty (hopefully that will not happen until we will free that block ;-)
1702  * We don't even need to mark it not-uptodate - nobody can expect
1703  * anything from a newly allocated buffer anyway. We used to used
1704  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1705  * don't want to mark the alias unmapped, for example - it would confuse
1706  * anyone who might pick it with bread() afterwards...
1707  *
1708  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1709  * be writeout I/O going on against recently-freed buffers.  We don't
1710  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1711  * only if we really need to.  That happens here.
1712  */
1713 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1714 {
1715         struct buffer_head *old_bh;
1716
1717         old_bh = __find_get_block_slow(bdev, block, 0);
1718         if (old_bh) {
1719                 clear_buffer_dirty(old_bh);
1720                 wait_on_buffer(old_bh);
1721                 clear_buffer_req(old_bh);
1722                 __brelse(old_bh);
1723         }
1724 }
1725 EXPORT_SYMBOL(unmap_underlying_metadata);
1726
1727 /*
1728  * NOTE! All mapped/uptodate combinations are valid:
1729  *
1730  *      Mapped  Uptodate        Meaning
1731  *
1732  *      No      No              "unknown" - must do get_block()
1733  *      No      Yes             "hole" - zero-filled
1734  *      Yes     No              "allocated" - allocated on disk, not read in
1735  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1736  *
1737  * "Dirty" is valid only with the last case (mapped+uptodate).
1738  */
1739
1740 /*
1741  * While block_write_full_page is writing back the dirty buffers under
1742  * the page lock, whoever dirtied the buffers may decide to clean them
1743  * again at any time.  We handle that by only looking at the buffer
1744  * state inside lock_buffer().
1745  *
1746  * If block_write_full_page() is called for regular writeback
1747  * (called_for_sync() is false) then it will redirty a page which has a locked
1748  * buffer.   This only can happen if someone has written the buffer directly,
1749  * with submit_bh().  At the address_space level PageWriteback prevents this
1750  * contention from occurring.
1751  */
1752 static int __block_write_full_page(struct inode *inode, struct page *page,
1753                         get_block_t *get_block, struct writeback_control *wbc)
1754 {
1755         int err;
1756         sector_t block;
1757         sector_t last_block;
1758         struct buffer_head *bh, *head;
1759         int nr_underway = 0;
1760
1761         BUG_ON(!PageLocked(page));
1762
1763         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1764
1765         if (!page_has_buffers(page)) {
1766                 create_empty_buffers(page, 1 << inode->i_blkbits,
1767                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1768         }
1769
1770         /*
1771          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1772          * here, and the (potentially unmapped) buffers may become dirty at
1773          * any time.  If a buffer becomes dirty here after we've inspected it
1774          * then we just miss that fact, and the page stays dirty.
1775          *
1776          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1777          * handle that here by just cleaning them.
1778          */
1779
1780         block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1781         head = page_buffers(page);
1782         bh = head;
1783
1784         /*
1785          * Get all the dirty buffers mapped to disk addresses and
1786          * handle any aliases from the underlying blockdev's mapping.
1787          */
1788         do {
1789                 if (block > last_block) {
1790                         /*
1791                          * mapped buffers outside i_size will occur, because
1792                          * this page can be outside i_size when there is a
1793                          * truncate in progress.
1794                          */
1795                         /*
1796                          * The buffer was zeroed by block_write_full_page()
1797                          */
1798                         clear_buffer_dirty(bh);
1799                         set_buffer_uptodate(bh);
1800                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1801                         err = get_block(inode, block, bh, 1);
1802                         if (err)
1803                                 goto recover;
1804                         if (buffer_new(bh)) {
1805                                 /* blockdev mappings never come here */
1806                                 clear_buffer_new(bh);
1807                                 unmap_underlying_metadata(bh->b_bdev,
1808                                                         bh->b_blocknr);
1809                         }
1810                 }
1811                 bh = bh->b_this_page;
1812                 block++;
1813         } while (bh != head);
1814
1815         do {
1816                 get_bh(bh);
1817                 if (!buffer_mapped(bh))
1818                         continue;
1819                 /*
1820                  * If it's a fully non-blocking write attempt and we cannot
1821                  * lock the buffer then redirty the page.  Note that this can
1822                  * potentially cause a busy-wait loop from pdflush and kswapd
1823                  * activity, but those code paths have their own higher-level
1824                  * throttling.
1825                  */
1826                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1827                         lock_buffer(bh);
1828                 } else if (test_set_buffer_locked(bh)) {
1829                         redirty_page_for_writepage(wbc, page);
1830                         continue;
1831                 }
1832                 if (test_clear_buffer_dirty(bh)) {
1833                         mark_buffer_async_write(bh);
1834                 } else {
1835                         unlock_buffer(bh);
1836                 }
1837         } while ((bh = bh->b_this_page) != head);
1838
1839         BUG_ON(PageWriteback(page));
1840         set_page_writeback(page);       /* Keeps try_to_free_buffers() away */
1841         unlock_page(page);
1842
1843         /*
1844          * The page may come unlocked any time after the *first* submit_bh()
1845          * call.  Be careful with its buffers.
1846          */
1847         do {
1848                 struct buffer_head *next = bh->b_this_page;
1849                 if (buffer_async_write(bh)) {
1850                         submit_bh(WRITE, bh);
1851                         nr_underway++;
1852                 }
1853                 put_bh(bh);
1854                 bh = next;
1855         } while (bh != head);
1856
1857         err = 0;
1858 done:
1859         if (nr_underway == 0) {
1860                 /*
1861                  * The page was marked dirty, but the buffers were
1862                  * clean.  Someone wrote them back by hand with
1863                  * ll_rw_block/submit_bh.  A rare case.
1864                  */
1865                 int uptodate = 1;
1866                 do {
1867                         if (!buffer_uptodate(bh)) {
1868                                 uptodate = 0;
1869                                 break;
1870                         }
1871                         bh = bh->b_this_page;
1872                 } while (bh != head);
1873                 if (uptodate)
1874                         SetPageUptodate(page);
1875                 end_page_writeback(page);
1876                 wbc->pages_skipped++;   /* We didn't write this page */
1877         }
1878         return err;
1879
1880 recover:
1881         /*
1882          * ENOSPC, or some other error.  We may already have added some
1883          * blocks to the file, so we need to write these out to avoid
1884          * exposing stale data.
1885          * The page is currently locked and not marked for writeback
1886          */
1887         bh = head;
1888         /* Recovery: lock and submit the mapped buffers */
1889         do {
1890                 get_bh(bh);
1891                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1892                         lock_buffer(bh);
1893                         mark_buffer_async_write(bh);
1894                 } else {
1895                         /*
1896                          * The buffer may have been set dirty during
1897                          * attachment to a dirty page.
1898                          */
1899                         clear_buffer_dirty(bh);
1900                 }
1901         } while ((bh = bh->b_this_page) != head);
1902         SetPageError(page);
1903         BUG_ON(PageWriteback(page));
1904         set_page_writeback(page);
1905         unlock_page(page);
1906         do {
1907                 struct buffer_head *next = bh->b_this_page;
1908                 if (buffer_async_write(bh)) {
1909                         clear_buffer_dirty(bh);
1910                         submit_bh(WRITE, bh);
1911                         nr_underway++;
1912                 }
1913                 put_bh(bh);
1914                 bh = next;
1915         } while (bh != head);
1916         goto done;
1917 }
1918
1919 static int __block_prepare_write(struct inode *inode, struct page *page,
1920                 unsigned from, unsigned to, get_block_t *get_block)
1921 {
1922         unsigned block_start, block_end;
1923         sector_t block;
1924         int err = 0;
1925         unsigned blocksize, bbits;
1926         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1927
1928         BUG_ON(!PageLocked(page));
1929         BUG_ON(from > PAGE_CACHE_SIZE);
1930         BUG_ON(to > PAGE_CACHE_SIZE);
1931         BUG_ON(from > to);
1932
1933         blocksize = 1 << inode->i_blkbits;
1934         if (!page_has_buffers(page))
1935                 create_empty_buffers(page, blocksize, 0);
1936         head = page_buffers(page);
1937
1938         bbits = inode->i_blkbits;
1939         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1940
1941         for(bh = head, block_start = 0; bh != head || !block_start;
1942             block++, block_start=block_end, bh = bh->b_this_page) {
1943                 block_end = block_start + blocksize;
1944                 if (block_end <= from || block_start >= to) {
1945                         if (PageUptodate(page)) {
1946                                 if (!buffer_uptodate(bh))
1947                                         set_buffer_uptodate(bh);
1948                         }
1949                         continue;
1950                 }
1951                 if (buffer_new(bh))
1952                         clear_buffer_new(bh);
1953                 if (!buffer_mapped(bh)) {
1954                         err = get_block(inode, block, bh, 1);
1955                         if (err)
1956                                 goto out;
1957                         if (buffer_new(bh)) {
1958                                 clear_buffer_new(bh);
1959                                 unmap_underlying_metadata(bh->b_bdev,
1960                                                         bh->b_blocknr);
1961                                 if (PageUptodate(page)) {
1962                                         set_buffer_uptodate(bh);
1963                                         continue;
1964                                 }
1965                                 if (block_end > to || block_start < from) {
1966                                         void *kaddr;
1967
1968                                         kaddr = kmap_atomic(page, KM_USER0);
1969                                         if (block_end > to)
1970                                                 memset(kaddr+to, 0,
1971                                                         block_end-to);
1972                                         if (block_start < from)
1973                                                 memset(kaddr+block_start,
1974                                                         0, from-block_start);
1975                                         flush_dcache_page(page);
1976                                         kunmap_atomic(kaddr, KM_USER0);
1977                                 }
1978                                 continue;
1979                         }
1980                 }
1981                 if (PageUptodate(page)) {
1982                         if (!buffer_uptodate(bh))
1983                                 set_buffer_uptodate(bh);
1984                         continue; 
1985                 }
1986                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1987                      (block_start < from || block_end > to)) {
1988                         ll_rw_block(READ, 1, &bh);
1989                         *wait_bh++=bh;
1990                 }
1991         }
1992         /*
1993          * If we issued read requests - let them complete.
1994          */
1995         while(wait_bh > wait) {
1996                 wait_on_buffer(*--wait_bh);
1997                 if (!buffer_uptodate(*wait_bh))
1998                         return -EIO;
1999         }
2000         return 0;
2001 out:
2002         /*
2003          * Zero out any newly allocated blocks to avoid exposing stale
2004          * data.  If BH_New is set, we know that the block was newly
2005          * allocated in the above loop.
2006          */
2007         bh = head;
2008         block_start = 0;
2009         do {
2010                 block_end = block_start+blocksize;
2011                 if (block_end <= from)
2012                         goto next_bh;
2013                 if (block_start >= to)
2014                         break;
2015                 if (buffer_new(bh)) {
2016                         void *kaddr;
2017
2018                         clear_buffer_new(bh);
2019                         kaddr = kmap_atomic(page, KM_USER0);
2020                         memset(kaddr+block_start, 0, bh->b_size);
2021                         kunmap_atomic(kaddr, KM_USER0);
2022                         set_buffer_uptodate(bh);
2023                         mark_buffer_dirty(bh);
2024                 }
2025 next_bh:
2026                 block_start = block_end;
2027                 bh = bh->b_this_page;
2028         } while (bh != head);
2029         return err;
2030 }
2031
2032 static int __block_commit_write(struct inode *inode, struct page *page,
2033                 unsigned from, unsigned to)
2034 {
2035         unsigned block_start, block_end;
2036         int partial = 0;
2037         unsigned blocksize;
2038         struct buffer_head *bh, *head;
2039
2040         blocksize = 1 << inode->i_blkbits;
2041
2042         for(bh = head = page_buffers(page), block_start = 0;
2043             bh != head || !block_start;
2044             block_start=block_end, bh = bh->b_this_page) {
2045                 block_end = block_start + blocksize;
2046                 if (block_end <= from || block_start >= to) {
2047                         if (!buffer_uptodate(bh))
2048                                 partial = 1;
2049                 } else {
2050                         set_buffer_uptodate(bh);
2051                         mark_buffer_dirty(bh);
2052                 }
2053         }
2054
2055         /*
2056          * If this is a partial write which happened to make all buffers
2057          * uptodate then we can optimize away a bogus readpage() for
2058          * the next read(). Here we 'discover' whether the page went
2059          * uptodate as a result of this (potentially partial) write.
2060          */
2061         if (!partial)
2062                 SetPageUptodate(page);
2063         return 0;
2064 }
2065
2066 /*
2067  * Generic "read page" function for block devices that have the normal
2068  * get_block functionality. This is most of the block device filesystems.
2069  * Reads the page asynchronously --- the unlock_buffer() and
2070  * set/clear_buffer_uptodate() functions propagate buffer state into the
2071  * page struct once IO has completed.
2072  */
2073 int block_read_full_page(struct page *page, get_block_t *get_block)
2074 {
2075         struct inode *inode = page->mapping->host;
2076         sector_t iblock, lblock;
2077         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2078         unsigned int blocksize;
2079         int nr, i;
2080         int fully_mapped = 1;
2081
2082         if (!PageLocked(page))
2083                 PAGE_BUG(page);
2084         blocksize = 1 << inode->i_blkbits;
2085         if (!page_has_buffers(page))
2086                 create_empty_buffers(page, blocksize, 0);
2087         head = page_buffers(page);
2088
2089         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2090         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2091         bh = head;
2092         nr = 0;
2093         i = 0;
2094
2095         do {
2096                 if (buffer_uptodate(bh))
2097                         continue;
2098
2099                 if (!buffer_mapped(bh)) {
2100                         fully_mapped = 0;
2101                         if (iblock < lblock) {
2102                                 if (get_block(inode, iblock, bh, 0))
2103                                         SetPageError(page);
2104                         }
2105                         if (!buffer_mapped(bh)) {
2106                                 void *kaddr = kmap_atomic(page, KM_USER0);
2107                                 memset(kaddr + i * blocksize, 0, blocksize);
2108                                 flush_dcache_page(page);
2109                                 kunmap_atomic(kaddr, KM_USER0);
2110                                 set_buffer_uptodate(bh);
2111                                 continue;
2112                         }
2113                         /*
2114                          * get_block() might have updated the buffer
2115                          * synchronously
2116                          */
2117                         if (buffer_uptodate(bh))
2118                                 continue;
2119                 }
2120                 arr[nr++] = bh;
2121         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2122
2123         if (fully_mapped)
2124                 SetPageMappedToDisk(page);
2125
2126         if (!nr) {
2127                 /*
2128                  * All buffers are uptodate - we can set the page uptodate
2129                  * as well. But not if get_block() returned an error.
2130                  */
2131                 if (!PageError(page))
2132                         SetPageUptodate(page);
2133                 unlock_page(page);
2134                 return 0;
2135         }
2136
2137         /* Stage two: lock the buffers */
2138         for (i = 0; i < nr; i++) {
2139                 bh = arr[i];
2140                 lock_buffer(bh);
2141                 mark_buffer_async_read(bh);
2142         }
2143
2144         /*
2145          * Stage 3: start the IO.  Check for uptodateness
2146          * inside the buffer lock in case another process reading
2147          * the underlying blockdev brought it uptodate (the sct fix).
2148          */
2149         for (i = 0; i < nr; i++) {
2150                 bh = arr[i];
2151                 if (buffer_uptodate(bh))
2152                         end_buffer_async_read(bh, 1);
2153                 else
2154                         submit_bh(READ, bh);
2155         }
2156         return 0;
2157 }
2158
2159 /* utility function for filesystems that need to do work on expanding
2160  * truncates.  Uses prepare/commit_write to allow the filesystem to
2161  * deal with the hole.  
2162  */
2163 int generic_cont_expand(struct inode *inode, loff_t size)
2164 {
2165         struct address_space *mapping = inode->i_mapping;
2166         struct page *page;
2167         unsigned long index, offset, limit;
2168         int err;
2169
2170         err = -EFBIG;
2171         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2172         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2173                 send_sig(SIGXFSZ, current, 0);
2174                 goto out;
2175         }
2176         if (size > inode->i_sb->s_maxbytes)
2177                 goto out;
2178
2179         offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2180
2181         /* ugh.  in prepare/commit_write, if from==to==start of block, we 
2182         ** skip the prepare.  make sure we never send an offset for the start
2183         ** of a block
2184         */
2185         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2186                 offset++;
2187         }
2188         index = size >> PAGE_CACHE_SHIFT;
2189         err = -ENOMEM;
2190         page = grab_cache_page(mapping, index);
2191         if (!page)
2192                 goto out;
2193         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2194         if (!err) {
2195                 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2196         }
2197         unlock_page(page);
2198         page_cache_release(page);
2199         if (err > 0)
2200                 err = 0;
2201 out:
2202         return err;
2203 }
2204
2205 /*
2206  * For moronic filesystems that do not allow holes in file.
2207  * We may have to extend the file.
2208  */
2209
2210 int cont_prepare_write(struct page *page, unsigned offset,
2211                 unsigned to, get_block_t *get_block, loff_t *bytes)
2212 {
2213         struct address_space *mapping = page->mapping;
2214         struct inode *inode = mapping->host;
2215         struct page *new_page;
2216         pgoff_t pgpos;
2217         long status;
2218         unsigned zerofrom;
2219         unsigned blocksize = 1 << inode->i_blkbits;
2220         void *kaddr;
2221
2222         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2223                 status = -ENOMEM;
2224                 new_page = grab_cache_page(mapping, pgpos);
2225                 if (!new_page)
2226                         goto out;
2227                 /* we might sleep */
2228                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2229                         unlock_page(new_page);
2230                         page_cache_release(new_page);
2231                         continue;
2232                 }
2233                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2234                 if (zerofrom & (blocksize-1)) {
2235                         *bytes |= (blocksize-1);
2236                         (*bytes)++;
2237                 }
2238                 status = __block_prepare_write(inode, new_page, zerofrom,
2239                                                 PAGE_CACHE_SIZE, get_block);
2240                 if (status)
2241                         goto out_unmap;
2242                 kaddr = kmap_atomic(new_page, KM_USER0);
2243                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2244                 flush_dcache_page(new_page);
2245                 kunmap_atomic(kaddr, KM_USER0);
2246                 __block_commit_write(inode, new_page,
2247                                 zerofrom, PAGE_CACHE_SIZE);
2248                 unlock_page(new_page);
2249                 page_cache_release(new_page);
2250         }
2251
2252         if (page->index < pgpos) {
2253                 /* completely inside the area */
2254                 zerofrom = offset;
2255         } else {
2256                 /* page covers the boundary, find the boundary offset */
2257                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2258
2259                 /* if we will expand the thing last block will be filled */
2260                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2261                         *bytes |= (blocksize-1);
2262                         (*bytes)++;
2263                 }
2264
2265                 /* starting below the boundary? Nothing to zero out */
2266                 if (offset <= zerofrom)
2267                         zerofrom = offset;
2268         }
2269         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2270         if (status)
2271                 goto out1;
2272         if (zerofrom < offset) {
2273                 kaddr = kmap_atomic(page, KM_USER0);
2274                 memset(kaddr+zerofrom, 0, offset-zerofrom);
2275                 flush_dcache_page(page);
2276                 kunmap_atomic(kaddr, KM_USER0);
2277                 __block_commit_write(inode, page, zerofrom, offset);
2278         }
2279         return 0;
2280 out1:
2281         ClearPageUptodate(page);
2282         return status;
2283
2284 out_unmap:
2285         ClearPageUptodate(new_page);
2286         unlock_page(new_page);
2287         page_cache_release(new_page);
2288 out:
2289         return status;
2290 }
2291
2292 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2293                         get_block_t *get_block)
2294 {
2295         struct inode *inode = page->mapping->host;
2296         int err = __block_prepare_write(inode, page, from, to, get_block);
2297         if (err)
2298                 ClearPageUptodate(page);
2299         return err;
2300 }
2301
2302 int block_commit_write(struct page *page, unsigned from, unsigned to)
2303 {
2304         struct inode *inode = page->mapping->host;
2305         __block_commit_write(inode,page,from,to);
2306         return 0;
2307 }
2308
2309 int generic_commit_write(struct file *file, struct page *page,
2310                 unsigned from, unsigned to)
2311 {
2312         struct inode *inode = page->mapping->host;
2313         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2314         __block_commit_write(inode,page,from,to);
2315         /*
2316          * No need to use i_size_read() here, the i_size
2317          * cannot change under us because we hold i_sem.
2318          */
2319         if (pos > inode->i_size) {
2320                 i_size_write(inode, pos);
2321                 mark_inode_dirty(inode);
2322         }
2323         return 0;
2324 }
2325
2326
2327 /*
2328  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2329  * immediately, while under the page lock.  So it needs a special end_io
2330  * handler which does not touch the bh after unlocking it.
2331  *
2332  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2333  * a race there is benign: unlock_buffer() only use the bh's address for
2334  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2335  * itself.
2336  */
2337 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2338 {
2339         if (uptodate) {
2340                 set_buffer_uptodate(bh);
2341         } else {
2342                 /* This happens, due to failed READA attempts. */
2343                 clear_buffer_uptodate(bh);
2344         }
2345         unlock_buffer(bh);
2346 }
2347
2348 /*
2349  * On entry, the page is fully not uptodate.
2350  * On exit the page is fully uptodate in the areas outside (from,to)
2351  */
2352 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2353                         get_block_t *get_block)
2354 {
2355         struct inode *inode = page->mapping->host;
2356         const unsigned blkbits = inode->i_blkbits;
2357         const unsigned blocksize = 1 << blkbits;
2358         struct buffer_head map_bh;
2359         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2360         unsigned block_in_page;
2361         unsigned block_start;
2362         sector_t block_in_file;
2363         char *kaddr;
2364         int nr_reads = 0;
2365         int i;
2366         int ret = 0;
2367         int is_mapped_to_disk = 1;
2368         int dirtied_it = 0;
2369
2370         if (PageMappedToDisk(page))
2371                 return 0;
2372
2373         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2374         map_bh.b_page = page;
2375
2376         /*
2377          * We loop across all blocks in the page, whether or not they are
2378          * part of the affected region.  This is so we can discover if the
2379          * page is fully mapped-to-disk.
2380          */
2381         for (block_start = 0, block_in_page = 0;
2382                   block_start < PAGE_CACHE_SIZE;
2383                   block_in_page++, block_start += blocksize) {
2384                 unsigned block_end = block_start + blocksize;
2385                 int create;
2386
2387                 map_bh.b_state = 0;
2388                 create = 1;
2389                 if (block_start >= to)
2390                         create = 0;
2391                 ret = get_block(inode, block_in_file + block_in_page,
2392                                         &map_bh, create);
2393                 if (ret)
2394                         goto failed;
2395                 if (!buffer_mapped(&map_bh))
2396                         is_mapped_to_disk = 0;
2397                 if (buffer_new(&map_bh))
2398                         unmap_underlying_metadata(map_bh.b_bdev,
2399                                                         map_bh.b_blocknr);
2400                 if (PageUptodate(page))
2401                         continue;
2402                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2403                         kaddr = kmap_atomic(page, KM_USER0);
2404                         if (block_start < from) {
2405                                 memset(kaddr+block_start, 0, from-block_start);
2406                                 dirtied_it = 1;
2407                         }
2408                         if (block_end > to) {
2409                                 memset(kaddr + to, 0, block_end - to);
2410                                 dirtied_it = 1;
2411                         }
2412                         flush_dcache_page(page);
2413                         kunmap_atomic(kaddr, KM_USER0);
2414                         continue;
2415                 }
2416                 if (buffer_uptodate(&map_bh))
2417                         continue;       /* reiserfs does this */
2418                 if (block_start < from || block_end > to) {
2419                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2420
2421                         if (!bh) {
2422                                 ret = -ENOMEM;
2423                                 goto failed;
2424                         }
2425                         bh->b_state = map_bh.b_state;
2426                         atomic_set(&bh->b_count, 0);
2427                         bh->b_this_page = 0;
2428                         bh->b_page = page;
2429                         bh->b_blocknr = map_bh.b_blocknr;
2430                         bh->b_size = blocksize;
2431                         bh->b_data = (char *)(long)block_start;
2432                         bh->b_bdev = map_bh.b_bdev;
2433                         bh->b_private = NULL;
2434                         read_bh[nr_reads++] = bh;
2435                 }
2436         }
2437
2438         if (nr_reads) {
2439                 struct buffer_head *bh;
2440
2441                 /*
2442                  * The page is locked, so these buffers are protected from
2443                  * any VM or truncate activity.  Hence we don't need to care
2444                  * for the buffer_head refcounts.
2445                  */
2446                 for (i = 0; i < nr_reads; i++) {
2447                         bh = read_bh[i];
2448                         lock_buffer(bh);
2449                         bh->b_end_io = end_buffer_read_nobh;
2450                         submit_bh(READ, bh);
2451                 }
2452                 for (i = 0; i < nr_reads; i++) {
2453                         bh = read_bh[i];
2454                         wait_on_buffer(bh);
2455                         if (!buffer_uptodate(bh))
2456                                 ret = -EIO;
2457                         free_buffer_head(bh);
2458                         read_bh[i] = NULL;
2459                 }
2460                 if (ret)
2461                         goto failed;
2462         }
2463
2464         if (is_mapped_to_disk)
2465                 SetPageMappedToDisk(page);
2466         SetPageUptodate(page);
2467
2468         /*
2469          * Setting the page dirty here isn't necessary for the prepare_write
2470          * function - commit_write will do that.  But if/when this function is
2471          * used within the pagefault handler to ensure that all mmapped pages
2472          * have backing space in the filesystem, we will need to dirty the page
2473          * if its contents were altered.
2474          */
2475         if (dirtied_it)
2476                 set_page_dirty(page);
2477
2478         return 0;
2479
2480 failed:
2481         for (i = 0; i < nr_reads; i++) {
2482                 if (read_bh[i])
2483                         free_buffer_head(read_bh[i]);
2484         }
2485
2486         /*
2487          * Error recovery is pretty slack.  Clear the page and mark it dirty
2488          * so we'll later zero out any blocks which _were_ allocated.
2489          */
2490         kaddr = kmap_atomic(page, KM_USER0);
2491         memset(kaddr, 0, PAGE_CACHE_SIZE);
2492         kunmap_atomic(kaddr, KM_USER0);
2493         SetPageUptodate(page);
2494         set_page_dirty(page);
2495         return ret;
2496 }
2497 EXPORT_SYMBOL(nobh_prepare_write);
2498
2499 int nobh_commit_write(struct file *file, struct page *page,
2500                 unsigned from, unsigned to)
2501 {
2502         struct inode *inode = page->mapping->host;
2503         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2504
2505         set_page_dirty(page);
2506         if (pos > inode->i_size) {
2507                 i_size_write(inode, pos);
2508                 mark_inode_dirty(inode);
2509         }
2510         return 0;
2511 }
2512 EXPORT_SYMBOL(nobh_commit_write);
2513
2514 /*
2515  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2516  */
2517 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2518 {
2519         struct inode *inode = mapping->host;
2520         unsigned blocksize = 1 << inode->i_blkbits;
2521         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2522         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2523         unsigned to;
2524         struct page *page;
2525         struct address_space_operations *a_ops = mapping->a_ops;
2526         char *kaddr;
2527         int ret = 0;
2528
2529         if ((offset & (blocksize - 1)) == 0)
2530                 goto out;
2531
2532         ret = -ENOMEM;
2533         page = grab_cache_page(mapping, index);
2534         if (!page)
2535                 goto out;
2536
2537         to = (offset + blocksize) & ~(blocksize - 1);
2538         ret = a_ops->prepare_write(NULL, page, offset, to);
2539         if (ret == 0) {
2540                 kaddr = kmap_atomic(page, KM_USER0);
2541                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2542                 flush_dcache_page(page);
2543                 kunmap_atomic(kaddr, KM_USER0);
2544                 set_page_dirty(page);
2545         }
2546         unlock_page(page);
2547         page_cache_release(page);
2548 out:
2549         return ret;
2550 }
2551 EXPORT_SYMBOL(nobh_truncate_page);
2552
2553 int block_truncate_page(struct address_space *mapping,
2554                         loff_t from, get_block_t *get_block)
2555 {
2556         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2557         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2558         unsigned blocksize;
2559         pgoff_t iblock;
2560         unsigned length, pos;
2561         struct inode *inode = mapping->host;
2562         struct page *page;
2563         struct buffer_head *bh;
2564         void *kaddr;
2565         int err;
2566
2567         blocksize = 1 << inode->i_blkbits;
2568         length = offset & (blocksize - 1);
2569
2570         /* Block boundary? Nothing to do */
2571         if (!length)
2572                 return 0;
2573
2574         length = blocksize - length;
2575         iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2576         
2577         page = grab_cache_page(mapping, index);
2578         err = -ENOMEM;
2579         if (!page)
2580                 goto out;
2581
2582         if (!page_has_buffers(page))
2583                 create_empty_buffers(page, blocksize, 0);
2584
2585         /* Find the buffer that contains "offset" */
2586         bh = page_buffers(page);
2587         pos = blocksize;
2588         while (offset >= pos) {
2589                 bh = bh->b_this_page;
2590                 iblock++;
2591                 pos += blocksize;
2592         }
2593
2594         err = 0;
2595         if (!buffer_mapped(bh)) {
2596                 err = get_block(inode, iblock, bh, 0);
2597                 if (err)
2598                         goto unlock;
2599                 /* unmapped? It's a hole - nothing to do */
2600                 if (!buffer_mapped(bh))
2601                         goto unlock;
2602         }
2603
2604         /* Ok, it's mapped. Make sure it's up-to-date */
2605         if (PageUptodate(page))
2606                 set_buffer_uptodate(bh);
2607
2608         if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2609                 err = -EIO;
2610                 ll_rw_block(READ, 1, &bh);
2611                 wait_on_buffer(bh);
2612                 /* Uhhuh. Read error. Complain and punt. */
2613                 if (!buffer_uptodate(bh))
2614                         goto unlock;
2615         }
2616
2617         kaddr = kmap_atomic(page, KM_USER0);
2618         memset(kaddr + offset, 0, length);
2619         flush_dcache_page(page);
2620         kunmap_atomic(kaddr, KM_USER0);
2621
2622         mark_buffer_dirty(bh);
2623         err = 0;
2624
2625 unlock:
2626         unlock_page(page);
2627         page_cache_release(page);
2628 out:
2629         return err;
2630 }
2631
2632 /*
2633  * The generic ->writepage function for buffer-backed address_spaces
2634  */
2635 int block_write_full_page(struct page *page, get_block_t *get_block,
2636                         struct writeback_control *wbc)
2637 {
2638         struct inode * const inode = page->mapping->host;
2639         loff_t i_size = i_size_read(inode);
2640         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2641         unsigned offset;
2642         void *kaddr;
2643
2644         /* Is the page fully inside i_size? */
2645         if (page->index < end_index)
2646                 return __block_write_full_page(inode, page, get_block, wbc);
2647
2648         /* Is the page fully outside i_size? (truncate in progress) */
2649         offset = i_size & (PAGE_CACHE_SIZE-1);
2650         if (page->index >= end_index+1 || !offset) {
2651                 /*
2652                  * The page may have dirty, unmapped buffers.  For example,
2653                  * they may have been added in ext3_writepage().  Make them
2654                  * freeable here, so the page does not leak.
2655                  */
2656                 block_invalidatepage(page, 0);
2657                 unlock_page(page);
2658                 return 0; /* don't care */
2659         }
2660
2661         /*
2662          * The page straddles i_size.  It must be zeroed out on each and every
2663          * writepage invocation because it may be mmapped.  "A file is mapped
2664          * in multiples of the page size.  For a file that is not a multiple of
2665          * the  page size, the remaining memory is zeroed when mapped, and
2666          * writes to that region are not written out to the file."
2667          */
2668         kaddr = kmap_atomic(page, KM_USER0);
2669         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2670         flush_dcache_page(page);
2671         kunmap_atomic(kaddr, KM_USER0);
2672         return __block_write_full_page(inode, page, get_block, wbc);
2673 }
2674
2675 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2676                             get_block_t *get_block)
2677 {
2678         struct buffer_head tmp;
2679         struct inode *inode = mapping->host;
2680         tmp.b_state = 0;
2681         tmp.b_blocknr = 0;
2682         get_block(inode, block, &tmp, 0);
2683         return tmp.b_blocknr;
2684 }
2685
2686 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2687 {
2688         struct buffer_head *bh = bio->bi_private;
2689
2690         if (bio->bi_size)
2691                 return 1;
2692
2693         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2694         bio_put(bio);
2695         return 0;
2696 }
2697
2698 void submit_bh(int rw, struct buffer_head * bh)
2699 {
2700         struct bio *bio;
2701
2702         BUG_ON(!buffer_locked(bh));
2703         BUG_ON(!buffer_mapped(bh));
2704         BUG_ON(!bh->b_end_io);
2705
2706         /* Only clear out a write error when rewriting */
2707         if (test_set_buffer_req(bh) && rw == WRITE)
2708                 clear_buffer_write_io_error(bh);
2709
2710         /*
2711          * from here on down, it's all bio -- do the initial mapping,
2712          * submit_bio -> generic_make_request may further map this bio around
2713          */
2714         bio = bio_alloc(GFP_NOIO, 1);
2715
2716         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2717         bio->bi_bdev = bh->b_bdev;
2718         bio->bi_io_vec[0].bv_page = bh->b_page;
2719         bio->bi_io_vec[0].bv_len = bh->b_size;
2720         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2721
2722         bio->bi_vcnt = 1;
2723         bio->bi_idx = 0;
2724         bio->bi_size = bh->b_size;
2725
2726         bio->bi_end_io = end_bio_bh_io_sync;
2727         bio->bi_private = bh;
2728
2729         submit_bio(rw, bio);
2730 }
2731
2732 /**
2733  * ll_rw_block: low-level access to block devices (DEPRECATED)
2734  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2735  * @nr: number of &struct buffer_heads in the array
2736  * @bhs: array of pointers to &struct buffer_head
2737  *
2738  * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2739  * and requests an I/O operation on them, either a %READ or a %WRITE.
2740  * The third %READA option is described in the documentation for
2741  * generic_make_request() which ll_rw_block() calls.
2742  *
2743  * This function drops any buffer that it cannot get a lock on (with the
2744  * BH_Lock state bit), any buffer that appears to be clean when doing a
2745  * write request, and any buffer that appears to be up-to-date when doing
2746  * read request.  Further it marks as clean buffers that are processed for
2747  * writing (the buffer cache won't assume that they are actually clean until
2748  * the buffer gets unlocked).
2749  *
2750  * ll_rw_block sets b_end_io to simple completion handler that marks
2751  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2752  * any waiters. 
2753  *
2754  * All of the buffers must be for the same device, and must also be a
2755  * multiple of the current approved size for the device.
2756  */
2757 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2758 {
2759         int i;
2760
2761         for (i = 0; i < nr; i++) {
2762                 struct buffer_head *bh = bhs[i];
2763
2764                 if (test_set_buffer_locked(bh))
2765                         continue;
2766
2767                 get_bh(bh);
2768                 if (rw == WRITE) {
2769                         bh->b_end_io = end_buffer_write_sync;
2770                         if (test_clear_buffer_dirty(bh)) {
2771                                 submit_bh(WRITE, bh);
2772                                 continue;
2773                         }
2774                 } else {
2775                         bh->b_end_io = end_buffer_read_sync;
2776                         if (!buffer_uptodate(bh)) {
2777                                 submit_bh(rw, bh);
2778                                 continue;
2779                         }
2780                 }
2781                 unlock_buffer(bh);
2782                 put_bh(bh);
2783         }
2784 }
2785
2786 /*
2787  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2788  * and then start new I/O and then wait upon it.
2789  */
2790 void sync_dirty_buffer(struct buffer_head *bh)
2791 {
2792         WARN_ON(atomic_read(&bh->b_count) < 1);
2793         lock_buffer(bh);
2794         if (test_clear_buffer_dirty(bh)) {
2795                 get_bh(bh);
2796                 bh->b_end_io = end_buffer_write_sync;
2797                 submit_bh(WRITE, bh);
2798                 wait_on_buffer(bh);
2799         } else {
2800                 unlock_buffer(bh);
2801         }
2802 }
2803
2804 /*
2805  * try_to_free_buffers() checks if all the buffers on this particular page
2806  * are unused, and releases them if so.
2807  *
2808  * Exclusion against try_to_free_buffers may be obtained by either
2809  * locking the page or by holding its mapping's private_lock.
2810  *
2811  * If the page is dirty but all the buffers are clean then we need to
2812  * be sure to mark the page clean as well.  This is because the page
2813  * may be against a block device, and a later reattachment of buffers
2814  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2815  * filesystem data on the same device.
2816  *
2817  * The same applies to regular filesystem pages: if all the buffers are
2818  * clean then we set the page clean and proceed.  To do that, we require
2819  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2820  * private_lock.
2821  *
2822  * try_to_free_buffers() is non-blocking.
2823  */
2824 static inline int buffer_busy(struct buffer_head *bh)
2825 {
2826         return atomic_read(&bh->b_count) |
2827                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2828 }
2829
2830 static int
2831 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2832 {
2833         struct buffer_head *head = page_buffers(page);
2834         struct buffer_head *bh;
2835         int was_uptodate = 1;
2836
2837         bh = head;
2838         do {
2839                 if (buffer_write_io_error(bh))
2840                         set_bit(AS_EIO, &page->mapping->flags);
2841                 if (buffer_busy(bh))
2842                         goto failed;
2843                 if (!buffer_uptodate(bh) && !buffer_req(bh))
2844                         was_uptodate = 0;
2845                 bh = bh->b_this_page;
2846         } while (bh != head);
2847
2848         do {
2849                 struct buffer_head *next = bh->b_this_page;
2850
2851                 if (!list_empty(&bh->b_assoc_buffers))
2852                         __remove_assoc_queue(bh);
2853                 bh = next;
2854         } while (bh != head);
2855         *buffers_to_free = head;
2856         __clear_page_buffers(page);
2857         return 1;
2858 failed:
2859         return 0;
2860 }
2861
2862 int try_to_free_buffers(struct page *page)
2863 {
2864         struct address_space * const mapping = page->mapping;
2865         struct buffer_head *buffers_to_free = NULL;
2866         int ret = 0;
2867
2868         BUG_ON(!PageLocked(page));
2869         if (PageWriteback(page))
2870                 return 0;
2871
2872         if (mapping == NULL) {          /* can this still happen? */
2873                 ret = drop_buffers(page, &buffers_to_free);
2874                 goto out;
2875         }
2876
2877         spin_lock(&mapping->private_lock);
2878         ret = drop_buffers(page, &buffers_to_free);
2879         if (ret) {
2880                 /*
2881                  * If the filesystem writes its buffers by hand (eg ext3)
2882                  * then we can have clean buffers against a dirty page.  We
2883                  * clean the page here; otherwise later reattachment of buffers
2884                  * could encounter a non-uptodate page, which is unresolvable.
2885                  * This only applies in the rare case where try_to_free_buffers
2886                  * succeeds but the page is not freed.
2887                  */
2888                 clear_page_dirty(page);
2889         }
2890         spin_unlock(&mapping->private_lock);
2891 out:
2892         if (buffers_to_free) {
2893                 struct buffer_head *bh = buffers_to_free;
2894
2895                 do {
2896                         struct buffer_head *next = bh->b_this_page;
2897                         free_buffer_head(bh);
2898                         bh = next;
2899                 } while (bh != buffers_to_free);
2900         }
2901         return ret;
2902 }
2903 EXPORT_SYMBOL(try_to_free_buffers);
2904
2905 int block_sync_page(struct page *page)
2906 {
2907         struct address_space *mapping;
2908         smp_mb();
2909         mapping = page->mapping;
2910         blk_run_address_space(mapping);
2911         return 0;
2912 }
2913
2914 /*
2915  * There are no bdflush tunables left.  But distributions are
2916  * still running obsolete flush daemons, so we terminate them here.
2917  *
2918  * Use of bdflush() is deprecated and will be removed in a future kernel.
2919  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2920  */
2921 asmlinkage long sys_bdflush(int func, long data)
2922 {
2923         static int msg_count;
2924
2925         if (!capable(CAP_SYS_ADMIN))
2926                 return -EPERM;
2927
2928         if (msg_count < 5) {
2929                 msg_count++;
2930                 printk(KERN_INFO
2931                         "warning: process `%s' used the obsolete bdflush"
2932                         " system call\n", current->comm);
2933                 printk(KERN_INFO "Fix your initscripts?\n");
2934         }
2935
2936         if (func == 1)
2937                 do_exit(0);
2938         return 0;
2939 }
2940
2941 /*
2942  * Buffer-head allocation
2943  */
2944 static kmem_cache_t *bh_cachep;
2945
2946 /*
2947  * Once the number of bh's in the machine exceeds this level, we start
2948  * stripping them in writeback.
2949  */
2950 static int max_buffer_heads;
2951
2952 int buffer_heads_over_limit;
2953
2954 struct bh_accounting {
2955         int nr;                 /* Number of live bh's */
2956         int ratelimit;          /* Limit cacheline bouncing */
2957 };
2958
2959 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2960
2961 static void recalc_bh_state(void)
2962 {
2963         int i;
2964         int tot = 0;
2965
2966         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2967                 return;
2968         __get_cpu_var(bh_accounting).ratelimit = 0;
2969         for_each_cpu(i)
2970                 tot += per_cpu(bh_accounting, i).nr;
2971         buffer_heads_over_limit = (tot > max_buffer_heads);
2972 }
2973         
2974 struct buffer_head *alloc_buffer_head(int gfp_flags)
2975 {
2976         struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
2977         if (ret) {
2978                 preempt_disable();
2979                 __get_cpu_var(bh_accounting).nr++;
2980                 recalc_bh_state();
2981                 preempt_enable();
2982         }
2983         return ret;
2984 }
2985 EXPORT_SYMBOL(alloc_buffer_head);
2986
2987 void free_buffer_head(struct buffer_head *bh)
2988 {
2989         BUG_ON(!list_empty(&bh->b_assoc_buffers));
2990         kmem_cache_free(bh_cachep, bh);
2991         preempt_disable();
2992         __get_cpu_var(bh_accounting).nr--;
2993         recalc_bh_state();
2994         preempt_enable();
2995 }
2996 EXPORT_SYMBOL(free_buffer_head);
2997
2998 static void
2999 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3000 {
3001         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3002                             SLAB_CTOR_CONSTRUCTOR) {
3003                 struct buffer_head * bh = (struct buffer_head *)data;
3004
3005                 memset(bh, 0, sizeof(*bh));
3006                 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3007         }
3008 }
3009
3010 #ifdef CONFIG_HOTPLUG_CPU
3011 static void buffer_exit_cpu(int cpu)
3012 {
3013         int i;
3014         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3015
3016         for (i = 0; i < BH_LRU_SIZE; i++) {
3017                 brelse(b->bhs[i]);
3018                 b->bhs[i] = NULL;
3019         }
3020 }
3021
3022 static int buffer_cpu_notify(struct notifier_block *self,
3023                               unsigned long action, void *hcpu)
3024 {
3025         if (action == CPU_DEAD)
3026                 buffer_exit_cpu((unsigned long)hcpu);
3027         return NOTIFY_OK;
3028 }
3029 #endif /* CONFIG_HOTPLUG_CPU */
3030
3031 void __init buffer_init(void)
3032 {
3033         int i;
3034         int nrpages;
3035
3036         bh_cachep = kmem_cache_create("buffer_head",
3037                         sizeof(struct buffer_head), 0,
3038                         0, init_buffer_head, NULL);
3039         for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
3040                 init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
3041
3042         /*
3043          * Limit the bh occupancy to 10% of ZONE_NORMAL
3044          */
3045         nrpages = (nr_free_buffer_pages() * 10) / 100;
3046         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3047         hotcpu_notifier(buffer_cpu_notify, 0);
3048 }
3049
3050 EXPORT_SYMBOL(__bforget);
3051 EXPORT_SYMBOL(__brelse);
3052 EXPORT_SYMBOL(__wait_on_buffer);
3053 EXPORT_SYMBOL(block_commit_write);
3054 EXPORT_SYMBOL(block_prepare_write);
3055 EXPORT_SYMBOL(block_read_full_page);
3056 EXPORT_SYMBOL(block_sync_page);
3057 EXPORT_SYMBOL(block_truncate_page);
3058 EXPORT_SYMBOL(block_write_full_page);
3059 EXPORT_SYMBOL(buffer_insert_list);
3060 EXPORT_SYMBOL(cont_prepare_write);
3061 EXPORT_SYMBOL(end_buffer_async_write);
3062 EXPORT_SYMBOL(end_buffer_read_sync);
3063 EXPORT_SYMBOL(end_buffer_write_sync);
3064 EXPORT_SYMBOL(file_fsync);
3065 EXPORT_SYMBOL(fsync_bdev);
3066 EXPORT_SYMBOL(fsync_buffers_list);
3067 EXPORT_SYMBOL(generic_block_bmap);
3068 EXPORT_SYMBOL(generic_commit_write);
3069 EXPORT_SYMBOL(generic_cont_expand);
3070 EXPORT_SYMBOL(init_buffer);
3071 EXPORT_SYMBOL(invalidate_bdev);
3072 EXPORT_SYMBOL(ll_rw_block);
3073 EXPORT_SYMBOL(mark_buffer_dirty);
3074 EXPORT_SYMBOL(submit_bh);
3075 EXPORT_SYMBOL(sync_dirty_buffer);
3076 EXPORT_SYMBOL(unlock_buffer);