This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20
21 #include <linux/config.h>
22 #include <linux/kernel.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/bio.h>
38 #include <linux/notifier.h>
39 #include <linux/cpu.h>
40 #include <asm/bitops.h>
41
42 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
43 static void invalidate_bh_lrus(void);
44
45 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
46
47 struct bh_wait_queue {
48         struct buffer_head *bh;
49         wait_queue_t wait;
50 };
51
52 #define __DEFINE_BH_WAIT(name, b, f)                                    \
53         struct bh_wait_queue name = {                                   \
54                 .bh     = b,                                            \
55                 .wait   = {                                             \
56                                 .task   = current,                      \
57                                 .flags  = f,                            \
58                                 .func   = bh_wake_function,             \
59                                 .task_list =                            \
60                                         LIST_HEAD_INIT(name.wait.task_list),\
61                         },                                              \
62         }
63 #define DEFINE_BH_WAIT(name, bh)        __DEFINE_BH_WAIT(name, bh, 0)
64 #define DEFINE_BH_WAIT_EXCLUSIVE(name, bh) \
65                 __DEFINE_BH_WAIT(name, bh, WQ_FLAG_EXCLUSIVE)
66
67 /*
68  * Hashed waitqueue_head's for wait_on_buffer()
69  */
70 #define BH_WAIT_TABLE_ORDER     7
71 static struct bh_wait_queue_head {
72         wait_queue_head_t wqh;
73 } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
74
75 inline void
76 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
77 {
78         bh->b_end_io = handler;
79         bh->b_private = private;
80 }
81
82 /*
83  * Return the address of the waitqueue_head to be used for this
84  * buffer_head
85  */
86 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
87 {
88         return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
89 }
90 EXPORT_SYMBOL(bh_waitq_head);
91
92 void wake_up_buffer(struct buffer_head *bh)
93 {
94         wait_queue_head_t *wq = bh_waitq_head(bh);
95
96         smp_mb();
97         if (waitqueue_active(wq))
98                 __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, bh);
99 }
100 EXPORT_SYMBOL(wake_up_buffer);
101
102 static int bh_wake_function(wait_queue_t *wait, unsigned mode,
103                                 int sync, void *key)
104 {
105         struct buffer_head *bh = key;
106         struct bh_wait_queue *wq;
107
108         wq = container_of(wait, struct bh_wait_queue, wait);
109         if (wq->bh != bh || buffer_locked(bh))
110                 return 0;
111         else
112                 return autoremove_wake_function(wait, mode, sync, key);
113 }
114
115 static void sync_buffer(struct buffer_head *bh)
116 {
117         struct block_device *bd;
118
119         smp_mb();
120         bd = bh->b_bdev;
121         if (bd)
122                 blk_run_address_space(bd->bd_inode->i_mapping);
123 }
124
125 void fastcall __lock_buffer(struct buffer_head *bh)
126 {
127         wait_queue_head_t *wqh = bh_waitq_head(bh);
128         DEFINE_BH_WAIT_EXCLUSIVE(wait, bh);
129
130         do {
131                 prepare_to_wait_exclusive(wqh, &wait.wait,
132                                         TASK_UNINTERRUPTIBLE);
133                 if (buffer_locked(bh)) {
134                         sync_buffer(bh);
135                         io_schedule();
136                 }
137         } while (test_set_buffer_locked(bh));
138         finish_wait(wqh, &wait.wait);
139 }
140 EXPORT_SYMBOL(__lock_buffer);
141
142 void fastcall unlock_buffer(struct buffer_head *bh)
143 {
144         clear_buffer_locked(bh);
145         smp_mb__after_clear_bit();
146         wake_up_buffer(bh);
147 }
148
149 /*
150  * Block until a buffer comes unlocked.  This doesn't stop it
151  * from becoming locked again - you have to lock it yourself
152  * if you want to preserve its state.
153  */
154 void __wait_on_buffer(struct buffer_head * bh)
155 {
156         wait_queue_head_t *wqh = bh_waitq_head(bh);
157         DEFINE_BH_WAIT(wait, bh);
158
159         do {
160                 prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
161                 if (buffer_locked(bh)) {
162                         sync_buffer(bh);
163                         io_schedule();
164                 }
165         } while (buffer_locked(bh));
166         finish_wait(wqh, &wait.wait);
167 }
168
169 static void
170 __set_page_buffers(struct page *page, struct buffer_head *head)
171 {
172         page_cache_get(page);
173         SetPagePrivate(page);
174         page->private = (unsigned long)head;
175 }
176
177 static void
178 __clear_page_buffers(struct page *page)
179 {
180         ClearPagePrivate(page);
181         page->private = 0;
182         page_cache_release(page);
183 }
184
185 static void buffer_io_error(struct buffer_head *bh)
186 {
187         char b[BDEVNAME_SIZE];
188
189         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
190                         bdevname(bh->b_bdev, b),
191                         (unsigned long long)bh->b_blocknr);
192 }
193
194 /*
195  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
196  * unlock the buffer. This is what ll_rw_block uses too.
197  */
198 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
199 {
200         if (uptodate) {
201                 set_buffer_uptodate(bh);
202         } else {
203                 /* This happens, due to failed READA attempts. */
204                 clear_buffer_uptodate(bh);
205         }
206         unlock_buffer(bh);
207         put_bh(bh);
208 }
209
210 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
211 {
212         char b[BDEVNAME_SIZE];
213
214         if (uptodate) {
215                 set_buffer_uptodate(bh);
216         } else {
217                 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
218                         buffer_io_error(bh);
219                         printk(KERN_WARNING "lost page write due to "
220                                         "I/O error on %s\n",
221                                        bdevname(bh->b_bdev, b));
222                 }
223                 set_buffer_write_io_error(bh);
224                 clear_buffer_uptodate(bh);
225         }
226         unlock_buffer(bh);
227         put_bh(bh);
228 }
229
230 /*
231  * Write out and wait upon all the dirty data associated with a block
232  * device via its mapping.  Does not take the superblock lock.
233  */
234 int sync_blockdev(struct block_device *bdev)
235 {
236         int ret = 0;
237
238         if (bdev) {
239                 int err;
240
241                 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
242                 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
243                 if (!ret)
244                         ret = err;
245         }
246         return ret;
247 }
248 EXPORT_SYMBOL(sync_blockdev);
249
250 /*
251  * Write out and wait upon all dirty data associated with this
252  * superblock.  Filesystem data as well as the underlying block
253  * device.  Takes the superblock lock.
254  */
255 int fsync_super(struct super_block *sb)
256 {
257         sync_inodes_sb(sb, 0);
258         DQUOT_SYNC(sb);
259         lock_super(sb);
260         if (sb->s_dirt && sb->s_op->write_super)
261                 sb->s_op->write_super(sb);
262         unlock_super(sb);
263         if (sb->s_op->sync_fs)
264                 sb->s_op->sync_fs(sb, 1);
265         sync_blockdev(sb->s_bdev);
266         sync_inodes_sb(sb, 1);
267
268         return sync_blockdev(sb->s_bdev);
269 }
270
271 /*
272  * Write out and wait upon all dirty data associated with this
273  * device.   Filesystem data as well as the underlying block
274  * device.  Takes the superblock lock.
275  */
276 int fsync_bdev(struct block_device *bdev)
277 {
278         struct super_block *sb = get_super(bdev);
279         if (sb) {
280                 int res = fsync_super(sb);
281                 drop_super(sb);
282                 return res;
283         }
284         return sync_blockdev(bdev);
285 }
286
287 /**
288  * freeze_bdev  --  lock a filesystem and force it into a consistent state
289  * @bdev:       blockdevice to lock
290  *
291  * This takes the block device bd_mount_sem to make sure no new mounts
292  * happen on bdev until thaw_bdev() is called.
293  * If a superblock is found on this device, we take the s_umount semaphore
294  * on it to make sure nobody unmounts until the snapshot creation is done.
295  */
296 struct super_block *freeze_bdev(struct block_device *bdev)
297 {
298         struct super_block *sb;
299
300         down(&bdev->bd_mount_sem);
301         sb = get_super(bdev);
302         if (sb && !(sb->s_flags & MS_RDONLY)) {
303                 sb->s_frozen = SB_FREEZE_WRITE;
304                 wmb();
305
306                 sync_inodes_sb(sb, 0);
307                 DQUOT_SYNC(sb);
308
309                 lock_super(sb);
310                 if (sb->s_dirt && sb->s_op->write_super)
311                         sb->s_op->write_super(sb);
312                 unlock_super(sb);
313
314                 if (sb->s_op->sync_fs)
315                         sb->s_op->sync_fs(sb, 1);
316
317                 sync_blockdev(sb->s_bdev);
318                 sync_inodes_sb(sb, 1);
319
320                 sb->s_frozen = SB_FREEZE_TRANS;
321                 wmb();
322
323                 sync_blockdev(sb->s_bdev);
324
325                 if (sb->s_op->write_super_lockfs)
326                         sb->s_op->write_super_lockfs(sb);
327         }
328
329         sync_blockdev(bdev);
330         return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
331 }
332 EXPORT_SYMBOL(freeze_bdev);
333
334 /**
335  * thaw_bdev  -- unlock filesystem
336  * @bdev:       blockdevice to unlock
337  * @sb:         associated superblock
338  *
339  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
340  */
341 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
342 {
343         if (sb) {
344                 BUG_ON(sb->s_bdev != bdev);
345
346                 if (sb->s_op->unlockfs)
347                         sb->s_op->unlockfs(sb);
348                 sb->s_frozen = SB_UNFROZEN;
349                 wmb();
350                 wake_up(&sb->s_wait_unfrozen);
351                 drop_super(sb);
352         }
353
354         up(&bdev->bd_mount_sem);
355 }
356 EXPORT_SYMBOL(thaw_bdev);
357
358 /*
359  * sync everything.  Start out by waking pdflush, because that writes back
360  * all queues in parallel.
361  */
362 static void do_sync(unsigned long wait)
363 {
364         wakeup_bdflush(0);
365         sync_inodes(0);         /* All mappings, inodes and their blockdevs */
366         DQUOT_SYNC(NULL);
367         sync_supers();          /* Write the superblocks */
368         sync_filesystems(0);    /* Start syncing the filesystems */
369         sync_filesystems(wait); /* Waitingly sync the filesystems */
370         sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
371         if (!wait)
372                 printk("Emergency Sync complete\n");
373         if (unlikely(laptop_mode))
374                 laptop_sync_completion();
375 }
376
377 asmlinkage long sys_sync(void)
378 {
379         do_sync(1);
380         return 0;
381 }
382
383 void emergency_sync(void)
384 {
385         pdflush_operation(do_sync, 0);
386 }
387
388 /*
389  * Generic function to fsync a file.
390  *
391  * filp may be NULL if called via the msync of a vma.
392  */
393  
394 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
395 {
396         struct inode * inode = dentry->d_inode;
397         struct super_block * sb;
398         int ret;
399
400         /* sync the inode to buffers */
401         write_inode_now(inode, 0);
402
403         /* sync the superblock to buffers */
404         sb = inode->i_sb;
405         lock_super(sb);
406         if (sb->s_op->write_super)
407                 sb->s_op->write_super(sb);
408         unlock_super(sb);
409
410         /* .. finally sync the buffers to disk */
411         ret = sync_blockdev(sb->s_bdev);
412         return ret;
413 }
414
415 asmlinkage long sys_fsync(unsigned int fd)
416 {
417         struct file * file;
418         struct address_space *mapping;
419         int ret, err;
420
421         ret = -EBADF;
422         file = fget(fd);
423         if (!file)
424                 goto out;
425
426         mapping = file->f_mapping;
427
428         ret = -EINVAL;
429         if (!file->f_op || !file->f_op->fsync) {
430                 /* Why?  We can still call filemap_fdatawrite */
431                 goto out_putf;
432         }
433
434         /* We need to protect against concurrent writers.. */
435         down(&mapping->host->i_sem);
436         current->flags |= PF_SYNCWRITE;
437         ret = filemap_fdatawrite(mapping);
438         err = file->f_op->fsync(file, file->f_dentry, 0);
439         if (!ret)
440                 ret = err;
441         err = filemap_fdatawait(mapping);
442         if (!ret)
443                 ret = err;
444         current->flags &= ~PF_SYNCWRITE;
445         up(&mapping->host->i_sem);
446
447 out_putf:
448         fput(file);
449 out:
450         return ret;
451 }
452
453 asmlinkage long sys_fdatasync(unsigned int fd)
454 {
455         struct file * file;
456         struct address_space *mapping;
457         int ret, err;
458
459         ret = -EBADF;
460         file = fget(fd);
461         if (!file)
462                 goto out;
463
464         ret = -EINVAL;
465         if (!file->f_op || !file->f_op->fsync)
466                 goto out_putf;
467
468         mapping = file->f_mapping;
469
470         down(&mapping->host->i_sem);
471         current->flags |= PF_SYNCWRITE;
472         ret = filemap_fdatawrite(mapping);
473         err = file->f_op->fsync(file, file->f_dentry, 1);
474         if (!ret)
475                 ret = err;
476         err = filemap_fdatawait(mapping);
477         if (!ret)
478                 ret = err;
479         current->flags &= ~PF_SYNCWRITE;
480         up(&mapping->host->i_sem);
481
482 out_putf:
483         fput(file);
484 out:
485         return ret;
486 }
487
488 /*
489  * Various filesystems appear to want __find_get_block to be non-blocking.
490  * But it's the page lock which protects the buffers.  To get around this,
491  * we get exclusion from try_to_free_buffers with the blockdev mapping's
492  * private_lock.
493  *
494  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
495  * may be quite high.  This code could TryLock the page, and if that
496  * succeeds, there is no need to take private_lock. (But if
497  * private_lock is contended then so is mapping->tree_lock).
498  */
499 static struct buffer_head *
500 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
501 {
502         struct inode *bd_inode = bdev->bd_inode;
503         struct address_space *bd_mapping = bd_inode->i_mapping;
504         struct buffer_head *ret = NULL;
505         pgoff_t index;
506         struct buffer_head *bh;
507         struct buffer_head *head;
508         struct page *page;
509         int all_mapped = 1;
510
511         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
512         page = find_get_page(bd_mapping, index);
513         if (!page)
514                 goto out;
515
516         spin_lock(&bd_mapping->private_lock);
517         if (!page_has_buffers(page))
518                 goto out_unlock;
519         head = page_buffers(page);
520         bh = head;
521         do {
522                 if (bh->b_blocknr == block) {
523                         ret = bh;
524                         get_bh(bh);
525                         goto out_unlock;
526                 }
527                 if (!buffer_mapped(bh))
528                         all_mapped = 0;
529                 bh = bh->b_this_page;
530         } while (bh != head);
531
532         /* we might be here because some of the buffers on this page are 
533          * not mapped.  This is due to various races between
534          * file io on the block device and getblk.  It gets dealt with
535          * elsewhere, don't buffer_error if we had some unmapped buffers
536          */
537         if (all_mapped) {
538                 printk("__find_get_block_slow() failed. "
539                         "block=%llu, b_blocknr=%llu\n",
540                         (unsigned long long)block, (unsigned long long)bh->b_blocknr);
541                 printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
542                 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
543         }
544 out_unlock:
545         spin_unlock(&bd_mapping->private_lock);
546         page_cache_release(page);
547 out:
548         return ret;
549 }
550
551 /* If invalidate_buffers() will trash dirty buffers, it means some kind
552    of fs corruption is going on. Trashing dirty data always imply losing
553    information that was supposed to be just stored on the physical layer
554    by the user.
555
556    Thus invalidate_buffers in general usage is not allwowed to trash
557    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
558    be preserved.  These buffers are simply skipped.
559   
560    We also skip buffers which are still in use.  For example this can
561    happen if a userspace program is reading the block device.
562
563    NOTE: In the case where the user removed a removable-media-disk even if
564    there's still dirty data not synced on disk (due a bug in the device driver
565    or due an error of the user), by not destroying the dirty buffers we could
566    generate corruption also on the next media inserted, thus a parameter is
567    necessary to handle this case in the most safe way possible (trying
568    to not corrupt also the new disk inserted with the data belonging to
569    the old now corrupted disk). Also for the ramdisk the natural thing
570    to do in order to release the ramdisk memory is to destroy dirty buffers.
571
572    These are two special cases. Normal usage imply the device driver
573    to issue a sync on the device (without waiting I/O completion) and
574    then an invalidate_buffers call that doesn't trash dirty buffers.
575
576    For handling cache coherency with the blkdev pagecache the 'update' case
577    is been introduced. It is needed to re-read from disk any pinned
578    buffer. NOTE: re-reading from disk is destructive so we can do it only
579    when we assume nobody is changing the buffercache under our I/O and when
580    we think the disk contains more recent information than the buffercache.
581    The update == 1 pass marks the buffers we need to update, the update == 2
582    pass does the actual I/O. */
583 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
584 {
585         invalidate_bh_lrus();
586         /*
587          * FIXME: what about destroy_dirty_buffers?
588          * We really want to use invalidate_inode_pages2() for
589          * that, but not until that's cleaned up.
590          */
591         invalidate_inode_pages(bdev->bd_inode->i_mapping);
592 }
593
594 /*
595  * Kick pdflush then try to free up some ZONE_NORMAL memory.
596  */
597 static void free_more_memory(void)
598 {
599         struct zone **zones;
600         pg_data_t *pgdat;
601
602         wakeup_bdflush(1024);
603         yield();
604
605         for_each_pgdat(pgdat) {
606                 zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
607                 if (*zones)
608                         try_to_free_pages(zones, GFP_NOFS, 0);
609         }
610 }
611
612 /*
613  * I/O completion handler for block_read_full_page() - pages
614  * which come unlocked at the end of I/O.
615  */
616 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
617 {
618         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
619         unsigned long flags;
620         struct buffer_head *tmp;
621         struct page *page;
622         int page_uptodate = 1;
623
624         BUG_ON(!buffer_async_read(bh));
625
626         page = bh->b_page;
627         if (uptodate) {
628                 set_buffer_uptodate(bh);
629         } else {
630                 clear_buffer_uptodate(bh);
631                 buffer_io_error(bh);
632                 SetPageError(page);
633         }
634
635         /*
636          * Be _very_ careful from here on. Bad things can happen if
637          * two buffer heads end IO at almost the same time and both
638          * decide that the page is now completely done.
639          */
640         spin_lock_irqsave(&page_uptodate_lock, flags);
641         clear_buffer_async_read(bh);
642         unlock_buffer(bh);
643         tmp = bh;
644         do {
645                 if (!buffer_uptodate(tmp))
646                         page_uptodate = 0;
647                 if (buffer_async_read(tmp)) {
648                         BUG_ON(!buffer_locked(tmp));
649                         goto still_busy;
650                 }
651                 tmp = tmp->b_this_page;
652         } while (tmp != bh);
653         spin_unlock_irqrestore(&page_uptodate_lock, flags);
654
655         /*
656          * If none of the buffers had errors and they are all
657          * uptodate then we can set the page uptodate.
658          */
659         if (page_uptodate && !PageError(page))
660                 SetPageUptodate(page);
661         unlock_page(page);
662         return;
663
664 still_busy:
665         spin_unlock_irqrestore(&page_uptodate_lock, flags);
666         return;
667 }
668
669 /*
670  * Completion handler for block_write_full_page() - pages which are unlocked
671  * during I/O, and which have PageWriteback cleared upon I/O completion.
672  */
673 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
674 {
675         char b[BDEVNAME_SIZE];
676         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
677         unsigned long flags;
678         struct buffer_head *tmp;
679         struct page *page;
680
681         BUG_ON(!buffer_async_write(bh));
682
683         page = bh->b_page;
684         if (uptodate) {
685                 set_buffer_uptodate(bh);
686         } else {
687                 if (printk_ratelimit()) {
688                         buffer_io_error(bh);
689                         printk(KERN_WARNING "lost page write due to "
690                                         "I/O error on %s\n",
691                                bdevname(bh->b_bdev, b));
692                 }
693                 set_bit(AS_EIO, &page->mapping->flags);
694                 clear_buffer_uptodate(bh);
695                 SetPageError(page);
696         }
697
698         spin_lock_irqsave(&page_uptodate_lock, flags);
699         clear_buffer_async_write(bh);
700         unlock_buffer(bh);
701         tmp = bh->b_this_page;
702         while (tmp != bh) {
703                 if (buffer_async_write(tmp)) {
704                         BUG_ON(!buffer_locked(tmp));
705                         goto still_busy;
706                 }
707                 tmp = tmp->b_this_page;
708         }
709         spin_unlock_irqrestore(&page_uptodate_lock, flags);
710         end_page_writeback(page);
711         return;
712
713 still_busy:
714         spin_unlock_irqrestore(&page_uptodate_lock, flags);
715         return;
716 }
717
718 /*
719  * If a page's buffers are under async readin (end_buffer_async_read
720  * completion) then there is a possibility that another thread of
721  * control could lock one of the buffers after it has completed
722  * but while some of the other buffers have not completed.  This
723  * locked buffer would confuse end_buffer_async_read() into not unlocking
724  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
725  * that this buffer is not under async I/O.
726  *
727  * The page comes unlocked when it has no locked buffer_async buffers
728  * left.
729  *
730  * PageLocked prevents anyone starting new async I/O reads any of
731  * the buffers.
732  *
733  * PageWriteback is used to prevent simultaneous writeout of the same
734  * page.
735  *
736  * PageLocked prevents anyone from starting writeback of a page which is
737  * under read I/O (PageWriteback is only ever set against a locked page).
738  */
739 static void mark_buffer_async_read(struct buffer_head *bh)
740 {
741         bh->b_end_io = end_buffer_async_read;
742         set_buffer_async_read(bh);
743 }
744
745 void mark_buffer_async_write(struct buffer_head *bh)
746 {
747         bh->b_end_io = end_buffer_async_write;
748         set_buffer_async_write(bh);
749 }
750 EXPORT_SYMBOL(mark_buffer_async_write);
751
752
753 /*
754  * fs/buffer.c contains helper functions for buffer-backed address space's
755  * fsync functions.  A common requirement for buffer-based filesystems is
756  * that certain data from the backing blockdev needs to be written out for
757  * a successful fsync().  For example, ext2 indirect blocks need to be
758  * written back and waited upon before fsync() returns.
759  *
760  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
761  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
762  * management of a list of dependent buffers at ->i_mapping->private_list.
763  *
764  * Locking is a little subtle: try_to_free_buffers() will remove buffers
765  * from their controlling inode's queue when they are being freed.  But
766  * try_to_free_buffers() will be operating against the *blockdev* mapping
767  * at the time, not against the S_ISREG file which depends on those buffers.
768  * So the locking for private_list is via the private_lock in the address_space
769  * which backs the buffers.  Which is different from the address_space 
770  * against which the buffers are listed.  So for a particular address_space,
771  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
772  * mapping->private_list will always be protected by the backing blockdev's
773  * ->private_lock.
774  *
775  * Which introduces a requirement: all buffers on an address_space's
776  * ->private_list must be from the same address_space: the blockdev's.
777  *
778  * address_spaces which do not place buffers at ->private_list via these
779  * utility functions are free to use private_lock and private_list for
780  * whatever they want.  The only requirement is that list_empty(private_list)
781  * be true at clear_inode() time.
782  *
783  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
784  * filesystems should do that.  invalidate_inode_buffers() should just go
785  * BUG_ON(!list_empty).
786  *
787  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
788  * take an address_space, not an inode.  And it should be called
789  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
790  * queued up.
791  *
792  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
793  * list if it is already on a list.  Because if the buffer is on a list,
794  * it *must* already be on the right one.  If not, the filesystem is being
795  * silly.  This will save a ton of locking.  But first we have to ensure
796  * that buffers are taken *off* the old inode's list when they are freed
797  * (presumably in truncate).  That requires careful auditing of all
798  * filesystems (do it inside bforget()).  It could also be done by bringing
799  * b_inode back.
800  */
801
802 /*
803  * The buffer's backing address_space's private_lock must be held
804  */
805 static inline void __remove_assoc_queue(struct buffer_head *bh)
806 {
807         list_del_init(&bh->b_assoc_buffers);
808 }
809
810 int inode_has_buffers(struct inode *inode)
811 {
812         return !list_empty(&inode->i_data.private_list);
813 }
814
815 /*
816  * osync is designed to support O_SYNC io.  It waits synchronously for
817  * all already-submitted IO to complete, but does not queue any new
818  * writes to the disk.
819  *
820  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
821  * you dirty the buffers, and then use osync_inode_buffers to wait for
822  * completion.  Any other dirty buffers which are not yet queued for
823  * write will not be flushed to disk by the osync.
824  */
825 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
826 {
827         struct buffer_head *bh;
828         struct list_head *p;
829         int err = 0;
830
831         spin_lock(lock);
832 repeat:
833         list_for_each_prev(p, list) {
834                 bh = BH_ENTRY(p);
835                 if (buffer_locked(bh)) {
836                         get_bh(bh);
837                         spin_unlock(lock);
838                         wait_on_buffer(bh);
839                         if (!buffer_uptodate(bh))
840                                 err = -EIO;
841                         brelse(bh);
842                         spin_lock(lock);
843                         goto repeat;
844                 }
845         }
846         spin_unlock(lock);
847         return err;
848 }
849
850 /**
851  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
852  *                        buffers
853  * @buffer_mapping - the mapping which backs the buffers' data
854  * @mapping - the mapping which wants those buffers written
855  *
856  * Starts I/O against the buffers at mapping->private_list, and waits upon
857  * that I/O.
858  *
859  * Basically, this is a convenience function for fsync().  @buffer_mapping is
860  * the blockdev which "owns" the buffers and @mapping is a file or directory
861  * which needs those buffers to be written for a successful fsync().
862  */
863 int sync_mapping_buffers(struct address_space *mapping)
864 {
865         struct address_space *buffer_mapping = mapping->assoc_mapping;
866
867         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
868                 return 0;
869
870         return fsync_buffers_list(&buffer_mapping->private_lock,
871                                         &mapping->private_list);
872 }
873 EXPORT_SYMBOL(sync_mapping_buffers);
874
875 /*
876  * Called when we've recently written block `bblock', and it is known that
877  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
878  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
879  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
880  */
881 void write_boundary_block(struct block_device *bdev,
882                         sector_t bblock, unsigned blocksize)
883 {
884         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
885         if (bh) {
886                 if (buffer_dirty(bh))
887                         ll_rw_block(WRITE, 1, &bh);
888                 put_bh(bh);
889         }
890 }
891
892 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
893 {
894         struct address_space *mapping = inode->i_mapping;
895         struct address_space *buffer_mapping = bh->b_page->mapping;
896
897         mark_buffer_dirty(bh);
898         if (!mapping->assoc_mapping) {
899                 mapping->assoc_mapping = buffer_mapping;
900         } else {
901                 if (mapping->assoc_mapping != buffer_mapping)
902                         BUG();
903         }
904         if (list_empty(&bh->b_assoc_buffers)) {
905                 spin_lock(&buffer_mapping->private_lock);
906                 list_move_tail(&bh->b_assoc_buffers,
907                                 &mapping->private_list);
908                 spin_unlock(&buffer_mapping->private_lock);
909         }
910 }
911 EXPORT_SYMBOL(mark_buffer_dirty_inode);
912
913 /*
914  * Add a page to the dirty page list.
915  *
916  * It is a sad fact of life that this function is called from several places
917  * deeply under spinlocking.  It may not sleep.
918  *
919  * If the page has buffers, the uptodate buffers are set dirty, to preserve
920  * dirty-state coherency between the page and the buffers.  It the page does
921  * not have buffers then when they are later attached they will all be set
922  * dirty.
923  *
924  * The buffers are dirtied before the page is dirtied.  There's a small race
925  * window in which a writepage caller may see the page cleanness but not the
926  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
927  * before the buffers, a concurrent writepage caller could clear the page dirty
928  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
929  * page on the dirty page list.
930  *
931  * We use private_lock to lock against try_to_free_buffers while using the
932  * page's buffer list.  Also use this to protect against clean buffers being
933  * added to the page after it was set dirty.
934  *
935  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
936  * address_space though.
937  */
938 int __set_page_dirty_buffers(struct page *page)
939 {
940         struct address_space * const mapping = page->mapping;
941
942         spin_lock(&mapping->private_lock);
943         if (page_has_buffers(page)) {
944                 struct buffer_head *head = page_buffers(page);
945                 struct buffer_head *bh = head;
946
947                 do {
948                         set_buffer_dirty(bh);
949                         bh = bh->b_this_page;
950                 } while (bh != head);
951         }
952         spin_unlock(&mapping->private_lock);
953
954         if (!TestSetPageDirty(page)) {
955                 spin_lock_irq(&mapping->tree_lock);
956                 if (page->mapping) {    /* Race with truncate? */
957                         if (!mapping->backing_dev_info->memory_backed)
958                                 inc_page_state(nr_dirty);
959                         radix_tree_tag_set(&mapping->page_tree,
960                                                 page_index(page),
961                                                 PAGECACHE_TAG_DIRTY);
962                 }
963                 spin_unlock_irq(&mapping->tree_lock);
964                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
965         }
966         
967         return 0;
968 }
969 EXPORT_SYMBOL(__set_page_dirty_buffers);
970
971 /*
972  * Write out and wait upon a list of buffers.
973  *
974  * We have conflicting pressures: we want to make sure that all
975  * initially dirty buffers get waited on, but that any subsequently
976  * dirtied buffers don't.  After all, we don't want fsync to last
977  * forever if somebody is actively writing to the file.
978  *
979  * Do this in two main stages: first we copy dirty buffers to a
980  * temporary inode list, queueing the writes as we go.  Then we clean
981  * up, waiting for those writes to complete.
982  * 
983  * During this second stage, any subsequent updates to the file may end
984  * up refiling the buffer on the original inode's dirty list again, so
985  * there is a chance we will end up with a buffer queued for write but
986  * not yet completed on that list.  So, as a final cleanup we go through
987  * the osync code to catch these locked, dirty buffers without requeuing
988  * any newly dirty buffers for write.
989  */
990 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
991 {
992         struct buffer_head *bh;
993         struct list_head tmp;
994         int err = 0, err2;
995
996         INIT_LIST_HEAD(&tmp);
997
998         spin_lock(lock);
999         while (!list_empty(list)) {
1000                 bh = BH_ENTRY(list->next);
1001                 list_del_init(&bh->b_assoc_buffers);
1002                 if (buffer_dirty(bh) || buffer_locked(bh)) {
1003                         list_add(&bh->b_assoc_buffers, &tmp);
1004                         if (buffer_dirty(bh)) {
1005                                 get_bh(bh);
1006                                 spin_unlock(lock);
1007                                 /*
1008                                  * Ensure any pending I/O completes so that
1009                                  * ll_rw_block() actually writes the current
1010                                  * contents - it is a noop if I/O is still in
1011                                  * flight on potentially older contents.
1012                                  */
1013                                 wait_on_buffer(bh);
1014                                 ll_rw_block(WRITE, 1, &bh);
1015                                 brelse(bh);
1016                                 spin_lock(lock);
1017                         }
1018                 }
1019         }
1020
1021         while (!list_empty(&tmp)) {
1022                 bh = BH_ENTRY(tmp.prev);
1023                 __remove_assoc_queue(bh);
1024                 get_bh(bh);
1025                 spin_unlock(lock);
1026                 wait_on_buffer(bh);
1027                 if (!buffer_uptodate(bh))
1028                         err = -EIO;
1029                 brelse(bh);
1030                 spin_lock(lock);
1031         }
1032         
1033         spin_unlock(lock);
1034         err2 = osync_buffers_list(lock, list);
1035         if (err)
1036                 return err;
1037         else
1038                 return err2;
1039 }
1040
1041 /*
1042  * Invalidate any and all dirty buffers on a given inode.  We are
1043  * probably unmounting the fs, but that doesn't mean we have already
1044  * done a sync().  Just drop the buffers from the inode list.
1045  *
1046  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
1047  * assumes that all the buffers are against the blockdev.  Not true
1048  * for reiserfs.
1049  */
1050 void invalidate_inode_buffers(struct inode *inode)
1051 {
1052         if (inode_has_buffers(inode)) {
1053                 struct address_space *mapping = &inode->i_data;
1054                 struct list_head *list = &mapping->private_list;
1055                 struct address_space *buffer_mapping = mapping->assoc_mapping;
1056
1057                 spin_lock(&buffer_mapping->private_lock);
1058                 while (!list_empty(list))
1059                         __remove_assoc_queue(BH_ENTRY(list->next));
1060                 spin_unlock(&buffer_mapping->private_lock);
1061         }
1062 }
1063
1064 /*
1065  * Remove any clean buffers from the inode's buffer list.  This is called
1066  * when we're trying to free the inode itself.  Those buffers can pin it.
1067  *
1068  * Returns true if all buffers were removed.
1069  */
1070 int remove_inode_buffers(struct inode *inode)
1071 {
1072         int ret = 1;
1073
1074         if (inode_has_buffers(inode)) {
1075                 struct address_space *mapping = &inode->i_data;
1076                 struct list_head *list = &mapping->private_list;
1077                 struct address_space *buffer_mapping = mapping->assoc_mapping;
1078
1079                 spin_lock(&buffer_mapping->private_lock);
1080                 while (!list_empty(list)) {
1081                         struct buffer_head *bh = BH_ENTRY(list->next);
1082                         if (buffer_dirty(bh)) {
1083                                 ret = 0;
1084                                 break;
1085                         }
1086                         __remove_assoc_queue(bh);
1087                 }
1088                 spin_unlock(&buffer_mapping->private_lock);
1089         }
1090         return ret;
1091 }
1092
1093 /*
1094  * Create the appropriate buffers when given a page for data area and
1095  * the size of each buffer.. Use the bh->b_this_page linked list to
1096  * follow the buffers created.  Return NULL if unable to create more
1097  * buffers.
1098  *
1099  * The retry flag is used to differentiate async IO (paging, swapping)
1100  * which may not fail from ordinary buffer allocations.
1101  */
1102 static struct buffer_head *
1103 create_buffers(struct page * page, unsigned long size, int retry)
1104 {
1105         struct buffer_head *bh, *head;
1106         long offset;
1107
1108 try_again:
1109         head = NULL;
1110         offset = PAGE_SIZE;
1111         while ((offset -= size) >= 0) {
1112                 bh = alloc_buffer_head(GFP_NOFS);
1113                 if (!bh)
1114                         goto no_grow;
1115
1116                 bh->b_bdev = NULL;
1117                 bh->b_this_page = head;
1118                 bh->b_blocknr = -1;
1119                 head = bh;
1120
1121                 bh->b_state = 0;
1122                 atomic_set(&bh->b_count, 0);
1123                 bh->b_size = size;
1124
1125                 /* Link the buffer to its page */
1126                 set_bh_page(bh, page, offset);
1127
1128                 bh->b_end_io = NULL;
1129         }
1130         return head;
1131 /*
1132  * In case anything failed, we just free everything we got.
1133  */
1134 no_grow:
1135         if (head) {
1136                 do {
1137                         bh = head;
1138                         head = head->b_this_page;
1139                         free_buffer_head(bh);
1140                 } while (head);
1141         }
1142
1143         /*
1144          * Return failure for non-async IO requests.  Async IO requests
1145          * are not allowed to fail, so we have to wait until buffer heads
1146          * become available.  But we don't want tasks sleeping with 
1147          * partially complete buffers, so all were released above.
1148          */
1149         if (!retry)
1150                 return NULL;
1151
1152         /* We're _really_ low on memory. Now we just
1153          * wait for old buffer heads to become free due to
1154          * finishing IO.  Since this is an async request and
1155          * the reserve list is empty, we're sure there are 
1156          * async buffer heads in use.
1157          */
1158         free_more_memory();
1159         goto try_again;
1160 }
1161
1162 static inline void
1163 link_dev_buffers(struct page *page, struct buffer_head *head)
1164 {
1165         struct buffer_head *bh, *tail;
1166
1167         bh = head;
1168         do {
1169                 tail = bh;
1170                 bh = bh->b_this_page;
1171         } while (bh);
1172         tail->b_this_page = head;
1173         __set_page_buffers(page, head);
1174 }
1175
1176 /*
1177  * Initialise the state of a blockdev page's buffers.
1178  */ 
1179 static void
1180 init_page_buffers(struct page *page, struct block_device *bdev,
1181                         sector_t block, int size)
1182 {
1183         struct buffer_head *head = page_buffers(page);
1184         struct buffer_head *bh = head;
1185         int uptodate = PageUptodate(page);
1186
1187         do {
1188                 if (!buffer_mapped(bh)) {
1189                         init_buffer(bh, NULL, NULL);
1190                         bh->b_bdev = bdev;
1191                         bh->b_blocknr = block;
1192                         if (uptodate)
1193                                 set_buffer_uptodate(bh);
1194                         set_buffer_mapped(bh);
1195                 }
1196                 block++;
1197                 bh = bh->b_this_page;
1198         } while (bh != head);
1199 }
1200
1201 /*
1202  * Create the page-cache page that contains the requested block.
1203  *
1204  * This is user purely for blockdev mappings.
1205  */
1206 static struct page *
1207 grow_dev_page(struct block_device *bdev, sector_t block,
1208                 pgoff_t index, int size)
1209 {
1210         struct inode *inode = bdev->bd_inode;
1211         struct page *page;
1212         struct buffer_head *bh;
1213
1214         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1215         if (!page)
1216                 return NULL;
1217
1218         if (!PageLocked(page))
1219                 BUG();
1220
1221         if (page_has_buffers(page)) {
1222                 bh = page_buffers(page);
1223                 if (bh->b_size == size) {
1224                         init_page_buffers(page, bdev, block, size);
1225                         return page;
1226                 }
1227                 if (!try_to_free_buffers(page))
1228                         goto failed;
1229         }
1230
1231         /*
1232          * Allocate some buffers for this page
1233          */
1234         bh = create_buffers(page, size, 0);
1235         if (!bh)
1236                 goto failed;
1237
1238         /*
1239          * Link the page to the buffers and initialise them.  Take the
1240          * lock to be atomic wrt __find_get_block(), which does not
1241          * run under the page lock.
1242          */
1243         spin_lock(&inode->i_mapping->private_lock);
1244         link_dev_buffers(page, bh);
1245         init_page_buffers(page, bdev, block, size);
1246         spin_unlock(&inode->i_mapping->private_lock);
1247         return page;
1248
1249 failed:
1250         BUG();
1251         unlock_page(page);
1252         page_cache_release(page);
1253         return NULL;
1254 }
1255
1256 /*
1257  * Create buffers for the specified block device block's page.  If
1258  * that page was dirty, the buffers are set dirty also.
1259  *
1260  * Except that's a bug.  Attaching dirty buffers to a dirty
1261  * blockdev's page can result in filesystem corruption, because
1262  * some of those buffers may be aliases of filesystem data.
1263  * grow_dev_page() will go BUG() if this happens.
1264  */
1265 static inline int
1266 grow_buffers(struct block_device *bdev, sector_t block, int size)
1267 {
1268         struct page *page;
1269         pgoff_t index;
1270         int sizebits;
1271
1272         sizebits = -1;
1273         do {
1274                 sizebits++;
1275         } while ((size << sizebits) < PAGE_SIZE);
1276
1277         index = block >> sizebits;
1278         block = index << sizebits;
1279
1280         /* Create a page with the proper size buffers.. */
1281         page = grow_dev_page(bdev, block, index, size);
1282         if (!page)
1283                 return 0;
1284         unlock_page(page);
1285         page_cache_release(page);
1286         return 1;
1287 }
1288
1289 struct buffer_head *
1290 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1291 {
1292         /* Size must be multiple of hard sectorsize */
1293         if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1294                         (size < 512 || size > PAGE_SIZE))) {
1295                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1296                                         size);
1297                 printk(KERN_ERR "hardsect size: %d\n",
1298                                         bdev_hardsect_size(bdev));
1299
1300                 dump_stack();
1301                 return NULL;
1302         }
1303
1304         for (;;) {
1305                 struct buffer_head * bh;
1306
1307                 bh = __find_get_block(bdev, block, size);
1308                 if (bh)
1309                         return bh;
1310
1311                 if (!grow_buffers(bdev, block, size))
1312                         free_more_memory();
1313         }
1314 }
1315
1316 /*
1317  * The relationship between dirty buffers and dirty pages:
1318  *
1319  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1320  * the page is tagged dirty in its radix tree.
1321  *
1322  * At all times, the dirtiness of the buffers represents the dirtiness of
1323  * subsections of the page.  If the page has buffers, the page dirty bit is
1324  * merely a hint about the true dirty state.
1325  *
1326  * When a page is set dirty in its entirety, all its buffers are marked dirty
1327  * (if the page has buffers).
1328  *
1329  * When a buffer is marked dirty, its page is dirtied, but the page's other
1330  * buffers are not.
1331  *
1332  * Also.  When blockdev buffers are explicitly read with bread(), they
1333  * individually become uptodate.  But their backing page remains not
1334  * uptodate - even if all of its buffers are uptodate.  A subsequent
1335  * block_read_full_page() against that page will discover all the uptodate
1336  * buffers, will set the page uptodate and will perform no I/O.
1337  */
1338
1339 /**
1340  * mark_buffer_dirty - mark a buffer_head as needing writeout
1341  *
1342  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1343  * backing page dirty, then tag the page as dirty in its address_space's radix
1344  * tree and then attach the address_space's inode to its superblock's dirty
1345  * inode list.
1346  *
1347  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1348  * mapping->tree_lock and the global inode_lock.
1349  */
1350 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1351 {
1352         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1353                 __set_page_dirty_nobuffers(bh->b_page);
1354 }
1355
1356 /*
1357  * Decrement a buffer_head's reference count.  If all buffers against a page
1358  * have zero reference count, are clean and unlocked, and if the page is clean
1359  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1360  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1361  * a page but it ends up not being freed, and buffers may later be reattached).
1362  */
1363 void __brelse(struct buffer_head * buf)
1364 {
1365         if (atomic_read(&buf->b_count)) {
1366                 put_bh(buf);
1367                 return;
1368         }
1369         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1370         WARN_ON(1);
1371 }
1372
1373 /*
1374  * bforget() is like brelse(), except it discards any
1375  * potentially dirty data.
1376  */
1377 void __bforget(struct buffer_head *bh)
1378 {
1379         clear_buffer_dirty(bh);
1380         if (!list_empty(&bh->b_assoc_buffers)) {
1381                 struct address_space *buffer_mapping = bh->b_page->mapping;
1382
1383                 spin_lock(&buffer_mapping->private_lock);
1384                 list_del_init(&bh->b_assoc_buffers);
1385                 spin_unlock(&buffer_mapping->private_lock);
1386         }
1387         __brelse(bh);
1388 }
1389
1390 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1391 {
1392         lock_buffer(bh);
1393         if (buffer_uptodate(bh)) {
1394                 unlock_buffer(bh);
1395                 return bh;
1396         } else {
1397                 get_bh(bh);
1398                 bh->b_end_io = end_buffer_read_sync;
1399                 submit_bh(READ, bh);
1400                 wait_on_buffer(bh);
1401                 if (buffer_uptodate(bh))
1402                         return bh;
1403         }
1404         brelse(bh);
1405         return NULL;
1406 }
1407
1408 /*
1409  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1410  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1411  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1412  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1413  * CPU's LRUs at the same time.
1414  *
1415  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1416  * sb_find_get_block().
1417  *
1418  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1419  * a local interrupt disable for that.
1420  */
1421
1422 #define BH_LRU_SIZE     8
1423
1424 struct bh_lru {
1425         struct buffer_head *bhs[BH_LRU_SIZE];
1426 };
1427
1428 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1429
1430 #ifdef CONFIG_SMP
1431 #define bh_lru_lock()   local_irq_disable()
1432 #define bh_lru_unlock() local_irq_enable()
1433 #else
1434 #define bh_lru_lock()   preempt_disable()
1435 #define bh_lru_unlock() preempt_enable()
1436 #endif
1437
1438 static inline void check_irqs_on(void)
1439 {
1440 #ifdef irqs_disabled
1441         BUG_ON(irqs_disabled());
1442 #endif
1443 }
1444
1445 /*
1446  * The LRU management algorithm is dopey-but-simple.  Sorry.
1447  */
1448 static void bh_lru_install(struct buffer_head *bh)
1449 {
1450         struct buffer_head *evictee = NULL;
1451         struct bh_lru *lru;
1452
1453         check_irqs_on();
1454         bh_lru_lock();
1455         lru = &__get_cpu_var(bh_lrus);
1456         if (lru->bhs[0] != bh) {
1457                 struct buffer_head *bhs[BH_LRU_SIZE];
1458                 int in;
1459                 int out = 0;
1460
1461                 get_bh(bh);
1462                 bhs[out++] = bh;
1463                 for (in = 0; in < BH_LRU_SIZE; in++) {
1464                         struct buffer_head *bh2 = lru->bhs[in];
1465
1466                         if (bh2 == bh) {
1467                                 __brelse(bh2);
1468                         } else {
1469                                 if (out >= BH_LRU_SIZE) {
1470                                         BUG_ON(evictee != NULL);
1471                                         evictee = bh2;
1472                                 } else {
1473                                         bhs[out++] = bh2;
1474                                 }
1475                         }
1476                 }
1477                 while (out < BH_LRU_SIZE)
1478                         bhs[out++] = NULL;
1479                 memcpy(lru->bhs, bhs, sizeof(bhs));
1480         }
1481         bh_lru_unlock();
1482
1483         if (evictee)
1484                 __brelse(evictee);
1485 }
1486
1487 /*
1488  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1489  */
1490 static inline struct buffer_head *
1491 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1492 {
1493         struct buffer_head *ret = NULL;
1494         struct bh_lru *lru;
1495         int i;
1496
1497         check_irqs_on();
1498         bh_lru_lock();
1499         lru = &__get_cpu_var(bh_lrus);
1500         for (i = 0; i < BH_LRU_SIZE; i++) {
1501                 struct buffer_head *bh = lru->bhs[i];
1502
1503                 if (bh && bh->b_bdev == bdev &&
1504                                 bh->b_blocknr == block && bh->b_size == size) {
1505                         if (i) {
1506                                 while (i) {
1507                                         lru->bhs[i] = lru->bhs[i - 1];
1508                                         i--;
1509                                 }
1510                                 lru->bhs[0] = bh;
1511                         }
1512                         get_bh(bh);
1513                         ret = bh;
1514                         break;
1515                 }
1516         }
1517         bh_lru_unlock();
1518         return ret;
1519 }
1520
1521 /*
1522  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1523  * it in the LRU and mark it as accessed.  If it is not present then return
1524  * NULL
1525  */
1526 struct buffer_head *
1527 __find_get_block(struct block_device *bdev, sector_t block, int size)
1528 {
1529         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1530
1531         if (bh == NULL) {
1532                 bh = __find_get_block_slow(bdev, block, size);
1533                 if (bh)
1534                         bh_lru_install(bh);
1535         }
1536         if (bh)
1537                 touch_buffer(bh);
1538         return bh;
1539 }
1540 EXPORT_SYMBOL(__find_get_block);
1541
1542 /*
1543  * __getblk will locate (and, if necessary, create) the buffer_head
1544  * which corresponds to the passed block_device, block and size. The
1545  * returned buffer has its reference count incremented.
1546  *
1547  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1548  * illegal block number, __getblk() will happily return a buffer_head
1549  * which represents the non-existent block.  Very weird.
1550  *
1551  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1552  * attempt is failing.  FIXME, perhaps?
1553  */
1554 struct buffer_head *
1555 __getblk(struct block_device *bdev, sector_t block, int size)
1556 {
1557         struct buffer_head *bh = __find_get_block(bdev, block, size);
1558
1559         might_sleep();
1560         if (bh == NULL)
1561                 bh = __getblk_slow(bdev, block, size);
1562         return bh;
1563 }
1564 EXPORT_SYMBOL(__getblk);
1565
1566 /*
1567  * Do async read-ahead on a buffer..
1568  */
1569 void __breadahead(struct block_device *bdev, sector_t block, int size)
1570 {
1571         struct buffer_head *bh = __getblk(bdev, block, size);
1572         ll_rw_block(READA, 1, &bh);
1573         brelse(bh);
1574 }
1575 EXPORT_SYMBOL(__breadahead);
1576
1577 /**
1578  *  __bread() - reads a specified block and returns the bh
1579  *  @block: number of block
1580  *  @size: size (in bytes) to read
1581  * 
1582  *  Reads a specified block, and returns buffer head that contains it.
1583  *  It returns NULL if the block was unreadable.
1584  */
1585 struct buffer_head *
1586 __bread(struct block_device *bdev, sector_t block, int size)
1587 {
1588         struct buffer_head *bh = __getblk(bdev, block, size);
1589
1590         if (!buffer_uptodate(bh))
1591                 bh = __bread_slow(bh);
1592         return bh;
1593 }
1594 EXPORT_SYMBOL(__bread);
1595
1596 /*
1597  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1598  * This doesn't race because it runs in each cpu either in irq
1599  * or with preempt disabled.
1600  */
1601 static void invalidate_bh_lru(void *arg)
1602 {
1603         struct bh_lru *b = &get_cpu_var(bh_lrus);
1604         int i;
1605
1606         for (i = 0; i < BH_LRU_SIZE; i++) {
1607                 brelse(b->bhs[i]);
1608                 b->bhs[i] = NULL;
1609         }
1610         put_cpu_var(bh_lrus);
1611 }
1612         
1613 static void invalidate_bh_lrus(void)
1614 {
1615         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1616 }
1617
1618 void set_bh_page(struct buffer_head *bh,
1619                 struct page *page, unsigned long offset)
1620 {
1621         bh->b_page = page;
1622         if (offset >= PAGE_SIZE)
1623                 BUG();
1624         if (PageHighMem(page))
1625                 /*
1626                  * This catches illegal uses and preserves the offset:
1627                  */
1628                 bh->b_data = (char *)(0 + offset);
1629         else
1630                 bh->b_data = page_address(page) + offset;
1631 }
1632 EXPORT_SYMBOL(set_bh_page);
1633
1634 /*
1635  * Called when truncating a buffer on a page completely.
1636  */
1637 static inline void discard_buffer(struct buffer_head * bh)
1638 {
1639         lock_buffer(bh);
1640         clear_buffer_dirty(bh);
1641         bh->b_bdev = NULL;
1642         clear_buffer_mapped(bh);
1643         clear_buffer_req(bh);
1644         clear_buffer_new(bh);
1645         clear_buffer_delay(bh);
1646         unlock_buffer(bh);
1647 }
1648
1649 /**
1650  * try_to_release_page() - release old fs-specific metadata on a page
1651  *
1652  * @page: the page which the kernel is trying to free
1653  * @gfp_mask: memory allocation flags (and I/O mode)
1654  *
1655  * The address_space is to try to release any data against the page
1656  * (presumably at page->private).  If the release was successful, return `1'.
1657  * Otherwise return zero.
1658  *
1659  * The @gfp_mask argument specifies whether I/O may be performed to release
1660  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1661  *
1662  * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1663  */
1664 int try_to_release_page(struct page *page, int gfp_mask)
1665 {
1666         struct address_space * const mapping = page->mapping;
1667
1668         BUG_ON(!PageLocked(page));
1669         if (PageWriteback(page))
1670                 return 0;
1671         
1672         if (mapping && mapping->a_ops->releasepage)
1673                 return mapping->a_ops->releasepage(page, gfp_mask);
1674         return try_to_free_buffers(page);
1675 }
1676 EXPORT_SYMBOL(try_to_release_page);
1677
1678 /**
1679  * block_invalidatepage - invalidate part of all of a buffer-backed page
1680  *
1681  * @page: the page which is affected
1682  * @offset: the index of the truncation point
1683  *
1684  * block_invalidatepage() is called when all or part of the page has become
1685  * invalidatedby a truncate operation.
1686  *
1687  * block_invalidatepage() does not have to release all buffers, but it must
1688  * ensure that no dirty buffer is left outside @offset and that no I/O
1689  * is underway against any of the blocks which are outside the truncation
1690  * point.  Because the caller is about to free (and possibly reuse) those
1691  * blocks on-disk.
1692  */
1693 int block_invalidatepage(struct page *page, unsigned long offset)
1694 {
1695         struct buffer_head *head, *bh, *next;
1696         unsigned int curr_off = 0;
1697         int ret = 1;
1698
1699         BUG_ON(!PageLocked(page));
1700         if (!page_has_buffers(page))
1701                 goto out;
1702
1703         head = page_buffers(page);
1704         bh = head;
1705         do {
1706                 unsigned int next_off = curr_off + bh->b_size;
1707                 next = bh->b_this_page;
1708
1709                 /*
1710                  * is this block fully invalidated?
1711                  */
1712                 if (offset <= curr_off)
1713                         discard_buffer(bh);
1714                 curr_off = next_off;
1715                 bh = next;
1716         } while (bh != head);
1717
1718         /*
1719          * We release buffers only if the entire page is being invalidated.
1720          * The get_block cached value has been unconditionally invalidated,
1721          * so real IO is not possible anymore.
1722          */
1723         if (offset == 0)
1724                 ret = try_to_release_page(page, 0);
1725 out:
1726         return ret;
1727 }
1728 EXPORT_SYMBOL(block_invalidatepage);
1729
1730 /*
1731  * We attach and possibly dirty the buffers atomically wrt
1732  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1733  * is already excluded via the page lock.
1734  */
1735 void create_empty_buffers(struct page *page,
1736                         unsigned long blocksize, unsigned long b_state)
1737 {
1738         struct buffer_head *bh, *head, *tail;
1739
1740         head = create_buffers(page, blocksize, 1);
1741         bh = head;
1742         do {
1743                 bh->b_state |= b_state;
1744                 tail = bh;
1745                 bh = bh->b_this_page;
1746         } while (bh);
1747         tail->b_this_page = head;
1748
1749         spin_lock(&page->mapping->private_lock);
1750         if (PageUptodate(page) || PageDirty(page)) {
1751                 bh = head;
1752                 do {
1753                         if (PageDirty(page))
1754                                 set_buffer_dirty(bh);
1755                         if (PageUptodate(page))
1756                                 set_buffer_uptodate(bh);
1757                         bh = bh->b_this_page;
1758                 } while (bh != head);
1759         }
1760         __set_page_buffers(page, head);
1761         spin_unlock(&page->mapping->private_lock);
1762 }
1763 EXPORT_SYMBOL(create_empty_buffers);
1764
1765 /*
1766  * We are taking a block for data and we don't want any output from any
1767  * buffer-cache aliases starting from return from that function and
1768  * until the moment when something will explicitly mark the buffer
1769  * dirty (hopefully that will not happen until we will free that block ;-)
1770  * We don't even need to mark it not-uptodate - nobody can expect
1771  * anything from a newly allocated buffer anyway. We used to used
1772  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1773  * don't want to mark the alias unmapped, for example - it would confuse
1774  * anyone who might pick it with bread() afterwards...
1775  *
1776  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1777  * be writeout I/O going on against recently-freed buffers.  We don't
1778  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1779  * only if we really need to.  That happens here.
1780  */
1781 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1782 {
1783         struct buffer_head *old_bh;
1784
1785         might_sleep();
1786
1787         old_bh = __find_get_block_slow(bdev, block, 0);
1788         if (old_bh) {
1789                 clear_buffer_dirty(old_bh);
1790                 wait_on_buffer(old_bh);
1791                 clear_buffer_req(old_bh);
1792                 __brelse(old_bh);
1793         }
1794 }
1795 EXPORT_SYMBOL(unmap_underlying_metadata);
1796
1797 /*
1798  * NOTE! All mapped/uptodate combinations are valid:
1799  *
1800  *      Mapped  Uptodate        Meaning
1801  *
1802  *      No      No              "unknown" - must do get_block()
1803  *      No      Yes             "hole" - zero-filled
1804  *      Yes     No              "allocated" - allocated on disk, not read in
1805  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1806  *
1807  * "Dirty" is valid only with the last case (mapped+uptodate).
1808  */
1809
1810 /*
1811  * While block_write_full_page is writing back the dirty buffers under
1812  * the page lock, whoever dirtied the buffers may decide to clean them
1813  * again at any time.  We handle that by only looking at the buffer
1814  * state inside lock_buffer().
1815  *
1816  * If block_write_full_page() is called for regular writeback
1817  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1818  * locked buffer.   This only can happen if someone has written the buffer
1819  * directly, with submit_bh().  At the address_space level PageWriteback
1820  * prevents this contention from occurring.
1821  */
1822 static int __block_write_full_page(struct inode *inode, struct page *page,
1823                         get_block_t *get_block, struct writeback_control *wbc)
1824 {
1825         int err;
1826         sector_t block;
1827         sector_t last_block;
1828         struct buffer_head *bh, *head;
1829         int nr_underway = 0;
1830
1831         BUG_ON(!PageLocked(page));
1832
1833         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1834
1835         if (!page_has_buffers(page)) {
1836                 create_empty_buffers(page, 1 << inode->i_blkbits,
1837                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1838         }
1839
1840         /*
1841          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1842          * here, and the (potentially unmapped) buffers may become dirty at
1843          * any time.  If a buffer becomes dirty here after we've inspected it
1844          * then we just miss that fact, and the page stays dirty.
1845          *
1846          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1847          * handle that here by just cleaning them.
1848          */
1849
1850         block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1851         head = page_buffers(page);
1852         bh = head;
1853
1854         /*
1855          * Get all the dirty buffers mapped to disk addresses and
1856          * handle any aliases from the underlying blockdev's mapping.
1857          */
1858         do {
1859                 if (block > last_block) {
1860                         /*
1861                          * mapped buffers outside i_size will occur, because
1862                          * this page can be outside i_size when there is a
1863                          * truncate in progress.
1864                          */
1865                         /*
1866                          * The buffer was zeroed by block_write_full_page()
1867                          */
1868                         clear_buffer_dirty(bh);
1869                         set_buffer_uptodate(bh);
1870                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1871                         err = get_block(inode, block, bh, 1);
1872                         if (err)
1873                                 goto recover;
1874                         if (buffer_new(bh)) {
1875                                 /* blockdev mappings never come here */
1876                                 clear_buffer_new(bh);
1877                                 unmap_underlying_metadata(bh->b_bdev,
1878                                                         bh->b_blocknr);
1879                         }
1880                 }
1881                 bh = bh->b_this_page;
1882                 block++;
1883         } while (bh != head);
1884
1885         do {
1886                 get_bh(bh);
1887                 if (!buffer_mapped(bh))
1888                         continue;
1889                 /*
1890                  * If it's a fully non-blocking write attempt and we cannot
1891                  * lock the buffer then redirty the page.  Note that this can
1892                  * potentially cause a busy-wait loop from pdflush and kswapd
1893                  * activity, but those code paths have their own higher-level
1894                  * throttling.
1895                  */
1896                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1897                         lock_buffer(bh);
1898                 } else if (test_set_buffer_locked(bh)) {
1899                         redirty_page_for_writepage(wbc, page);
1900                         continue;
1901                 }
1902                 if (test_clear_buffer_dirty(bh)) {
1903                         mark_buffer_async_write(bh);
1904                 } else {
1905                         unlock_buffer(bh);
1906                 }
1907         } while ((bh = bh->b_this_page) != head);
1908
1909         /*
1910          * The page and its buffers are protected by PageWriteback(), so we can
1911          * drop the bh refcounts early.
1912          */
1913         BUG_ON(PageWriteback(page));
1914         set_page_writeback(page);
1915         unlock_page(page);
1916
1917         do {
1918                 struct buffer_head *next = bh->b_this_page;
1919                 if (buffer_async_write(bh)) {
1920                         submit_bh(WRITE, bh);
1921                         nr_underway++;
1922                 }
1923                 put_bh(bh);
1924                 bh = next;
1925         } while (bh != head);
1926
1927         err = 0;
1928 done:
1929         if (nr_underway == 0) {
1930                 /*
1931                  * The page was marked dirty, but the buffers were
1932                  * clean.  Someone wrote them back by hand with
1933                  * ll_rw_block/submit_bh.  A rare case.
1934                  */
1935                 int uptodate = 1;
1936                 do {
1937                         if (!buffer_uptodate(bh)) {
1938                                 uptodate = 0;
1939                                 break;
1940                         }
1941                         bh = bh->b_this_page;
1942                 } while (bh != head);
1943                 if (uptodate)
1944                         SetPageUptodate(page);
1945                 end_page_writeback(page);
1946                 /*
1947                  * The page and buffer_heads can be released at any time from
1948                  * here on.
1949                  */
1950                 wbc->pages_skipped++;   /* We didn't write this page */
1951         }
1952         return err;
1953
1954 recover:
1955         /*
1956          * ENOSPC, or some other error.  We may already have added some
1957          * blocks to the file, so we need to write these out to avoid
1958          * exposing stale data.
1959          * The page is currently locked and not marked for writeback
1960          */
1961         bh = head;
1962         /* Recovery: lock and submit the mapped buffers */
1963         do {
1964                 get_bh(bh);
1965                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1966                         lock_buffer(bh);
1967                         mark_buffer_async_write(bh);
1968                 } else {
1969                         /*
1970                          * The buffer may have been set dirty during
1971                          * attachment to a dirty page.
1972                          */
1973                         clear_buffer_dirty(bh);
1974                 }
1975         } while ((bh = bh->b_this_page) != head);
1976         SetPageError(page);
1977         BUG_ON(PageWriteback(page));
1978         set_page_writeback(page);
1979         unlock_page(page);
1980         do {
1981                 struct buffer_head *next = bh->b_this_page;
1982                 if (buffer_async_write(bh)) {
1983                         clear_buffer_dirty(bh);
1984                         submit_bh(WRITE, bh);
1985                         nr_underway++;
1986                 }
1987                 put_bh(bh);
1988                 bh = next;
1989         } while (bh != head);
1990         goto done;
1991 }
1992
1993 static int __block_prepare_write(struct inode *inode, struct page *page,
1994                 unsigned from, unsigned to, get_block_t *get_block)
1995 {
1996         unsigned block_start, block_end;
1997         sector_t block;
1998         int err = 0;
1999         unsigned blocksize, bbits;
2000         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2001
2002         BUG_ON(!PageLocked(page));
2003         BUG_ON(from > PAGE_CACHE_SIZE);
2004         BUG_ON(to > PAGE_CACHE_SIZE);
2005         BUG_ON(from > to);
2006
2007         blocksize = 1 << inode->i_blkbits;
2008         if (!page_has_buffers(page))
2009                 create_empty_buffers(page, blocksize, 0);
2010         head = page_buffers(page);
2011
2012         bbits = inode->i_blkbits;
2013         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2014
2015         for(bh = head, block_start = 0; bh != head || !block_start;
2016             block++, block_start=block_end, bh = bh->b_this_page) {
2017                 block_end = block_start + blocksize;
2018                 if (block_end <= from || block_start >= to) {
2019                         if (PageUptodate(page)) {
2020                                 if (!buffer_uptodate(bh))
2021                                         set_buffer_uptodate(bh);
2022                         }
2023                         continue;
2024                 }
2025                 if (buffer_new(bh))
2026                         clear_buffer_new(bh);
2027                 if (!buffer_mapped(bh)) {
2028                         err = get_block(inode, block, bh, 1);
2029                         if (err)
2030                                 goto out;
2031                         if (buffer_new(bh)) {
2032                                 clear_buffer_new(bh);
2033                                 unmap_underlying_metadata(bh->b_bdev,
2034                                                         bh->b_blocknr);
2035                                 if (PageUptodate(page)) {
2036                                         set_buffer_uptodate(bh);
2037                                         continue;
2038                                 }
2039                                 if (block_end > to || block_start < from) {
2040                                         void *kaddr;
2041
2042                                         kaddr = kmap_atomic(page, KM_USER0);
2043                                         if (block_end > to)
2044                                                 memset(kaddr+to, 0,
2045                                                         block_end-to);
2046                                         if (block_start < from)
2047                                                 memset(kaddr+block_start,
2048                                                         0, from-block_start);
2049                                         flush_dcache_page(page);
2050                                         kunmap_atomic(kaddr, KM_USER0);
2051                                 }
2052                                 continue;
2053                         }
2054                 }
2055                 if (PageUptodate(page)) {
2056                         if (!buffer_uptodate(bh))
2057                                 set_buffer_uptodate(bh);
2058                         continue; 
2059                 }
2060                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2061                      (block_start < from || block_end > to)) {
2062                         ll_rw_block(READ, 1, &bh);
2063                         *wait_bh++=bh;
2064                 }
2065         }
2066         /*
2067          * If we issued read requests - let them complete.
2068          */
2069         while(wait_bh > wait) {
2070                 wait_on_buffer(*--wait_bh);
2071                 if (!buffer_uptodate(*wait_bh))
2072                         return -EIO;
2073         }
2074         return 0;
2075 out:
2076         /*
2077          * Zero out any newly allocated blocks to avoid exposing stale
2078          * data.  If BH_New is set, we know that the block was newly
2079          * allocated in the above loop.
2080          */
2081         bh = head;
2082         block_start = 0;
2083         do {
2084                 block_end = block_start+blocksize;
2085                 if (block_end <= from)
2086                         goto next_bh;
2087                 if (block_start >= to)
2088                         break;
2089                 if (buffer_new(bh)) {
2090                         void *kaddr;
2091
2092                         clear_buffer_new(bh);
2093                         kaddr = kmap_atomic(page, KM_USER0);
2094                         memset(kaddr+block_start, 0, bh->b_size);
2095                         kunmap_atomic(kaddr, KM_USER0);
2096                         set_buffer_uptodate(bh);
2097                         mark_buffer_dirty(bh);
2098                 }
2099 next_bh:
2100                 block_start = block_end;
2101                 bh = bh->b_this_page;
2102         } while (bh != head);
2103         return err;
2104 }
2105
2106 static int __block_commit_write(struct inode *inode, struct page *page,
2107                 unsigned from, unsigned to)
2108 {
2109         unsigned block_start, block_end;
2110         int partial = 0;
2111         unsigned blocksize;
2112         struct buffer_head *bh, *head;
2113
2114         blocksize = 1 << inode->i_blkbits;
2115
2116         for(bh = head = page_buffers(page), block_start = 0;
2117             bh != head || !block_start;
2118             block_start=block_end, bh = bh->b_this_page) {
2119                 block_end = block_start + blocksize;
2120                 if (block_end <= from || block_start >= to) {
2121                         if (!buffer_uptodate(bh))
2122                                 partial = 1;
2123                 } else {
2124                         set_buffer_uptodate(bh);
2125                         mark_buffer_dirty(bh);
2126                 }
2127         }
2128
2129         /*
2130          * If this is a partial write which happened to make all buffers
2131          * uptodate then we can optimize away a bogus readpage() for
2132          * the next read(). Here we 'discover' whether the page went
2133          * uptodate as a result of this (potentially partial) write.
2134          */
2135         if (!partial)
2136                 SetPageUptodate(page);
2137         return 0;
2138 }
2139
2140 /*
2141  * Generic "read page" function for block devices that have the normal
2142  * get_block functionality. This is most of the block device filesystems.
2143  * Reads the page asynchronously --- the unlock_buffer() and
2144  * set/clear_buffer_uptodate() functions propagate buffer state into the
2145  * page struct once IO has completed.
2146  */
2147 int block_read_full_page(struct page *page, get_block_t *get_block)
2148 {
2149         struct inode *inode = page->mapping->host;
2150         sector_t iblock, lblock;
2151         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2152         unsigned int blocksize;
2153         int nr, i;
2154         int fully_mapped = 1;
2155
2156         if (!PageLocked(page))
2157                 PAGE_BUG(page);
2158         blocksize = 1 << inode->i_blkbits;
2159         if (!page_has_buffers(page))
2160                 create_empty_buffers(page, blocksize, 0);
2161         head = page_buffers(page);
2162
2163         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2164         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2165         bh = head;
2166         nr = 0;
2167         i = 0;
2168
2169         do {
2170                 if (buffer_uptodate(bh))
2171                         continue;
2172
2173                 if (!buffer_mapped(bh)) {
2174                         fully_mapped = 0;
2175                         if (iblock < lblock) {
2176                                 if (get_block(inode, iblock, bh, 0))
2177                                         SetPageError(page);
2178                         }
2179                         if (!buffer_mapped(bh)) {
2180                                 void *kaddr = kmap_atomic(page, KM_USER0);
2181                                 memset(kaddr + i * blocksize, 0, blocksize);
2182                                 flush_dcache_page(page);
2183                                 kunmap_atomic(kaddr, KM_USER0);
2184                                 set_buffer_uptodate(bh);
2185                                 continue;
2186                         }
2187                         /*
2188                          * get_block() might have updated the buffer
2189                          * synchronously
2190                          */
2191                         if (buffer_uptodate(bh))
2192                                 continue;
2193                 }
2194                 arr[nr++] = bh;
2195         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2196
2197         if (fully_mapped)
2198                 SetPageMappedToDisk(page);
2199
2200         if (!nr) {
2201                 /*
2202                  * All buffers are uptodate - we can set the page uptodate
2203                  * as well. But not if get_block() returned an error.
2204                  */
2205                 if (!PageError(page))
2206                         SetPageUptodate(page);
2207                 unlock_page(page);
2208                 return 0;
2209         }
2210
2211         /* Stage two: lock the buffers */
2212         for (i = 0; i < nr; i++) {
2213                 bh = arr[i];
2214                 lock_buffer(bh);
2215                 mark_buffer_async_read(bh);
2216         }
2217
2218         /*
2219          * Stage 3: start the IO.  Check for uptodateness
2220          * inside the buffer lock in case another process reading
2221          * the underlying blockdev brought it uptodate (the sct fix).
2222          */
2223         for (i = 0; i < nr; i++) {
2224                 bh = arr[i];
2225                 if (buffer_uptodate(bh))
2226                         end_buffer_async_read(bh, 1);
2227                 else
2228                         submit_bh(READ, bh);
2229         }
2230         return 0;
2231 }
2232
2233 /* utility function for filesystems that need to do work on expanding
2234  * truncates.  Uses prepare/commit_write to allow the filesystem to
2235  * deal with the hole.  
2236  */
2237 int generic_cont_expand(struct inode *inode, loff_t size)
2238 {
2239         struct address_space *mapping = inode->i_mapping;
2240         struct page *page;
2241         unsigned long index, offset, limit;
2242         int err;
2243
2244         err = -EFBIG;
2245         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2246         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2247                 send_sig(SIGXFSZ, current, 0);
2248                 goto out;
2249         }
2250         if (size > inode->i_sb->s_maxbytes)
2251                 goto out;
2252
2253         offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2254
2255         /* ugh.  in prepare/commit_write, if from==to==start of block, we 
2256         ** skip the prepare.  make sure we never send an offset for the start
2257         ** of a block
2258         */
2259         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2260                 offset++;
2261         }
2262         index = size >> PAGE_CACHE_SHIFT;
2263         err = -ENOMEM;
2264         page = grab_cache_page(mapping, index);
2265         if (!page)
2266                 goto out;
2267         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2268         if (!err) {
2269                 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2270         }
2271         unlock_page(page);
2272         page_cache_release(page);
2273         if (err > 0)
2274                 err = 0;
2275 out:
2276         return err;
2277 }
2278
2279 /*
2280  * For moronic filesystems that do not allow holes in file.
2281  * We may have to extend the file.
2282  */
2283
2284 int cont_prepare_write(struct page *page, unsigned offset,
2285                 unsigned to, get_block_t *get_block, loff_t *bytes)
2286 {
2287         struct address_space *mapping = page->mapping;
2288         struct inode *inode = mapping->host;
2289         struct page *new_page;
2290         pgoff_t pgpos;
2291         long status;
2292         unsigned zerofrom;
2293         unsigned blocksize = 1 << inode->i_blkbits;
2294         void *kaddr;
2295
2296         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2297                 status = -ENOMEM;
2298                 new_page = grab_cache_page(mapping, pgpos);
2299                 if (!new_page)
2300                         goto out;
2301                 /* we might sleep */
2302                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2303                         unlock_page(new_page);
2304                         page_cache_release(new_page);
2305                         continue;
2306                 }
2307                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2308                 if (zerofrom & (blocksize-1)) {
2309                         *bytes |= (blocksize-1);
2310                         (*bytes)++;
2311                 }
2312                 status = __block_prepare_write(inode, new_page, zerofrom,
2313                                                 PAGE_CACHE_SIZE, get_block);
2314                 if (status)
2315                         goto out_unmap;
2316                 kaddr = kmap_atomic(new_page, KM_USER0);
2317                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2318                 flush_dcache_page(new_page);
2319                 kunmap_atomic(kaddr, KM_USER0);
2320                 __block_commit_write(inode, new_page,
2321                                 zerofrom, PAGE_CACHE_SIZE);
2322                 unlock_page(new_page);
2323                 page_cache_release(new_page);
2324         }
2325
2326         if (page->index < pgpos) {
2327                 /* completely inside the area */
2328                 zerofrom = offset;
2329         } else {
2330                 /* page covers the boundary, find the boundary offset */
2331                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2332
2333                 /* if we will expand the thing last block will be filled */
2334                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2335                         *bytes |= (blocksize-1);
2336                         (*bytes)++;
2337                 }
2338
2339                 /* starting below the boundary? Nothing to zero out */
2340                 if (offset <= zerofrom)
2341                         zerofrom = offset;
2342         }
2343         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2344         if (status)
2345                 goto out1;
2346         if (zerofrom < offset) {
2347                 kaddr = kmap_atomic(page, KM_USER0);
2348                 memset(kaddr+zerofrom, 0, offset-zerofrom);
2349                 flush_dcache_page(page);
2350                 kunmap_atomic(kaddr, KM_USER0);
2351                 __block_commit_write(inode, page, zerofrom, offset);
2352         }
2353         return 0;
2354 out1:
2355         ClearPageUptodate(page);
2356         return status;
2357
2358 out_unmap:
2359         ClearPageUptodate(new_page);
2360         unlock_page(new_page);
2361         page_cache_release(new_page);
2362 out:
2363         return status;
2364 }
2365
2366 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2367                         get_block_t *get_block)
2368 {
2369         struct inode *inode = page->mapping->host;
2370         int err = __block_prepare_write(inode, page, from, to, get_block);
2371         if (err)
2372                 ClearPageUptodate(page);
2373         return err;
2374 }
2375
2376 int block_commit_write(struct page *page, unsigned from, unsigned to)
2377 {
2378         struct inode *inode = page->mapping->host;
2379         __block_commit_write(inode,page,from,to);
2380         return 0;
2381 }
2382
2383 int generic_commit_write(struct file *file, struct page *page,
2384                 unsigned from, unsigned to)
2385 {
2386         struct inode *inode = page->mapping->host;
2387         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2388         __block_commit_write(inode,page,from,to);
2389         /*
2390          * No need to use i_size_read() here, the i_size
2391          * cannot change under us because we hold i_sem.
2392          */
2393         if (pos > inode->i_size) {
2394                 i_size_write(inode, pos);
2395                 mark_inode_dirty(inode);
2396         }
2397         return 0;
2398 }
2399
2400
2401 /*
2402  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2403  * immediately, while under the page lock.  So it needs a special end_io
2404  * handler which does not touch the bh after unlocking it.
2405  *
2406  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2407  * a race there is benign: unlock_buffer() only use the bh's address for
2408  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2409  * itself.
2410  */
2411 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2412 {
2413         if (uptodate) {
2414                 set_buffer_uptodate(bh);
2415         } else {
2416                 /* This happens, due to failed READA attempts. */
2417                 clear_buffer_uptodate(bh);
2418         }
2419         unlock_buffer(bh);
2420 }
2421
2422 /*
2423  * On entry, the page is fully not uptodate.
2424  * On exit the page is fully uptodate in the areas outside (from,to)
2425  */
2426 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2427                         get_block_t *get_block)
2428 {
2429         struct inode *inode = page->mapping->host;
2430         const unsigned blkbits = inode->i_blkbits;
2431         const unsigned blocksize = 1 << blkbits;
2432         struct buffer_head map_bh;
2433         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2434         unsigned block_in_page;
2435         unsigned block_start;
2436         sector_t block_in_file;
2437         char *kaddr;
2438         int nr_reads = 0;
2439         int i;
2440         int ret = 0;
2441         int is_mapped_to_disk = 1;
2442         int dirtied_it = 0;
2443
2444         if (PageMappedToDisk(page))
2445                 return 0;
2446
2447         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2448         map_bh.b_page = page;
2449
2450         /*
2451          * We loop across all blocks in the page, whether or not they are
2452          * part of the affected region.  This is so we can discover if the
2453          * page is fully mapped-to-disk.
2454          */
2455         for (block_start = 0, block_in_page = 0;
2456                   block_start < PAGE_CACHE_SIZE;
2457                   block_in_page++, block_start += blocksize) {
2458                 unsigned block_end = block_start + blocksize;
2459                 int create;
2460
2461                 map_bh.b_state = 0;
2462                 create = 1;
2463                 if (block_start >= to)
2464                         create = 0;
2465                 ret = get_block(inode, block_in_file + block_in_page,
2466                                         &map_bh, create);
2467                 if (ret)
2468                         goto failed;
2469                 if (!buffer_mapped(&map_bh))
2470                         is_mapped_to_disk = 0;
2471                 if (buffer_new(&map_bh))
2472                         unmap_underlying_metadata(map_bh.b_bdev,
2473                                                         map_bh.b_blocknr);
2474                 if (PageUptodate(page))
2475                         continue;
2476                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2477                         kaddr = kmap_atomic(page, KM_USER0);
2478                         if (block_start < from) {
2479                                 memset(kaddr+block_start, 0, from-block_start);
2480                                 dirtied_it = 1;
2481                         }
2482                         if (block_end > to) {
2483                                 memset(kaddr + to, 0, block_end - to);
2484                                 dirtied_it = 1;
2485                         }
2486                         flush_dcache_page(page);
2487                         kunmap_atomic(kaddr, KM_USER0);
2488                         continue;
2489                 }
2490                 if (buffer_uptodate(&map_bh))
2491                         continue;       /* reiserfs does this */
2492                 if (block_start < from || block_end > to) {
2493                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2494
2495                         if (!bh) {
2496                                 ret = -ENOMEM;
2497                                 goto failed;
2498                         }
2499                         bh->b_state = map_bh.b_state;
2500                         atomic_set(&bh->b_count, 0);
2501                         bh->b_this_page = NULL;
2502                         bh->b_page = page;
2503                         bh->b_blocknr = map_bh.b_blocknr;
2504                         bh->b_size = blocksize;
2505                         bh->b_data = (char *)(long)block_start;
2506                         bh->b_bdev = map_bh.b_bdev;
2507                         bh->b_private = NULL;
2508                         read_bh[nr_reads++] = bh;
2509                 }
2510         }
2511
2512         if (nr_reads) {
2513                 struct buffer_head *bh;
2514
2515                 /*
2516                  * The page is locked, so these buffers are protected from
2517                  * any VM or truncate activity.  Hence we don't need to care
2518                  * for the buffer_head refcounts.
2519                  */
2520                 for (i = 0; i < nr_reads; i++) {
2521                         bh = read_bh[i];
2522                         lock_buffer(bh);
2523                         bh->b_end_io = end_buffer_read_nobh;
2524                         submit_bh(READ, bh);
2525                 }
2526                 for (i = 0; i < nr_reads; i++) {
2527                         bh = read_bh[i];
2528                         wait_on_buffer(bh);
2529                         if (!buffer_uptodate(bh))
2530                                 ret = -EIO;
2531                         free_buffer_head(bh);
2532                         read_bh[i] = NULL;
2533                 }
2534                 if (ret)
2535                         goto failed;
2536         }
2537
2538         if (is_mapped_to_disk)
2539                 SetPageMappedToDisk(page);
2540         SetPageUptodate(page);
2541
2542         /*
2543          * Setting the page dirty here isn't necessary for the prepare_write
2544          * function - commit_write will do that.  But if/when this function is
2545          * used within the pagefault handler to ensure that all mmapped pages
2546          * have backing space in the filesystem, we will need to dirty the page
2547          * if its contents were altered.
2548          */
2549         if (dirtied_it)
2550                 set_page_dirty(page);
2551
2552         return 0;
2553
2554 failed:
2555         for (i = 0; i < nr_reads; i++) {
2556                 if (read_bh[i])
2557                         free_buffer_head(read_bh[i]);
2558         }
2559
2560         /*
2561          * Error recovery is pretty slack.  Clear the page and mark it dirty
2562          * so we'll later zero out any blocks which _were_ allocated.
2563          */
2564         kaddr = kmap_atomic(page, KM_USER0);
2565         memset(kaddr, 0, PAGE_CACHE_SIZE);
2566         kunmap_atomic(kaddr, KM_USER0);
2567         SetPageUptodate(page);
2568         set_page_dirty(page);
2569         return ret;
2570 }
2571 EXPORT_SYMBOL(nobh_prepare_write);
2572
2573 int nobh_commit_write(struct file *file, struct page *page,
2574                 unsigned from, unsigned to)
2575 {
2576         struct inode *inode = page->mapping->host;
2577         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2578
2579         set_page_dirty(page);
2580         if (pos > inode->i_size) {
2581                 i_size_write(inode, pos);
2582                 mark_inode_dirty(inode);
2583         }
2584         return 0;
2585 }
2586 EXPORT_SYMBOL(nobh_commit_write);
2587
2588 /*
2589  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2590  */
2591 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2592 {
2593         struct inode *inode = mapping->host;
2594         unsigned blocksize = 1 << inode->i_blkbits;
2595         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2596         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2597         unsigned to;
2598         struct page *page;
2599         struct address_space_operations *a_ops = mapping->a_ops;
2600         char *kaddr;
2601         int ret = 0;
2602
2603         if ((offset & (blocksize - 1)) == 0)
2604                 goto out;
2605
2606         ret = -ENOMEM;
2607         page = grab_cache_page(mapping, index);
2608         if (!page)
2609                 goto out;
2610
2611         to = (offset + blocksize) & ~(blocksize - 1);
2612         ret = a_ops->prepare_write(NULL, page, offset, to);
2613         if (ret == 0) {
2614                 kaddr = kmap_atomic(page, KM_USER0);
2615                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2616                 flush_dcache_page(page);
2617                 kunmap_atomic(kaddr, KM_USER0);
2618                 set_page_dirty(page);
2619         }
2620         unlock_page(page);
2621         page_cache_release(page);
2622 out:
2623         return ret;
2624 }
2625 EXPORT_SYMBOL(nobh_truncate_page);
2626
2627 int block_truncate_page(struct address_space *mapping,
2628                         loff_t from, get_block_t *get_block)
2629 {
2630         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2631         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2632         unsigned blocksize;
2633         pgoff_t iblock;
2634         unsigned length, pos;
2635         struct inode *inode = mapping->host;
2636         struct page *page;
2637         struct buffer_head *bh;
2638         void *kaddr;
2639         int err;
2640
2641         blocksize = 1 << inode->i_blkbits;
2642         length = offset & (blocksize - 1);
2643
2644         /* Block boundary? Nothing to do */
2645         if (!length)
2646                 return 0;
2647
2648         length = blocksize - length;
2649         iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2650         
2651         page = grab_cache_page(mapping, index);
2652         err = -ENOMEM;
2653         if (!page)
2654                 goto out;
2655
2656         if (!page_has_buffers(page))
2657                 create_empty_buffers(page, blocksize, 0);
2658
2659         /* Find the buffer that contains "offset" */
2660         bh = page_buffers(page);
2661         pos = blocksize;
2662         while (offset >= pos) {
2663                 bh = bh->b_this_page;
2664                 iblock++;
2665                 pos += blocksize;
2666         }
2667
2668         err = 0;
2669         if (!buffer_mapped(bh)) {
2670                 err = get_block(inode, iblock, bh, 0);
2671                 if (err)
2672                         goto unlock;
2673                 /* unmapped? It's a hole - nothing to do */
2674                 if (!buffer_mapped(bh))
2675                         goto unlock;
2676         }
2677
2678         /* Ok, it's mapped. Make sure it's up-to-date */
2679         if (PageUptodate(page))
2680                 set_buffer_uptodate(bh);
2681
2682         if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2683                 err = -EIO;
2684                 ll_rw_block(READ, 1, &bh);
2685                 wait_on_buffer(bh);
2686                 /* Uhhuh. Read error. Complain and punt. */
2687                 if (!buffer_uptodate(bh))
2688                         goto unlock;
2689         }
2690
2691         kaddr = kmap_atomic(page, KM_USER0);
2692         memset(kaddr + offset, 0, length);
2693         flush_dcache_page(page);
2694         kunmap_atomic(kaddr, KM_USER0);
2695
2696         mark_buffer_dirty(bh);
2697         err = 0;
2698
2699 unlock:
2700         unlock_page(page);
2701         page_cache_release(page);
2702 out:
2703         return err;
2704 }
2705
2706 /*
2707  * The generic ->writepage function for buffer-backed address_spaces
2708  */
2709 int block_write_full_page(struct page *page, get_block_t *get_block,
2710                         struct writeback_control *wbc)
2711 {
2712         struct inode * const inode = page->mapping->host;
2713         loff_t i_size = i_size_read(inode);
2714         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2715         unsigned offset;
2716         void *kaddr;
2717
2718         /* Is the page fully inside i_size? */
2719         if (page->index < end_index)
2720                 return __block_write_full_page(inode, page, get_block, wbc);
2721
2722         /* Is the page fully outside i_size? (truncate in progress) */
2723         offset = i_size & (PAGE_CACHE_SIZE-1);
2724         if (page->index >= end_index+1 || !offset) {
2725                 /*
2726                  * The page may have dirty, unmapped buffers.  For example,
2727                  * they may have been added in ext3_writepage().  Make them
2728                  * freeable here, so the page does not leak.
2729                  */
2730                 block_invalidatepage(page, 0);
2731                 unlock_page(page);
2732                 return 0; /* don't care */
2733         }
2734
2735         /*
2736          * The page straddles i_size.  It must be zeroed out on each and every
2737          * writepage invokation because it may be mmapped.  "A file is mapped
2738          * in multiples of the page size.  For a file that is not a multiple of
2739          * the  page size, the remaining memory is zeroed when mapped, and
2740          * writes to that region are not written out to the file."
2741          */
2742         kaddr = kmap_atomic(page, KM_USER0);
2743         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2744         flush_dcache_page(page);
2745         kunmap_atomic(kaddr, KM_USER0);
2746         return __block_write_full_page(inode, page, get_block, wbc);
2747 }
2748
2749 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2750                             get_block_t *get_block)
2751 {
2752         struct buffer_head tmp;
2753         struct inode *inode = mapping->host;
2754         tmp.b_state = 0;
2755         tmp.b_blocknr = 0;
2756         get_block(inode, block, &tmp, 0);
2757         return tmp.b_blocknr;
2758 }
2759
2760 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2761 {
2762         struct buffer_head *bh = bio->bi_private;
2763
2764         if (bio->bi_size)
2765                 return 1;
2766
2767         if (err == -EOPNOTSUPP) {
2768                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2769                 set_bit(BH_Eopnotsupp, &bh->b_state);
2770         }
2771
2772         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2773         bio_put(bio);
2774         return 0;
2775 }
2776
2777 int submit_bh(int rw, struct buffer_head * bh)
2778 {
2779         struct bio *bio;
2780         int ret = 0;
2781
2782         BUG_ON(!buffer_locked(bh));
2783         BUG_ON(!buffer_mapped(bh));
2784         BUG_ON(!bh->b_end_io);
2785
2786         if (buffer_ordered(bh) && (rw == WRITE))
2787                 rw = WRITE_BARRIER;
2788
2789         /*
2790          * Only clear out a write error when rewriting, should this
2791          * include WRITE_SYNC as well?
2792          */
2793         if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2794                 clear_buffer_write_io_error(bh);
2795
2796         /*
2797          * from here on down, it's all bio -- do the initial mapping,
2798          * submit_bio -> generic_make_request may further map this bio around
2799          */
2800         bio = bio_alloc(GFP_NOIO, 1);
2801
2802         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2803         bio->bi_bdev = bh->b_bdev;
2804         bio->bi_io_vec[0].bv_page = bh->b_page;
2805         bio->bi_io_vec[0].bv_len = bh->b_size;
2806         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2807
2808         bio->bi_vcnt = 1;
2809         bio->bi_idx = 0;
2810         bio->bi_size = bh->b_size;
2811
2812         bio->bi_end_io = end_bio_bh_io_sync;
2813         bio->bi_private = bh;
2814
2815         bio_get(bio);
2816         submit_bio(rw, bio);
2817
2818         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2819                 ret = -EOPNOTSUPP;
2820
2821         bio_put(bio);
2822         return ret;
2823 }
2824
2825 /**
2826  * ll_rw_block: low-level access to block devices (DEPRECATED)
2827  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2828  * @nr: number of &struct buffer_heads in the array
2829  * @bhs: array of pointers to &struct buffer_head
2830  *
2831  * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2832  * and requests an I/O operation on them, either a %READ or a %WRITE.
2833  * The third %READA option is described in the documentation for
2834  * generic_make_request() which ll_rw_block() calls.
2835  *
2836  * This function drops any buffer that it cannot get a lock on (with the
2837  * BH_Lock state bit), any buffer that appears to be clean when doing a
2838  * write request, and any buffer that appears to be up-to-date when doing
2839  * read request.  Further it marks as clean buffers that are processed for
2840  * writing (the buffer cache won't assume that they are actually clean until
2841  * the buffer gets unlocked).
2842  *
2843  * ll_rw_block sets b_end_io to simple completion handler that marks
2844  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2845  * any waiters. 
2846  *
2847  * All of the buffers must be for the same device, and must also be a
2848  * multiple of the current approved size for the device.
2849  */
2850 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2851 {
2852         int i;
2853
2854         for (i = 0; i < nr; i++) {
2855                 struct buffer_head *bh = bhs[i];
2856
2857                 if (test_set_buffer_locked(bh))
2858                         continue;
2859
2860                 get_bh(bh);
2861                 if (rw == WRITE) {
2862                         bh->b_end_io = end_buffer_write_sync;
2863                         if (test_clear_buffer_dirty(bh)) {
2864                                 submit_bh(WRITE, bh);
2865                                 continue;
2866                         }
2867                 } else {
2868                         bh->b_end_io = end_buffer_read_sync;
2869                         if (!buffer_uptodate(bh)) {
2870                                 submit_bh(rw, bh);
2871                                 continue;
2872                         }
2873                 }
2874                 unlock_buffer(bh);
2875                 put_bh(bh);
2876         }
2877 }
2878
2879 /*
2880  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2881  * and then start new I/O and then wait upon it.  The caller must have a ref on
2882  * the buffer_head.
2883  */
2884 int sync_dirty_buffer(struct buffer_head *bh)
2885 {
2886         int ret = 0;
2887
2888         WARN_ON(atomic_read(&bh->b_count) < 1);
2889         lock_buffer(bh);
2890         if (test_clear_buffer_dirty(bh)) {
2891                 get_bh(bh);
2892                 bh->b_end_io = end_buffer_write_sync;
2893                 ret = submit_bh(WRITE, bh);
2894                 wait_on_buffer(bh);
2895                 if (buffer_eopnotsupp(bh)) {
2896                         clear_buffer_eopnotsupp(bh);
2897                         ret = -EOPNOTSUPP;
2898                 }
2899                 if (!ret && !buffer_uptodate(bh))
2900                         ret = -EIO;
2901         } else {
2902                 unlock_buffer(bh);
2903         }
2904         return ret;
2905 }
2906
2907 /*
2908  * try_to_free_buffers() checks if all the buffers on this particular page
2909  * are unused, and releases them if so.
2910  *
2911  * Exclusion against try_to_free_buffers may be obtained by either
2912  * locking the page or by holding its mapping's private_lock.
2913  *
2914  * If the page is dirty but all the buffers are clean then we need to
2915  * be sure to mark the page clean as well.  This is because the page
2916  * may be against a block device, and a later reattachment of buffers
2917  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2918  * filesystem data on the same device.
2919  *
2920  * The same applies to regular filesystem pages: if all the buffers are
2921  * clean then we set the page clean and proceed.  To do that, we require
2922  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2923  * private_lock.
2924  *
2925  * try_to_free_buffers() is non-blocking.
2926  */
2927 static inline int buffer_busy(struct buffer_head *bh)
2928 {
2929         return atomic_read(&bh->b_count) |
2930                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2931 }
2932
2933 static int
2934 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2935 {
2936         struct buffer_head *head = page_buffers(page);
2937         struct buffer_head *bh;
2938
2939         bh = head;
2940         do {
2941                 if (buffer_write_io_error(bh))
2942                         set_bit(AS_EIO, &page->mapping->flags);
2943                 if (buffer_busy(bh))
2944                         goto failed;
2945                 bh = bh->b_this_page;
2946         } while (bh != head);
2947
2948         do {
2949                 struct buffer_head *next = bh->b_this_page;
2950
2951                 if (!list_empty(&bh->b_assoc_buffers))
2952                         __remove_assoc_queue(bh);
2953                 bh = next;
2954         } while (bh != head);
2955         *buffers_to_free = head;
2956         __clear_page_buffers(page);
2957         return 1;
2958 failed:
2959         return 0;
2960 }
2961
2962 int try_to_free_buffers(struct page *page)
2963 {
2964         struct address_space * const mapping = page->mapping;
2965         struct buffer_head *buffers_to_free = NULL;
2966         int ret = 0;
2967
2968         BUG_ON(!PageLocked(page));
2969         if (PageWriteback(page))
2970                 return 0;
2971
2972         if (mapping == NULL) {          /* can this still happen? */
2973                 ret = drop_buffers(page, &buffers_to_free);
2974                 goto out;
2975         }
2976
2977         spin_lock(&mapping->private_lock);
2978         ret = drop_buffers(page, &buffers_to_free);
2979         if (ret) {
2980                 /*
2981                  * If the filesystem writes its buffers by hand (eg ext3)
2982                  * then we can have clean buffers against a dirty page.  We
2983                  * clean the page here; otherwise later reattachment of buffers
2984                  * could encounter a non-uptodate page, which is unresolvable.
2985                  * This only applies in the rare case where try_to_free_buffers
2986                  * succeeds but the page is not freed.
2987                  */
2988                 clear_page_dirty(page);
2989         }
2990         spin_unlock(&mapping->private_lock);
2991 out:
2992         if (buffers_to_free) {
2993                 struct buffer_head *bh = buffers_to_free;
2994
2995                 do {
2996                         struct buffer_head *next = bh->b_this_page;
2997                         free_buffer_head(bh);
2998                         bh = next;
2999                 } while (bh != buffers_to_free);
3000         }
3001         return ret;
3002 }
3003 EXPORT_SYMBOL(try_to_free_buffers);
3004
3005 int block_sync_page(struct page *page)
3006 {
3007         struct address_space *mapping;
3008
3009         smp_mb();
3010         mapping = page_mapping(page);
3011         if (mapping)
3012                 blk_run_backing_dev(mapping->backing_dev_info, page);
3013         return 0;
3014 }
3015
3016 /*
3017  * There are no bdflush tunables left.  But distributions are
3018  * still running obsolete flush daemons, so we terminate them here.
3019  *
3020  * Use of bdflush() is deprecated and will be removed in a future kernel.
3021  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3022  */
3023 asmlinkage long sys_bdflush(int func, long data)
3024 {
3025         static int msg_count;
3026
3027         if (!capable(CAP_SYS_ADMIN))
3028                 return -EPERM;
3029
3030         if (msg_count < 5) {
3031                 msg_count++;
3032                 printk(KERN_INFO
3033                         "warning: process `%s' used the obsolete bdflush"
3034                         " system call\n", current->comm);
3035                 printk(KERN_INFO "Fix your initscripts?\n");
3036         }
3037
3038         if (func == 1)
3039                 do_exit(0);
3040         return 0;
3041 }
3042
3043 /*
3044  * Buffer-head allocation
3045  */
3046 static kmem_cache_t *bh_cachep;
3047
3048 /*
3049  * Once the number of bh's in the machine exceeds this level, we start
3050  * stripping them in writeback.
3051  */
3052 static int max_buffer_heads;
3053
3054 int buffer_heads_over_limit;
3055
3056 struct bh_accounting {
3057         int nr;                 /* Number of live bh's */
3058         int ratelimit;          /* Limit cacheline bouncing */
3059 };
3060
3061 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3062
3063 static void recalc_bh_state(void)
3064 {
3065         int i;
3066         int tot = 0;
3067
3068         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3069                 return;
3070         __get_cpu_var(bh_accounting).ratelimit = 0;
3071         for_each_cpu(i)
3072                 tot += per_cpu(bh_accounting, i).nr;
3073         buffer_heads_over_limit = (tot > max_buffer_heads);
3074 }
3075         
3076 struct buffer_head *alloc_buffer_head(int gfp_flags)
3077 {
3078         struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3079         if (ret) {
3080                 preempt_disable();
3081                 __get_cpu_var(bh_accounting).nr++;
3082                 recalc_bh_state();
3083                 preempt_enable();
3084         }
3085         return ret;
3086 }
3087 EXPORT_SYMBOL(alloc_buffer_head);
3088
3089 void free_buffer_head(struct buffer_head *bh)
3090 {
3091         BUG_ON(!list_empty(&bh->b_assoc_buffers));
3092         kmem_cache_free(bh_cachep, bh);
3093         preempt_disable();
3094         __get_cpu_var(bh_accounting).nr--;
3095         recalc_bh_state();
3096         preempt_enable();
3097 }
3098 EXPORT_SYMBOL(free_buffer_head);
3099
3100 static void
3101 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3102 {
3103         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3104                             SLAB_CTOR_CONSTRUCTOR) {
3105                 struct buffer_head * bh = (struct buffer_head *)data;
3106
3107                 memset(bh, 0, sizeof(*bh));
3108                 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3109         }
3110 }
3111
3112 #ifdef CONFIG_HOTPLUG_CPU
3113 static void buffer_exit_cpu(int cpu)
3114 {
3115         int i;
3116         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3117
3118         for (i = 0; i < BH_LRU_SIZE; i++) {
3119                 brelse(b->bhs[i]);
3120                 b->bhs[i] = NULL;
3121         }
3122 }
3123
3124 static int buffer_cpu_notify(struct notifier_block *self,
3125                               unsigned long action, void *hcpu)
3126 {
3127         if (action == CPU_DEAD)
3128                 buffer_exit_cpu((unsigned long)hcpu);
3129         return NOTIFY_OK;
3130 }
3131 #endif /* CONFIG_HOTPLUG_CPU */
3132
3133 void __init buffer_init(void)
3134 {
3135         int i;
3136         int nrpages;
3137
3138         bh_cachep = kmem_cache_create("buffer_head",
3139                         sizeof(struct buffer_head), 0,
3140                         SLAB_PANIC, init_buffer_head, NULL);
3141         for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
3142                 init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
3143
3144         /*
3145          * Limit the bh occupancy to 10% of ZONE_NORMAL
3146          */
3147         nrpages = (nr_free_buffer_pages() * 10) / 100;
3148         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3149         hotcpu_notifier(buffer_cpu_notify, 0);
3150 }
3151
3152 EXPORT_SYMBOL(__bforget);
3153 EXPORT_SYMBOL(__brelse);
3154 EXPORT_SYMBOL(__wait_on_buffer);
3155 EXPORT_SYMBOL(block_commit_write);
3156 EXPORT_SYMBOL(block_prepare_write);
3157 EXPORT_SYMBOL(block_read_full_page);
3158 EXPORT_SYMBOL(block_sync_page);
3159 EXPORT_SYMBOL(block_truncate_page);
3160 EXPORT_SYMBOL(block_write_full_page);
3161 EXPORT_SYMBOL(cont_prepare_write);
3162 EXPORT_SYMBOL(end_buffer_async_write);
3163 EXPORT_SYMBOL(end_buffer_read_sync);
3164 EXPORT_SYMBOL(end_buffer_write_sync);
3165 EXPORT_SYMBOL(file_fsync);
3166 EXPORT_SYMBOL(fsync_bdev);
3167 EXPORT_SYMBOL(generic_block_bmap);
3168 EXPORT_SYMBOL(generic_commit_write);
3169 EXPORT_SYMBOL(generic_cont_expand);
3170 EXPORT_SYMBOL(init_buffer);
3171 EXPORT_SYMBOL(invalidate_bdev);
3172 EXPORT_SYMBOL(ll_rw_block);
3173 EXPORT_SYMBOL(mark_buffer_dirty);
3174 EXPORT_SYMBOL(submit_bh);
3175 EXPORT_SYMBOL(sync_dirty_buffer);
3176 EXPORT_SYMBOL(unlock_buffer);