vserver 1.9.3
[linux-2.6.git] / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20
21 #include <linux/config.h>
22 #include <linux/kernel.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/bio.h>
38 #include <linux/notifier.h>
39 #include <linux/cpu.h>
40 #include <asm/bitops.h>
41
42 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
43 static void invalidate_bh_lrus(void);
44
45 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
46
47 struct bh_wait_queue {
48         struct buffer_head *bh;
49         wait_queue_t wait;
50 };
51
52 #define __DEFINE_BH_WAIT(name, b, f)                                    \
53         struct bh_wait_queue name = {                                   \
54                 .bh     = b,                                            \
55                 .wait   = {                                             \
56                                 .task   = current,                      \
57                                 .flags  = f,                            \
58                                 .func   = bh_wake_function,             \
59                                 .task_list =                            \
60                                         LIST_HEAD_INIT(name.wait.task_list),\
61                         },                                              \
62         }
63 #define DEFINE_BH_WAIT(name, bh)        __DEFINE_BH_WAIT(name, bh, 0)
64 #define DEFINE_BH_WAIT_EXCLUSIVE(name, bh) \
65                 __DEFINE_BH_WAIT(name, bh, WQ_FLAG_EXCLUSIVE)
66
67 /*
68  * Hashed waitqueue_head's for wait_on_buffer()
69  */
70 #define BH_WAIT_TABLE_ORDER     7
71 static struct bh_wait_queue_head {
72         wait_queue_head_t wqh;
73 } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
74
75 inline void
76 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
77 {
78         bh->b_end_io = handler;
79         bh->b_private = private;
80 }
81
82 /*
83  * Return the address of the waitqueue_head to be used for this
84  * buffer_head
85  */
86 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
87 {
88         return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
89 }
90 EXPORT_SYMBOL(bh_waitq_head);
91
92 void wake_up_buffer(struct buffer_head *bh)
93 {
94         wait_queue_head_t *wq = bh_waitq_head(bh);
95
96         smp_mb();
97         if (waitqueue_active(wq))
98                 __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, bh);
99 }
100 EXPORT_SYMBOL(wake_up_buffer);
101
102 static int bh_wake_function(wait_queue_t *wait, unsigned mode,
103                                 int sync, void *key)
104 {
105         struct buffer_head *bh = key;
106         struct bh_wait_queue *wq;
107
108         wq = container_of(wait, struct bh_wait_queue, wait);
109         if (wq->bh != bh || buffer_locked(bh))
110                 return 0;
111         else
112                 return autoremove_wake_function(wait, mode, sync, key);
113 }
114
115 static void sync_buffer(struct buffer_head *bh)
116 {
117         struct block_device *bd;
118
119         smp_mb();
120         bd = bh->b_bdev;
121         if (bd)
122                 blk_run_address_space(bd->bd_inode->i_mapping);
123 }
124
125 void fastcall __lock_buffer(struct buffer_head *bh)
126 {
127         wait_queue_head_t *wqh = bh_waitq_head(bh);
128         DEFINE_BH_WAIT_EXCLUSIVE(wait, bh);
129
130         do {
131                 prepare_to_wait_exclusive(wqh, &wait.wait,
132                                         TASK_UNINTERRUPTIBLE);
133                 if (buffer_locked(bh)) {
134                         sync_buffer(bh);
135                         io_schedule();
136                 }
137         } while (test_set_buffer_locked(bh));
138         finish_wait(wqh, &wait.wait);
139 }
140 EXPORT_SYMBOL(__lock_buffer);
141
142 void fastcall unlock_buffer(struct buffer_head *bh)
143 {
144         clear_buffer_locked(bh);
145         smp_mb__after_clear_bit();
146         wake_up_buffer(bh);
147 }
148
149 /*
150  * Block until a buffer comes unlocked.  This doesn't stop it
151  * from becoming locked again - you have to lock it yourself
152  * if you want to preserve its state.
153  */
154 void __wait_on_buffer(struct buffer_head * bh)
155 {
156         wait_queue_head_t *wqh = bh_waitq_head(bh);
157         DEFINE_BH_WAIT(wait, bh);
158
159         do {
160                 prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
161                 if (buffer_locked(bh)) {
162                         sync_buffer(bh);
163                         io_schedule();
164                 }
165         } while (buffer_locked(bh));
166         finish_wait(wqh, &wait.wait);
167 }
168
169 static void
170 __set_page_buffers(struct page *page, struct buffer_head *head)
171 {
172         page_cache_get(page);
173         SetPagePrivate(page);
174         page->private = (unsigned long)head;
175 }
176
177 static void
178 __clear_page_buffers(struct page *page)
179 {
180         ClearPagePrivate(page);
181         page->private = 0;
182         page_cache_release(page);
183 }
184
185 static void buffer_io_error(struct buffer_head *bh)
186 {
187         char b[BDEVNAME_SIZE];
188
189         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
190                         bdevname(bh->b_bdev, b),
191                         (unsigned long long)bh->b_blocknr);
192 }
193
194 /*
195  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
196  * unlock the buffer. This is what ll_rw_block uses too.
197  */
198 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
199 {
200         if (uptodate) {
201                 set_buffer_uptodate(bh);
202         } else {
203                 /* This happens, due to failed READA attempts. */
204                 clear_buffer_uptodate(bh);
205         }
206         unlock_buffer(bh);
207         put_bh(bh);
208 }
209
210 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
211 {
212         char b[BDEVNAME_SIZE];
213
214         if (uptodate) {
215                 set_buffer_uptodate(bh);
216         } else {
217                 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
218                         buffer_io_error(bh);
219                         printk(KERN_WARNING "lost page write due to "
220                                         "I/O error on %s\n",
221                                        bdevname(bh->b_bdev, b));
222                 }
223                 set_buffer_write_io_error(bh);
224                 clear_buffer_uptodate(bh);
225         }
226         unlock_buffer(bh);
227         put_bh(bh);
228 }
229
230 /*
231  * Write out and wait upon all the dirty data associated with a block
232  * device via its mapping.  Does not take the superblock lock.
233  */
234 int sync_blockdev(struct block_device *bdev)
235 {
236         int ret = 0;
237
238         if (bdev) {
239                 int err;
240
241                 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
242                 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
243                 if (!ret)
244                         ret = err;
245         }
246         return ret;
247 }
248 EXPORT_SYMBOL(sync_blockdev);
249
250 /*
251  * Write out and wait upon all dirty data associated with this
252  * superblock.  Filesystem data as well as the underlying block
253  * device.  Takes the superblock lock.
254  */
255 int fsync_super(struct super_block *sb)
256 {
257         sync_inodes_sb(sb, 0);
258         DQUOT_SYNC(sb);
259         lock_super(sb);
260         if (sb->s_dirt && sb->s_op->write_super)
261                 sb->s_op->write_super(sb);
262         unlock_super(sb);
263         if (sb->s_op->sync_fs)
264                 sb->s_op->sync_fs(sb, 1);
265         sync_blockdev(sb->s_bdev);
266         sync_inodes_sb(sb, 1);
267
268         return sync_blockdev(sb->s_bdev);
269 }
270
271 /*
272  * Write out and wait upon all dirty data associated with this
273  * device.   Filesystem data as well as the underlying block
274  * device.  Takes the superblock lock.
275  */
276 int fsync_bdev(struct block_device *bdev)
277 {
278         struct super_block *sb = get_super(bdev);
279         if (sb) {
280                 int res = fsync_super(sb);
281                 drop_super(sb);
282                 return res;
283         }
284         return sync_blockdev(bdev);
285 }
286
287 /**
288  * freeze_bdev  --  lock a filesystem and force it into a consistent state
289  * @bdev:       blockdevice to lock
290  *
291  * This takes the block device bd_mount_sem to make sure no new mounts
292  * happen on bdev until thaw_bdev() is called.
293  * If a superblock is found on this device, we take the s_umount semaphore
294  * on it to make sure nobody unmounts until the snapshot creation is done.
295  */
296 struct super_block *freeze_bdev(struct block_device *bdev)
297 {
298         struct super_block *sb;
299
300         down(&bdev->bd_mount_sem);
301         sb = get_super(bdev);
302         if (sb && !(sb->s_flags & MS_RDONLY)) {
303                 sb->s_frozen = SB_FREEZE_WRITE;
304                 wmb();
305
306                 sync_inodes_sb(sb, 0);
307                 DQUOT_SYNC(sb);
308
309                 lock_super(sb);
310                 if (sb->s_dirt && sb->s_op->write_super)
311                         sb->s_op->write_super(sb);
312                 unlock_super(sb);
313
314                 if (sb->s_op->sync_fs)
315                         sb->s_op->sync_fs(sb, 1);
316
317                 sync_blockdev(sb->s_bdev);
318                 sync_inodes_sb(sb, 1);
319
320                 sb->s_frozen = SB_FREEZE_TRANS;
321                 wmb();
322
323                 sync_blockdev(sb->s_bdev);
324
325                 if (sb->s_op->write_super_lockfs)
326                         sb->s_op->write_super_lockfs(sb);
327         }
328
329         sync_blockdev(bdev);
330         return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
331 }
332 EXPORT_SYMBOL(freeze_bdev);
333
334 /**
335  * thaw_bdev  -- unlock filesystem
336  * @bdev:       blockdevice to unlock
337  * @sb:         associated superblock
338  *
339  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
340  */
341 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
342 {
343         if (sb) {
344                 BUG_ON(sb->s_bdev != bdev);
345
346                 if (sb->s_op->unlockfs)
347                         sb->s_op->unlockfs(sb);
348                 sb->s_frozen = SB_UNFROZEN;
349                 wmb();
350                 wake_up(&sb->s_wait_unfrozen);
351                 drop_super(sb);
352         }
353
354         up(&bdev->bd_mount_sem);
355 }
356 EXPORT_SYMBOL(thaw_bdev);
357
358 /*
359  * sync everything.  Start out by waking pdflush, because that writes back
360  * all queues in parallel.
361  */
362 static void do_sync(unsigned long wait)
363 {
364         wakeup_bdflush(0);
365         sync_inodes(0);         /* All mappings, inodes and their blockdevs */
366         DQUOT_SYNC(NULL);
367         sync_supers();          /* Write the superblocks */
368         sync_filesystems(0);    /* Start syncing the filesystems */
369         sync_filesystems(wait); /* Waitingly sync the filesystems */
370         sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
371         if (!wait)
372                 printk("Emergency Sync complete\n");
373         if (unlikely(laptop_mode))
374                 laptop_sync_completion();
375 }
376
377 asmlinkage long sys_sync(void)
378 {
379         do_sync(1);
380         return 0;
381 }
382
383 void emergency_sync(void)
384 {
385         pdflush_operation(do_sync, 0);
386 }
387
388 /*
389  * Generic function to fsync a file.
390  *
391  * filp may be NULL if called via the msync of a vma.
392  */
393  
394 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
395 {
396         struct inode * inode = dentry->d_inode;
397         struct super_block * sb;
398         int ret;
399
400         /* sync the inode to buffers */
401         write_inode_now(inode, 0);
402
403         /* sync the superblock to buffers */
404         sb = inode->i_sb;
405         lock_super(sb);
406         if (sb->s_op->write_super)
407                 sb->s_op->write_super(sb);
408         unlock_super(sb);
409
410         /* .. finally sync the buffers to disk */
411         ret = sync_blockdev(sb->s_bdev);
412         return ret;
413 }
414
415 asmlinkage long sys_fsync(unsigned int fd)
416 {
417         struct file * file;
418         struct address_space *mapping;
419         int ret, err;
420
421         ret = -EBADF;
422         file = fget(fd);
423         if (!file)
424                 goto out;
425
426         mapping = file->f_mapping;
427
428         ret = -EINVAL;
429         if (!file->f_op || !file->f_op->fsync) {
430                 /* Why?  We can still call filemap_fdatawrite */
431                 goto out_putf;
432         }
433
434         /* We need to protect against concurrent writers.. */
435         down(&mapping->host->i_sem);
436         current->flags |= PF_SYNCWRITE;
437         ret = filemap_fdatawrite(mapping);
438         err = file->f_op->fsync(file, file->f_dentry, 0);
439         if (!ret)
440                 ret = err;
441         err = filemap_fdatawait(mapping);
442         if (!ret)
443                 ret = err;
444         current->flags &= ~PF_SYNCWRITE;
445         up(&mapping->host->i_sem);
446
447 out_putf:
448         fput(file);
449 out:
450         return ret;
451 }
452
453 asmlinkage long sys_fdatasync(unsigned int fd)
454 {
455         struct file * file;
456         struct address_space *mapping;
457         int ret, err;
458
459         ret = -EBADF;
460         file = fget(fd);
461         if (!file)
462                 goto out;
463
464         ret = -EINVAL;
465         if (!file->f_op || !file->f_op->fsync)
466                 goto out_putf;
467
468         mapping = file->f_mapping;
469
470         down(&mapping->host->i_sem);
471         current->flags |= PF_SYNCWRITE;
472         ret = filemap_fdatawrite(mapping);
473         err = file->f_op->fsync(file, file->f_dentry, 1);
474         if (!ret)
475                 ret = err;
476         err = filemap_fdatawait(mapping);
477         if (!ret)
478                 ret = err;
479         current->flags &= ~PF_SYNCWRITE;
480         up(&mapping->host->i_sem);
481
482 out_putf:
483         fput(file);
484 out:
485         return ret;
486 }
487
488 /*
489  * Various filesystems appear to want __find_get_block to be non-blocking.
490  * But it's the page lock which protects the buffers.  To get around this,
491  * we get exclusion from try_to_free_buffers with the blockdev mapping's
492  * private_lock.
493  *
494  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
495  * may be quite high.  This code could TryLock the page, and if that
496  * succeeds, there is no need to take private_lock. (But if
497  * private_lock is contended then so is mapping->tree_lock).
498  */
499 static struct buffer_head *
500 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
501 {
502         struct inode *bd_inode = bdev->bd_inode;
503         struct address_space *bd_mapping = bd_inode->i_mapping;
504         struct buffer_head *ret = NULL;
505         pgoff_t index;
506         struct buffer_head *bh;
507         struct buffer_head *head;
508         struct page *page;
509
510         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
511         page = find_get_page(bd_mapping, index);
512         if (!page)
513                 goto out;
514
515         spin_lock(&bd_mapping->private_lock);
516         if (!page_has_buffers(page))
517                 goto out_unlock;
518         head = page_buffers(page);
519         bh = head;
520         do {
521                 if (bh->b_blocknr == block) {
522                         ret = bh;
523                         get_bh(bh);
524                         goto out_unlock;
525                 }
526                 bh = bh->b_this_page;
527         } while (bh != head);
528
529         printk("__find_get_block_slow() failed. "
530                 "block=%llu, b_blocknr=%llu\n",
531                 (unsigned long long)block, (unsigned long long)bh->b_blocknr);
532         printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
533         printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
534 out_unlock:
535         spin_unlock(&bd_mapping->private_lock);
536         page_cache_release(page);
537 out:
538         return ret;
539 }
540
541 /* If invalidate_buffers() will trash dirty buffers, it means some kind
542    of fs corruption is going on. Trashing dirty data always imply losing
543    information that was supposed to be just stored on the physical layer
544    by the user.
545
546    Thus invalidate_buffers in general usage is not allwowed to trash
547    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
548    be preserved.  These buffers are simply skipped.
549   
550    We also skip buffers which are still in use.  For example this can
551    happen if a userspace program is reading the block device.
552
553    NOTE: In the case where the user removed a removable-media-disk even if
554    there's still dirty data not synced on disk (due a bug in the device driver
555    or due an error of the user), by not destroying the dirty buffers we could
556    generate corruption also on the next media inserted, thus a parameter is
557    necessary to handle this case in the most safe way possible (trying
558    to not corrupt also the new disk inserted with the data belonging to
559    the old now corrupted disk). Also for the ramdisk the natural thing
560    to do in order to release the ramdisk memory is to destroy dirty buffers.
561
562    These are two special cases. Normal usage imply the device driver
563    to issue a sync on the device (without waiting I/O completion) and
564    then an invalidate_buffers call that doesn't trash dirty buffers.
565
566    For handling cache coherency with the blkdev pagecache the 'update' case
567    is been introduced. It is needed to re-read from disk any pinned
568    buffer. NOTE: re-reading from disk is destructive so we can do it only
569    when we assume nobody is changing the buffercache under our I/O and when
570    we think the disk contains more recent information than the buffercache.
571    The update == 1 pass marks the buffers we need to update, the update == 2
572    pass does the actual I/O. */
573 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
574 {
575         invalidate_bh_lrus();
576         /*
577          * FIXME: what about destroy_dirty_buffers?
578          * We really want to use invalidate_inode_pages2() for
579          * that, but not until that's cleaned up.
580          */
581         invalidate_inode_pages(bdev->bd_inode->i_mapping);
582 }
583
584 /*
585  * Kick pdflush then try to free up some ZONE_NORMAL memory.
586  */
587 static void free_more_memory(void)
588 {
589         struct zone **zones;
590         pg_data_t *pgdat;
591
592         wakeup_bdflush(1024);
593         yield();
594
595         for_each_pgdat(pgdat) {
596                 zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
597                 if (*zones)
598                         try_to_free_pages(zones, GFP_NOFS, 0);
599         }
600 }
601
602 /*
603  * I/O completion handler for block_read_full_page() - pages
604  * which come unlocked at the end of I/O.
605  */
606 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
607 {
608         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
609         unsigned long flags;
610         struct buffer_head *tmp;
611         struct page *page;
612         int page_uptodate = 1;
613
614         BUG_ON(!buffer_async_read(bh));
615
616         page = bh->b_page;
617         if (uptodate) {
618                 set_buffer_uptodate(bh);
619         } else {
620                 clear_buffer_uptodate(bh);
621                 buffer_io_error(bh);
622                 SetPageError(page);
623         }
624
625         /*
626          * Be _very_ careful from here on. Bad things can happen if
627          * two buffer heads end IO at almost the same time and both
628          * decide that the page is now completely done.
629          */
630         spin_lock_irqsave(&page_uptodate_lock, flags);
631         clear_buffer_async_read(bh);
632         unlock_buffer(bh);
633         tmp = bh;
634         do {
635                 if (!buffer_uptodate(tmp))
636                         page_uptodate = 0;
637                 if (buffer_async_read(tmp)) {
638                         BUG_ON(!buffer_locked(tmp));
639                         goto still_busy;
640                 }
641                 tmp = tmp->b_this_page;
642         } while (tmp != bh);
643         spin_unlock_irqrestore(&page_uptodate_lock, flags);
644
645         /*
646          * If none of the buffers had errors and they are all
647          * uptodate then we can set the page uptodate.
648          */
649         if (page_uptodate && !PageError(page))
650                 SetPageUptodate(page);
651         unlock_page(page);
652         return;
653
654 still_busy:
655         spin_unlock_irqrestore(&page_uptodate_lock, flags);
656         return;
657 }
658
659 /*
660  * Completion handler for block_write_full_page() - pages which are unlocked
661  * during I/O, and which have PageWriteback cleared upon I/O completion.
662  */
663 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
664 {
665         char b[BDEVNAME_SIZE];
666         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
667         unsigned long flags;
668         struct buffer_head *tmp;
669         struct page *page;
670
671         BUG_ON(!buffer_async_write(bh));
672
673         page = bh->b_page;
674         if (uptodate) {
675                 set_buffer_uptodate(bh);
676         } else {
677                 if (printk_ratelimit()) {
678                         buffer_io_error(bh);
679                         printk(KERN_WARNING "lost page write due to "
680                                         "I/O error on %s\n",
681                                bdevname(bh->b_bdev, b));
682                 }
683                 set_bit(AS_EIO, &page->mapping->flags);
684                 clear_buffer_uptodate(bh);
685                 SetPageError(page);
686         }
687
688         spin_lock_irqsave(&page_uptodate_lock, flags);
689         clear_buffer_async_write(bh);
690         unlock_buffer(bh);
691         tmp = bh->b_this_page;
692         while (tmp != bh) {
693                 if (buffer_async_write(tmp)) {
694                         BUG_ON(!buffer_locked(tmp));
695                         goto still_busy;
696                 }
697                 tmp = tmp->b_this_page;
698         }
699         spin_unlock_irqrestore(&page_uptodate_lock, flags);
700         end_page_writeback(page);
701         return;
702
703 still_busy:
704         spin_unlock_irqrestore(&page_uptodate_lock, flags);
705         return;
706 }
707
708 /*
709  * If a page's buffers are under async readin (end_buffer_async_read
710  * completion) then there is a possibility that another thread of
711  * control could lock one of the buffers after it has completed
712  * but while some of the other buffers have not completed.  This
713  * locked buffer would confuse end_buffer_async_read() into not unlocking
714  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
715  * that this buffer is not under async I/O.
716  *
717  * The page comes unlocked when it has no locked buffer_async buffers
718  * left.
719  *
720  * PageLocked prevents anyone starting new async I/O reads any of
721  * the buffers.
722  *
723  * PageWriteback is used to prevent simultaneous writeout of the same
724  * page.
725  *
726  * PageLocked prevents anyone from starting writeback of a page which is
727  * under read I/O (PageWriteback is only ever set against a locked page).
728  */
729 static void mark_buffer_async_read(struct buffer_head *bh)
730 {
731         bh->b_end_io = end_buffer_async_read;
732         set_buffer_async_read(bh);
733 }
734
735 void mark_buffer_async_write(struct buffer_head *bh)
736 {
737         bh->b_end_io = end_buffer_async_write;
738         set_buffer_async_write(bh);
739 }
740 EXPORT_SYMBOL(mark_buffer_async_write);
741
742
743 /*
744  * fs/buffer.c contains helper functions for buffer-backed address space's
745  * fsync functions.  A common requirement for buffer-based filesystems is
746  * that certain data from the backing blockdev needs to be written out for
747  * a successful fsync().  For example, ext2 indirect blocks need to be
748  * written back and waited upon before fsync() returns.
749  *
750  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
751  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
752  * management of a list of dependent buffers at ->i_mapping->private_list.
753  *
754  * Locking is a little subtle: try_to_free_buffers() will remove buffers
755  * from their controlling inode's queue when they are being freed.  But
756  * try_to_free_buffers() will be operating against the *blockdev* mapping
757  * at the time, not against the S_ISREG file which depends on those buffers.
758  * So the locking for private_list is via the private_lock in the address_space
759  * which backs the buffers.  Which is different from the address_space 
760  * against which the buffers are listed.  So for a particular address_space,
761  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
762  * mapping->private_list will always be protected by the backing blockdev's
763  * ->private_lock.
764  *
765  * Which introduces a requirement: all buffers on an address_space's
766  * ->private_list must be from the same address_space: the blockdev's.
767  *
768  * address_spaces which do not place buffers at ->private_list via these
769  * utility functions are free to use private_lock and private_list for
770  * whatever they want.  The only requirement is that list_empty(private_list)
771  * be true at clear_inode() time.
772  *
773  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
774  * filesystems should do that.  invalidate_inode_buffers() should just go
775  * BUG_ON(!list_empty).
776  *
777  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
778  * take an address_space, not an inode.  And it should be called
779  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
780  * queued up.
781  *
782  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
783  * list if it is already on a list.  Because if the buffer is on a list,
784  * it *must* already be on the right one.  If not, the filesystem is being
785  * silly.  This will save a ton of locking.  But first we have to ensure
786  * that buffers are taken *off* the old inode's list when they are freed
787  * (presumably in truncate).  That requires careful auditing of all
788  * filesystems (do it inside bforget()).  It could also be done by bringing
789  * b_inode back.
790  */
791
792 /*
793  * The buffer's backing address_space's private_lock must be held
794  */
795 static inline void __remove_assoc_queue(struct buffer_head *bh)
796 {
797         list_del_init(&bh->b_assoc_buffers);
798 }
799
800 int inode_has_buffers(struct inode *inode)
801 {
802         return !list_empty(&inode->i_data.private_list);
803 }
804
805 /*
806  * osync is designed to support O_SYNC io.  It waits synchronously for
807  * all already-submitted IO to complete, but does not queue any new
808  * writes to the disk.
809  *
810  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
811  * you dirty the buffers, and then use osync_inode_buffers to wait for
812  * completion.  Any other dirty buffers which are not yet queued for
813  * write will not be flushed to disk by the osync.
814  */
815 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
816 {
817         struct buffer_head *bh;
818         struct list_head *p;
819         int err = 0;
820
821         spin_lock(lock);
822 repeat:
823         list_for_each_prev(p, list) {
824                 bh = BH_ENTRY(p);
825                 if (buffer_locked(bh)) {
826                         get_bh(bh);
827                         spin_unlock(lock);
828                         wait_on_buffer(bh);
829                         if (!buffer_uptodate(bh))
830                                 err = -EIO;
831                         brelse(bh);
832                         spin_lock(lock);
833                         goto repeat;
834                 }
835         }
836         spin_unlock(lock);
837         return err;
838 }
839
840 /**
841  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
842  *                        buffers
843  * @buffer_mapping - the mapping which backs the buffers' data
844  * @mapping - the mapping which wants those buffers written
845  *
846  * Starts I/O against the buffers at mapping->private_list, and waits upon
847  * that I/O.
848  *
849  * Basically, this is a convenience function for fsync().  @buffer_mapping is
850  * the blockdev which "owns" the buffers and @mapping is a file or directory
851  * which needs those buffers to be written for a successful fsync().
852  */
853 int sync_mapping_buffers(struct address_space *mapping)
854 {
855         struct address_space *buffer_mapping = mapping->assoc_mapping;
856
857         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
858                 return 0;
859
860         return fsync_buffers_list(&buffer_mapping->private_lock,
861                                         &mapping->private_list);
862 }
863 EXPORT_SYMBOL(sync_mapping_buffers);
864
865 /*
866  * Called when we've recently written block `bblock', and it is known that
867  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
868  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
869  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
870  */
871 void write_boundary_block(struct block_device *bdev,
872                         sector_t bblock, unsigned blocksize)
873 {
874         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
875         if (bh) {
876                 if (buffer_dirty(bh))
877                         ll_rw_block(WRITE, 1, &bh);
878                 put_bh(bh);
879         }
880 }
881
882 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
883 {
884         struct address_space *mapping = inode->i_mapping;
885         struct address_space *buffer_mapping = bh->b_page->mapping;
886
887         mark_buffer_dirty(bh);
888         if (!mapping->assoc_mapping) {
889                 mapping->assoc_mapping = buffer_mapping;
890         } else {
891                 if (mapping->assoc_mapping != buffer_mapping)
892                         BUG();
893         }
894         if (list_empty(&bh->b_assoc_buffers)) {
895                 spin_lock(&buffer_mapping->private_lock);
896                 list_move_tail(&bh->b_assoc_buffers,
897                                 &mapping->private_list);
898                 spin_unlock(&buffer_mapping->private_lock);
899         }
900 }
901 EXPORT_SYMBOL(mark_buffer_dirty_inode);
902
903 /*
904  * Add a page to the dirty page list.
905  *
906  * It is a sad fact of life that this function is called from several places
907  * deeply under spinlocking.  It may not sleep.
908  *
909  * If the page has buffers, the uptodate buffers are set dirty, to preserve
910  * dirty-state coherency between the page and the buffers.  It the page does
911  * not have buffers then when they are later attached they will all be set
912  * dirty.
913  *
914  * The buffers are dirtied before the page is dirtied.  There's a small race
915  * window in which a writepage caller may see the page cleanness but not the
916  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
917  * before the buffers, a concurrent writepage caller could clear the page dirty
918  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
919  * page on the dirty page list.
920  *
921  * We use private_lock to lock against try_to_free_buffers while using the
922  * page's buffer list.  Also use this to protect against clean buffers being
923  * added to the page after it was set dirty.
924  *
925  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
926  * address_space though.
927  */
928 int __set_page_dirty_buffers(struct page *page)
929 {
930         struct address_space * const mapping = page->mapping;
931
932         spin_lock(&mapping->private_lock);
933         if (page_has_buffers(page)) {
934                 struct buffer_head *head = page_buffers(page);
935                 struct buffer_head *bh = head;
936
937                 do {
938                         set_buffer_dirty(bh);
939                         bh = bh->b_this_page;
940                 } while (bh != head);
941         }
942         spin_unlock(&mapping->private_lock);
943
944         if (!TestSetPageDirty(page)) {
945                 spin_lock_irq(&mapping->tree_lock);
946                 if (page->mapping) {    /* Race with truncate? */
947                         if (!mapping->backing_dev_info->memory_backed)
948                                 inc_page_state(nr_dirty);
949                         radix_tree_tag_set(&mapping->page_tree,
950                                                 page_index(page),
951                                                 PAGECACHE_TAG_DIRTY);
952                 }
953                 spin_unlock_irq(&mapping->tree_lock);
954                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
955         }
956         
957         return 0;
958 }
959 EXPORT_SYMBOL(__set_page_dirty_buffers);
960
961 /*
962  * Write out and wait upon a list of buffers.
963  *
964  * We have conflicting pressures: we want to make sure that all
965  * initially dirty buffers get waited on, but that any subsequently
966  * dirtied buffers don't.  After all, we don't want fsync to last
967  * forever if somebody is actively writing to the file.
968  *
969  * Do this in two main stages: first we copy dirty buffers to a
970  * temporary inode list, queueing the writes as we go.  Then we clean
971  * up, waiting for those writes to complete.
972  * 
973  * During this second stage, any subsequent updates to the file may end
974  * up refiling the buffer on the original inode's dirty list again, so
975  * there is a chance we will end up with a buffer queued for write but
976  * not yet completed on that list.  So, as a final cleanup we go through
977  * the osync code to catch these locked, dirty buffers without requeuing
978  * any newly dirty buffers for write.
979  */
980 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
981 {
982         struct buffer_head *bh;
983         struct list_head tmp;
984         int err = 0, err2;
985
986         INIT_LIST_HEAD(&tmp);
987
988         spin_lock(lock);
989         while (!list_empty(list)) {
990                 bh = BH_ENTRY(list->next);
991                 list_del_init(&bh->b_assoc_buffers);
992                 if (buffer_dirty(bh) || buffer_locked(bh)) {
993                         list_add(&bh->b_assoc_buffers, &tmp);
994                         if (buffer_dirty(bh)) {
995                                 get_bh(bh);
996                                 spin_unlock(lock);
997                                 /*
998                                  * Ensure any pending I/O completes so that
999                                  * ll_rw_block() actually writes the current
1000                                  * contents - it is a noop if I/O is still in
1001                                  * flight on potentially older contents.
1002                                  */
1003                                 wait_on_buffer(bh);
1004                                 ll_rw_block(WRITE, 1, &bh);
1005                                 brelse(bh);
1006                                 spin_lock(lock);
1007                         }
1008                 }
1009         }
1010
1011         while (!list_empty(&tmp)) {
1012                 bh = BH_ENTRY(tmp.prev);
1013                 __remove_assoc_queue(bh);
1014                 get_bh(bh);
1015                 spin_unlock(lock);
1016                 wait_on_buffer(bh);
1017                 if (!buffer_uptodate(bh))
1018                         err = -EIO;
1019                 brelse(bh);
1020                 spin_lock(lock);
1021         }
1022         
1023         spin_unlock(lock);
1024         err2 = osync_buffers_list(lock, list);
1025         if (err)
1026                 return err;
1027         else
1028                 return err2;
1029 }
1030
1031 /*
1032  * Invalidate any and all dirty buffers on a given inode.  We are
1033  * probably unmounting the fs, but that doesn't mean we have already
1034  * done a sync().  Just drop the buffers from the inode list.
1035  *
1036  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
1037  * assumes that all the buffers are against the blockdev.  Not true
1038  * for reiserfs.
1039  */
1040 void invalidate_inode_buffers(struct inode *inode)
1041 {
1042         if (inode_has_buffers(inode)) {
1043                 struct address_space *mapping = &inode->i_data;
1044                 struct list_head *list = &mapping->private_list;
1045                 struct address_space *buffer_mapping = mapping->assoc_mapping;
1046
1047                 spin_lock(&buffer_mapping->private_lock);
1048                 while (!list_empty(list))
1049                         __remove_assoc_queue(BH_ENTRY(list->next));
1050                 spin_unlock(&buffer_mapping->private_lock);
1051         }
1052 }
1053
1054 /*
1055  * Remove any clean buffers from the inode's buffer list.  This is called
1056  * when we're trying to free the inode itself.  Those buffers can pin it.
1057  *
1058  * Returns true if all buffers were removed.
1059  */
1060 int remove_inode_buffers(struct inode *inode)
1061 {
1062         int ret = 1;
1063
1064         if (inode_has_buffers(inode)) {
1065                 struct address_space *mapping = &inode->i_data;
1066                 struct list_head *list = &mapping->private_list;
1067                 struct address_space *buffer_mapping = mapping->assoc_mapping;
1068
1069                 spin_lock(&buffer_mapping->private_lock);
1070                 while (!list_empty(list)) {
1071                         struct buffer_head *bh = BH_ENTRY(list->next);
1072                         if (buffer_dirty(bh)) {
1073                                 ret = 0;
1074                                 break;
1075                         }
1076                         __remove_assoc_queue(bh);
1077                 }
1078                 spin_unlock(&buffer_mapping->private_lock);
1079         }
1080         return ret;
1081 }
1082
1083 /*
1084  * Create the appropriate buffers when given a page for data area and
1085  * the size of each buffer.. Use the bh->b_this_page linked list to
1086  * follow the buffers created.  Return NULL if unable to create more
1087  * buffers.
1088  *
1089  * The retry flag is used to differentiate async IO (paging, swapping)
1090  * which may not fail from ordinary buffer allocations.
1091  */
1092 static struct buffer_head *
1093 create_buffers(struct page * page, unsigned long size, int retry)
1094 {
1095         struct buffer_head *bh, *head;
1096         long offset;
1097
1098 try_again:
1099         head = NULL;
1100         offset = PAGE_SIZE;
1101         while ((offset -= size) >= 0) {
1102                 bh = alloc_buffer_head(GFP_NOFS);
1103                 if (!bh)
1104                         goto no_grow;
1105
1106                 bh->b_bdev = NULL;
1107                 bh->b_this_page = head;
1108                 bh->b_blocknr = -1;
1109                 head = bh;
1110
1111                 bh->b_state = 0;
1112                 atomic_set(&bh->b_count, 0);
1113                 bh->b_size = size;
1114
1115                 /* Link the buffer to its page */
1116                 set_bh_page(bh, page, offset);
1117
1118                 bh->b_end_io = NULL;
1119         }
1120         return head;
1121 /*
1122  * In case anything failed, we just free everything we got.
1123  */
1124 no_grow:
1125         if (head) {
1126                 do {
1127                         bh = head;
1128                         head = head->b_this_page;
1129                         free_buffer_head(bh);
1130                 } while (head);
1131         }
1132
1133         /*
1134          * Return failure for non-async IO requests.  Async IO requests
1135          * are not allowed to fail, so we have to wait until buffer heads
1136          * become available.  But we don't want tasks sleeping with 
1137          * partially complete buffers, so all were released above.
1138          */
1139         if (!retry)
1140                 return NULL;
1141
1142         /* We're _really_ low on memory. Now we just
1143          * wait for old buffer heads to become free due to
1144          * finishing IO.  Since this is an async request and
1145          * the reserve list is empty, we're sure there are 
1146          * async buffer heads in use.
1147          */
1148         free_more_memory();
1149         goto try_again;
1150 }
1151
1152 static inline void
1153 link_dev_buffers(struct page *page, struct buffer_head *head)
1154 {
1155         struct buffer_head *bh, *tail;
1156
1157         bh = head;
1158         do {
1159                 tail = bh;
1160                 bh = bh->b_this_page;
1161         } while (bh);
1162         tail->b_this_page = head;
1163         __set_page_buffers(page, head);
1164 }
1165
1166 /*
1167  * Initialise the state of a blockdev page's buffers.
1168  */ 
1169 static void
1170 init_page_buffers(struct page *page, struct block_device *bdev,
1171                         sector_t block, int size)
1172 {
1173         struct buffer_head *head = page_buffers(page);
1174         struct buffer_head *bh = head;
1175         unsigned int b_state;
1176
1177         b_state = 1 << BH_Mapped;
1178         if (PageUptodate(page))
1179                 b_state |= 1 << BH_Uptodate;
1180
1181         do {
1182                 if (!(bh->b_state & (1 << BH_Mapped))) {
1183                         init_buffer(bh, NULL, NULL);
1184                         bh->b_bdev = bdev;
1185                         bh->b_blocknr = block;
1186                         bh->b_state = b_state;
1187                 }
1188                 block++;
1189                 bh = bh->b_this_page;
1190         } while (bh != head);
1191 }
1192
1193 /*
1194  * Create the page-cache page that contains the requested block.
1195  *
1196  * This is user purely for blockdev mappings.
1197  */
1198 static struct page *
1199 grow_dev_page(struct block_device *bdev, sector_t block,
1200                 pgoff_t index, int size)
1201 {
1202         struct inode *inode = bdev->bd_inode;
1203         struct page *page;
1204         struct buffer_head *bh;
1205
1206         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1207         if (!page)
1208                 return NULL;
1209
1210         if (!PageLocked(page))
1211                 BUG();
1212
1213         if (page_has_buffers(page)) {
1214                 bh = page_buffers(page);
1215                 if (bh->b_size == size)
1216                         return page;
1217                 if (!try_to_free_buffers(page))
1218                         goto failed;
1219         }
1220
1221         /*
1222          * Allocate some buffers for this page
1223          */
1224         bh = create_buffers(page, size, 0);
1225         if (!bh)
1226                 goto failed;
1227
1228         /*
1229          * Link the page to the buffers and initialise them.  Take the
1230          * lock to be atomic wrt __find_get_block(), which does not
1231          * run under the page lock.
1232          */
1233         spin_lock(&inode->i_mapping->private_lock);
1234         link_dev_buffers(page, bh);
1235         init_page_buffers(page, bdev, block, size);
1236         spin_unlock(&inode->i_mapping->private_lock);
1237         return page;
1238
1239 failed:
1240         BUG();
1241         unlock_page(page);
1242         page_cache_release(page);
1243         return NULL;
1244 }
1245
1246 /*
1247  * Create buffers for the specified block device block's page.  If
1248  * that page was dirty, the buffers are set dirty also.
1249  *
1250  * Except that's a bug.  Attaching dirty buffers to a dirty
1251  * blockdev's page can result in filesystem corruption, because
1252  * some of those buffers may be aliases of filesystem data.
1253  * grow_dev_page() will go BUG() if this happens.
1254  */
1255 static inline int
1256 grow_buffers(struct block_device *bdev, sector_t block, int size)
1257 {
1258         struct page *page;
1259         pgoff_t index;
1260         int sizebits;
1261
1262         sizebits = -1;
1263         do {
1264                 sizebits++;
1265         } while ((size << sizebits) < PAGE_SIZE);
1266
1267         index = block >> sizebits;
1268         block = index << sizebits;
1269
1270         /* Create a page with the proper size buffers.. */
1271         page = grow_dev_page(bdev, block, index, size);
1272         if (!page)
1273                 return 0;
1274         unlock_page(page);
1275         page_cache_release(page);
1276         return 1;
1277 }
1278
1279 struct buffer_head *
1280 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1281 {
1282         /* Size must be multiple of hard sectorsize */
1283         if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1284                         (size < 512 || size > PAGE_SIZE))) {
1285                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1286                                         size);
1287                 printk(KERN_ERR "hardsect size: %d\n",
1288                                         bdev_hardsect_size(bdev));
1289
1290                 dump_stack();
1291                 return NULL;
1292         }
1293
1294         for (;;) {
1295                 struct buffer_head * bh;
1296
1297                 bh = __find_get_block(bdev, block, size);
1298                 if (bh)
1299                         return bh;
1300
1301                 if (!grow_buffers(bdev, block, size))
1302                         free_more_memory();
1303         }
1304 }
1305
1306 /*
1307  * The relationship between dirty buffers and dirty pages:
1308  *
1309  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1310  * the page is tagged dirty in its radix tree.
1311  *
1312  * At all times, the dirtiness of the buffers represents the dirtiness of
1313  * subsections of the page.  If the page has buffers, the page dirty bit is
1314  * merely a hint about the true dirty state.
1315  *
1316  * When a page is set dirty in its entirety, all its buffers are marked dirty
1317  * (if the page has buffers).
1318  *
1319  * When a buffer is marked dirty, its page is dirtied, but the page's other
1320  * buffers are not.
1321  *
1322  * Also.  When blockdev buffers are explicitly read with bread(), they
1323  * individually become uptodate.  But their backing page remains not
1324  * uptodate - even if all of its buffers are uptodate.  A subsequent
1325  * block_read_full_page() against that page will discover all the uptodate
1326  * buffers, will set the page uptodate and will perform no I/O.
1327  */
1328
1329 /**
1330  * mark_buffer_dirty - mark a buffer_head as needing writeout
1331  *
1332  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1333  * backing page dirty, then tag the page as dirty in its address_space's radix
1334  * tree and then attach the address_space's inode to its superblock's dirty
1335  * inode list.
1336  *
1337  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1338  * mapping->tree_lock and the global inode_lock.
1339  */
1340 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1341 {
1342         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1343                 __set_page_dirty_nobuffers(bh->b_page);
1344 }
1345
1346 /*
1347  * Decrement a buffer_head's reference count.  If all buffers against a page
1348  * have zero reference count, are clean and unlocked, and if the page is clean
1349  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1350  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1351  * a page but it ends up not being freed, and buffers may later be reattached).
1352  */
1353 void __brelse(struct buffer_head * buf)
1354 {
1355         if (atomic_read(&buf->b_count)) {
1356                 put_bh(buf);
1357                 return;
1358         }
1359         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1360         WARN_ON(1);
1361 }
1362
1363 /*
1364  * bforget() is like brelse(), except it discards any
1365  * potentially dirty data.
1366  */
1367 void __bforget(struct buffer_head *bh)
1368 {
1369         clear_buffer_dirty(bh);
1370         if (!list_empty(&bh->b_assoc_buffers)) {
1371                 struct address_space *buffer_mapping = bh->b_page->mapping;
1372
1373                 spin_lock(&buffer_mapping->private_lock);
1374                 list_del_init(&bh->b_assoc_buffers);
1375                 spin_unlock(&buffer_mapping->private_lock);
1376         }
1377         __brelse(bh);
1378 }
1379
1380 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1381 {
1382         lock_buffer(bh);
1383         if (buffer_uptodate(bh)) {
1384                 unlock_buffer(bh);
1385                 return bh;
1386         } else {
1387                 get_bh(bh);
1388                 bh->b_end_io = end_buffer_read_sync;
1389                 submit_bh(READ, bh);
1390                 wait_on_buffer(bh);
1391                 if (buffer_uptodate(bh))
1392                         return bh;
1393         }
1394         brelse(bh);
1395         return NULL;
1396 }
1397
1398 /*
1399  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1400  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1401  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1402  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1403  * CPU's LRUs at the same time.
1404  *
1405  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1406  * sb_find_get_block().
1407  *
1408  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1409  * a local interrupt disable for that.
1410  */
1411
1412 #define BH_LRU_SIZE     8
1413
1414 struct bh_lru {
1415         struct buffer_head *bhs[BH_LRU_SIZE];
1416 };
1417
1418 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1419
1420 #ifdef CONFIG_SMP
1421 #define bh_lru_lock()   local_irq_disable()
1422 #define bh_lru_unlock() local_irq_enable()
1423 #else
1424 #define bh_lru_lock()   preempt_disable()
1425 #define bh_lru_unlock() preempt_enable()
1426 #endif
1427
1428 static inline void check_irqs_on(void)
1429 {
1430 #ifdef irqs_disabled
1431         BUG_ON(irqs_disabled());
1432 #endif
1433 }
1434
1435 /*
1436  * The LRU management algorithm is dopey-but-simple.  Sorry.
1437  */
1438 static void bh_lru_install(struct buffer_head *bh)
1439 {
1440         struct buffer_head *evictee = NULL;
1441         struct bh_lru *lru;
1442
1443         check_irqs_on();
1444         bh_lru_lock();
1445         lru = &__get_cpu_var(bh_lrus);
1446         if (lru->bhs[0] != bh) {
1447                 struct buffer_head *bhs[BH_LRU_SIZE];
1448                 int in;
1449                 int out = 0;
1450
1451                 get_bh(bh);
1452                 bhs[out++] = bh;
1453                 for (in = 0; in < BH_LRU_SIZE; in++) {
1454                         struct buffer_head *bh2 = lru->bhs[in];
1455
1456                         if (bh2 == bh) {
1457                                 __brelse(bh2);
1458                         } else {
1459                                 if (out >= BH_LRU_SIZE) {
1460                                         BUG_ON(evictee != NULL);
1461                                         evictee = bh2;
1462                                 } else {
1463                                         bhs[out++] = bh2;
1464                                 }
1465                         }
1466                 }
1467                 while (out < BH_LRU_SIZE)
1468                         bhs[out++] = NULL;
1469                 memcpy(lru->bhs, bhs, sizeof(bhs));
1470         }
1471         bh_lru_unlock();
1472
1473         if (evictee)
1474                 __brelse(evictee);
1475 }
1476
1477 /*
1478  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1479  */
1480 static inline struct buffer_head *
1481 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1482 {
1483         struct buffer_head *ret = NULL;
1484         struct bh_lru *lru;
1485         int i;
1486
1487         check_irqs_on();
1488         bh_lru_lock();
1489         lru = &__get_cpu_var(bh_lrus);
1490         for (i = 0; i < BH_LRU_SIZE; i++) {
1491                 struct buffer_head *bh = lru->bhs[i];
1492
1493                 if (bh && bh->b_bdev == bdev &&
1494                                 bh->b_blocknr == block && bh->b_size == size) {
1495                         if (i) {
1496                                 while (i) {
1497                                         lru->bhs[i] = lru->bhs[i - 1];
1498                                         i--;
1499                                 }
1500                                 lru->bhs[0] = bh;
1501                         }
1502                         get_bh(bh);
1503                         ret = bh;
1504                         break;
1505                 }
1506         }
1507         bh_lru_unlock();
1508         return ret;
1509 }
1510
1511 /*
1512  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1513  * it in the LRU and mark it as accessed.  If it is not present then return
1514  * NULL
1515  */
1516 struct buffer_head *
1517 __find_get_block(struct block_device *bdev, sector_t block, int size)
1518 {
1519         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1520
1521         if (bh == NULL) {
1522                 bh = __find_get_block_slow(bdev, block, size);
1523                 if (bh)
1524                         bh_lru_install(bh);
1525         }
1526         if (bh)
1527                 touch_buffer(bh);
1528         return bh;
1529 }
1530 EXPORT_SYMBOL(__find_get_block);
1531
1532 /*
1533  * __getblk will locate (and, if necessary, create) the buffer_head
1534  * which corresponds to the passed block_device, block and size. The
1535  * returned buffer has its reference count incremented.
1536  *
1537  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1538  * illegal block number, __getblk() will happily return a buffer_head
1539  * which represents the non-existent block.  Very weird.
1540  *
1541  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1542  * attempt is failing.  FIXME, perhaps?
1543  */
1544 struct buffer_head *
1545 __getblk(struct block_device *bdev, sector_t block, int size)
1546 {
1547         struct buffer_head *bh = __find_get_block(bdev, block, size);
1548
1549         might_sleep();
1550         if (bh == NULL)
1551                 bh = __getblk_slow(bdev, block, size);
1552         return bh;
1553 }
1554 EXPORT_SYMBOL(__getblk);
1555
1556 /*
1557  * Do async read-ahead on a buffer..
1558  */
1559 void __breadahead(struct block_device *bdev, sector_t block, int size)
1560 {
1561         struct buffer_head *bh = __getblk(bdev, block, size);
1562         ll_rw_block(READA, 1, &bh);
1563         brelse(bh);
1564 }
1565 EXPORT_SYMBOL(__breadahead);
1566
1567 /**
1568  *  __bread() - reads a specified block and returns the bh
1569  *  @block: number of block
1570  *  @size: size (in bytes) to read
1571  * 
1572  *  Reads a specified block, and returns buffer head that contains it.
1573  *  It returns NULL if the block was unreadable.
1574  */
1575 struct buffer_head *
1576 __bread(struct block_device *bdev, sector_t block, int size)
1577 {
1578         struct buffer_head *bh = __getblk(bdev, block, size);
1579
1580         if (!buffer_uptodate(bh))
1581                 bh = __bread_slow(bh);
1582         return bh;
1583 }
1584 EXPORT_SYMBOL(__bread);
1585
1586 /*
1587  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1588  * This doesn't race because it runs in each cpu either in irq
1589  * or with preempt disabled.
1590  */
1591 static void invalidate_bh_lru(void *arg)
1592 {
1593         struct bh_lru *b = &get_cpu_var(bh_lrus);
1594         int i;
1595
1596         for (i = 0; i < BH_LRU_SIZE; i++) {
1597                 brelse(b->bhs[i]);
1598                 b->bhs[i] = NULL;
1599         }
1600         put_cpu_var(bh_lrus);
1601 }
1602         
1603 static void invalidate_bh_lrus(void)
1604 {
1605         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1606 }
1607
1608 void set_bh_page(struct buffer_head *bh,
1609                 struct page *page, unsigned long offset)
1610 {
1611         bh->b_page = page;
1612         if (offset >= PAGE_SIZE)
1613                 BUG();
1614         if (PageHighMem(page))
1615                 /*
1616                  * This catches illegal uses and preserves the offset:
1617                  */
1618                 bh->b_data = (char *)(0 + offset);
1619         else
1620                 bh->b_data = page_address(page) + offset;
1621 }
1622 EXPORT_SYMBOL(set_bh_page);
1623
1624 /*
1625  * Called when truncating a buffer on a page completely.
1626  */
1627 static inline void discard_buffer(struct buffer_head * bh)
1628 {
1629         lock_buffer(bh);
1630         clear_buffer_dirty(bh);
1631         bh->b_bdev = NULL;
1632         clear_buffer_mapped(bh);
1633         clear_buffer_req(bh);
1634         clear_buffer_new(bh);
1635         clear_buffer_delay(bh);
1636         unlock_buffer(bh);
1637 }
1638
1639 /**
1640  * try_to_release_page() - release old fs-specific metadata on a page
1641  *
1642  * @page: the page which the kernel is trying to free
1643  * @gfp_mask: memory allocation flags (and I/O mode)
1644  *
1645  * The address_space is to try to release any data against the page
1646  * (presumably at page->private).  If the release was successful, return `1'.
1647  * Otherwise return zero.
1648  *
1649  * The @gfp_mask argument specifies whether I/O may be performed to release
1650  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1651  *
1652  * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1653  */
1654 int try_to_release_page(struct page *page, int gfp_mask)
1655 {
1656         struct address_space * const mapping = page->mapping;
1657
1658         BUG_ON(!PageLocked(page));
1659         if (PageWriteback(page))
1660                 return 0;
1661         
1662         if (mapping && mapping->a_ops->releasepage)
1663                 return mapping->a_ops->releasepage(page, gfp_mask);
1664         return try_to_free_buffers(page);
1665 }
1666 EXPORT_SYMBOL(try_to_release_page);
1667
1668 /**
1669  * block_invalidatepage - invalidate part of all of a buffer-backed page
1670  *
1671  * @page: the page which is affected
1672  * @offset: the index of the truncation point
1673  *
1674  * block_invalidatepage() is called when all or part of the page has become
1675  * invalidatedby a truncate operation.
1676  *
1677  * block_invalidatepage() does not have to release all buffers, but it must
1678  * ensure that no dirty buffer is left outside @offset and that no I/O
1679  * is underway against any of the blocks which are outside the truncation
1680  * point.  Because the caller is about to free (and possibly reuse) those
1681  * blocks on-disk.
1682  */
1683 int block_invalidatepage(struct page *page, unsigned long offset)
1684 {
1685         struct buffer_head *head, *bh, *next;
1686         unsigned int curr_off = 0;
1687         int ret = 1;
1688
1689         BUG_ON(!PageLocked(page));
1690         if (!page_has_buffers(page))
1691                 goto out;
1692
1693         head = page_buffers(page);
1694         bh = head;
1695         do {
1696                 unsigned int next_off = curr_off + bh->b_size;
1697                 next = bh->b_this_page;
1698
1699                 /*
1700                  * is this block fully invalidated?
1701                  */
1702                 if (offset <= curr_off)
1703                         discard_buffer(bh);
1704                 curr_off = next_off;
1705                 bh = next;
1706         } while (bh != head);
1707
1708         /*
1709          * We release buffers only if the entire page is being invalidated.
1710          * The get_block cached value has been unconditionally invalidated,
1711          * so real IO is not possible anymore.
1712          */
1713         if (offset == 0)
1714                 ret = try_to_release_page(page, 0);
1715 out:
1716         return ret;
1717 }
1718 EXPORT_SYMBOL(block_invalidatepage);
1719
1720 /*
1721  * We attach and possibly dirty the buffers atomically wrt
1722  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1723  * is already excluded via the page lock.
1724  */
1725 void create_empty_buffers(struct page *page,
1726                         unsigned long blocksize, unsigned long b_state)
1727 {
1728         struct buffer_head *bh, *head, *tail;
1729
1730         head = create_buffers(page, blocksize, 1);
1731         bh = head;
1732         do {
1733                 bh->b_state |= b_state;
1734                 tail = bh;
1735                 bh = bh->b_this_page;
1736         } while (bh);
1737         tail->b_this_page = head;
1738
1739         spin_lock(&page->mapping->private_lock);
1740         if (PageUptodate(page) || PageDirty(page)) {
1741                 bh = head;
1742                 do {
1743                         if (PageDirty(page))
1744                                 set_buffer_dirty(bh);
1745                         if (PageUptodate(page))
1746                                 set_buffer_uptodate(bh);
1747                         bh = bh->b_this_page;
1748                 } while (bh != head);
1749         }
1750         __set_page_buffers(page, head);
1751         spin_unlock(&page->mapping->private_lock);
1752 }
1753 EXPORT_SYMBOL(create_empty_buffers);
1754
1755 /*
1756  * We are taking a block for data and we don't want any output from any
1757  * buffer-cache aliases starting from return from that function and
1758  * until the moment when something will explicitly mark the buffer
1759  * dirty (hopefully that will not happen until we will free that block ;-)
1760  * We don't even need to mark it not-uptodate - nobody can expect
1761  * anything from a newly allocated buffer anyway. We used to used
1762  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1763  * don't want to mark the alias unmapped, for example - it would confuse
1764  * anyone who might pick it with bread() afterwards...
1765  *
1766  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1767  * be writeout I/O going on against recently-freed buffers.  We don't
1768  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1769  * only if we really need to.  That happens here.
1770  */
1771 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1772 {
1773         struct buffer_head *old_bh;
1774
1775         might_sleep();
1776
1777         old_bh = __find_get_block_slow(bdev, block, 0);
1778         if (old_bh) {
1779                 clear_buffer_dirty(old_bh);
1780                 wait_on_buffer(old_bh);
1781                 clear_buffer_req(old_bh);
1782                 __brelse(old_bh);
1783         }
1784 }
1785 EXPORT_SYMBOL(unmap_underlying_metadata);
1786
1787 /*
1788  * NOTE! All mapped/uptodate combinations are valid:
1789  *
1790  *      Mapped  Uptodate        Meaning
1791  *
1792  *      No      No              "unknown" - must do get_block()
1793  *      No      Yes             "hole" - zero-filled
1794  *      Yes     No              "allocated" - allocated on disk, not read in
1795  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1796  *
1797  * "Dirty" is valid only with the last case (mapped+uptodate).
1798  */
1799
1800 /*
1801  * While block_write_full_page is writing back the dirty buffers under
1802  * the page lock, whoever dirtied the buffers may decide to clean them
1803  * again at any time.  We handle that by only looking at the buffer
1804  * state inside lock_buffer().
1805  *
1806  * If block_write_full_page() is called for regular writeback
1807  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1808  * locked buffer.   This only can happen if someone has written the buffer
1809  * directly, with submit_bh().  At the address_space level PageWriteback
1810  * prevents this contention from occurring.
1811  */
1812 static int __block_write_full_page(struct inode *inode, struct page *page,
1813                         get_block_t *get_block, struct writeback_control *wbc)
1814 {
1815         int err;
1816         sector_t block;
1817         sector_t last_block;
1818         struct buffer_head *bh, *head;
1819         int nr_underway = 0;
1820
1821         BUG_ON(!PageLocked(page));
1822
1823         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1824
1825         if (!page_has_buffers(page)) {
1826                 create_empty_buffers(page, 1 << inode->i_blkbits,
1827                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1828         }
1829
1830         /*
1831          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1832          * here, and the (potentially unmapped) buffers may become dirty at
1833          * any time.  If a buffer becomes dirty here after we've inspected it
1834          * then we just miss that fact, and the page stays dirty.
1835          *
1836          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1837          * handle that here by just cleaning them.
1838          */
1839
1840         block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1841         head = page_buffers(page);
1842         bh = head;
1843
1844         /*
1845          * Get all the dirty buffers mapped to disk addresses and
1846          * handle any aliases from the underlying blockdev's mapping.
1847          */
1848         do {
1849                 if (block > last_block) {
1850                         /*
1851                          * mapped buffers outside i_size will occur, because
1852                          * this page can be outside i_size when there is a
1853                          * truncate in progress.
1854                          */
1855                         /*
1856                          * The buffer was zeroed by block_write_full_page()
1857                          */
1858                         clear_buffer_dirty(bh);
1859                         set_buffer_uptodate(bh);
1860                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1861                         err = get_block(inode, block, bh, 1);
1862                         if (err)
1863                                 goto recover;
1864                         if (buffer_new(bh)) {
1865                                 /* blockdev mappings never come here */
1866                                 clear_buffer_new(bh);
1867                                 unmap_underlying_metadata(bh->b_bdev,
1868                                                         bh->b_blocknr);
1869                         }
1870                 }
1871                 bh = bh->b_this_page;
1872                 block++;
1873         } while (bh != head);
1874
1875         do {
1876                 get_bh(bh);
1877                 if (!buffer_mapped(bh))
1878                         continue;
1879                 /*
1880                  * If it's a fully non-blocking write attempt and we cannot
1881                  * lock the buffer then redirty the page.  Note that this can
1882                  * potentially cause a busy-wait loop from pdflush and kswapd
1883                  * activity, but those code paths have their own higher-level
1884                  * throttling.
1885                  */
1886                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1887                         lock_buffer(bh);
1888                 } else if (test_set_buffer_locked(bh)) {
1889                         redirty_page_for_writepage(wbc, page);
1890                         continue;
1891                 }
1892                 if (test_clear_buffer_dirty(bh)) {
1893                         mark_buffer_async_write(bh);
1894                 } else {
1895                         unlock_buffer(bh);
1896                 }
1897         } while ((bh = bh->b_this_page) != head);
1898
1899         /*
1900          * The page and its buffers are protected by PageWriteback(), so we can
1901          * drop the bh refcounts early.
1902          */
1903         BUG_ON(PageWriteback(page));
1904         set_page_writeback(page);
1905         unlock_page(page);
1906
1907         do {
1908                 struct buffer_head *next = bh->b_this_page;
1909                 if (buffer_async_write(bh)) {
1910                         submit_bh(WRITE, bh);
1911                         nr_underway++;
1912                 }
1913                 put_bh(bh);
1914                 bh = next;
1915         } while (bh != head);
1916
1917         err = 0;
1918 done:
1919         if (nr_underway == 0) {
1920                 /*
1921                  * The page was marked dirty, but the buffers were
1922                  * clean.  Someone wrote them back by hand with
1923                  * ll_rw_block/submit_bh.  A rare case.
1924                  */
1925                 int uptodate = 1;
1926                 do {
1927                         if (!buffer_uptodate(bh)) {
1928                                 uptodate = 0;
1929                                 break;
1930                         }
1931                         bh = bh->b_this_page;
1932                 } while (bh != head);
1933                 if (uptodate)
1934                         SetPageUptodate(page);
1935                 end_page_writeback(page);
1936                 /*
1937                  * The page and buffer_heads can be released at any time from
1938                  * here on.
1939                  */
1940                 wbc->pages_skipped++;   /* We didn't write this page */
1941         }
1942         return err;
1943
1944 recover:
1945         /*
1946          * ENOSPC, or some other error.  We may already have added some
1947          * blocks to the file, so we need to write these out to avoid
1948          * exposing stale data.
1949          * The page is currently locked and not marked for writeback
1950          */
1951         bh = head;
1952         /* Recovery: lock and submit the mapped buffers */
1953         do {
1954                 get_bh(bh);
1955                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1956                         lock_buffer(bh);
1957                         mark_buffer_async_write(bh);
1958                 } else {
1959                         /*
1960                          * The buffer may have been set dirty during
1961                          * attachment to a dirty page.
1962                          */
1963                         clear_buffer_dirty(bh);
1964                 }
1965         } while ((bh = bh->b_this_page) != head);
1966         SetPageError(page);
1967         BUG_ON(PageWriteback(page));
1968         set_page_writeback(page);
1969         unlock_page(page);
1970         do {
1971                 struct buffer_head *next = bh->b_this_page;
1972                 if (buffer_async_write(bh)) {
1973                         clear_buffer_dirty(bh);
1974                         submit_bh(WRITE, bh);
1975                         nr_underway++;
1976                 }
1977                 put_bh(bh);
1978                 bh = next;
1979         } while (bh != head);
1980         goto done;
1981 }
1982
1983 static int __block_prepare_write(struct inode *inode, struct page *page,
1984                 unsigned from, unsigned to, get_block_t *get_block)
1985 {
1986         unsigned block_start, block_end;
1987         sector_t block;
1988         int err = 0;
1989         unsigned blocksize, bbits;
1990         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1991
1992         BUG_ON(!PageLocked(page));
1993         BUG_ON(from > PAGE_CACHE_SIZE);
1994         BUG_ON(to > PAGE_CACHE_SIZE);
1995         BUG_ON(from > to);
1996
1997         blocksize = 1 << inode->i_blkbits;
1998         if (!page_has_buffers(page))
1999                 create_empty_buffers(page, blocksize, 0);
2000         head = page_buffers(page);
2001
2002         bbits = inode->i_blkbits;
2003         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2004
2005         for(bh = head, block_start = 0; bh != head || !block_start;
2006             block++, block_start=block_end, bh = bh->b_this_page) {
2007                 block_end = block_start + blocksize;
2008                 if (block_end <= from || block_start >= to) {
2009                         if (PageUptodate(page)) {
2010                                 if (!buffer_uptodate(bh))
2011                                         set_buffer_uptodate(bh);
2012                         }
2013                         continue;
2014                 }
2015                 if (buffer_new(bh))
2016                         clear_buffer_new(bh);
2017                 if (!buffer_mapped(bh)) {
2018                         err = get_block(inode, block, bh, 1);
2019                         if (err)
2020                                 goto out;
2021                         if (buffer_new(bh)) {
2022                                 clear_buffer_new(bh);
2023                                 unmap_underlying_metadata(bh->b_bdev,
2024                                                         bh->b_blocknr);
2025                                 if (PageUptodate(page)) {
2026                                         set_buffer_uptodate(bh);
2027                                         continue;
2028                                 }
2029                                 if (block_end > to || block_start < from) {
2030                                         void *kaddr;
2031
2032                                         kaddr = kmap_atomic(page, KM_USER0);
2033                                         if (block_end > to)
2034                                                 memset(kaddr+to, 0,
2035                                                         block_end-to);
2036                                         if (block_start < from)
2037                                                 memset(kaddr+block_start,
2038                                                         0, from-block_start);
2039                                         flush_dcache_page(page);
2040                                         kunmap_atomic(kaddr, KM_USER0);
2041                                 }
2042                                 continue;
2043                         }
2044                 }
2045                 if (PageUptodate(page)) {
2046                         if (!buffer_uptodate(bh))
2047                                 set_buffer_uptodate(bh);
2048                         continue; 
2049                 }
2050                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2051                      (block_start < from || block_end > to)) {
2052                         ll_rw_block(READ, 1, &bh);
2053                         *wait_bh++=bh;
2054                 }
2055         }
2056         /*
2057          * If we issued read requests - let them complete.
2058          */
2059         while(wait_bh > wait) {
2060                 wait_on_buffer(*--wait_bh);
2061                 if (!buffer_uptodate(*wait_bh))
2062                         return -EIO;
2063         }
2064         return 0;
2065 out:
2066         /*
2067          * Zero out any newly allocated blocks to avoid exposing stale
2068          * data.  If BH_New is set, we know that the block was newly
2069          * allocated in the above loop.
2070          */
2071         bh = head;
2072         block_start = 0;
2073         do {
2074                 block_end = block_start+blocksize;
2075                 if (block_end <= from)
2076                         goto next_bh;
2077                 if (block_start >= to)
2078                         break;
2079                 if (buffer_new(bh)) {
2080                         void *kaddr;
2081
2082                         clear_buffer_new(bh);
2083                         kaddr = kmap_atomic(page, KM_USER0);
2084                         memset(kaddr+block_start, 0, bh->b_size);
2085                         kunmap_atomic(kaddr, KM_USER0);
2086                         set_buffer_uptodate(bh);
2087                         mark_buffer_dirty(bh);
2088                 }
2089 next_bh:
2090                 block_start = block_end;
2091                 bh = bh->b_this_page;
2092         } while (bh != head);
2093         return err;
2094 }
2095
2096 static int __block_commit_write(struct inode *inode, struct page *page,
2097                 unsigned from, unsigned to)
2098 {
2099         unsigned block_start, block_end;
2100         int partial = 0;
2101         unsigned blocksize;
2102         struct buffer_head *bh, *head;
2103
2104         blocksize = 1 << inode->i_blkbits;
2105
2106         for(bh = head = page_buffers(page), block_start = 0;
2107             bh != head || !block_start;
2108             block_start=block_end, bh = bh->b_this_page) {
2109                 block_end = block_start + blocksize;
2110                 if (block_end <= from || block_start >= to) {
2111                         if (!buffer_uptodate(bh))
2112                                 partial = 1;
2113                 } else {
2114                         set_buffer_uptodate(bh);
2115                         mark_buffer_dirty(bh);
2116                 }
2117         }
2118
2119         /*
2120          * If this is a partial write which happened to make all buffers
2121          * uptodate then we can optimize away a bogus readpage() for
2122          * the next read(). Here we 'discover' whether the page went
2123          * uptodate as a result of this (potentially partial) write.
2124          */
2125         if (!partial)
2126                 SetPageUptodate(page);
2127         return 0;
2128 }
2129
2130 /*
2131  * Generic "read page" function for block devices that have the normal
2132  * get_block functionality. This is most of the block device filesystems.
2133  * Reads the page asynchronously --- the unlock_buffer() and
2134  * set/clear_buffer_uptodate() functions propagate buffer state into the
2135  * page struct once IO has completed.
2136  */
2137 int block_read_full_page(struct page *page, get_block_t *get_block)
2138 {
2139         struct inode *inode = page->mapping->host;
2140         sector_t iblock, lblock;
2141         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2142         unsigned int blocksize;
2143         int nr, i;
2144         int fully_mapped = 1;
2145
2146         if (!PageLocked(page))
2147                 PAGE_BUG(page);
2148         blocksize = 1 << inode->i_blkbits;
2149         if (!page_has_buffers(page))
2150                 create_empty_buffers(page, blocksize, 0);
2151         head = page_buffers(page);
2152
2153         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2154         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2155         bh = head;
2156         nr = 0;
2157         i = 0;
2158
2159         do {
2160                 if (buffer_uptodate(bh))
2161                         continue;
2162
2163                 if (!buffer_mapped(bh)) {
2164                         fully_mapped = 0;
2165                         if (iblock < lblock) {
2166                                 if (get_block(inode, iblock, bh, 0))
2167                                         SetPageError(page);
2168                         }
2169                         if (!buffer_mapped(bh)) {
2170                                 void *kaddr = kmap_atomic(page, KM_USER0);
2171                                 memset(kaddr + i * blocksize, 0, blocksize);
2172                                 flush_dcache_page(page);
2173                                 kunmap_atomic(kaddr, KM_USER0);
2174                                 set_buffer_uptodate(bh);
2175                                 continue;
2176                         }
2177                         /*
2178                          * get_block() might have updated the buffer
2179                          * synchronously
2180                          */
2181                         if (buffer_uptodate(bh))
2182                                 continue;
2183                 }
2184                 arr[nr++] = bh;
2185         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2186
2187         if (fully_mapped)
2188                 SetPageMappedToDisk(page);
2189
2190         if (!nr) {
2191                 /*
2192                  * All buffers are uptodate - we can set the page uptodate
2193                  * as well. But not if get_block() returned an error.
2194                  */
2195                 if (!PageError(page))
2196                         SetPageUptodate(page);
2197                 unlock_page(page);
2198                 return 0;
2199         }
2200
2201         /* Stage two: lock the buffers */
2202         for (i = 0; i < nr; i++) {
2203                 bh = arr[i];
2204                 lock_buffer(bh);
2205                 mark_buffer_async_read(bh);
2206         }
2207
2208         /*
2209          * Stage 3: start the IO.  Check for uptodateness
2210          * inside the buffer lock in case another process reading
2211          * the underlying blockdev brought it uptodate (the sct fix).
2212          */
2213         for (i = 0; i < nr; i++) {
2214                 bh = arr[i];
2215                 if (buffer_uptodate(bh))
2216                         end_buffer_async_read(bh, 1);
2217                 else
2218                         submit_bh(READ, bh);
2219         }
2220         return 0;
2221 }
2222
2223 /* utility function for filesystems that need to do work on expanding
2224  * truncates.  Uses prepare/commit_write to allow the filesystem to
2225  * deal with the hole.  
2226  */
2227 int generic_cont_expand(struct inode *inode, loff_t size)
2228 {
2229         struct address_space *mapping = inode->i_mapping;
2230         struct page *page;
2231         unsigned long index, offset, limit;
2232         int err;
2233
2234         err = -EFBIG;
2235         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2236         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2237                 send_sig(SIGXFSZ, current, 0);
2238                 goto out;
2239         }
2240         if (size > inode->i_sb->s_maxbytes)
2241                 goto out;
2242
2243         offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2244
2245         /* ugh.  in prepare/commit_write, if from==to==start of block, we 
2246         ** skip the prepare.  make sure we never send an offset for the start
2247         ** of a block
2248         */
2249         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2250                 offset++;
2251         }
2252         index = size >> PAGE_CACHE_SHIFT;
2253         err = -ENOMEM;
2254         page = grab_cache_page(mapping, index);
2255         if (!page)
2256                 goto out;
2257         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2258         if (!err) {
2259                 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2260         }
2261         unlock_page(page);
2262         page_cache_release(page);
2263         if (err > 0)
2264                 err = 0;
2265 out:
2266         return err;
2267 }
2268
2269 /*
2270  * For moronic filesystems that do not allow holes in file.
2271  * We may have to extend the file.
2272  */
2273
2274 int cont_prepare_write(struct page *page, unsigned offset,
2275                 unsigned to, get_block_t *get_block, loff_t *bytes)
2276 {
2277         struct address_space *mapping = page->mapping;
2278         struct inode *inode = mapping->host;
2279         struct page *new_page;
2280         pgoff_t pgpos;
2281         long status;
2282         unsigned zerofrom;
2283         unsigned blocksize = 1 << inode->i_blkbits;
2284         void *kaddr;
2285
2286         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2287                 status = -ENOMEM;
2288                 new_page = grab_cache_page(mapping, pgpos);
2289                 if (!new_page)
2290                         goto out;
2291                 /* we might sleep */
2292                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2293                         unlock_page(new_page);
2294                         page_cache_release(new_page);
2295                         continue;
2296                 }
2297                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2298                 if (zerofrom & (blocksize-1)) {
2299                         *bytes |= (blocksize-1);
2300                         (*bytes)++;
2301                 }
2302                 status = __block_prepare_write(inode, new_page, zerofrom,
2303                                                 PAGE_CACHE_SIZE, get_block);
2304                 if (status)
2305                         goto out_unmap;
2306                 kaddr = kmap_atomic(new_page, KM_USER0);
2307                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2308                 flush_dcache_page(new_page);
2309                 kunmap_atomic(kaddr, KM_USER0);
2310                 __block_commit_write(inode, new_page,
2311                                 zerofrom, PAGE_CACHE_SIZE);
2312                 unlock_page(new_page);
2313                 page_cache_release(new_page);
2314         }
2315
2316         if (page->index < pgpos) {
2317                 /* completely inside the area */
2318                 zerofrom = offset;
2319         } else {
2320                 /* page covers the boundary, find the boundary offset */
2321                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2322
2323                 /* if we will expand the thing last block will be filled */
2324                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2325                         *bytes |= (blocksize-1);
2326                         (*bytes)++;
2327                 }
2328
2329                 /* starting below the boundary? Nothing to zero out */
2330                 if (offset <= zerofrom)
2331                         zerofrom = offset;
2332         }
2333         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2334         if (status)
2335                 goto out1;
2336         if (zerofrom < offset) {
2337                 kaddr = kmap_atomic(page, KM_USER0);
2338                 memset(kaddr+zerofrom, 0, offset-zerofrom);
2339                 flush_dcache_page(page);
2340                 kunmap_atomic(kaddr, KM_USER0);
2341                 __block_commit_write(inode, page, zerofrom, offset);
2342         }
2343         return 0;
2344 out1:
2345         ClearPageUptodate(page);
2346         return status;
2347
2348 out_unmap:
2349         ClearPageUptodate(new_page);
2350         unlock_page(new_page);
2351         page_cache_release(new_page);
2352 out:
2353         return status;
2354 }
2355
2356 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2357                         get_block_t *get_block)
2358 {
2359         struct inode *inode = page->mapping->host;
2360         int err = __block_prepare_write(inode, page, from, to, get_block);
2361         if (err)
2362                 ClearPageUptodate(page);
2363         return err;
2364 }
2365
2366 int block_commit_write(struct page *page, unsigned from, unsigned to)
2367 {
2368         struct inode *inode = page->mapping->host;
2369         __block_commit_write(inode,page,from,to);
2370         return 0;
2371 }
2372
2373 int generic_commit_write(struct file *file, struct page *page,
2374                 unsigned from, unsigned to)
2375 {
2376         struct inode *inode = page->mapping->host;
2377         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2378         __block_commit_write(inode,page,from,to);
2379         /*
2380          * No need to use i_size_read() here, the i_size
2381          * cannot change under us because we hold i_sem.
2382          */
2383         if (pos > inode->i_size) {
2384                 i_size_write(inode, pos);
2385                 mark_inode_dirty(inode);
2386         }
2387         return 0;
2388 }
2389
2390
2391 /*
2392  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2393  * immediately, while under the page lock.  So it needs a special end_io
2394  * handler which does not touch the bh after unlocking it.
2395  *
2396  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2397  * a race there is benign: unlock_buffer() only use the bh's address for
2398  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2399  * itself.
2400  */
2401 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2402 {
2403         if (uptodate) {
2404                 set_buffer_uptodate(bh);
2405         } else {
2406                 /* This happens, due to failed READA attempts. */
2407                 clear_buffer_uptodate(bh);
2408         }
2409         unlock_buffer(bh);
2410 }
2411
2412 /*
2413  * On entry, the page is fully not uptodate.
2414  * On exit the page is fully uptodate in the areas outside (from,to)
2415  */
2416 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2417                         get_block_t *get_block)
2418 {
2419         struct inode *inode = page->mapping->host;
2420         const unsigned blkbits = inode->i_blkbits;
2421         const unsigned blocksize = 1 << blkbits;
2422         struct buffer_head map_bh;
2423         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2424         unsigned block_in_page;
2425         unsigned block_start;
2426         sector_t block_in_file;
2427         char *kaddr;
2428         int nr_reads = 0;
2429         int i;
2430         int ret = 0;
2431         int is_mapped_to_disk = 1;
2432         int dirtied_it = 0;
2433
2434         if (PageMappedToDisk(page))
2435                 return 0;
2436
2437         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2438         map_bh.b_page = page;
2439
2440         /*
2441          * We loop across all blocks in the page, whether or not they are
2442          * part of the affected region.  This is so we can discover if the
2443          * page is fully mapped-to-disk.
2444          */
2445         for (block_start = 0, block_in_page = 0;
2446                   block_start < PAGE_CACHE_SIZE;
2447                   block_in_page++, block_start += blocksize) {
2448                 unsigned block_end = block_start + blocksize;
2449                 int create;
2450
2451                 map_bh.b_state = 0;
2452                 create = 1;
2453                 if (block_start >= to)
2454                         create = 0;
2455                 ret = get_block(inode, block_in_file + block_in_page,
2456                                         &map_bh, create);
2457                 if (ret)
2458                         goto failed;
2459                 if (!buffer_mapped(&map_bh))
2460                         is_mapped_to_disk = 0;
2461                 if (buffer_new(&map_bh))
2462                         unmap_underlying_metadata(map_bh.b_bdev,
2463                                                         map_bh.b_blocknr);
2464                 if (PageUptodate(page))
2465                         continue;
2466                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2467                         kaddr = kmap_atomic(page, KM_USER0);
2468                         if (block_start < from) {
2469                                 memset(kaddr+block_start, 0, from-block_start);
2470                                 dirtied_it = 1;
2471                         }
2472                         if (block_end > to) {
2473                                 memset(kaddr + to, 0, block_end - to);
2474                                 dirtied_it = 1;
2475                         }
2476                         flush_dcache_page(page);
2477                         kunmap_atomic(kaddr, KM_USER0);
2478                         continue;
2479                 }
2480                 if (buffer_uptodate(&map_bh))
2481                         continue;       /* reiserfs does this */
2482                 if (block_start < from || block_end > to) {
2483                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2484
2485                         if (!bh) {
2486                                 ret = -ENOMEM;
2487                                 goto failed;
2488                         }
2489                         bh->b_state = map_bh.b_state;
2490                         atomic_set(&bh->b_count, 0);
2491                         bh->b_this_page = NULL;
2492                         bh->b_page = page;
2493                         bh->b_blocknr = map_bh.b_blocknr;
2494                         bh->b_size = blocksize;
2495                         bh->b_data = (char *)(long)block_start;
2496                         bh->b_bdev = map_bh.b_bdev;
2497                         bh->b_private = NULL;
2498                         read_bh[nr_reads++] = bh;
2499                 }
2500         }
2501
2502         if (nr_reads) {
2503                 struct buffer_head *bh;
2504
2505                 /*
2506                  * The page is locked, so these buffers are protected from
2507                  * any VM or truncate activity.  Hence we don't need to care
2508                  * for the buffer_head refcounts.
2509                  */
2510                 for (i = 0; i < nr_reads; i++) {
2511                         bh = read_bh[i];
2512                         lock_buffer(bh);
2513                         bh->b_end_io = end_buffer_read_nobh;
2514                         submit_bh(READ, bh);
2515                 }
2516                 for (i = 0; i < nr_reads; i++) {
2517                         bh = read_bh[i];
2518                         wait_on_buffer(bh);
2519                         if (!buffer_uptodate(bh))
2520                                 ret = -EIO;
2521                         free_buffer_head(bh);
2522                         read_bh[i] = NULL;
2523                 }
2524                 if (ret)
2525                         goto failed;
2526         }
2527
2528         if (is_mapped_to_disk)
2529                 SetPageMappedToDisk(page);
2530         SetPageUptodate(page);
2531
2532         /*
2533          * Setting the page dirty here isn't necessary for the prepare_write
2534          * function - commit_write will do that.  But if/when this function is
2535          * used within the pagefault handler to ensure that all mmapped pages
2536          * have backing space in the filesystem, we will need to dirty the page
2537          * if its contents were altered.
2538          */
2539         if (dirtied_it)
2540                 set_page_dirty(page);
2541
2542         return 0;
2543
2544 failed:
2545         for (i = 0; i < nr_reads; i++) {
2546                 if (read_bh[i])
2547                         free_buffer_head(read_bh[i]);
2548         }
2549
2550         /*
2551          * Error recovery is pretty slack.  Clear the page and mark it dirty
2552          * so we'll later zero out any blocks which _were_ allocated.
2553          */
2554         kaddr = kmap_atomic(page, KM_USER0);
2555         memset(kaddr, 0, PAGE_CACHE_SIZE);
2556         kunmap_atomic(kaddr, KM_USER0);
2557         SetPageUptodate(page);
2558         set_page_dirty(page);
2559         return ret;
2560 }
2561 EXPORT_SYMBOL(nobh_prepare_write);
2562
2563 int nobh_commit_write(struct file *file, struct page *page,
2564                 unsigned from, unsigned to)
2565 {
2566         struct inode *inode = page->mapping->host;
2567         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2568
2569         set_page_dirty(page);
2570         if (pos > inode->i_size) {
2571                 i_size_write(inode, pos);
2572                 mark_inode_dirty(inode);
2573         }
2574         return 0;
2575 }
2576 EXPORT_SYMBOL(nobh_commit_write);
2577
2578 /*
2579  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2580  */
2581 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2582 {
2583         struct inode *inode = mapping->host;
2584         unsigned blocksize = 1 << inode->i_blkbits;
2585         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2586         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2587         unsigned to;
2588         struct page *page;
2589         struct address_space_operations *a_ops = mapping->a_ops;
2590         char *kaddr;
2591         int ret = 0;
2592
2593         if ((offset & (blocksize - 1)) == 0)
2594                 goto out;
2595
2596         ret = -ENOMEM;
2597         page = grab_cache_page(mapping, index);
2598         if (!page)
2599                 goto out;
2600
2601         to = (offset + blocksize) & ~(blocksize - 1);
2602         ret = a_ops->prepare_write(NULL, page, offset, to);
2603         if (ret == 0) {
2604                 kaddr = kmap_atomic(page, KM_USER0);
2605                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2606                 flush_dcache_page(page);
2607                 kunmap_atomic(kaddr, KM_USER0);
2608                 set_page_dirty(page);
2609         }
2610         unlock_page(page);
2611         page_cache_release(page);
2612 out:
2613         return ret;
2614 }
2615 EXPORT_SYMBOL(nobh_truncate_page);
2616
2617 int block_truncate_page(struct address_space *mapping,
2618                         loff_t from, get_block_t *get_block)
2619 {
2620         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2621         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2622         unsigned blocksize;
2623         pgoff_t iblock;
2624         unsigned length, pos;
2625         struct inode *inode = mapping->host;
2626         struct page *page;
2627         struct buffer_head *bh;
2628         void *kaddr;
2629         int err;
2630
2631         blocksize = 1 << inode->i_blkbits;
2632         length = offset & (blocksize - 1);
2633
2634         /* Block boundary? Nothing to do */
2635         if (!length)
2636                 return 0;
2637
2638         length = blocksize - length;
2639         iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2640         
2641         page = grab_cache_page(mapping, index);
2642         err = -ENOMEM;
2643         if (!page)
2644                 goto out;
2645
2646         if (!page_has_buffers(page))
2647                 create_empty_buffers(page, blocksize, 0);
2648
2649         /* Find the buffer that contains "offset" */
2650         bh = page_buffers(page);
2651         pos = blocksize;
2652         while (offset >= pos) {
2653                 bh = bh->b_this_page;
2654                 iblock++;
2655                 pos += blocksize;
2656         }
2657
2658         err = 0;
2659         if (!buffer_mapped(bh)) {
2660                 err = get_block(inode, iblock, bh, 0);
2661                 if (err)
2662                         goto unlock;
2663                 /* unmapped? It's a hole - nothing to do */
2664                 if (!buffer_mapped(bh))
2665                         goto unlock;
2666         }
2667
2668         /* Ok, it's mapped. Make sure it's up-to-date */
2669         if (PageUptodate(page))
2670                 set_buffer_uptodate(bh);
2671
2672         if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2673                 err = -EIO;
2674                 ll_rw_block(READ, 1, &bh);
2675                 wait_on_buffer(bh);
2676                 /* Uhhuh. Read error. Complain and punt. */
2677                 if (!buffer_uptodate(bh))
2678                         goto unlock;
2679         }
2680
2681         kaddr = kmap_atomic(page, KM_USER0);
2682         memset(kaddr + offset, 0, length);
2683         flush_dcache_page(page);
2684         kunmap_atomic(kaddr, KM_USER0);
2685
2686         mark_buffer_dirty(bh);
2687         err = 0;
2688
2689 unlock:
2690         unlock_page(page);
2691         page_cache_release(page);
2692 out:
2693         return err;
2694 }
2695
2696 /*
2697  * The generic ->writepage function for buffer-backed address_spaces
2698  */
2699 int block_write_full_page(struct page *page, get_block_t *get_block,
2700                         struct writeback_control *wbc)
2701 {
2702         struct inode * const inode = page->mapping->host;
2703         loff_t i_size = i_size_read(inode);
2704         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2705         unsigned offset;
2706         void *kaddr;
2707
2708         /* Is the page fully inside i_size? */
2709         if (page->index < end_index)
2710                 return __block_write_full_page(inode, page, get_block, wbc);
2711
2712         /* Is the page fully outside i_size? (truncate in progress) */
2713         offset = i_size & (PAGE_CACHE_SIZE-1);
2714         if (page->index >= end_index+1 || !offset) {
2715                 /*
2716                  * The page may have dirty, unmapped buffers.  For example,
2717                  * they may have been added in ext3_writepage().  Make them
2718                  * freeable here, so the page does not leak.
2719                  */
2720                 block_invalidatepage(page, 0);
2721                 unlock_page(page);
2722                 return 0; /* don't care */
2723         }
2724
2725         /*
2726          * The page straddles i_size.  It must be zeroed out on each and every
2727          * writepage invokation because it may be mmapped.  "A file is mapped
2728          * in multiples of the page size.  For a file that is not a multiple of
2729          * the  page size, the remaining memory is zeroed when mapped, and
2730          * writes to that region are not written out to the file."
2731          */
2732         kaddr = kmap_atomic(page, KM_USER0);
2733         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2734         flush_dcache_page(page);
2735         kunmap_atomic(kaddr, KM_USER0);
2736         return __block_write_full_page(inode, page, get_block, wbc);
2737 }
2738
2739 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2740                             get_block_t *get_block)
2741 {
2742         struct buffer_head tmp;
2743         struct inode *inode = mapping->host;
2744         tmp.b_state = 0;
2745         tmp.b_blocknr = 0;
2746         get_block(inode, block, &tmp, 0);
2747         return tmp.b_blocknr;
2748 }
2749
2750 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2751 {
2752         struct buffer_head *bh = bio->bi_private;
2753
2754         if (bio->bi_size)
2755                 return 1;
2756
2757         if (err == -EOPNOTSUPP) {
2758                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2759                 set_bit(BH_Eopnotsupp, &bh->b_state);
2760         }
2761
2762         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2763         bio_put(bio);
2764         return 0;
2765 }
2766
2767 int submit_bh(int rw, struct buffer_head * bh)
2768 {
2769         struct bio *bio;
2770         int ret = 0;
2771
2772         BUG_ON(!buffer_locked(bh));
2773         BUG_ON(!buffer_mapped(bh));
2774         BUG_ON(!bh->b_end_io);
2775
2776         if (buffer_ordered(bh) && (rw == WRITE))
2777                 rw = WRITE_BARRIER;
2778
2779         /*
2780          * Only clear out a write error when rewriting, should this
2781          * include WRITE_SYNC as well?
2782          */
2783         if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2784                 clear_buffer_write_io_error(bh);
2785
2786         /*
2787          * from here on down, it's all bio -- do the initial mapping,
2788          * submit_bio -> generic_make_request may further map this bio around
2789          */
2790         bio = bio_alloc(GFP_NOIO, 1);
2791
2792         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2793         bio->bi_bdev = bh->b_bdev;
2794         bio->bi_io_vec[0].bv_page = bh->b_page;
2795         bio->bi_io_vec[0].bv_len = bh->b_size;
2796         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2797
2798         bio->bi_vcnt = 1;
2799         bio->bi_idx = 0;
2800         bio->bi_size = bh->b_size;
2801
2802         bio->bi_end_io = end_bio_bh_io_sync;
2803         bio->bi_private = bh;
2804
2805         bio_get(bio);
2806         submit_bio(rw, bio);
2807
2808         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2809                 ret = -EOPNOTSUPP;
2810
2811         bio_put(bio);
2812         return ret;
2813 }
2814
2815 /**
2816  * ll_rw_block: low-level access to block devices (DEPRECATED)
2817  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2818  * @nr: number of &struct buffer_heads in the array
2819  * @bhs: array of pointers to &struct buffer_head
2820  *
2821  * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2822  * and requests an I/O operation on them, either a %READ or a %WRITE.
2823  * The third %READA option is described in the documentation for
2824  * generic_make_request() which ll_rw_block() calls.
2825  *
2826  * This function drops any buffer that it cannot get a lock on (with the
2827  * BH_Lock state bit), any buffer that appears to be clean when doing a
2828  * write request, and any buffer that appears to be up-to-date when doing
2829  * read request.  Further it marks as clean buffers that are processed for
2830  * writing (the buffer cache won't assume that they are actually clean until
2831  * the buffer gets unlocked).
2832  *
2833  * ll_rw_block sets b_end_io to simple completion handler that marks
2834  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2835  * any waiters. 
2836  *
2837  * All of the buffers must be for the same device, and must also be a
2838  * multiple of the current approved size for the device.
2839  */
2840 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2841 {
2842         int i;
2843
2844         for (i = 0; i < nr; i++) {
2845                 struct buffer_head *bh = bhs[i];
2846
2847                 if (test_set_buffer_locked(bh))
2848                         continue;
2849
2850                 get_bh(bh);
2851                 if (rw == WRITE) {
2852                         bh->b_end_io = end_buffer_write_sync;
2853                         if (test_clear_buffer_dirty(bh)) {
2854                                 submit_bh(WRITE, bh);
2855                                 continue;
2856                         }
2857                 } else {
2858                         bh->b_end_io = end_buffer_read_sync;
2859                         if (!buffer_uptodate(bh)) {
2860                                 submit_bh(rw, bh);
2861                                 continue;
2862                         }
2863                 }
2864                 unlock_buffer(bh);
2865                 put_bh(bh);
2866         }
2867 }
2868
2869 /*
2870  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2871  * and then start new I/O and then wait upon it.  The caller must have a ref on
2872  * the buffer_head.
2873  */
2874 int sync_dirty_buffer(struct buffer_head *bh)
2875 {
2876         int ret = 0;
2877
2878         WARN_ON(atomic_read(&bh->b_count) < 1);
2879         lock_buffer(bh);
2880         if (test_clear_buffer_dirty(bh)) {
2881                 get_bh(bh);
2882                 bh->b_end_io = end_buffer_write_sync;
2883                 ret = submit_bh(WRITE, bh);
2884                 wait_on_buffer(bh);
2885                 if (buffer_eopnotsupp(bh)) {
2886                         clear_buffer_eopnotsupp(bh);
2887                         ret = -EOPNOTSUPP;
2888                 }
2889                 if (!ret && !buffer_uptodate(bh))
2890                         ret = -EIO;
2891         } else {
2892                 unlock_buffer(bh);
2893         }
2894         return ret;
2895 }
2896
2897 /*
2898  * try_to_free_buffers() checks if all the buffers on this particular page
2899  * are unused, and releases them if so.
2900  *
2901  * Exclusion against try_to_free_buffers may be obtained by either
2902  * locking the page or by holding its mapping's private_lock.
2903  *
2904  * If the page is dirty but all the buffers are clean then we need to
2905  * be sure to mark the page clean as well.  This is because the page
2906  * may be against a block device, and a later reattachment of buffers
2907  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2908  * filesystem data on the same device.
2909  *
2910  * The same applies to regular filesystem pages: if all the buffers are
2911  * clean then we set the page clean and proceed.  To do that, we require
2912  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2913  * private_lock.
2914  *
2915  * try_to_free_buffers() is non-blocking.
2916  */
2917 static inline int buffer_busy(struct buffer_head *bh)
2918 {
2919         return atomic_read(&bh->b_count) |
2920                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2921 }
2922
2923 static int
2924 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2925 {
2926         struct buffer_head *head = page_buffers(page);
2927         struct buffer_head *bh;
2928
2929         bh = head;
2930         do {
2931                 if (buffer_write_io_error(bh))
2932                         set_bit(AS_EIO, &page->mapping->flags);
2933                 if (buffer_busy(bh))
2934                         goto failed;
2935                 bh = bh->b_this_page;
2936         } while (bh != head);
2937
2938         do {
2939                 struct buffer_head *next = bh->b_this_page;
2940
2941                 if (!list_empty(&bh->b_assoc_buffers))
2942                         __remove_assoc_queue(bh);
2943                 bh = next;
2944         } while (bh != head);
2945         *buffers_to_free = head;
2946         __clear_page_buffers(page);
2947         return 1;
2948 failed:
2949         return 0;
2950 }
2951
2952 int try_to_free_buffers(struct page *page)
2953 {
2954         struct address_space * const mapping = page->mapping;
2955         struct buffer_head *buffers_to_free = NULL;
2956         int ret = 0;
2957
2958         BUG_ON(!PageLocked(page));
2959         if (PageWriteback(page))
2960                 return 0;
2961
2962         if (mapping == NULL) {          /* can this still happen? */
2963                 ret = drop_buffers(page, &buffers_to_free);
2964                 goto out;
2965         }
2966
2967         spin_lock(&mapping->private_lock);
2968         ret = drop_buffers(page, &buffers_to_free);
2969         if (ret) {
2970                 /*
2971                  * If the filesystem writes its buffers by hand (eg ext3)
2972                  * then we can have clean buffers against a dirty page.  We
2973                  * clean the page here; otherwise later reattachment of buffers
2974                  * could encounter a non-uptodate page, which is unresolvable.
2975                  * This only applies in the rare case where try_to_free_buffers
2976                  * succeeds but the page is not freed.
2977                  */
2978                 clear_page_dirty(page);
2979         }
2980         spin_unlock(&mapping->private_lock);
2981 out:
2982         if (buffers_to_free) {
2983                 struct buffer_head *bh = buffers_to_free;
2984
2985                 do {
2986                         struct buffer_head *next = bh->b_this_page;
2987                         free_buffer_head(bh);
2988                         bh = next;
2989                 } while (bh != buffers_to_free);
2990         }
2991         return ret;
2992 }
2993 EXPORT_SYMBOL(try_to_free_buffers);
2994
2995 int block_sync_page(struct page *page)
2996 {
2997         struct address_space *mapping;
2998
2999         smp_mb();
3000         mapping = page_mapping(page);
3001         if (mapping)
3002                 blk_run_backing_dev(mapping->backing_dev_info, page);
3003         return 0;
3004 }
3005
3006 /*
3007  * There are no bdflush tunables left.  But distributions are
3008  * still running obsolete flush daemons, so we terminate them here.
3009  *
3010  * Use of bdflush() is deprecated and will be removed in a future kernel.
3011  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3012  */
3013 asmlinkage long sys_bdflush(int func, long data)
3014 {
3015         static int msg_count;
3016
3017         if (!capable(CAP_SYS_ADMIN))
3018                 return -EPERM;
3019
3020         if (msg_count < 5) {
3021                 msg_count++;
3022                 printk(KERN_INFO
3023                         "warning: process `%s' used the obsolete bdflush"
3024                         " system call\n", current->comm);
3025                 printk(KERN_INFO "Fix your initscripts?\n");
3026         }
3027
3028         if (func == 1)
3029                 do_exit(0);
3030         return 0;
3031 }
3032
3033 /*
3034  * Buffer-head allocation
3035  */
3036 static kmem_cache_t *bh_cachep;
3037
3038 /*
3039  * Once the number of bh's in the machine exceeds this level, we start
3040  * stripping them in writeback.
3041  */
3042 static int max_buffer_heads;
3043
3044 int buffer_heads_over_limit;
3045
3046 struct bh_accounting {
3047         int nr;                 /* Number of live bh's */
3048         int ratelimit;          /* Limit cacheline bouncing */
3049 };
3050
3051 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3052
3053 static void recalc_bh_state(void)
3054 {
3055         int i;
3056         int tot = 0;
3057
3058         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3059                 return;
3060         __get_cpu_var(bh_accounting).ratelimit = 0;
3061         for_each_cpu(i)
3062                 tot += per_cpu(bh_accounting, i).nr;
3063         buffer_heads_over_limit = (tot > max_buffer_heads);
3064 }
3065         
3066 struct buffer_head *alloc_buffer_head(int gfp_flags)
3067 {
3068         struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3069         if (ret) {
3070                 preempt_disable();
3071                 __get_cpu_var(bh_accounting).nr++;
3072                 recalc_bh_state();
3073                 preempt_enable();
3074         }
3075         return ret;
3076 }
3077 EXPORT_SYMBOL(alloc_buffer_head);
3078
3079 void free_buffer_head(struct buffer_head *bh)
3080 {
3081         BUG_ON(!list_empty(&bh->b_assoc_buffers));
3082         kmem_cache_free(bh_cachep, bh);
3083         preempt_disable();
3084         __get_cpu_var(bh_accounting).nr--;
3085         recalc_bh_state();
3086         preempt_enable();
3087 }
3088 EXPORT_SYMBOL(free_buffer_head);
3089
3090 static void
3091 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3092 {
3093         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3094                             SLAB_CTOR_CONSTRUCTOR) {
3095                 struct buffer_head * bh = (struct buffer_head *)data;
3096
3097                 memset(bh, 0, sizeof(*bh));
3098                 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3099         }
3100 }
3101
3102 #ifdef CONFIG_HOTPLUG_CPU
3103 static void buffer_exit_cpu(int cpu)
3104 {
3105         int i;
3106         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3107
3108         for (i = 0; i < BH_LRU_SIZE; i++) {
3109                 brelse(b->bhs[i]);
3110                 b->bhs[i] = NULL;
3111         }
3112 }
3113
3114 static int buffer_cpu_notify(struct notifier_block *self,
3115                               unsigned long action, void *hcpu)
3116 {
3117         if (action == CPU_DEAD)
3118                 buffer_exit_cpu((unsigned long)hcpu);
3119         return NOTIFY_OK;
3120 }
3121 #endif /* CONFIG_HOTPLUG_CPU */
3122
3123 void __init buffer_init(void)
3124 {
3125         int i;
3126         int nrpages;
3127
3128         bh_cachep = kmem_cache_create("buffer_head",
3129                         sizeof(struct buffer_head), 0,
3130                         SLAB_PANIC, init_buffer_head, NULL);
3131         for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
3132                 init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
3133
3134         /*
3135          * Limit the bh occupancy to 10% of ZONE_NORMAL
3136          */
3137         nrpages = (nr_free_buffer_pages() * 10) / 100;
3138         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3139         hotcpu_notifier(buffer_cpu_notify, 0);
3140 }
3141
3142 EXPORT_SYMBOL(__bforget);
3143 EXPORT_SYMBOL(__brelse);
3144 EXPORT_SYMBOL(__wait_on_buffer);
3145 EXPORT_SYMBOL(block_commit_write);
3146 EXPORT_SYMBOL(block_prepare_write);
3147 EXPORT_SYMBOL(block_read_full_page);
3148 EXPORT_SYMBOL(block_sync_page);
3149 EXPORT_SYMBOL(block_truncate_page);
3150 EXPORT_SYMBOL(block_write_full_page);
3151 EXPORT_SYMBOL(cont_prepare_write);
3152 EXPORT_SYMBOL(end_buffer_async_write);
3153 EXPORT_SYMBOL(end_buffer_read_sync);
3154 EXPORT_SYMBOL(end_buffer_write_sync);
3155 EXPORT_SYMBOL(file_fsync);
3156 EXPORT_SYMBOL(fsync_bdev);
3157 EXPORT_SYMBOL(generic_block_bmap);
3158 EXPORT_SYMBOL(generic_commit_write);
3159 EXPORT_SYMBOL(generic_cont_expand);
3160 EXPORT_SYMBOL(init_buffer);
3161 EXPORT_SYMBOL(invalidate_bdev);
3162 EXPORT_SYMBOL(ll_rw_block);
3163 EXPORT_SYMBOL(mark_buffer_dirty);
3164 EXPORT_SYMBOL(submit_bh);
3165 EXPORT_SYMBOL(sync_dirty_buffer);
3166 EXPORT_SYMBOL(unlock_buffer);