fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / drivers / block / rd.c
index 3dd9163..485aa87 100644 (file)
  * and set blk_size for -ENOSPC,     Werner Fink <werner@suse.de>, Apr '99
  */
 
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <asm/atomic.h>
 #include <linux/bio.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/init.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
 #include <linux/genhd.h>
-#include <linux/bio.h>
 #include <linux/buffer_head.h>         /* for invalidate_bdev() */
 #include <linux/backing-dev.h>
 #include <linux/blkpg.h>
-#include <asm/uaccess.h>
+#include <linux/writeback.h>
 
-/* The RAM disk size is now a parameter */
-#define NUM_RAMDISKS 16                /* This cannot be overridden (yet) */
+#include <asm/uaccess.h>
 
 /* Various static variables go here.  Most are used only in the RAM disk code.
  */
 
-static struct gendisk *rd_disks[NUM_RAMDISKS];
-static struct block_device *rd_bdev[NUM_RAMDISKS];/* Protected device data */
-static struct request_queue *rd_queue[NUM_RAMDISKS];
+static struct gendisk *rd_disks[CONFIG_BLK_DEV_RAM_COUNT];
+static struct block_device *rd_bdev[CONFIG_BLK_DEV_RAM_COUNT];/* Protected device data */
+static struct request_queue *rd_queue[CONFIG_BLK_DEV_RAM_COUNT];
 
 /*
  * Parameters for the boot-loading of the RAM disk.  These are set by
@@ -87,23 +84,53 @@ int rd_size = CONFIG_BLK_DEV_RAM_SIZE;              /* Size of the RAM disks */
  * behaviour. The default is still BLOCK_SIZE (needed by rd_load_image that
  * supposes the filesystem in the image uses a BLOCK_SIZE blocksize).
  */
-int rd_blocksize = BLOCK_SIZE;                 /* blocksize of the RAM disks */
+static int rd_blocksize = CONFIG_BLK_DEV_RAM_BLOCKSIZE;
 
 /*
  * Copyright (C) 2000 Linus Torvalds.
  *               2000 Transmeta Corp.
  * aops copied from ramfs.
  */
-static int ramdisk_readpage(struct file *file, struct page *page)
-{
-       if (!PageUptodate(page)) {
-               void *kaddr = kmap_atomic(page, KM_USER0);
 
-               memset(kaddr, 0, PAGE_CACHE_SIZE);
-               flush_dcache_page(page);
-               kunmap_atomic(kaddr, KM_USER0);
-               SetPageUptodate(page);
+/*
+ * If a ramdisk page has buffers, some may be uptodate and some may be not.
+ * To bring the page uptodate we zero out the non-uptodate buffers.  The
+ * page must be locked.
+ */
+static void make_page_uptodate(struct page *page)
+{
+       if (page_has_buffers(page)) {
+               struct buffer_head *bh = page_buffers(page);
+               struct buffer_head *head = bh;
+
+               do {
+                       if (!buffer_uptodate(bh)) {
+                               memset(bh->b_data, 0, bh->b_size);
+                               /*
+                                * akpm: I'm totally undecided about this.  The
+                                * buffer has just been magically brought "up to
+                                * date", but nobody should want to be reading
+                                * it anyway, because it hasn't been used for
+                                * anything yet.  It is still in a "not read
+                                * from disk yet" state.
+                                *
+                                * But non-uptodate buffers against an uptodate
+                                * page are against the rules.  So do it anyway.
+                                */
+                                set_buffer_uptodate(bh);
+                       }
+               } while ((bh = bh->b_this_page) != head);
+       } else {
+               memset(page_address(page), 0, PAGE_CACHE_SIZE);
        }
+       flush_dcache_page(page);
+       SetPageUptodate(page);
+}
+
+static int ramdisk_readpage(struct file *file, struct page *page)
+{
+       if (!PageUptodate(page))
+               make_page_uptodate(page);
        unlock_page(page);
        return 0;
 }
@@ -111,34 +138,70 @@ static int ramdisk_readpage(struct file *file, struct page *page)
 static int ramdisk_prepare_write(struct file *file, struct page *page,
                                unsigned offset, unsigned to)
 {
-       if (!PageUptodate(page)) {
-               void *kaddr = kmap_atomic(page, KM_USER0);
-
-               memset(kaddr, 0, PAGE_CACHE_SIZE);
-               flush_dcache_page(page);
-               kunmap_atomic(kaddr, KM_USER0);
-               SetPageUptodate(page);
-       }
-       SetPageDirty(page);
+       if (!PageUptodate(page))
+               make_page_uptodate(page);
        return 0;
 }
 
 static int ramdisk_commit_write(struct file *file, struct page *page,
                                unsigned offset, unsigned to)
 {
+       set_page_dirty(page);
+       return 0;
+}
+
+/*
+ * ->writepage to the the blockdev's mapping has to redirty the page so that the
+ * VM doesn't go and steal it.  We return AOP_WRITEPAGE_ACTIVATE so that the VM
+ * won't try to (pointlessly) write the page again for a while.
+ *
+ * Really, these pages should not be on the LRU at all.
+ */
+static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
+{
+       if (!PageUptodate(page))
+               make_page_uptodate(page);
+       SetPageDirty(page);
+       if (wbc->for_reclaim)
+               return AOP_WRITEPAGE_ACTIVATE;
+       unlock_page(page);
+       return 0;
+}
+
+/*
+ * This is a little speedup thing: short-circuit attempts to write back the
+ * ramdisk blockdev inode to its non-existent backing store.
+ */
+static int ramdisk_writepages(struct address_space *mapping,
+                               struct writeback_control *wbc)
+{
+       return 0;
+}
+
+/*
+ * ramdisk blockdev pages have their own ->set_page_dirty() because we don't
+ * want them to contribute to dirty memory accounting.
+ */
+static int ramdisk_set_page_dirty(struct page *page)
+{
+       if (!TestSetPageDirty(page))
+               return 1;
        return 0;
 }
 
-static struct address_space_operations ramdisk_aops = {
-       .readpage = ramdisk_readpage,
-       .prepare_write = ramdisk_prepare_write,
-       .commit_write = ramdisk_commit_write,
+static const struct address_space_operations ramdisk_aops = {
+       .readpage       = ramdisk_readpage,
+       .prepare_write  = ramdisk_prepare_write,
+       .commit_write   = ramdisk_commit_write,
+       .writepage      = ramdisk_writepage,
+       .set_page_dirty = ramdisk_set_page_dirty,
+       .writepages     = ramdisk_writepages,
 };
 
 static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector,
                                struct address_space *mapping)
 {
-       unsigned long index = sector >> (PAGE_CACHE_SHIFT - 9);
+       pgoff_t index = sector >> (PAGE_CACHE_SHIFT - 9);
        unsigned int vec_offset = vec->bv_offset;
        int offset = (sector << 9) & ~PAGE_CACHE_MASK;
        int size = vec->bv_len;
@@ -146,60 +209,47 @@ static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector,
 
        do {
                int count;
-               struct page * page;
-               char * src, * dst;
-               int unlock = 0;
+               struct page *page;
+               char *src;
+               char *dst;
 
                count = PAGE_CACHE_SIZE - offset;
                if (count > size)
                        count = size;
                size -= count;
 
-               page = find_get_page(mapping, index);
+               page = grab_cache_page(mapping, index);
                if (!page) {
-                       page = grab_cache_page(mapping, index);
                        err = -ENOMEM;
-                       if (!page)
-                               goto out;
-                       err = 0;
-
-                       if (!PageUptodate(page)) {
-                               void *kaddr = kmap_atomic(page, KM_USER0);
-
-                               memset(kaddr, 0, PAGE_CACHE_SIZE);
-                               flush_dcache_page(page);
-                               kunmap_atomic(kaddr, KM_USER0);
-                               SetPageUptodate(page);
-                       }
-
-                       unlock = 1;
+                       goto out;
                }
 
+               if (!PageUptodate(page))
+                       make_page_uptodate(page);
+
                index++;
 
                if (rw == READ) {
-                       src = kmap(page) + offset;
-                       dst = kmap(vec->bv_page) + vec_offset;
+                       src = kmap_atomic(page, KM_USER0) + offset;
+                       dst = kmap_atomic(vec->bv_page, KM_USER1) + vec_offset;
                } else {
-                       dst = kmap(page) + offset;
-                       src = kmap(vec->bv_page) + vec_offset;
+                       src = kmap_atomic(vec->bv_page, KM_USER0) + vec_offset;
+                       dst = kmap_atomic(page, KM_USER1) + offset;
                }
                offset = 0;
                vec_offset += count;
 
                memcpy(dst, src, count);
 
-               kunmap(page);
-               kunmap(vec->bv_page);
+               kunmap_atomic(src, KM_USER0);
+               kunmap_atomic(dst, KM_USER1);
 
-               if (rw == READ) {
+               if (rw == READ)
                        flush_dcache_page(vec->bv_page);
-               } else {
-                       SetPageDirty(page);
-               }
-               if (unlock)
-                       unlock_page(page);
-               __free_page(page);
+               else
+                       set_page_dirty(page);
+               unlock_page(page);
+               put_page(page);
        } while (size);
 
  out:
@@ -251,7 +301,7 @@ static int rd_ioctl(struct inode *inode, struct file *file,
        struct block_device *bdev = inode->i_bdev;
 
        if (cmd != BLKFLSBUF)
-               return -EINVAL;
+               return -ENOTTY;
 
        /*
         * special: we want to release the ramdisk memory, it's not like with
@@ -259,37 +309,78 @@ static int rd_ioctl(struct inode *inode, struct file *file,
         * cache
         */
        error = -EBUSY;
-       down(&bdev->bd_sem);
+       mutex_lock(&bdev->bd_mutex);
        if (bdev->bd_openers <= 2) {
                truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
                error = 0;
        }
-       up(&bdev->bd_sem);
+       mutex_unlock(&bdev->bd_mutex);
        return error;
 }
 
+/*
+ * This is the backing_dev_info for the blockdev inode itself.  It doesn't need
+ * writeback and it does not contribute to dirty memory accounting.
+ */
 static struct backing_dev_info rd_backing_dev_info = {
        .ra_pages       = 0,    /* No readahead */
-       .memory_backed  = 1,    /* Does not contribute to dirty memory */
-       .unplug_io_fn = default_unplug_io_fn,
+       .capabilities   = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | BDI_CAP_MAP_COPY,
+       .unplug_io_fn   = default_unplug_io_fn,
+};
+
+/*
+ * This is the backing_dev_info for the files which live atop the ramdisk
+ * "device".  These files do need writeback and they do contribute to dirty
+ * memory accounting.
+ */
+static struct backing_dev_info rd_file_backing_dev_info = {
+       .ra_pages       = 0,    /* No readahead */
+       .capabilities   = BDI_CAP_MAP_COPY,     /* Does contribute to dirty memory */
+       .unplug_io_fn   = default_unplug_io_fn,
 };
 
 static int rd_open(struct inode *inode, struct file *filp)
 {
        unsigned unit = iminor(inode);
 
-       /*
-        * Immunize device against invalidate_buffers() and prune_icache().
-        */
        if (rd_bdev[unit] == NULL) {
                struct block_device *bdev = inode->i_bdev;
+               struct address_space *mapping;
+               unsigned bsize;
+               gfp_t gfp_mask;
+
                inode = igrab(bdev->bd_inode);
                rd_bdev[unit] = bdev;
                bdev->bd_openers++;
-               bdev->bd_block_size = rd_blocksize;
-               inode->i_size = get_capacity(rd_disks[unit])<<9;
-               inode->i_mapping->a_ops = &ramdisk_aops;
-               inode->i_mapping->backing_dev_info = &rd_backing_dev_info;
+               bsize = bdev_hardsect_size(bdev);
+               bdev->bd_block_size = bsize;
+               inode->i_blkbits = blksize_bits(bsize);
+               inode->i_size = get_capacity(bdev->bd_disk)<<9;
+
+               mapping = inode->i_mapping;
+               mapping->a_ops = &ramdisk_aops;
+               mapping->backing_dev_info = &rd_backing_dev_info;
+               bdev->bd_inode_backing_dev_info = &rd_file_backing_dev_info;
+
+               /*
+                * Deep badness.  rd_blkdev_pagecache_IO() needs to allocate
+                * pagecache pages within a request_fn.  We cannot recur back
+                * into the filesytem which is mounted atop the ramdisk, because
+                * that would deadlock on fs locks.  And we really don't want
+                * to reenter rd_blkdev_pagecache_IO when we're already within
+                * that function.
+                *
+                * So we turn off __GFP_FS and __GFP_IO.
+                *
+                * And to give this thing a hope of working, turn on __GFP_HIGH.
+                * Hopefully, there's enough regular memory allocation going on
+                * for the page allocator emergency pools to keep the ramdisk
+                * driver happy.
+                */
+               gfp_mask = mapping_gfp_mask(mapping);
+               gfp_mask &= ~(__GFP_FS|__GFP_IO);
+               gfp_mask |= __GFP_HIGH;
+               mapping_set_gfp_mask(mapping, gfp_mask);
        }
 
        return 0;
@@ -308,7 +399,7 @@ static void __exit rd_cleanup(void)
 {
        int i;
 
-       for (i = 0; i < NUM_RAMDISKS; i++) {
+       for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
                struct block_device *bdev = rd_bdev[i];
                rd_bdev[i] = NULL;
                if (bdev) {
@@ -319,7 +410,6 @@ static void __exit rd_cleanup(void)
                put_disk(rd_disks[i]);
                blk_cleanup_queue(rd_queue[i]);
        }
-       devfs_remove("rd");
        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 }
 
@@ -338,10 +428,16 @@ static int __init rd_init(void)
                rd_blocksize = BLOCK_SIZE;
        }
 
-       for (i = 0; i < NUM_RAMDISKS; i++) {
+       for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
                rd_disks[i] = alloc_disk(1);
                if (!rd_disks[i])
                        goto out;
+
+               rd_queue[i] = blk_alloc_queue(GFP_KERNEL);
+               if (!rd_queue[i]) {
+                       put_disk(rd_disks[i]);
+                       goto out;
+               }
        }
 
        if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) {
@@ -349,16 +445,11 @@ static int __init rd_init(void)
                goto out;
        }
 
-       devfs_mk_dir("rd");
-
-       for (i = 0; i < NUM_RAMDISKS; i++) {
+       for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
                struct gendisk *disk = rd_disks[i];
 
-               rd_queue[i] = blk_alloc_queue(GFP_KERNEL);
-               if (!rd_queue[i])
-                       goto out_queue;
-
                blk_queue_make_request(rd_queue[i], &rd_make_request);
+               blk_queue_hardsect_size(rd_queue[i], rd_blocksize);
 
                /* rd_size is given in kB */
                disk->major = RAMDISK_MAJOR;
@@ -367,7 +458,6 @@ static int __init rd_init(void)
                disk->queue = rd_queue[i];
                disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
                sprintf(disk->disk_name, "ram%d", i);
-               sprintf(disk->devfs_name, "rd/%d", i);
                set_capacity(disk, rd_size * 2);
                add_disk(rd_disks[i]);
        }
@@ -375,11 +465,9 @@ static int __init rd_init(void)
        /* rd_size is given in kB */
        printk("RAMDISK driver initialized: "
                "%d RAM disks of %dK size %d blocksize\n",
-               NUM_RAMDISKS, rd_size, rd_blocksize);
+               CONFIG_BLK_DEV_RAM_COUNT, rd_size, rd_blocksize);
 
        return 0;
-out_queue:
-       unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 out:
        while (i--) {
                put_disk(rd_disks[i]);
@@ -413,9 +501,10 @@ __setup("ramdisk_blocksize=", ramdisk_blocksize);
 #endif
 
 /* options - modular */
-MODULE_PARM     (rd_size, "1i");
+module_param(rd_size, int, 0);
 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
-MODULE_PARM     (rd_blocksize, "i");
+module_param(rd_blocksize, int, 0);
 MODULE_PARM_DESC(rd_blocksize, "Blocksize of each RAM disk in bytes.");
+MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 
 MODULE_LICENSE("GPL");