Merge to Fedora kernel-2.6.18-1.2255_FC5 patched with stable patch-2.6.18.5-vs2.0...
[linux-2.6.git] / drivers / block / loop.c
index 3efc2ea..c774121 100644 (file)
  * Support up to 256 loop devices
  * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
  *
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
  * Still To Fix:
  * - Advisory locking is ignored here.
  * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
@@ -58,7 +62,6 @@
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/init.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/smp_lock.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
@@ -67,6 +70,8 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>         /* for invalidate_bdev() */
 #include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
 
 #include <asm/uaccess.h>
 
@@ -127,7 +132,7 @@ static int transfer_xor(struct loop_device *lo, int cmd,
 
 static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
 {
-       if (info->lo_encrypt_key_size <= 0)
+       if (unlikely(info->lo_encrypt_key_size <= 0))
                return -EINVAL;
        return 0;
 }
@@ -173,7 +178,7 @@ figure_loop_size(struct loop_device *lo)
        loff_t size = get_loop_size(lo, lo->lo_backing_file);
        sector_t x = (sector_t)size;
 
-       if ((loff_t)x != size)
+       if (unlikely((loff_t)x != size))
                return -EFBIG;
 
        set_capacity(disks[lo->lo_number], x);
@@ -186,48 +191,57 @@ lo_do_transfer(struct loop_device *lo, int cmd,
               struct page *lpage, unsigned loffs,
               int size, sector_t rblock)
 {
-       if (!lo->transfer)
+       if (unlikely(!lo->transfer))
                return 0;
 
        return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
 }
 
-static int
-do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
+/**
+ * do_lo_send_aops - helper for writing data to a loop device
+ *
+ * This is the fast version for backing filesystems which implement the address
+ * space operations prepare_write and commit_write.
+ */
+static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
+               int bsize, loff_t pos, struct page *page)
 {
        struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
        struct address_space *mapping = file->f_mapping;
-       struct address_space_operations *aops = mapping->a_ops;
-       struct page *page;
+       const struct address_space_operations *aops = mapping->a_ops;
        pgoff_t index;
-       unsigned size, offset, bv_offs;
-       int len;
-       int ret = 0;
+       unsigned offset, bv_offs;
+       int len, ret;
 
-       down(&mapping->host->i_sem);
+       mutex_lock(&mapping->host->i_mutex);
        index = pos >> PAGE_CACHE_SHIFT;
        offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
        bv_offs = bvec->bv_offset;
        len = bvec->bv_len;
        while (len > 0) {
                sector_t IV;
+               unsigned size;
                int transfer_result;
 
                IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
-
                size = PAGE_CACHE_SIZE - offset;
                if (size > len)
                        size = len;
-
                page = grab_cache_page(mapping, index);
-               if (!page)
+               if (unlikely(!page))
                        goto fail;
-               if (aops->prepare_write(file, page, offset, offset+size))
+               ret = aops->prepare_write(file, page, offset,
+                                         offset + size);
+               if (unlikely(ret)) {
+                       if (ret == AOP_TRUNCATED_PAGE) {
+                               page_cache_release(page);
+                               continue;
+                       }
                        goto unlock;
+               }
                transfer_result = lo_do_transfer(lo, WRITE, page, offset,
-                                                bvec->bv_page, bv_offs,
-                                                size, IV);
-               if (transfer_result) {
+                               bvec->bv_page, bv_offs, size, IV);
+               if (unlikely(transfer_result)) {
                        char *kaddr;
 
                        /*
@@ -241,9 +255,16 @@ do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
                        kunmap_atomic(kaddr, KM_USER0);
                }
                flush_dcache_page(page);
-               if (aops->commit_write(file, page, offset, offset+size))
+               ret = aops->commit_write(file, page, offset,
+                                        offset + size);
+               if (unlikely(ret)) {
+                       if (ret == AOP_TRUNCATED_PAGE) {
+                               page_cache_release(page);
+                               continue;
+                       }
                        goto unlock;
-               if (transfer_result)
+               }
+               if (unlikely(transfer_result))
                        goto unlock;
                bv_offs += size;
                len -= size;
@@ -253,32 +274,126 @@ do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
                unlock_page(page);
                page_cache_release(page);
        }
-       up(&mapping->host->i_sem);
+       ret = 0;
 out:
+       mutex_unlock(&mapping->host->i_mutex);
        return ret;
-
 unlock:
        unlock_page(page);
        page_cache_release(page);
 fail:
-       up(&mapping->host->i_sem);
        ret = -1;
        goto out;
 }
 
-static int
-lo_send(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
+/**
+ * __do_lo_send_write - helper for writing data to a loop device
+ *
+ * This helper just factors out common code between do_lo_send_direct_write()
+ * and do_lo_send_write().
+ */
+static int __do_lo_send_write(struct file *file,
+               u8 __user *buf, const int len, loff_t pos)
 {
+       ssize_t bw;
+       mm_segment_t old_fs = get_fs();
+
+       set_fs(get_ds());
+       bw = file->f_op->write(file, buf, len, &pos);
+       set_fs(old_fs);
+       if (likely(bw == len))
+               return 0;
+       printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
+                       (unsigned long long)pos, len);
+       if (bw >= 0)
+               bw = -EIO;
+       return bw;
+}
+
+/**
+ * do_lo_send_direct_write - helper for writing data to a loop device
+ *
+ * This is the fast, non-transforming version for backing filesystems which do
+ * not implement the address space operations prepare_write and commit_write.
+ * It uses the write file operation which should be present on all writeable
+ * filesystems.
+ */
+static int do_lo_send_direct_write(struct loop_device *lo,
+               struct bio_vec *bvec, int bsize, loff_t pos, struct page *page)
+{
+       ssize_t bw = __do_lo_send_write(lo->lo_backing_file,
+                       (u8 __user *)kmap(bvec->bv_page) + bvec->bv_offset,
+                       bvec->bv_len, pos);
+       kunmap(bvec->bv_page);
+       cond_resched();
+       return bw;
+}
+
+/**
+ * do_lo_send_write - helper for writing data to a loop device
+ *
+ * This is the slow, transforming version for filesystems which do not
+ * implement the address space operations prepare_write and commit_write.  It
+ * uses the write file operation which should be present on all writeable
+ * filesystems.
+ *
+ * Using fops->write is slower than using aops->{prepare,commit}_write in the
+ * transforming case because we need to double buffer the data as we cannot do
+ * the transformations in place as we do not have direct access to the
+ * destination pages of the backing file.
+ */
+static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
+               int bsize, loff_t pos, struct page *page)
+{
+       int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,
+                       bvec->bv_offset, bvec->bv_len, pos >> 9);
+       if (likely(!ret))
+               return __do_lo_send_write(lo->lo_backing_file,
+                               (u8 __user *)page_address(page), bvec->bv_len,
+                               pos);
+       printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
+                       "length %i.\n", (unsigned long long)pos, bvec->bv_len);
+       if (ret > 0)
+               ret = -EIO;
+       return ret;
+}
+
+static int lo_send(struct loop_device *lo, struct bio *bio, int bsize,
+               loff_t pos)
+{
+       int (*do_lo_send)(struct loop_device *, struct bio_vec *, int, loff_t,
+                       struct page *page);
        struct bio_vec *bvec;
+       struct page *page = NULL;
        int i, ret = 0;
 
+       do_lo_send = do_lo_send_aops;
+       if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
+               do_lo_send = do_lo_send_direct_write;
+               if (lo->transfer != transfer_none) {
+                       page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+                       if (unlikely(!page))
+                               goto fail;
+                       kmap(page);
+                       do_lo_send = do_lo_send_write;
+               }
+       }
        bio_for_each_segment(bvec, bio, i) {
-               ret = do_lo_send(lo, bvec, bsize, pos);
+               ret = do_lo_send(lo, bvec, bsize, pos, page);
                if (ret < 0)
                        break;
                pos += bvec->bv_len;
        }
+       if (page) {
+               kunmap(page);
+               __free_page(page);
+       }
+out:
        return ret;
+fail:
+       printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
+       ret = -ENOMEM;
+       goto out;
 }
 
 struct lo_read_data {
@@ -368,17 +483,11 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
  */
 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
 {
-       unsigned long flags;
-
-       spin_lock_irqsave(&lo->lo_lock, flags);
        if (lo->lo_biotail) {
                lo->lo_biotail->bi_next = bio;
                lo->lo_biotail = bio;
        } else
                lo->lo_bio = lo->lo_biotail = bio;
-       spin_unlock_irqrestore(&lo->lo_lock, flags);
-
-       up(&lo->lo_bh_mutex);
 }
 
 /*
@@ -388,14 +497,12 @@ static struct bio *loop_get_bio(struct loop_device *lo)
 {
        struct bio *bio;
 
-       spin_lock_irq(&lo->lo_lock);
        if ((bio = lo->lo_bio)) {
                if (bio == lo->lo_biotail)
                        lo->lo_biotail = NULL;
                lo->lo_bio = bio->bi_next;
                bio->bi_next = NULL;
        }
-       spin_unlock_irq(&lo->lo_lock);
 
        return bio;
 }
@@ -405,35 +512,28 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio)
        struct loop_device *lo = q->queuedata;
        int rw = bio_rw(old_bio);
 
-       if (!lo)
-               goto out;
+       if (rw == READA)
+               rw = READ;
+
+       BUG_ON(!lo || (rw != READ && rw != WRITE));
 
        spin_lock_irq(&lo->lo_lock);
        if (lo->lo_state != Lo_bound)
-               goto inactive;
-       atomic_inc(&lo->lo_pending);
-       spin_unlock_irq(&lo->lo_lock);
-
-       if (rw == WRITE) {
-               if (lo->lo_flags & LO_FLAGS_READ_ONLY)
-                       goto err;
-       } else if (rw == READA) {
-               rw = READ;
-       } else if (rw != READ) {
-               printk(KERN_ERR "loop: unknown command (%x)\n", rw);
-               goto err;
-       }
+               goto out;
+       if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
+               goto out;
+       lo->lo_pending++;
        loop_add_bio(lo, old_bio);
+       spin_unlock_irq(&lo->lo_lock);
+       complete(&lo->lo_bh_done);
        return 0;
-err:
-       if (atomic_dec_and_test(&lo->lo_pending))
-               up(&lo->lo_bh_mutex);
+
 out:
+       if (lo->lo_pending == 0)
+               complete(&lo->lo_bh_done);
+       spin_unlock_irq(&lo->lo_lock);
        bio_io_error(old_bio, old_bio->bi_size);
        return 0;
-inactive:
-       spin_unlock_irq(&lo->lo_lock);
-       goto out;
 }
 
 /*
@@ -456,13 +556,11 @@ static void do_loop_switch(struct loop_device *, struct switch_request *);
 
 static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
 {
-       int ret;
-
        if (unlikely(!bio->bi_bdev)) {
                do_loop_switch(lo, bio->bi_private);
                bio_put(bio);
        } else {
-               ret = do_bio_filebacked(lo, bio);
+               int ret = do_bio_filebacked(lo, bio);
                bio_endio(bio, bio->bi_size, ret);
        }
 }
@@ -490,38 +588,46 @@ static int loop_thread(void *data)
        set_user_nice(current, -20);
 
        lo->lo_state = Lo_bound;
-       atomic_inc(&lo->lo_pending);
+       lo->lo_pending = 1;
 
        /*
-        * up sem, we are running
+        * complete it, we are running
         */
-       up(&lo->lo_sem);
+       complete(&lo->lo_done);
 
        for (;;) {
-               down_interruptible(&lo->lo_bh_mutex);
+               int pending;
+
+               if (wait_for_completion_interruptible(&lo->lo_bh_done))
+                       continue;
+
+               spin_lock_irq(&lo->lo_lock);
+
                /*
-                * could be upped because of tear-down, not because of
-                * pending work
+                * could be completed because of tear-down, not pending work
                 */
-               if (!atomic_read(&lo->lo_pending))
+               if (unlikely(!lo->lo_pending)) {
+                       spin_unlock_irq(&lo->lo_lock);
                        break;
+               }
 
                bio = loop_get_bio(lo);
-               if (!bio) {
-                       printk("loop: missing bio\n");
-                       continue;
-               }
+               lo->lo_pending--;
+               pending = lo->lo_pending;
+               spin_unlock_irq(&lo->lo_lock);
+
+               BUG_ON(!bio);
                loop_handle_bio(lo, bio);
 
                /*
                 * upped both for pending work and tear-down, lo_pending
                 * will hit zero then
                 */
-               if (atomic_dec_and_test(&lo->lo_pending))
+               if (unlikely(!pending))
                        break;
        }
 
-       up(&lo->lo_sem);
+       complete(&lo->lo_done);
        return 0;
 }
 
@@ -556,7 +662,8 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
 
        mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
        lo->lo_backing_file = file;
-       lo->lo_blocksize = mapping->host->i_blksize;
+       lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+               mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
        lo->old_gfp_mask = mapping_gfp_mask(mapping);
        mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
        complete(&p->wait);
@@ -584,7 +691,7 @@ static int loop_change_fd(struct loop_device *lo, struct file *lo_file,
 
        /* the loop device has to be read-only */
        error = -EINVAL;
-       if (lo->lo_flags != LO_FLAGS_READ_ONLY)
+       if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
                goto out;
 
        error = -EBADF;
@@ -676,18 +783,21 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 
        error = -EINVAL;
        if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-               struct address_space_operations *aops = mapping->a_ops;
+               const struct address_space_operations *aops = mapping->a_ops;
                /*
                 * If we can't read - sorry. If we only can't write - well,
                 * it's going to be read-only.
                 */
                if (!file->f_op->sendfile)
                        goto out_putf;
-
-               if (!aops->prepare_write || !aops->commit_write)
+               if (aops->prepare_write && aops->commit_write)
+                       lo_flags |= LO_FLAGS_USE_AOPS;
+               if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
                        lo_flags |= LO_FLAGS_READ_ONLY;
 
-               lo_blocksize = inode->i_blksize;
+               lo_blocksize = S_ISBLK(inode->i_mode) ?
+                       inode->i_bdev->bd_block_size : PAGE_SIZE;
+
                error = 0;
        } else {
                goto out_putf;
@@ -709,7 +819,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
        lo->lo_device = bdev;
        lo->lo_flags = lo_flags;
        lo->lo_backing_file = file;
-       lo->transfer = NULL;
+       lo->transfer = transfer_none;
        lo->ioctl = NULL;
        lo->lo_sizelimit = 0;
        lo->old_gfp_mask = mapping_gfp_mask(mapping);
@@ -730,8 +840,10 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 
        set_blocksize(bdev, lo_blocksize);
 
-       kernel_thread(loop_thread, lo, CLONE_KERNEL);
-       down(&lo->lo_sem);
+       error = kernel_thread(loop_thread, lo, CLONE_KERNEL);
+       if (error < 0)
+               goto out_putf;
+       wait_for_completion(&lo->lo_done);
        return 0;
 
  out_putf:
@@ -782,7 +894,7 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
 static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 {
        struct file *filp = lo->lo_backing_file;
-       int gfp = lo->old_gfp_mask;
+       gfp_t gfp = lo->old_gfp_mask;
 
        if (lo->lo_state != Lo_bound)
                return -ENXIO;
@@ -795,11 +907,12 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 
        spin_lock_irq(&lo->lo_lock);
        lo->lo_state = Lo_rundown;
-       if (atomic_dec_and_test(&lo->lo_pending))
-               up(&lo->lo_bh_mutex);
+       lo->lo_pending--;
+       if (!lo->lo_pending)
+               complete(&lo->lo_bh_done);
        spin_unlock_irq(&lo->lo_lock);
 
-       down(&lo->lo_sem);
+       wait_for_completion(&lo->lo_done);
 
        lo->lo_backing_file = NULL;
 
@@ -1034,7 +1147,7 @@ static int lo_ioctl(struct inode * inode, struct file * file,
        struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
        int err;
 
-       down(&lo->lo_ctl_mutex);
+       mutex_lock(&lo->lo_ctl_mutex);
        switch (cmd) {
        case LOOP_SET_FD:
                err = loop_set_fd(lo, file, inode->i_bdev, arg);
@@ -1060,7 +1173,7 @@ static int lo_ioctl(struct inode * inode, struct file * file,
        default:
                err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
        }
-       up(&lo->lo_ctl_mutex);
+       mutex_unlock(&lo->lo_ctl_mutex);
        return err;
 }
 
@@ -1068,9 +1181,9 @@ static int lo_open(struct inode *inode, struct file *file)
 {
        struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
 
-       down(&lo->lo_ctl_mutex);
+       mutex_lock(&lo->lo_ctl_mutex);
        lo->lo_refcnt++;
-       up(&lo->lo_ctl_mutex);
+       mutex_unlock(&lo->lo_ctl_mutex);
 
        return 0;
 }
@@ -1079,9 +1192,9 @@ static int lo_release(struct inode *inode, struct file *file)
 {
        struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
 
-       down(&lo->lo_ctl_mutex);
+       mutex_lock(&lo->lo_ctl_mutex);
        --lo->lo_refcnt;
-       up(&lo->lo_ctl_mutex);
+       mutex_unlock(&lo->lo_ctl_mutex);
 
        return 0;
 }
@@ -1123,12 +1236,12 @@ int loop_unregister_transfer(int number)
        xfer_funcs[n] = NULL;
 
        for (lo = &loop_dev[0]; lo < &loop_dev[max_loop]; lo++) {
-               down(&lo->lo_ctl_mutex);
+               mutex_lock(&lo->lo_ctl_mutex);
 
                if (lo->lo_encryption == xfer)
                        loop_release_xfer(lo);
 
-               up(&lo->lo_ctl_mutex);
+               mutex_unlock(&lo->lo_ctl_mutex);
        }
 
        return 0;
@@ -1137,7 +1250,7 @@ int loop_unregister_transfer(int number)
 EXPORT_SYMBOL(loop_register_transfer);
 EXPORT_SYMBOL(loop_unregister_transfer);
 
-int __init loop_init(void)
+static int __init loop_init(void)
 {
        int     i;
 
@@ -1165,8 +1278,6 @@ int __init loop_init(void)
                        goto out_mem3;
        }
 
-       devfs_mk_dir("loop");
-
        for (i = 0; i < max_loop; i++) {
                struct loop_device *lo = &loop_dev[i];
                struct gendisk *disk = disks[i];
@@ -1175,16 +1286,15 @@ int __init loop_init(void)
                lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
                if (!lo->lo_queue)
                        goto out_mem4;
-               init_MUTEX(&lo->lo_ctl_mutex);
-               init_MUTEX_LOCKED(&lo->lo_sem);
-               init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+               mutex_init(&lo->lo_ctl_mutex);
+               init_completion(&lo->lo_done);
+               init_completion(&lo->lo_bh_done);
                lo->lo_number = i;
                spin_lock_init(&lo->lo_lock);
                disk->major = LOOP_MAJOR;
                disk->first_minor = i;
                disk->fops = &lo_fops;
                sprintf(disk->disk_name, "loop%d", i);
-               sprintf(disk->devfs_name, "loop/%d", i);
                disk->private_data = lo;
                disk->queue = lo->lo_queue;
        }
@@ -1197,8 +1307,7 @@ int __init loop_init(void)
 
 out_mem4:
        while (i--)
-               blk_put_queue(loop_dev[i].lo_queue);
-       devfs_remove("loop");
+               blk_cleanup_queue(loop_dev[i].lo_queue);
        i = max_loop;
 out_mem3:
        while (i--)
@@ -1212,16 +1321,15 @@ out_mem1:
        return -ENOMEM;
 }
 
-void loop_exit(void)
+static void loop_exit(void)
 {
        int i;
 
        for (i = 0; i < max_loop; i++) {
                del_gendisk(disks[i]);
-               blk_put_queue(loop_dev[i].lo_queue);
+               blk_cleanup_queue(loop_dev[i].lo_queue);
                put_disk(disks[i]);
        }
-       devfs_remove("loop");
        if (unregister_blkdev(LOOP_MAJOR, "loop"))
                printk(KERN_WARNING "loop: cannot unregister blkdev\n");