linux 2.6.16.38 w/ vs2.0.3-rc1

[linux-2.6.git] / drivers / block / loop.c
diff --git a/drivers/block/loop.c b/drivers/block/loop.c

index f125094..0d4838a 100644 (file)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -39,6 +39,11 @@
   * Support up to 256 loop devices
   * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
   *
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
   * Still To Fix:
   * - Advisory locking is ignored here.
   * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
@@ -47,7 +52,7 @@
  
  #include <linux/config.h>
  #include <linux/module.h>
-
+#include <linux/moduleparam.h>
  #include <linux/sched.h>
  #include <linux/fs.h>
  #include <linux/file.h>
@@ -67,6 +72,10 @@
  #include <linux/writeback.h>
  #include <linux/buffer_head.h>         /* for invalidate_bdev() */
  #include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/vs_base.h>
+#include <linux/vs_context.h>
  
  #include <asm/uaccess.h>
  
@@ -127,7 +136,7 @@ static int transfer_xor(struct loop_device *lo, int cmd,
  
  static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
  {
-       if (info->lo_encrypt_key_size <= 0)
+       if (unlikely(info->lo_encrypt_key_size <= 0))
                 return -EINVAL;
         return 0;
  }
@@ -173,7 +182,7 @@ figure_loop_size(struct loop_device *lo)
         loff_t size = get_loop_size(lo, lo->lo_backing_file);
         sector_t x = (sector_t)size;
  
-       if ((loff_t)x != size)
+       if (unlikely((loff_t)x != size))
                 return -EFBIG;
  
         set_capacity(disks[lo->lo_number], x);
@@ -186,48 +195,57 @@ lo_do_transfer(struct loop_device *lo, int cmd,
                struct page *lpage, unsigned loffs,
                int size, sector_t rblock)
  {
-       if (!lo->transfer)
+       if (unlikely(!lo->transfer))
                 return 0;
  
         return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
  }
  
-static int
-do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
+/**
+ * do_lo_send_aops - helper for writing data to a loop device
+ *
+ * This is the fast version for backing filesystems which implement the address
+ * space operations prepare_write and commit_write.
+ */
+static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
+               int bsize, loff_t pos, struct page *page)
  {
         struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
         struct address_space *mapping = file->f_mapping;
         struct address_space_operations *aops = mapping->a_ops;
-       struct page *page;
         pgoff_t index;
-       unsigned size, offset, bv_offs;
-       int len;
-       int ret = 0;
+       unsigned offset, bv_offs;
+       int len, ret;
  
-       down(&mapping->host->i_sem);
+       mutex_lock(&mapping->host->i_mutex);
         index = pos >> PAGE_CACHE_SHIFT;
         offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
         bv_offs = bvec->bv_offset;
         len = bvec->bv_len;
         while (len > 0) {
                 sector_t IV;
+               unsigned size;
                 int transfer_result;
  
                 IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
-
                 size = PAGE_CACHE_SIZE - offset;
                 if (size > len)
                         size = len;
-
                 page = grab_cache_page(mapping, index);
-               if (!page)
+               if (unlikely(!page))
                         goto fail;
-               if (aops->prepare_write(file, page, offset, offset+size))
+               ret = aops->prepare_write(file, page, offset,
+                                         offset + size);
+               if (unlikely(ret)) {
+                       if (ret == AOP_TRUNCATED_PAGE) {
+                               page_cache_release(page);
+                               continue;
+                       }
                         goto unlock;
+               }
                 transfer_result = lo_do_transfer(lo, WRITE, page, offset,
-                                                bvec->bv_page, bv_offs,
-                                                size, IV);
-               if (transfer_result) {
+                               bvec->bv_page, bv_offs, size, IV);
+               if (unlikely(transfer_result)) {
                         char *kaddr;
  
                         /*
@@ -241,9 +259,16 @@ do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
                         kunmap_atomic(kaddr, KM_USER0);
                 }
                 flush_dcache_page(page);
-               if (aops->commit_write(file, page, offset, offset+size))
+               ret = aops->commit_write(file, page, offset,
+                                        offset + size);
+               if (unlikely(ret)) {
+                       if (ret == AOP_TRUNCATED_PAGE) {
+                               page_cache_release(page);
+                               continue;
+                       }
                         goto unlock;
-               if (transfer_result)
+               }
+               if (unlikely(transfer_result))
                         goto unlock;
                 bv_offs += size;
                 len -= size;
@@ -253,32 +278,126 @@ do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
                 unlock_page(page);
                 page_cache_release(page);
         }
-       up(&mapping->host->i_sem);
+       ret = 0;
  out:
+       mutex_unlock(&mapping->host->i_mutex);
         return ret;
-
  unlock:
         unlock_page(page);
         page_cache_release(page);
  fail:
-       up(&mapping->host->i_sem);
         ret = -1;
         goto out;
  }
  
-static int
-lo_send(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
+/**
+ * __do_lo_send_write - helper for writing data to a loop device
+ *
+ * This helper just factors out common code between do_lo_send_direct_write()
+ * and do_lo_send_write().
+ */
+static int __do_lo_send_write(struct file *file,
+               u8 __user *buf, const int len, loff_t pos)
  {
+       ssize_t bw;
+       mm_segment_t old_fs = get_fs();
+
+       set_fs(get_ds());
+       bw = file->f_op->write(file, buf, len, &pos);
+       set_fs(old_fs);
+       if (likely(bw == len))
+               return 0;
+       printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
+                       (unsigned long long)pos, len);
+       if (bw >= 0)
+               bw = -EIO;
+       return bw;
+}
+
+/**
+ * do_lo_send_direct_write - helper for writing data to a loop device
+ *
+ * This is the fast, non-transforming version for backing filesystems which do
+ * not implement the address space operations prepare_write and commit_write.
+ * It uses the write file operation which should be present on all writeable
+ * filesystems.
+ */
+static int do_lo_send_direct_write(struct loop_device *lo,
+               struct bio_vec *bvec, int bsize, loff_t pos, struct page *page)
+{
+       ssize_t bw = __do_lo_send_write(lo->lo_backing_file,
+                       (u8 __user *)kmap(bvec->bv_page) + bvec->bv_offset,
+                       bvec->bv_len, pos);
+       kunmap(bvec->bv_page);
+       cond_resched();
+       return bw;
+}
+
+/**
+ * do_lo_send_write - helper for writing data to a loop device
+ *
+ * This is the slow, transforming version for filesystems which do not
+ * implement the address space operations prepare_write and commit_write.  It
+ * uses the write file operation which should be present on all writeable
+ * filesystems.
+ *
+ * Using fops->write is slower than using aops->{prepare,commit}_write in the
+ * transforming case because we need to double buffer the data as we cannot do
+ * the transformations in place as we do not have direct access to the
+ * destination pages of the backing file.
+ */
+static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
+               int bsize, loff_t pos, struct page *page)
+{
+       int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,
+                       bvec->bv_offset, bvec->bv_len, pos >> 9);
+       if (likely(!ret))
+               return __do_lo_send_write(lo->lo_backing_file,
+                               (u8 __user *)page_address(page), bvec->bv_len,
+                               pos);
+       printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
+                       "length %i.\n", (unsigned long long)pos, bvec->bv_len);
+       if (ret > 0)
+               ret = -EIO;
+       return ret;
+}
+
+static int lo_send(struct loop_device *lo, struct bio *bio, int bsize,
+               loff_t pos)
+{
+       int (*do_lo_send)(struct loop_device *, struct bio_vec *, int, loff_t,
+                       struct page *page);
         struct bio_vec *bvec;
+       struct page *page = NULL;
         int i, ret = 0;
  
+       do_lo_send = do_lo_send_aops;
+       if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
+               do_lo_send = do_lo_send_direct_write;
+               if (lo->transfer != transfer_none) {
+                       page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+                       if (unlikely(!page))
+                               goto fail;
+                       kmap(page);
+                       do_lo_send = do_lo_send_write;
+               }
+       }
         bio_for_each_segment(bvec, bio, i) {
-               ret = do_lo_send(lo, bvec, bsize, pos);
+               ret = do_lo_send(lo, bvec, bsize, pos, page);
                 if (ret < 0)
                         break;
                 pos += bvec->bv_len;
         }
+       if (page) {
+               kunmap(page);
+               __free_page(page);
+       }
+out:
         return ret;
+fail:
+       printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
+       ret = -ENOMEM;
+       goto out;
  }
  
  struct lo_read_data {
@@ -293,7 +412,7 @@ lo_read_actor(read_descriptor_t *desc, struct page *page,
               unsigned long offset, unsigned long size)
  {
         unsigned long count = desc->count;
-       struct lo_read_data *p = (struct lo_read_data*)desc->buf;
+       struct lo_read_data *p = desc->arg.data;
         struct loop_device *lo = p->lo;
         sector_t IV;
  
@@ -368,17 +487,11 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
   */
  static void loop_add_bio(struct loop_device *lo, struct bio *bio)
  {
-       unsigned long flags;
-
-       spin_lock_irqsave(&lo->lo_lock, flags);
         if (lo->lo_biotail) {
                 lo->lo_biotail->bi_next = bio;
                 lo->lo_biotail = bio;
         } else
                 lo->lo_bio = lo->lo_biotail = bio;
-       spin_unlock_irqrestore(&lo->lo_lock, flags);
-
-       up(&lo->lo_bh_mutex);
  }
  
  /*
@@ -388,14 +501,12 @@ static struct bio *loop_get_bio(struct loop_device *lo)
  {
         struct bio *bio;
  
-       spin_lock_irq(&lo->lo_lock);
         if ((bio = lo->lo_bio)) {
                 if (bio == lo->lo_biotail)
                         lo->lo_biotail = NULL;
                 lo->lo_bio = bio->bi_next;
                 bio->bi_next = NULL;
         }
-       spin_unlock_irq(&lo->lo_lock);
  
         return bio;
  }
@@ -405,35 +516,28 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio)
         struct loop_device *lo = q->queuedata;
         int rw = bio_rw(old_bio);
  
-       if (!lo)
-               goto out;
+       if (rw == READA)
+               rw = READ;
+
+       BUG_ON(!lo || (rw != READ && rw != WRITE));
  
         spin_lock_irq(&lo->lo_lock);
         if (lo->lo_state != Lo_bound)
-               goto inactive;
-       atomic_inc(&lo->lo_pending);
-       spin_unlock_irq(&lo->lo_lock);
-
-       if (rw == WRITE) {
-               if (lo->lo_flags & LO_FLAGS_READ_ONLY)
-                       goto err;
-       } else if (rw == READA) {
-               rw = READ;
-       } else if (rw != READ) {
-               printk(KERN_ERR "loop: unknown command (%x)\n", rw);
-               goto err;
-       }
+               goto out;
+       if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
+               goto out;
+       lo->lo_pending++;
         loop_add_bio(lo, old_bio);
+       spin_unlock_irq(&lo->lo_lock);
+       complete(&lo->lo_bh_done);
         return 0;
-err:
-       if (atomic_dec_and_test(&lo->lo_pending))
-               up(&lo->lo_bh_mutex);
+
  out:
+       if (lo->lo_pending == 0)
+               complete(&lo->lo_bh_done);
+       spin_unlock_irq(&lo->lo_lock);
         bio_io_error(old_bio, old_bio->bi_size);
         return 0;
-inactive:
-       spin_unlock_irq(&lo->lo_lock);
-       goto out;
  }
  
  /*
@@ -456,13 +560,11 @@ static void do_loop_switch(struct loop_device *, struct switch_request *);
  
  static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
  {
-       int ret;
-
         if (unlikely(!bio->bi_bdev)) {
                 do_loop_switch(lo, bio->bi_private);
                 bio_put(bio);
         } else {
-               ret = do_bio_filebacked(lo, bio);
+               int ret = do_bio_filebacked(lo, bio);
                 bio_endio(bio, bio->bi_size, ret);
         }
  }
@@ -490,38 +592,46 @@ static int loop_thread(void *data)
         set_user_nice(current, -20);
  
         lo->lo_state = Lo_bound;
-       atomic_inc(&lo->lo_pending);
+       lo->lo_pending = 1;
  
         /*
-        * up sem, we are running
+        * complete it, we are running
          */
-       up(&lo->lo_sem);
+       complete(&lo->lo_done);
  
         for (;;) {
-               down_interruptible(&lo->lo_bh_mutex);
+               int pending;
+
+               if (wait_for_completion_interruptible(&lo->lo_bh_done))
+                       continue;
+
+               spin_lock_irq(&lo->lo_lock);
+
                 /*
-                * could be upped because of tear-down, not because of
-                * pending work
+                * could be completed because of tear-down, not pending work
                  */
-               if (!atomic_read(&lo->lo_pending))
+               if (unlikely(!lo->lo_pending)) {
+                       spin_unlock_irq(&lo->lo_lock);
                         break;
+               }
  
                 bio = loop_get_bio(lo);
-               if (!bio) {
-                       printk("loop: missing bio\n");
-                       continue;
-               }
+               lo->lo_pending--;
+               pending = lo->lo_pending;
+               spin_unlock_irq(&lo->lo_lock);
+
+               BUG_ON(!bio);
                 loop_handle_bio(lo, bio);
  
                 /*
                  * upped both for pending work and tear-down, lo_pending
                  * will hit zero then
                  */
-               if (atomic_dec_and_test(&lo->lo_pending))
+               if (unlikely(!pending))
                         break;
         }
  
-       up(&lo->lo_sem);
+       complete(&lo->lo_done);
         return 0;
  }
  
@@ -584,7 +694,7 @@ static int loop_change_fd(struct loop_device *lo, struct file *lo_file,
  
         /* the loop device has to be read-only */
         error = -EINVAL;
-       if (lo->lo_flags != LO_FLAGS_READ_ONLY)
+       if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
                 goto out;
  
         error = -EBADF;
@@ -622,10 +732,17 @@ static int loop_change_fd(struct loop_device *lo, struct file *lo_file,
         return error;
  }
  
+static inline int is_loop_device(struct file *file)
+{
+       struct inode *i = file->f_mapping->host;
+
+       return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
+}
+
  static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
                        struct block_device *bdev, unsigned int arg)
  {
-       struct file     *file;
+       struct file     *file, *f;
         struct inode    *inode;
         struct address_space *mapping;
         unsigned lo_blocksize;
@@ -636,15 +753,31 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
         /* This is safe, since we have a reference from open(). */
         __module_get(THIS_MODULE);
  
-       error = -EBUSY;
-       if (lo->lo_state != Lo_unbound)
-               goto out;
-
         error = -EBADF;
         file = fget(arg);
         if (!file)
                 goto out;
  
+       error = -EBUSY;
+       if (lo->lo_state != Lo_unbound)
+               goto out_putf;
+
+       /* Avoid recursion */
+       f = file;
+       while (is_loop_device(f)) {
+               struct loop_device *l;
+
+               if (f->f_mapping->host->i_rdev == lo_file->f_mapping->host->i_rdev)
+                       goto out_putf;
+
+               l = f->f_mapping->host->i_bdev->bd_disk->private_data;
+               if (l->lo_state == Lo_unbound) {
+                       error = -EINVAL;
+                       goto out_putf;
+               }
+               f = l->lo_backing_file;
+       }
+
         mapping = file->f_mapping;
         inode = mapping->host;
  
@@ -660,8 +793,9 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
                  */
                 if (!file->f_op->sendfile)
                         goto out_putf;
-
-               if (!aops->prepare_write || !aops->commit_write)
+               if (aops->prepare_write && aops->commit_write)
+                       lo_flags |= LO_FLAGS_USE_AOPS;
+               if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
                         lo_flags |= LO_FLAGS_READ_ONLY;
  
                 lo_blocksize = inode->i_blksize;
@@ -708,7 +842,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
         set_blocksize(bdev, lo_blocksize);
  
         kernel_thread(loop_thread, lo, CLONE_KERNEL);
-       down(&lo->lo_sem);
+       wait_for_completion(&lo->lo_done);
         return 0;
  
   out_putf:
@@ -759,7 +893,7 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
  static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
  {
         struct file *filp = lo->lo_backing_file;
-       int gfp = lo->old_gfp_mask;
+       gfp_t gfp = lo->old_gfp_mask;
  
         if (lo->lo_state != Lo_bound)
                 return -ENXIO;
@@ -772,11 +906,12 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
  
         spin_lock_irq(&lo->lo_lock);
         lo->lo_state = Lo_rundown;
-       if (atomic_dec_and_test(&lo->lo_pending))
-               up(&lo->lo_bh_mutex);
+       lo->lo_pending--;
+       if (!lo->lo_pending)
+               complete(&lo->lo_bh_done);
         spin_unlock_irq(&lo->lo_lock);
  
-       down(&lo->lo_sem);
+       wait_for_completion(&lo->lo_done);
  
         lo->lo_backing_file = NULL;
  
@@ -1073,7 +1208,7 @@ static struct block_device_operations lo_fops = {
  /*
   * And now the modules code and kernel interface.
   */
-MODULE_PARM(max_loop, "i");
+module_param(max_loop, int, 0);
  MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)");
  MODULE_LICENSE("GPL");
  MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
@@ -1114,7 +1249,7 @@ int loop_unregister_transfer(int number)
  EXPORT_SYMBOL(loop_register_transfer);
  EXPORT_SYMBOL(loop_unregister_transfer);
  
-int __init loop_init(void)
+static int __init loop_init(void)
  {
         int     i;
  
@@ -1153,8 +1288,8 @@ int __init loop_init(void)
                 if (!lo->lo_queue)
                         goto out_mem4;
                 init_MUTEX(&lo->lo_ctl_mutex);
-               init_MUTEX_LOCKED(&lo->lo_sem);
-               init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+               init_completion(&lo->lo_done);
+               init_completion(&lo->lo_bh_done);
                 lo->lo_number = i;
                 spin_lock_init(&lo->lo_lock);
                 disk->major = LOOP_MAJOR;
@@ -1189,7 +1324,7 @@ out_mem1:
         return -ENOMEM;
  }
  
-void loop_exit(void)
+static void loop_exit(void)
  {
         int i;