vserver 1.9.5.x5
[linux-2.6.git] / fs / direct-io.c
index afc38f4..2832683 100644 (file)
@@ -215,7 +215,7 @@ static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
 {
        if (dio->end_io && dio->result)
                dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
-       if (dio->lock_type != DIO_NO_LOCKING)
+       if (dio->lock_type == DIO_LOCKING)
                up_read(&dio->inode->i_alloc_sem);
 }
 
@@ -844,8 +844,10 @@ do_holes:
                                char *kaddr;
 
                                /* AKPM: eargh, -ENOTBLK is a hack */
-                               if (dio->rw == WRITE)
+                               if (dio->rw == WRITE) {
+                                       page_cache_release(page);
                                        return -ENOTBLK;
+                               }
 
                                if (dio->block_in_file >=
                                        i_size_read(dio->inode)>>blkbits) {
@@ -1124,11 +1126,23 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 /*
  * This is a library function for use by filesystem drivers.
+ * The locking rules are governed by the dio_lock_type parameter.
+ *
+ * DIO_NO_LOCKING (no locking, for raw block device access)
+ * For writes, i_sem is not held on entry; it is never taken.
+ *
+ * DIO_LOCKING (simple locking for regular files)
+ * For writes we are called under i_sem and return with i_sem held, even though
+ * it is internally dropped.
+ * For reads, i_sem is not held on entry, but it is taken and dropped before
+ * returning.
  *
- * For writes to S_ISREG files, we are called under i_sem and return with i_sem
- * held, even though it is internally dropped.
+ * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
+ *     uninitialised data, allowing parallel direct readers and writers)
+ * For writes we are called without i_sem, return without it, never touch it.
+ * For reads, i_sem is held on entry and will be released before returning.
  *
- * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken.
+ * Additional i_alloc_sem locking requirements described inline below.
  */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1145,6 +1159,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        ssize_t retval = -EINVAL;
        loff_t end = offset;
        struct dio *dio;
+       int reader_with_isem = (rw == READ && dio_lock_type == DIO_OWN_LOCKING);
+
+       if (rw & WRITE)
+               current->flags |= PF_SYNCWRITE;
 
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1177,12 +1195,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                goto out;
 
        /*
+        * For block device access DIO_NO_LOCKING is used,
+        *      neither readers nor writers do any locking at all
         * For regular files using DIO_LOCKING,
         *      readers need to grab i_sem and i_alloc_sem
         *      writers need to grab i_alloc_sem only (i_sem is already held)
         * For regular files using DIO_OWN_LOCKING,
-        *      both readers and writers need to grab i_alloc_sem
-        *      neither readers nor writers hold i_sem on entry (nor exit)
+        *      neither readers nor writers take any locks here
+        *      (i_sem is already held and release for writers here)
         */
        dio->lock_type = dio_lock_type;
        if (dio_lock_type != DIO_NO_LOCKING) {
@@ -1190,20 +1210,26 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                        struct address_space *mapping;
 
                        mapping = iocb->ki_filp->f_mapping;
-                       down(&inode->i_sem);
+                       if (dio_lock_type != DIO_OWN_LOCKING) {
+                               down(&inode->i_sem);
+                               reader_with_isem = 1;
+                       }
                        retval = filemap_write_and_wait(mapping);
                        if (retval) {
-                               up(&inode->i_sem);
                                kfree(dio);
                                goto out;
                        }
-                       down_read(&inode->i_alloc_sem);
-                       if (dio_lock_type == DIO_OWN_LOCKING)
+
+                       if (dio_lock_type == DIO_OWN_LOCKING) {
                                up(&inode->i_sem);
-               } else {
-                       down_read(&inode->i_alloc_sem);
+                               reader_with_isem = 0;
+                       }
                }
+
+               if (dio_lock_type == DIO_LOCKING)
+                       down_read(&inode->i_alloc_sem);
        }
+
        /*
         * For file extending writes updating i_size before data
         * writeouts complete can expose uninitialized blocks. So
@@ -1215,7 +1241,15 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
        retval = direct_io_worker(rw, iocb, inode, iov, offset,
                                nr_segs, blkbits, get_blocks, end_io, dio);
+
+       if (rw == READ && dio_lock_type == DIO_LOCKING)
+               reader_with_isem = 0;
+
 out:
+       if (reader_with_isem)
+               up(&inode->i_sem);
+       if (rw & WRITE)
+               current->flags &= ~PF_SYNCWRITE;
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);