vserver 1.9.3
[linux-2.6.git] / drivers / block / ll_rw_blk.c
index 17c403e..26fdf6b 100644 (file)
@@ -154,6 +154,8 @@ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
        return ret;
 }
 
+EXPORT_SYMBOL(blk_get_backing_dev_info);
+
 void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
 {
        q->activity_fn = fn;
@@ -262,6 +264,45 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 
 EXPORT_SYMBOL(blk_queue_make_request);
 
+/**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:     the request queue
+ * @flag:  see below
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+       if (flag)
+               set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+       else
+               clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
+ * blk_queue_issue_flush_fn - set function for issuing a flush
+ * @q:     the request queue
+ * @iff:   the function to be called issuing the flush
+ *
+ * Description:
+ *   If a driver supports issuing a flush command, the support is notified
+ *   to the block layer by defining it through this call.
+ *
+ **/
+void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
+{
+       q->issue_flush_fn = iff;
+}
+
+EXPORT_SYMBOL(blk_queue_issue_flush_fn);
+
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q:  the request queue for the device
@@ -311,7 +352,7 @@ void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
                printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
        }
 
-       q->max_sectors = max_sectors;
+       q->max_sectors = q->max_hw_sectors = max_sectors;
 }
 
 EXPORT_SYMBOL(blk_queue_max_sectors);
@@ -413,7 +454,8 @@ EXPORT_SYMBOL(blk_queue_hardsect_size);
 void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
 {
        /* zero is "infinity" */
-       t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
+       t->max_sectors = t->max_hw_sectors =
+               min_not_zero(t->max_sectors,b->max_sectors);
 
        t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
        t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
@@ -482,15 +524,14 @@ struct request *blk_queue_find_tag(request_queue_t *q, int tag)
 EXPORT_SYMBOL(blk_queue_find_tag);
 
 /**
- * blk_queue_free_tags - release tag maintenance info
+ * __blk_queue_free_tags - release tag maintenance info
  * @q:  the request queue for the device
  *
  *  Notes:
  *    blk_cleanup_queue() will take care of calling this function, if tagging
- *    has been used. So there's usually no need to call this directly, unless
- *    tagging is just being disabled but the queue remains in function.
+ *    has been used. So there's no need to call this directly.
  **/
-void blk_queue_free_tags(request_queue_t *q)
+static void __blk_queue_free_tags(request_queue_t *q)
 {
        struct blk_queue_tag *bqt = q->queue_tags;
 
@@ -514,12 +555,27 @@ void blk_queue_free_tags(request_queue_t *q)
        q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
 }
 
+/**
+ * blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *     This is used to disabled tagged queuing to a device, yet leave
+ *     queue in function.
+ **/
+void blk_queue_free_tags(request_queue_t *q)
+{
+       clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+}
+
 EXPORT_SYMBOL(blk_queue_free_tags);
 
 static int
 init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 {
        int bits, i;
+       struct request **tag_index;
+       unsigned long *tag_map;
 
        if (depth > q->nr_requests * 2) {
                depth = q->nr_requests * 2;
@@ -527,32 +583,31 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
                                __FUNCTION__, depth);
        }
 
-       tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
-       if (!tags->tag_index)
+       tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+       if (!tag_index)
                goto fail;
 
        bits = (depth / BLK_TAGS_PER_LONG) + 1;
-       tags->tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
-       if (!tags->tag_map)
+       tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
+       if (!tag_map)
                goto fail;
 
-       memset(tags->tag_index, 0, depth * sizeof(struct request *));
-       memset(tags->tag_map, 0, bits * sizeof(unsigned long));
+       memset(tag_index, 0, depth * sizeof(struct request *));
+       memset(tag_map, 0, bits * sizeof(unsigned long));
        tags->max_depth = depth;
        tags->real_max_depth = bits * BITS_PER_LONG;
+       tags->tag_index = tag_index;
+       tags->tag_map = tag_map;
 
        /*
         * set the upper bits if the depth isn't a multiple of the word size
         */
        for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++)
-               __set_bit(i, tags->tag_map);
+               __set_bit(i, tag_map);
 
-       INIT_LIST_HEAD(&tags->busy_list);
-       tags->busy = 0;
-       atomic_set(&tags->refcnt, 1);
        return 0;
 fail:
-       kfree(tags->tag_index);
+       kfree(tag_index);
        return -ENOMEM;
 }
 
@@ -564,13 +619,26 @@ fail:
 int blk_queue_init_tags(request_queue_t *q, int depth,
                        struct blk_queue_tag *tags)
 {
-       if (!tags) {
+       int rc;
+
+       BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
+
+       if (!tags && !q->queue_tags) {
                tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
                if (!tags)
                        goto fail;
 
                if (init_tag_map(q, tags, depth))
                        goto fail;
+
+               INIT_LIST_HEAD(&tags->busy_list);
+               tags->busy = 0;
+               atomic_set(&tags->refcnt, 1);
+       } else if (q->queue_tags) {
+               if ((rc = blk_queue_resize_tags(q, depth)))
+                       return rc;
+               set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+               return 0;
        } else
                atomic_inc(&tags->refcnt);
 
@@ -1335,8 +1403,8 @@ void blk_cleanup_queue(request_queue_t * q)
        if (rl->rq_pool)
                mempool_destroy(rl->rq_pool);
 
-       if (blk_queue_tagged(q))
-               blk_queue_free_tags(q);
+       if (q->queue_tags)
+               __blk_queue_free_tags(q);
 
        kmem_cache_free(requestq_cachep, q);
 }
@@ -1925,10 +1993,11 @@ int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
        }
 
        rq->flags |= REQ_NOMERGE;
-       rq->waiting = &wait;
+       if (!rq->waiting)
+               rq->waiting = &wait;
        elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
        generic_unplug_device(q);
-       wait_for_completion(&wait);
+       wait_for_completion(rq->waiting);
        rq->waiting = NULL;
 
        if (rq->errors)
@@ -1939,6 +2008,72 @@ int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
 
 EXPORT_SYMBOL(blk_execute_rq);
 
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:      blockdev to issue flush for
+ * @error_sector:      error sector
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to.  Caller must run wait_for_completion() on its own.
+ */
+int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+{
+       request_queue_t *q;
+
+       if (bdev->bd_disk == NULL)
+               return -ENXIO;
+
+       q = bdev_get_queue(bdev);
+       if (!q)
+               return -ENXIO;
+       if (!q->issue_flush_fn)
+               return -EOPNOTSUPP;
+
+       return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
+}
+
+EXPORT_SYMBOL(blkdev_issue_flush);
+
+/**
+ * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices
+ * @q:         device queue
+ * @disk:      gendisk
+ * @error_sector:      error offset
+ *
+ * Description:
+ *    Devices understanding the SCSI command set, can use this function as
+ *    a helper for issuing a cache flush. Note: driver is required to store
+ *    the error offset (in case of error flushing) in ->sector of struct
+ *    request.
+ */
+int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
+                              sector_t *error_sector)
+{
+       struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT);
+       int ret;
+
+       rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
+       rq->sector = 0;
+       memset(rq->cmd, 0, sizeof(rq->cmd));
+       rq->cmd[0] = 0x35;
+       rq->cmd_len = 12;
+       rq->data = NULL;
+       rq->data_len = 0;
+       rq->timeout = 60 * HZ;
+
+       ret = blk_execute_rq(q, disk, rq);
+
+       if (ret && error_sector)
+               *error_sector = rq->sector;
+
+       blk_put_request(rq);
+       return ret;
+}
+
+EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn);
+
 void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
 {
        int rw = rq_data_dir(rq);
@@ -2192,7 +2327,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge);
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
        struct request *req, *freereq = NULL;
-       int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+       int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err;
        sector_t sector;
 
        sector = bio->bi_sector;
@@ -2210,9 +2345,11 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 
        spin_lock_prefetch(q->queue_lock);
 
-       barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
-
-       ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
+       barrier = bio_barrier(bio);
+       if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) {
+               err = -EOPNOTSUPP;
+               goto end_io;
+       }
 
 again:
        spin_lock_irq(q->queue_lock);
@@ -2292,7 +2429,8 @@ get_rq:
                        /*
                         * READA bit set
                         */
-                       if (ra)
+                       err = -EWOULDBLOCK;
+                       if (bio_rw_ahead(bio))
                                goto end_io;
        
                        freereq = get_request_wait(q, rw);
@@ -2303,10 +2441,9 @@ get_rq:
        req->flags |= REQ_CMD;
 
        /*
-        * inherit FAILFAST from bio and don't stack up
-        * retries for read ahead
+        * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
         */
-       if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw))       
+       if (bio_rw_ahead(bio) || bio_failfast(bio))
                req->flags |= REQ_FAILFAST;
 
        /*
@@ -2340,7 +2477,7 @@ out:
        return 0;
 
 end_io:
-       bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
+       bio_endio(bio, nr_sectors << 9, err);
        return 0;
 }
 
@@ -2399,6 +2536,7 @@ void generic_make_request(struct bio *bio)
        sector_t maxsector;
        int ret, nr_sectors = bio_sectors(bio);
 
+       might_sleep();
        /* Test device or partition size, when known. */
        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
        if (maxsector) {
@@ -2446,11 +2584,11 @@ end_io:
                        break;
                }
 
-               if (unlikely(bio_sectors(bio) > q->max_sectors)) {
+               if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
                        printk("bio too big device %s (%u > %u)\n", 
                                bdevname(bio->bi_bdev, b),
                                bio_sectors(bio),
-                               q->max_sectors);
+                               q->max_hw_sectors);
                        goto end_io;
                }
 
@@ -2647,9 +2785,16 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect)
 static int __end_that_request_first(struct request *req, int uptodate,
                                    int nr_bytes)
 {
-       int total_bytes, bio_nbytes, error = 0, next_idx = 0;
+       int total_bytes, bio_nbytes, error, next_idx = 0;
        struct bio *bio;
 
+       /*
+        * extend uptodate bool to allow < 0 value to be direct io error
+        */
+       error = 0;
+       if (end_io_error(uptodate))
+               error = !uptodate ? -EIO : uptodate;
+
        /*
         * for a REQ_BLOCK_PC request, we want to carry any eventual
         * sense key with us all the way through
@@ -2658,7 +2803,6 @@ static int __end_that_request_first(struct request *req, int uptodate,
                req->errors = 0;
 
        if (!uptodate) {
-               error = -EIO;
                if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
                        printk("end_request: I/O error, dev %s, sector %llu\n",
                                req->rq_disk ? req->rq_disk->disk_name : "?",
@@ -2741,7 +2885,7 @@ static int __end_that_request_first(struct request *req, int uptodate,
 /**
  * end_that_request_first - end I/O on a request
  * @req:      the request being processed
- * @uptodate: 0 for I/O error
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
  * @nr_sectors: number of sectors to end I/O on
  *
  * Description:
@@ -2762,7 +2906,7 @@ EXPORT_SYMBOL(end_that_request_first);
 /**
  * end_that_request_chunk - end I/O on a request
  * @req:      the request being processed
- * @uptodate: 0 for I/O error
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
@@ -3063,13 +3207,61 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
        unsigned long ra_kb;
        ssize_t ret = queue_var_store(&ra_kb, page, count);
 
+       spin_lock_irq(q->queue_lock);
        if (ra_kb > (q->max_sectors >> 1))
                ra_kb = (q->max_sectors >> 1);
 
        q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
+       spin_unlock_irq(q->queue_lock);
+
        return ret;
 }
 
+static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
+{
+       int max_sectors_kb = q->max_sectors >> 1;
+
+       return queue_var_show(max_sectors_kb, (page));
+}
+
+static ssize_t
+queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
+{
+       unsigned long max_sectors_kb,
+                       max_hw_sectors_kb = q->max_hw_sectors >> 1,
+                       page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
+       ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
+       int ra_kb;
+
+       if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
+               return -EINVAL;
+       /*
+        * Take the queue lock to update the readahead and max_sectors
+        * values synchronously:
+        */
+       spin_lock_irq(q->queue_lock);
+       /*
+        * Trim readahead window as well, if necessary:
+        */
+       ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
+       if (ra_kb > max_sectors_kb)
+               q->backing_dev_info.ra_pages =
+                               max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
+
+       q->max_sectors = max_sectors_kb << 1;
+       spin_unlock_irq(q->queue_lock);
+
+       return ret;
+}
+
+static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
+{
+       int max_hw_sectors_kb = q->max_hw_sectors >> 1;
+
+       return queue_var_show(max_hw_sectors_kb, (page));
+}
+
+
 static struct queue_sysfs_entry queue_requests_entry = {
        .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
        .show = queue_requests_show,
@@ -3082,9 +3274,22 @@ static struct queue_sysfs_entry queue_ra_entry = {
        .store = queue_ra_store,
 };
 
+static struct queue_sysfs_entry queue_max_sectors_entry = {
+       .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_max_sectors_show,
+       .store = queue_max_sectors_store,
+};
+
+static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
+       .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
+       .show = queue_max_hw_sectors_show,
+};
+
 static struct attribute *default_attrs[] = {
        &queue_requests_entry.attr,
        &queue_ra_entry.attr,
+       &queue_max_hw_sectors_entry.attr,
+       &queue_max_sectors_entry.attr,
        NULL,
 };