vserver 1.9.3

[linux-2.6.git] / drivers / block / ll_rw_blk.c
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c

index 17c403e..26fdf6b 100644 (file)
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -154,6 +154,8 @@ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
         return ret;
  }
  
+EXPORT_SYMBOL(blk_get_backing_dev_info);
+
  void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
  {
         q->activity_fn = fn;
@@ -262,6 +264,45 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
  
  EXPORT_SYMBOL(blk_queue_make_request);
  
+/**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:     the request queue
+ * @flag:  see below
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+       if (flag)
+               set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+       else
+               clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
+ * blk_queue_issue_flush_fn - set function for issuing a flush
+ * @q:     the request queue
+ * @iff:   the function to be called issuing the flush
+ *
+ * Description:
+ *   If a driver supports issuing a flush command, the support is notified
+ *   to the block layer by defining it through this call.
+ *
+ **/
+void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
+{
+       q->issue_flush_fn = iff;
+}
+
+EXPORT_SYMBOL(blk_queue_issue_flush_fn);
+
  /**
   * blk_queue_bounce_limit - set bounce buffer limit for queue
   * @q:  the request queue for the device
@@ -311,7 +352,7 @@ void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
                 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
         }
  
-       q->max_sectors = max_sectors;
+       q->max_sectors = q->max_hw_sectors = max_sectors;
  }
  
  EXPORT_SYMBOL(blk_queue_max_sectors);
@@ -413,7 +454,8 @@ EXPORT_SYMBOL(blk_queue_hardsect_size);
  void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
  {
         /* zero is "infinity" */
-       t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
+       t->max_sectors = t->max_hw_sectors =
+               min_not_zero(t->max_sectors,b->max_sectors);
  
         t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
         t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
@@ -482,15 +524,14 @@ struct request *blk_queue_find_tag(request_queue_t *q, int tag)
  EXPORT_SYMBOL(blk_queue_find_tag);
  
  /**
- * blk_queue_free_tags - release tag maintenance info
+ * __blk_queue_free_tags - release tag maintenance info
   * @q:  the request queue for the device
   *
   *  Notes:
   *    blk_cleanup_queue() will take care of calling this function, if tagging
- *    has been used. So there's usually no need to call this directly, unless
- *    tagging is just being disabled but the queue remains in function.
+ *    has been used. So there's no need to call this directly.
   **/
-void blk_queue_free_tags(request_queue_t *q)
+static void __blk_queue_free_tags(request_queue_t *q)
  {
         struct blk_queue_tag *bqt = q->queue_tags;
  
@@ -514,12 +555,27 @@ void blk_queue_free_tags(request_queue_t *q)
         q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
  }
  
+/**
+ * blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *     This is used to disabled tagged queuing to a device, yet leave
+ *     queue in function.
+ **/
+void blk_queue_free_tags(request_queue_t *q)
+{
+       clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+}
+
  EXPORT_SYMBOL(blk_queue_free_tags);
  
  static int
  init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
  {
         int bits, i;
+       struct request **tag_index;
+       unsigned long *tag_map;
  
         if (depth > q->nr_requests * 2) {
                 depth = q->nr_requests * 2;
@@ -527,32 +583,31 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
                                 __FUNCTION__, depth);
         }
  
-       tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
-       if (!tags->tag_index)
+       tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+       if (!tag_index)
                 goto fail;
  
         bits = (depth / BLK_TAGS_PER_LONG) + 1;
-       tags->tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
-       if (!tags->tag_map)
+       tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
+       if (!tag_map)
                 goto fail;
  
-       memset(tags->tag_index, 0, depth * sizeof(struct request *));
-       memset(tags->tag_map, 0, bits * sizeof(unsigned long));
+       memset(tag_index, 0, depth * sizeof(struct request *));
+       memset(tag_map, 0, bits * sizeof(unsigned long));
         tags->max_depth = depth;
         tags->real_max_depth = bits * BITS_PER_LONG;
+       tags->tag_index = tag_index;
+       tags->tag_map = tag_map;
  
         /*
          * set the upper bits if the depth isn't a multiple of the word size
          */
         for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++)
-               __set_bit(i, tags->tag_map);
+               __set_bit(i, tag_map);
  
-       INIT_LIST_HEAD(&tags->busy_list);
-       tags->busy = 0;
-       atomic_set(&tags->refcnt, 1);
         return 0;
  fail:
-       kfree(tags->tag_index);
+       kfree(tag_index);
         return -ENOMEM;
  }
  
@@ -564,13 +619,26 @@ fail:
  int blk_queue_init_tags(request_queue_t *q, int depth,
                         struct blk_queue_tag *tags)
  {
-       if (!tags) {
+       int rc;
+
+       BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
+
+       if (!tags && !q->queue_tags) {
                 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
                 if (!tags)
                         goto fail;
  
                 if (init_tag_map(q, tags, depth))
                         goto fail;
+
+               INIT_LIST_HEAD(&tags->busy_list);
+               tags->busy = 0;
+               atomic_set(&tags->refcnt, 1);
+       } else if (q->queue_tags) {
+               if ((rc = blk_queue_resize_tags(q, depth)))
+                       return rc;
+               set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+               return 0;
         } else
                 atomic_inc(&tags->refcnt);
  
@@ -1335,8 +1403,8 @@ void blk_cleanup_queue(request_queue_t * q)
         if (rl->rq_pool)
                 mempool_destroy(rl->rq_pool);
  
-       if (blk_queue_tagged(q))
-               blk_queue_free_tags(q);
+       if (q->queue_tags)
+               __blk_queue_free_tags(q);
  
         kmem_cache_free(requestq_cachep, q);
  }
@@ -1925,10 +1993,11 @@ int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
         }
  
         rq->flags |= REQ_NOMERGE;
-       rq->waiting = &wait;
+       if (!rq->waiting)
+               rq->waiting = &wait;
         elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
         generic_unplug_device(q);
-       wait_for_completion(&wait);
+       wait_for_completion(rq->waiting);
         rq->waiting = NULL;
  
         if (rq->errors)
@@ -1939,6 +2008,72 @@ int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
  
  EXPORT_SYMBOL(blk_execute_rq);
  
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:      blockdev to issue flush for
+ * @error_sector:      error sector
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to.  Caller must run wait_for_completion() on its own.
+ */
+int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+{
+       request_queue_t *q;
+
+       if (bdev->bd_disk == NULL)
+               return -ENXIO;
+
+       q = bdev_get_queue(bdev);
+       if (!q)
+               return -ENXIO;
+       if (!q->issue_flush_fn)
+               return -EOPNOTSUPP;
+
+       return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
+}
+
+EXPORT_SYMBOL(blkdev_issue_flush);
+
+/**
+ * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices
+ * @q:         device queue
+ * @disk:      gendisk
+ * @error_sector:      error offset
+ *
+ * Description:
+ *    Devices understanding the SCSI command set, can use this function as
+ *    a helper for issuing a cache flush. Note: driver is required to store
+ *    the error offset (in case of error flushing) in ->sector of struct
+ *    request.
+ */
+int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
+                              sector_t *error_sector)
+{
+       struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT);
+       int ret;
+
+       rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
+       rq->sector = 0;
+       memset(rq->cmd, 0, sizeof(rq->cmd));
+       rq->cmd[0] = 0x35;
+       rq->cmd_len = 12;
+       rq->data = NULL;
+       rq->data_len = 0;
+       rq->timeout = 60 * HZ;
+
+       ret = blk_execute_rq(q, disk, rq);
+
+       if (ret && error_sector)
+               *error_sector = rq->sector;
+
+       blk_put_request(rq);
+       return ret;
+}
+
+EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn);
+
  void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
  {
         int rw = rq_data_dir(rq);
@@ -2192,7 +2327,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge);
  static int __make_request(request_queue_t *q, struct bio *bio)
  {
         struct request *req, *freereq = NULL;
-       int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+       int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err;
         sector_t sector;
  
         sector = bio->bi_sector;
@@ -2210,9 +2345,11 @@ static int __make_request(request_queue_t *q, struct bio *bio)
  
         spin_lock_prefetch(q->queue_lock);
  
-       barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
-
-       ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
+       barrier = bio_barrier(bio);
+       if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) {
+               err = -EOPNOTSUPP;
+               goto end_io;
+       }
  
  again:
         spin_lock_irq(q->queue_lock);
@@ -2292,7 +2429,8 @@ get_rq:
                         /*
                          * READA bit set
                          */
-                       if (ra)
+                       err = -EWOULDBLOCK;
+                       if (bio_rw_ahead(bio))
                                 goto end_io;
         
                         freereq = get_request_wait(q, rw);
@@ -2303,10 +2441,9 @@ get_rq:
         req->flags |= REQ_CMD;
  
         /*
-        * inherit FAILFAST from bio and don't stack up
-        * retries for read ahead
+        * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
          */
-       if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw))       
+       if (bio_rw_ahead(bio) || bio_failfast(bio))
                 req->flags |= REQ_FAILFAST;
  
         /*
@@ -2340,7 +2477,7 @@ out:
         return 0;
  
  end_io:
-       bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
+       bio_endio(bio, nr_sectors << 9, err);
         return 0;
  }
  
@@ -2399,6 +2536,7 @@ void generic_make_request(struct bio *bio)
         sector_t maxsector;
         int ret, nr_sectors = bio_sectors(bio);
  
+       might_sleep();
         /* Test device or partition size, when known. */
         maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
         if (maxsector) {
@@ -2446,11 +2584,11 @@ end_io:
                         break;
                 }
  
-               if (unlikely(bio_sectors(bio) > q->max_sectors)) {
+               if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
                         printk("bio too big device %s (%u > %u)\n", 
                                 bdevname(bio->bi_bdev, b),
                                 bio_sectors(bio),
-                               q->max_sectors);
+                               q->max_hw_sectors);
                         goto end_io;
                 }
  
@@ -2647,9 +2785,16 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect)
  static int __end_that_request_first(struct request *req, int uptodate,
                                     int nr_bytes)
  {
-       int total_bytes, bio_nbytes, error = 0, next_idx = 0;
+       int total_bytes, bio_nbytes, error, next_idx = 0;
         struct bio *bio;
  
+       /*
+        * extend uptodate bool to allow < 0 value to be direct io error
+        */
+       error = 0;
+       if (end_io_error(uptodate))
+               error = !uptodate ? -EIO : uptodate;
+
         /*
          * for a REQ_BLOCK_PC request, we want to carry any eventual
          * sense key with us all the way through
@@ -2658,7 +2803,6 @@ static int __end_that_request_first(struct request *req, int uptodate,
                 req->errors = 0;
  
         if (!uptodate) {
-               error = -EIO;
                 if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
                         printk("end_request: I/O error, dev %s, sector %llu\n",
                                 req->rq_disk ? req->rq_disk->disk_name : "?",
@@ -2741,7 +2885,7 @@ static int __end_that_request_first(struct request *req, int uptodate,
  /**
   * end_that_request_first - end I/O on a request
   * @req:      the request being processed
- * @uptodate: 0 for I/O error
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
   * @nr_sectors: number of sectors to end I/O on
   *
   * Description:
@@ -2762,7 +2906,7 @@ EXPORT_SYMBOL(end_that_request_first);
  /**
   * end_that_request_chunk - end I/O on a request
   * @req:      the request being processed
- * @uptodate: 0 for I/O error
+ * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
   * @nr_bytes: number of bytes to complete
   *
   * Description:
@@ -3063,13 +3207,61 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
         unsigned long ra_kb;
         ssize_t ret = queue_var_store(&ra_kb, page, count);
  
+       spin_lock_irq(q->queue_lock);
         if (ra_kb > (q->max_sectors >> 1))
                 ra_kb = (q->max_sectors >> 1);
  
         q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
+       spin_unlock_irq(q->queue_lock);
+
         return ret;
  }
  
+static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
+{
+       int max_sectors_kb = q->max_sectors >> 1;
+
+       return queue_var_show(max_sectors_kb, (page));
+}
+
+static ssize_t
+queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
+{
+       unsigned long max_sectors_kb,
+                       max_hw_sectors_kb = q->max_hw_sectors >> 1,
+                       page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
+       ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
+       int ra_kb;
+
+       if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
+               return -EINVAL;
+       /*
+        * Take the queue lock to update the readahead and max_sectors
+        * values synchronously:
+        */
+       spin_lock_irq(q->queue_lock);
+       /*
+        * Trim readahead window as well, if necessary:
+        */
+       ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
+       if (ra_kb > max_sectors_kb)
+               q->backing_dev_info.ra_pages =
+                               max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
+
+       q->max_sectors = max_sectors_kb << 1;
+       spin_unlock_irq(q->queue_lock);
+
+       return ret;
+}
+
+static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
+{
+       int max_hw_sectors_kb = q->max_hw_sectors >> 1;
+
+       return queue_var_show(max_hw_sectors_kb, (page));
+}
+
+
  static struct queue_sysfs_entry queue_requests_entry = {
         .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
         .show = queue_requests_show,
@@ -3082,9 +3274,22 @@ static struct queue_sysfs_entry queue_ra_entry = {
         .store = queue_ra_store,
  };
  
+static struct queue_sysfs_entry queue_max_sectors_entry = {
+       .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_max_sectors_show,
+       .store = queue_max_sectors_store,
+};
+
+static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
+       .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
+       .show = queue_max_hw_sectors_show,
+};
+
  static struct attribute *default_attrs[] = {
         &queue_requests_entry.attr,
         &queue_ra_entry.attr,
+       &queue_max_hw_sectors_entry.attr,
+       &queue_max_sectors_entry.attr,
         NULL,
  };