1 /******************************************************************************
4 * XenLinux virtual block-device driver.
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
11 * Copyright (c) 2005, XenSource Ltd
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License version 2
15 * as published by the Free Software Foundation; or, when distributed
16 * separately from the Linux kernel or incorporated into other
17 * software packages, subject to the following license:
19 * Permission is hereby granted, free of charge, to any person obtaining a copy
20 * of this source file (the "Software"), to deal in the Software without
21 * restriction, including without limitation the rights to use, copy, modify,
22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23 * and to permit persons to whom the Software is furnished to do so, subject to
24 * the following conditions:
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 #include <linux/version.h>
40 #include <linux/cdrom.h>
41 #include <linux/sched.h>
42 #include <linux/interrupt.h>
43 #include <scsi/scsi.h>
44 #include <xen/evtchn.h>
45 #include <xen/xenbus.h>
46 #include <xen/interface/grant_table.h>
47 #include <xen/gnttab.h>
48 #include <asm/hypervisor.h>
49 #include <asm/maddr.h>
51 #define BLKIF_STATE_DISCONNECTED 0
52 #define BLKIF_STATE_CONNECTED 1
53 #define BLKIF_STATE_SUSPENDED 2
55 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
56 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
57 #define GRANT_INVALID_REF 0
59 static void connect(struct blkfront_info *);
60 static void blkfront_closing(struct xenbus_device *);
61 static int blkfront_remove(struct xenbus_device *);
62 static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
63 static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
65 static void kick_pending_request_queues(struct blkfront_info *);
67 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
68 static void blkif_restart_queue(void *arg);
69 static void blkif_recover(struct blkfront_info *);
70 static void blkif_completion(struct blk_shadow *);
71 static void blkif_free(struct blkfront_info *, int);
75 * Entry point to this code when a new device is created. Allocate the basic
76 * structures and the ring buffer for communication with the backend, and
77 * inform the backend of the appropriate details for those. Switch to
80 static int blkfront_probe(struct xenbus_device *dev,
81 const struct xenbus_device_id *id)
84 struct blkfront_info *info;
86 /* FIXME: Use dynamic device id if this is not set. */
87 err = xenbus_scanf(XBT_NIL, dev->nodename,
88 "virtual-device", "%i", &vdevice);
90 xenbus_dev_fatal(dev, err, "reading virtual-device");
94 info = kzalloc(sizeof(*info), GFP_KERNEL);
96 xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
101 info->vdevice = vdevice;
102 info->connected = BLKIF_STATE_DISCONNECTED;
103 INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
105 for (i = 0; i < BLK_RING_SIZE; i++)
106 info->shadow[i].req.id = i+1;
107 info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
109 /* Front end dir is a number, which is used as the id. */
110 info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
111 dev->dev.driver_data = info;
113 err = talk_to_backend(dev, info);
116 dev->dev.driver_data = NULL;
125 * We are reconnecting to the backend, due to a suspend/resume, or a backend
126 * driver restart. We tear down our blkif structure and recreate it, but
127 * leave the device-layer structures intact so that this is transparent to the
128 * rest of the kernel.
130 static int blkfront_resume(struct xenbus_device *dev)
132 struct blkfront_info *info = dev->dev.driver_data;
135 DPRINTK("blkfront_resume: %s\n", dev->nodename);
139 err = talk_to_backend(dev, info);
147 /* Common code used when first setting up, and when resuming. */
148 static int talk_to_backend(struct xenbus_device *dev,
149 struct blkfront_info *info)
151 const char *message = NULL;
152 struct xenbus_transaction xbt;
155 /* Create shared ring, alloc event channel. */
156 err = setup_blkring(dev, info);
161 err = xenbus_transaction_start(&xbt);
163 xenbus_dev_fatal(dev, err, "starting transaction");
164 goto destroy_blkring;
167 err = xenbus_printf(xbt, dev->nodename,
168 "ring-ref","%u", info->ring_ref);
170 message = "writing ring-ref";
171 goto abort_transaction;
173 err = xenbus_printf(xbt, dev->nodename,
174 "event-channel", "%u", info->evtchn);
176 message = "writing event-channel";
177 goto abort_transaction;
180 err = xenbus_transaction_end(xbt, 0);
184 xenbus_dev_fatal(dev, err, "completing transaction");
185 goto destroy_blkring;
188 xenbus_switch_state(dev, XenbusStateInitialised);
193 xenbus_transaction_end(xbt, 1);
195 xenbus_dev_fatal(dev, err, "%s", message);
203 static int setup_blkring(struct xenbus_device *dev,
204 struct blkfront_info *info)
206 blkif_sring_t *sring;
209 info->ring_ref = GRANT_INVALID_REF;
211 sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
213 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
216 SHARED_RING_INIT(sring);
217 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
219 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
221 free_page((unsigned long)sring);
222 info->ring.sring = NULL;
225 info->ring_ref = err;
227 err = xenbus_alloc_evtchn(dev, &info->evtchn);
231 err = bind_evtchn_to_irqhandler(
232 info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
234 xenbus_dev_fatal(dev, err,
235 "bind_evtchn_to_irqhandler failed");
248 * Callback received when the backend's state changes.
250 static void backend_changed(struct xenbus_device *dev,
251 enum xenbus_state backend_state)
253 struct blkfront_info *info = dev->dev.driver_data;
254 struct block_device *bd;
256 DPRINTK("blkfront:backend_changed.\n");
258 switch (backend_state) {
259 case XenbusStateInitialising:
260 case XenbusStateInitWait:
261 case XenbusStateInitialised:
262 case XenbusStateUnknown:
263 case XenbusStateClosed:
266 case XenbusStateConnected:
270 case XenbusStateClosing:
271 bd = bdget(info->dev);
273 xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
275 mutex_lock(&bd->bd_mutex);
277 xenbus_dev_error(dev, -EBUSY,
278 "Device in use; refusing to close");
280 blkfront_closing(dev);
281 mutex_unlock(&bd->bd_mutex);
288 /* ** Connection ** */
292 * Invoked when the backend is finally 'ready' (and has told produced
293 * the details about the physical device - #sectors, size, etc).
295 static void connect(struct blkfront_info *info)
297 unsigned long sectors, sector_size;
301 if ((info->connected == BLKIF_STATE_CONNECTED) ||
302 (info->connected == BLKIF_STATE_SUSPENDED) )
305 DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
307 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
308 "sectors", "%lu", §ors,
309 "info", "%u", &binfo,
310 "sector-size", "%lu", §or_size,
313 xenbus_dev_fatal(info->xbdev, err,
314 "reading backend fields at %s",
315 info->xbdev->otherend);
319 err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
321 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
322 info->xbdev->otherend);
326 (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
328 /* Kick pending requests. */
329 spin_lock_irq(&blkif_io_lock);
330 info->connected = BLKIF_STATE_CONNECTED;
331 kick_pending_request_queues(info);
332 spin_unlock_irq(&blkif_io_lock);
338 * Handle the change of state of the backend to Closing. We must delete our
339 * device-layer structures now, to ensure that writes are flushed through to
340 * the backend. Once is this done, we can switch to Closed in
343 static void blkfront_closing(struct xenbus_device *dev)
345 struct blkfront_info *info = dev->dev.driver_data;
348 DPRINTK("blkfront_closing: %s removed\n", dev->nodename);
350 if (info->rq == NULL)
353 spin_lock_irqsave(&blkif_io_lock, flags);
354 /* No more blkif_request(). */
355 blk_stop_queue(info->rq);
356 /* No more gnttab callback work. */
357 gnttab_cancel_free_callback(&info->callback);
358 spin_unlock_irqrestore(&blkif_io_lock, flags);
360 /* Flush gnttab callback work. Must be done with no locks held. */
361 flush_scheduled_work();
365 xenbus_frontend_closed(dev);
369 static int blkfront_remove(struct xenbus_device *dev)
371 struct blkfront_info *info = dev->dev.driver_data;
373 DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
383 static inline int GET_ID_FROM_FREELIST(
384 struct blkfront_info *info)
386 unsigned long free = info->shadow_free;
387 BUG_ON(free > BLK_RING_SIZE);
388 info->shadow_free = info->shadow[free].req.id;
389 info->shadow[free].req.id = 0x0fffffee; /* debug */
393 static inline void ADD_ID_TO_FREELIST(
394 struct blkfront_info *info, unsigned long id)
396 info->shadow[id].req.id = info->shadow_free;
397 info->shadow[id].request = 0;
398 info->shadow_free = id;
401 static inline void flush_requests(struct blkfront_info *info)
405 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
408 notify_remote_via_irq(info->irq);
411 static void kick_pending_request_queues(struct blkfront_info *info)
413 if (!RING_FULL(&info->ring)) {
414 /* Re-enable calldowns. */
415 blk_start_queue(info->rq);
416 /* Kick things off immediately. */
417 do_blkif_request(info->rq);
421 static void blkif_restart_queue(void *arg)
423 struct blkfront_info *info = (struct blkfront_info *)arg;
424 spin_lock_irq(&blkif_io_lock);
425 if (info->connected == BLKIF_STATE_CONNECTED)
426 kick_pending_request_queues(info);
427 spin_unlock_irq(&blkif_io_lock);
430 static void blkif_restart_queue_callback(void *arg)
432 struct blkfront_info *info = (struct blkfront_info *)arg;
433 schedule_work(&info->work);
436 int blkif_open(struct inode *inode, struct file *filep)
438 struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
444 int blkif_release(struct inode *inode, struct file *filep)
446 struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
448 if (info->users == 0) {
449 /* Check whether we have been instructed to close. We will
450 have ignored this request initially, as the device was
452 struct xenbus_device * dev = info->xbdev;
453 enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
455 if (state == XenbusStateClosing)
456 blkfront_closing(dev);
462 int blkif_ioctl(struct inode *inode, struct file *filep,
463 unsigned command, unsigned long argument)
467 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
468 command, (long)argument, inode->i_rdev);
471 case CDROMMULTISESSION:
472 DPRINTK("FIXME: support multisession CDs later\n");
473 for (i = 0; i < sizeof(struct cdrom_multisession); i++)
474 if (put_user(0, (char __user *)(argument + i)))
479 /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
481 return -EINVAL; /* same return as native Linux */
488 int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
490 /* We don't have real geometry info, but let's at least return
491 values consistent with the size of the device */
492 sector_t nsect = get_capacity(bd->bd_disk);
493 sector_t cylinders = nsect;
497 sector_div(cylinders, hg->heads * hg->sectors);
498 hg->cylinders = cylinders;
499 if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
500 hg->cylinders = 0xffff;
506 * blkif_queue_request
510 * id: for guest use only.
511 * operation: BLKIF_OP_{READ,WRITE,PROBE}
512 * buffer: buffer to read/write into. this should be a
513 * virtual address in the guest os.
515 static int blkif_queue_request(struct request *req)
517 struct blkfront_info *info = req->rq_disk->private_data;
518 unsigned long buffer_mfn;
519 blkif_request_t *ring_req;
521 struct bio_vec *bvec;
524 unsigned int fsect, lsect;
526 grant_ref_t gref_head;
528 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
531 if (gnttab_alloc_grant_references(
532 BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
533 gnttab_request_free_callback(
535 blkif_restart_queue_callback,
537 BLKIF_MAX_SEGMENTS_PER_REQUEST);
541 /* Fill out a communications ring structure. */
542 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
543 id = GET_ID_FROM_FREELIST(info);
544 info->shadow[id].request = (unsigned long)req;
547 ring_req->operation = rq_data_dir(req) ?
548 BLKIF_OP_WRITE : BLKIF_OP_READ;
549 ring_req->sector_number = (blkif_sector_t)req->sector;
550 ring_req->handle = info->handle;
552 ring_req->nr_segments = 0;
553 rq_for_each_bio (bio, req) {
554 bio_for_each_segment (bvec, bio, idx) {
555 BUG_ON(ring_req->nr_segments
556 == BLKIF_MAX_SEGMENTS_PER_REQUEST);
557 buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT;
558 fsect = bvec->bv_offset >> 9;
559 lsect = fsect + (bvec->bv_len >> 9) - 1;
560 /* install a grant reference. */
561 ref = gnttab_claim_grant_reference(&gref_head);
562 BUG_ON(ref == -ENOSPC);
564 gnttab_grant_foreign_access_ref(
566 info->xbdev->otherend_id,
570 info->shadow[id].frame[ring_req->nr_segments] =
571 mfn_to_pfn(buffer_mfn);
573 ring_req->seg[ring_req->nr_segments] =
574 (struct blkif_request_segment) {
577 .last_sect = lsect };
579 ring_req->nr_segments++;
583 info->ring.req_prod_pvt++;
585 /* Keep a private copy so we can reissue requests when recovering. */
586 info->shadow[id].req = *ring_req;
588 gnttab_free_grant_references(gref_head);
595 * read a block; request is in a request queue
597 void do_blkif_request(request_queue_t *rq)
599 struct blkfront_info *info = NULL;
603 DPRINTK("Entered do_blkif_request\n");
607 while ((req = elv_next_request(rq)) != NULL) {
608 info = req->rq_disk->private_data;
609 if (!blk_fs_request(req)) {
614 if (RING_FULL(&info->ring))
617 DPRINTK("do_blk_req %p: cmd %p, sec %lx, "
618 "(%u/%li) buffer:%p [%s]\n",
619 req, req->cmd, req->sector, req->current_nr_sectors,
620 req->nr_sectors, req->buffer,
621 rq_data_dir(req) ? "write" : "read");
624 blkdev_dequeue_request(req);
625 if (blkif_queue_request(req)) {
626 blk_requeue_request(rq, req);
628 /* Avoid pointless unplugs. */
637 flush_requests(info);
641 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
644 blkif_response_t *bret;
647 struct blkfront_info *info = (struct blkfront_info *)dev_id;
649 spin_lock_irqsave(&blkif_io_lock, flags);
651 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
652 spin_unlock_irqrestore(&blkif_io_lock, flags);
657 rp = info->ring.sring->rsp_prod;
658 rmb(); /* Ensure we see queued responses up to 'rp'. */
660 for (i = info->ring.rsp_cons; i != rp; i++) {
664 bret = RING_GET_RESPONSE(&info->ring, i);
666 req = (struct request *)info->shadow[id].request;
668 blkif_completion(&info->shadow[id]);
670 ADD_ID_TO_FREELIST(info, id);
672 switch (bret->operation) {
675 if (unlikely(bret->status != BLKIF_RSP_OKAY))
676 DPRINTK("Bad return from blkdev data "
677 "request: %x\n", bret->status);
679 ret = end_that_request_first(
680 req, (bret->status == BLKIF_RSP_OKAY),
681 req->hard_nr_sectors);
683 end_that_request_last(
684 req, (bret->status == BLKIF_RSP_OKAY));
691 info->ring.rsp_cons = i;
693 if (i != info->ring.req_prod_pvt) {
695 RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
699 info->ring.sring->rsp_event = i + 1;
701 kick_pending_request_queues(info);
703 spin_unlock_irqrestore(&blkif_io_lock, flags);
708 static void blkif_free(struct blkfront_info *info, int suspend)
710 /* Prevent new requests being issued until we fix things up. */
711 spin_lock_irq(&blkif_io_lock);
712 info->connected = suspend ?
713 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
714 /* No more blkif_request(). */
716 blk_stop_queue(info->rq);
717 /* No more gnttab callback work. */
718 gnttab_cancel_free_callback(&info->callback);
719 spin_unlock_irq(&blkif_io_lock);
721 /* Flush gnttab callback work. Must be done with no locks held. */
722 flush_scheduled_work();
724 /* Free resources associated with old device channel. */
725 if (info->ring_ref != GRANT_INVALID_REF) {
726 gnttab_end_foreign_access(info->ring_ref, 0,
727 (unsigned long)info->ring.sring);
728 info->ring_ref = GRANT_INVALID_REF;
729 info->ring.sring = NULL;
732 unbind_from_irqhandler(info->irq, info);
733 info->evtchn = info->irq = 0;
737 static void blkif_completion(struct blk_shadow *s)
740 for (i = 0; i < s->req.nr_segments; i++)
741 gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
744 static void blkif_recover(struct blkfront_info *info)
747 blkif_request_t *req;
748 struct blk_shadow *copy;
751 /* Stage 1: Make a safe copy of the shadow state. */
752 copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL);
753 memcpy(copy, info->shadow, sizeof(info->shadow));
755 /* Stage 2: Set up free list. */
756 memset(&info->shadow, 0, sizeof(info->shadow));
757 for (i = 0; i < BLK_RING_SIZE; i++)
758 info->shadow[i].req.id = i+1;
759 info->shadow_free = info->ring.req_prod_pvt;
760 info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
762 /* Stage 3: Find pending requests and requeue them. */
763 for (i = 0; i < BLK_RING_SIZE; i++) {
765 if (copy[i].request == 0)
768 /* Grab a request slot and copy shadow state into it. */
769 req = RING_GET_REQUEST(
770 &info->ring, info->ring.req_prod_pvt);
773 /* We get a new request id, and must reset the shadow state. */
774 req->id = GET_ID_FROM_FREELIST(info);
775 memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i]));
777 /* Rewrite any grant references invalidated by susp/resume. */
778 for (j = 0; j < req->nr_segments; j++)
779 gnttab_grant_foreign_access_ref(
781 info->xbdev->otherend_id,
782 pfn_to_mfn(info->shadow[req->id].frame[j]),
785 info->shadow[req->id].request));
786 info->shadow[req->id].req = *req;
788 info->ring.req_prod_pvt++;
793 (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
795 spin_lock_irq(&blkif_io_lock);
797 /* Now safe for us to use the shared ring */
798 info->connected = BLKIF_STATE_CONNECTED;
800 /* Send off requeued requests */
801 flush_requests(info);
803 /* Kick any other new requests queued since we resumed */
804 kick_pending_request_queues(info);
806 spin_unlock_irq(&blkif_io_lock);
810 /* ** Driver Registration ** */
813 static struct xenbus_device_id blkfront_ids[] = {
819 static struct xenbus_driver blkfront = {
821 .owner = THIS_MODULE,
823 .probe = blkfront_probe,
824 .remove = blkfront_remove,
825 .resume = blkfront_resume,
826 .otherend_changed = backend_changed,
830 static int __init xlblk_init(void)
832 if (!is_running_on_xen())
835 return xenbus_register_frontend(&blkfront);
837 module_init(xlblk_init);
840 static void xlblk_exit(void)
842 return xenbus_unregister_driver(&blkfront);
844 module_exit(xlblk_exit);
846 MODULE_LICENSE("Dual BSD/GPL");