1 /******************************************************************************
4 * XenLinux virtual block-device driver.
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
12 * This file may be distributed separately from the Linux kernel, or
13 * incorporated into other software packages, subject to the following license:
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
36 if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
37 __LINE__, __FILE__); *(int*)0=0; }
42 #include <linux/version.h>
44 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
48 #include <linux/blk.h>
49 #include <linux/tqueue.h>
52 #include <linux/cdrom.h>
53 #include <linux/sched.h>
54 #include <linux/interrupt.h>
55 #include <scsi/scsi.h>
56 #include <asm-xen/ctrl_if.h>
57 #include <asm-xen/evtchn.h>
58 #ifdef CONFIG_XEN_BLKDEV_GRANT
59 #include <asm-xen/xen-public/grant_table.h>
60 #include <asm-xen/gnttab.h>
63 typedef unsigned char byte; /* from linux/ide.h */
65 /* Control whether runtime update of vbds is enabled. */
66 #define ENABLE_VBD_UPDATE 1
69 static void vbd_update(void);
71 static void vbd_update(void){};
74 #define BLKIF_STATE_CLOSED 0
75 #define BLKIF_STATE_DISCONNECTED 1
76 #define BLKIF_STATE_CONNECTED 2
78 #define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_blk: " fmt, ##args)
80 static int blkif_handle = 0;
81 static unsigned int blkif_state = BLKIF_STATE_CLOSED;
82 static unsigned int blkif_evtchn = 0;
83 static unsigned int blkif_irq = 0;
85 static int blkif_control_rsp_valid;
86 static blkif_response_t blkif_control_rsp;
88 static blkif_front_ring_t blk_ring;
90 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
92 #ifdef CONFIG_XEN_BLKDEV_GRANT
93 static domid_t rdomid = 0;
94 static grant_ref_t gref_head, gref_terminal;
95 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
96 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE)
97 #define GRANTREF_INVALID (1<<15)
100 static struct blk_shadow {
102 unsigned long request;
103 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
104 } blk_shadow[BLK_RING_SIZE];
105 unsigned long blk_shadow_free;
107 static int recovery = 0; /* Recovery in progress: protected by blkif_io_lock */
109 static void kick_pending_request_queues(void);
111 int __init xlblk_init(void);
113 static void blkif_completion(struct blk_shadow *s);
115 static inline int GET_ID_FROM_FREELIST(void)
117 unsigned long free = blk_shadow_free;
118 BUG_ON(free > BLK_RING_SIZE);
119 blk_shadow_free = blk_shadow[free].req.id;
120 blk_shadow[free].req.id = 0x0fffffee; /* debug */
124 static inline void ADD_ID_TO_FREELIST(unsigned long id)
126 blk_shadow[id].req.id = blk_shadow_free;
127 blk_shadow[id].request = 0;
128 blk_shadow_free = id;
132 /************************ COMMON CODE (inlined) ************************/
134 /* Kernel-specific definitions used in the common code */
135 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
136 #define DISABLE_SCATTERGATHER()
138 static int sg_operation = -1;
139 #define DISABLE_SCATTERGATHER() (sg_operation = -1)
142 static inline void pickle_request(struct blk_shadow *s, blkif_request_t *r)
144 #ifndef CONFIG_XEN_BLKDEV_GRANT
150 #ifndef CONFIG_XEN_BLKDEV_GRANT
151 for ( i = 0; i < r->nr_segments; i++ )
152 s->req.frame_and_sects[i] = machine_to_phys(r->frame_and_sects[i]);
156 static inline void unpickle_request(blkif_request_t *r, struct blk_shadow *s)
158 #ifndef CONFIG_XEN_BLKDEV_GRANT
164 #ifndef CONFIG_XEN_BLKDEV_GRANT
165 for ( i = 0; i < s->req.nr_segments; i++ )
166 r->frame_and_sects[i] = phys_to_machine(s->req.frame_and_sects[i]);
171 static inline void flush_requests(void)
173 DISABLE_SCATTERGATHER();
174 RING_PUSH_REQUESTS(&blk_ring);
175 notify_via_evtchn(blkif_evtchn);
179 /************************** KERNEL VERSION 2.6 **************************/
181 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
183 module_init(xlblk_init);
185 #if ENABLE_VBD_UPDATE
186 static void update_vbds_task(void *unused)
191 static void vbd_update(void)
193 static DECLARE_WORK(update_tq, update_vbds_task, NULL);
194 schedule_work(&update_tq);
196 #endif /* ENABLE_VBD_UPDATE */
198 static void kick_pending_request_queues(void)
200 if ( (xlbd_blk_queue != NULL) &&
201 test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) )
203 blk_start_queue(xlbd_blk_queue);
204 /* XXXcl call to request_fn should not be needed but
205 * we get stuck without... needs investigating
207 xlbd_blk_queue->request_fn(xlbd_blk_queue);
212 int blkif_open(struct inode *inode, struct file *filep)
214 struct gendisk *gd = inode->i_bdev->bd_disk;
215 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
217 /* Update of usage count is protected by per-device semaphore. */
224 int blkif_release(struct inode *inode, struct file *filep)
226 struct gendisk *gd = inode->i_bdev->bd_disk;
227 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
230 * When usage drops to zero it may allow more VBD updates to occur.
231 * Update of usage count is protected by a per-device semaphore.
233 if ( --di->mi->usage == 0 )
240 int blkif_ioctl(struct inode *inode, struct file *filep,
241 unsigned command, unsigned long argument)
245 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
246 command, (long)argument, inode->i_rdev);
251 /* return ENOSYS to use defaults */
254 case CDROMMULTISESSION:
255 DPRINTK("FIXME: support multisession CDs later\n");
256 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
257 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
261 printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
271 * blkif_queue_request
275 * id: for guest use only.
276 * operation: BLKIF_OP_{READ,WRITE,PROBE}
277 * buffer: buffer to read/write into. this should be a
278 * virtual address in the guest os.
280 static int blkif_queue_request(struct request *req)
282 struct xlbd_disk_info *di =
283 (struct xlbd_disk_info *)req->rq_disk->private_data;
284 unsigned long buffer_ma;
285 blkif_request_t *ring_req;
287 struct bio_vec *bvec;
290 unsigned int fsect, lsect;
291 #ifdef CONFIG_XEN_BLKDEV_GRANT
295 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
298 /* Fill out a communications ring structure. */
299 ring_req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
300 id = GET_ID_FROM_FREELIST();
301 blk_shadow[id].request = (unsigned long)req;
304 ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
306 ring_req->sector_number = (blkif_sector_t)req->sector;
307 ring_req->device = di->xd_device;
309 ring_req->nr_segments = 0;
310 rq_for_each_bio(bio, req)
312 bio_for_each_segment(bvec, bio, idx)
314 if ( ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST )
316 buffer_ma = page_to_phys(bvec->bv_page);
317 fsect = bvec->bv_offset >> 9;
318 lsect = fsect + (bvec->bv_len >> 9) - 1;
319 #ifdef CONFIG_XEN_BLKDEV_GRANT
320 /* install a grant reference. */
321 ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
322 ASSERT( ref != -ENOSPC );
324 gnttab_grant_foreign_access_ref(
327 buffer_ma >> PAGE_SHIFT,
330 blk_shadow[id].frame[ring_req->nr_segments] =
331 buffer_ma >> PAGE_SHIFT;
333 ring_req->frame_and_sects[ring_req->nr_segments++] =
334 (((u32) ref) << 16) | (fsect << 3) | lsect;
337 ring_req->frame_and_sects[ring_req->nr_segments++] =
338 buffer_ma | (fsect << 3) | lsect;
343 blk_ring.req_prod_pvt++;
345 /* Keep a private copy so we can reissue requests when recovering. */
346 pickle_request(&blk_shadow[id], ring_req);
354 * read a block; request is in a request queue
356 void do_blkif_request(request_queue_t *rq)
361 DPRINTK("Entered do_blkif_request\n");
365 while ( (req = elv_next_request(rq)) != NULL )
367 if ( !blk_fs_request(req) )
373 if ( RING_FULL(&blk_ring) )
379 DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
380 req, req->cmd, req->sector, req->current_nr_sectors,
381 req->nr_sectors, req->buffer,
382 rq_data_dir(req) ? "write" : "read");
384 blkdev_dequeue_request(req);
385 if ( blkif_queue_request(req) )
399 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
402 blkif_response_t *bret;
406 spin_lock_irqsave(&blkif_io_lock, flags);
408 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) ||
411 spin_unlock_irqrestore(&blkif_io_lock, flags);
415 rp = blk_ring.sring->rsp_prod;
416 rmb(); /* Ensure we see queued responses up to 'rp'. */
418 for ( i = blk_ring.rsp_cons; i != rp; i++ )
422 bret = RING_GET_RESPONSE(&blk_ring, i);
424 req = (struct request *)blk_shadow[id].request;
426 blkif_completion(&blk_shadow[id]);
428 ADD_ID_TO_FREELIST(id);
430 switch ( bret->operation )
434 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
435 DPRINTK("Bad return from blkdev data request: %x\n",
438 if ( unlikely(end_that_request_first
440 (bret->status == BLKIF_RSP_OKAY),
441 req->hard_nr_sectors)) )
443 end_that_request_last(req);
447 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
448 blkif_control_rsp_valid = 1;
455 blk_ring.rsp_cons = i;
457 kick_pending_request_queues();
459 spin_unlock_irqrestore(&blkif_io_lock, flags);
465 /************************** KERNEL VERSION 2.4 **************************/
467 static kdev_t sg_dev;
468 static unsigned long sg_next_sect;
471 * Request queues with outstanding work, but ring is currently full.
472 * We need no special lock here, as we always access this with the
473 * blkif_io_lock held. We only need a small maximum list.
475 #define MAX_PENDING 8
476 static request_queue_t *pending_queues[MAX_PENDING];
477 static int nr_pending;
480 #define blkif_io_lock io_request_lock
482 /*============================================================================*/
483 #if ENABLE_VBD_UPDATE
486 * blkif_update_int/update-vbds_task - handle VBD update events.
487 * Schedule a task for keventd to run, which will update the VBDs and perform
488 * the corresponding updates to our view of VBD state.
490 static void update_vbds_task(void *unused)
495 static void vbd_update(void)
497 static struct tq_struct update_tq;
498 update_tq.routine = update_vbds_task;
499 schedule_task(&update_tq);
502 #endif /* ENABLE_VBD_UPDATE */
503 /*============================================================================*/
505 static void kick_pending_request_queues(void)
507 /* We kick pending request queues if the ring is reasonably empty. */
508 if ( (nr_pending != 0) &&
509 (RING_PENDING_REQUESTS(&blk_ring) < (BLK_RING_SIZE >> 1)) )
511 /* Attempt to drain the queue, but bail if the ring becomes full. */
512 while ( (nr_pending != 0) && !RING_FULL(&blk_ring) )
513 do_blkif_request(pending_queues[--nr_pending]);
517 int blkif_open(struct inode *inode, struct file *filep)
519 short xldev = inode->i_rdev;
520 struct gendisk *gd = get_gendisk(xldev);
521 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
522 short minor = MINOR(xldev);
524 if ( gd->part[minor].nr_sects == 0 )
527 * Device either doesn't exist, or has zero capacity; we use a few
528 * cheesy heuristics to return the relevant error code
530 if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
531 ((minor & (gd->max_p - 1)) != 0) )
534 * We have a real device, but no such partition, or we just have a
535 * partition number so guess this is the problem.
537 return -ENXIO; /* no such device or address */
539 else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
541 /* This is a removable device => assume that media is missing. */
542 return -ENOMEDIUM; /* media not present (this is a guess) */
546 /* Just go for the general 'no such device' error. */
547 return -ENODEV; /* no such device */
551 /* Update of usage count is protected by per-device semaphore. */
558 int blkif_release(struct inode *inode, struct file *filep)
560 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
563 * When usage drops to zero it may allow more VBD updates to occur.
564 * Update of usage count is protected by a per-device semaphore.
566 if ( --disk->usage == 0 ) {
574 int blkif_ioctl(struct inode *inode, struct file *filep,
575 unsigned command, unsigned long argument)
577 kdev_t dev = inode->i_rdev;
578 struct hd_geometry *geo = (struct hd_geometry *)argument;
580 struct hd_struct *part;
582 unsigned short cylinders;
585 /* NB. No need to check permissions. That is done for us. */
587 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
588 command, (long) argument, dev);
590 gd = get_gendisk(dev);
591 part = &gd->part[MINOR(dev)];
596 DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects);
597 return put_user(part->nr_sects, (unsigned long *) argument);
600 DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
601 (u64)part->nr_sects * 512);
602 return put_user((u64)part->nr_sects * 512, (u64 *) argument);
604 case BLKRRPART: /* re-read partition table */
605 DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART);
606 return blkif_revalidate(dev);
609 return hardsect_size[MAJOR(dev)][MINOR(dev)];
611 case BLKBSZGET: /* get block size */
612 DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET);
615 case BLKBSZSET: /* set block size */
616 DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET);
619 case BLKRASET: /* set read-ahead */
620 DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET);
623 case BLKRAGET: /* get read-ahead */
624 DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET);
628 DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO);
629 if (!argument) return -EINVAL;
631 /* We don't have real geometry info, but let's at least return
632 values consistent with the size of the device */
636 cylinders = part->nr_sects / (heads * sectors);
638 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
639 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
640 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
641 if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT;
645 case HDIO_GETGEO_BIG:
646 DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
647 if (!argument) return -EINVAL;
649 /* We don't have real geometry info, but let's at least return
650 values consistent with the size of the device */
654 cylinders = part->nr_sects / (heads * sectors);
656 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
657 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
658 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
659 if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT;
663 case CDROMMULTISESSION:
664 DPRINTK("FIXME: support multisession CDs later\n");
665 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
666 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
669 case SCSI_IOCTL_GET_BUS_NUMBER:
670 DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
674 printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command);
683 /* check media change: should probably do something here in some cases :-) */
684 int blkif_check(kdev_t dev)
686 DPRINTK("blkif_check\n");
690 int blkif_revalidate(kdev_t dev)
692 struct block_device *bd;
695 unsigned long capacity;
698 if ( (bd = bdget(dev)) == NULL )
702 * Update of partition info, and check of usage count, is protected
703 * by the per-block-device semaphore.
707 if ( ((gd = get_gendisk(dev)) == NULL) ||
708 ((disk = xldev_to_xldisk(dev)) == NULL) ||
709 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
715 if ( disk->usage > 1 )
721 /* Only reread partition table if VBDs aren't mapped to partitions. */
722 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
724 for ( i = gd->max_p - 1; i >= 0; i-- )
726 invalidate_device(dev+i, 1);
727 gd->part[MINOR(dev+i)].start_sect = 0;
728 gd->part[MINOR(dev+i)].nr_sects = 0;
729 gd->sizes[MINOR(dev+i)] = 0;
732 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
743 * blkif_queue_request
747 * id: for guest use only.
748 * operation: BLKIF_OP_{READ,WRITE,PROBE}
749 * buffer: buffer to read/write into. this should be a
750 * virtual address in the guest os.
752 static int blkif_queue_request(unsigned long id,
755 unsigned long sector_number,
756 unsigned short nr_sectors,
759 unsigned long buffer_ma = virt_to_bus(buffer);
762 blkif_request_t *req;
763 struct buffer_head *bh;
764 unsigned int fsect, lsect;
765 #ifdef CONFIG_XEN_BLKDEV_GRANT
769 fsect = (buffer_ma & ~PAGE_MASK) >> 9;
770 lsect = fsect + nr_sectors - 1;
772 /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
773 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
778 buffer_ma &= PAGE_MASK;
780 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
788 gd = get_gendisk(device);
791 * Update the sector_number we'll pass down as appropriate; note that
792 * we could sanity check that resulting sector will be in this
793 * partition, but this will happen in driver backend anyhow.
795 sector_number += gd->part[MINOR(device)].start_sect;
798 * If this unit doesn't consist of virtual partitions then we clear
799 * the partn bits from the device number.
801 if ( !(gd->flags[MINOR(device)>>gd->minor_shift] &
802 GENHD_FL_VIRT_PARTNS) )
803 device &= ~(gd->max_p - 1);
805 if ( (sg_operation == operation) &&
806 (sg_dev == device) &&
807 (sg_next_sect == sector_number) )
809 req = RING_GET_REQUEST(&blk_ring,
810 blk_ring.req_prod_pvt - 1);
811 bh = (struct buffer_head *)id;
813 bh->b_reqnext = (struct buffer_head *)blk_shadow[req->id].request;
814 blk_shadow[req->id].request = (unsigned long)id;
816 #ifdef CONFIG_XEN_BLKDEV_GRANT
817 /* install a grant reference. */
818 ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
819 ASSERT( ref != -ENOSPC );
821 gnttab_grant_foreign_access_ref(
824 buffer_ma >> PAGE_SHIFT,
825 ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );
827 blk_shadow[id].frame[req->nr_segments] =
828 buffer_ma >> PAGE_SHIFT;
830 req->frame_and_sects[req->nr_segments] =
831 (((u32) ref ) << 16) | (fsect << 3) | lsect;
833 req->frame_and_sects[req->nr_segments] =
834 buffer_ma | (fsect << 3) | lsect;
836 if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
837 sg_next_sect += nr_sectors;
839 DISABLE_SCATTERGATHER();
841 /* Update the copy of the request in the recovery ring. */
842 pickle_request(&blk_shadow[req->id], req );
846 else if ( RING_FULL(&blk_ring) )
852 sg_operation = operation;
854 sg_next_sect = sector_number + nr_sectors;
859 panic("unknown op %d\n", operation);
862 /* Fill out a communications ring structure. */
863 req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
865 xid = GET_ID_FROM_FREELIST();
866 blk_shadow[xid].request = (unsigned long)id;
869 req->operation = operation;
870 req->sector_number = (blkif_sector_t)sector_number;
871 req->device = device;
872 req->nr_segments = 1;
873 #ifdef CONFIG_XEN_BLKDEV_GRANT
874 /* install a grant reference. */
875 ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
876 ASSERT( ref != -ENOSPC );
878 gnttab_grant_foreign_access_ref(
881 buffer_ma >> PAGE_SHIFT,
882 ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );
884 blk_shadow[xid].frame[0] = buffer_ma >> PAGE_SHIFT;
886 req->frame_and_sects[0] = (((u32) ref)<<16) | (fsect<<3) | lsect;
888 req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
891 /* Keep a private copy so we can reissue requests when recovering. */
892 pickle_request(&blk_shadow[xid], req);
894 blk_ring.req_prod_pvt++;
902 * read a block; request is in a request queue
904 void do_blkif_request(request_queue_t *rq)
907 struct buffer_head *bh, *next_bh;
908 int rw, nsect, full, queued = 0;
910 DPRINTK("Entered do_blkif_request\n");
912 while ( !rq->plugged && !list_empty(&rq->queue_head))
914 if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL )
917 DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
918 req, req->cmd, req->sector,
919 req->current_nr_sectors, req->nr_sectors, req->bh);
924 if ( unlikely((rw != READ) && (rw != WRITE)) )
925 panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
932 next_bh = bh->b_reqnext;
933 bh->b_reqnext = NULL;
935 full = blkif_queue_request(
937 (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE,
938 bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
942 bh->b_reqnext = next_bh;
943 pending_queues[nr_pending++] = rq;
944 if ( unlikely(nr_pending >= MAX_PENDING) )
951 /* Dequeue the buffer head from the request. */
952 nsect = bh->b_size >> 9;
953 bh = req->bh = next_bh;
957 /* There's another buffer head to do. Update the request. */
958 req->hard_sector += nsect;
959 req->hard_nr_sectors -= nsect;
960 req->sector = req->hard_sector;
961 req->nr_sectors = req->hard_nr_sectors;
962 req->current_nr_sectors = bh->b_size >> 9;
963 req->buffer = bh->b_data;
967 /* That was the last buffer head. Finalise the request. */
968 if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
970 blkdev_dequeue_request(req);
971 end_that_request_last(req);
982 static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
986 struct buffer_head *bh, *next_bh;
988 spin_lock_irqsave(&io_request_lock, flags);
990 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
992 spin_unlock_irqrestore(&io_request_lock, flags);
996 rp = blk_ring.sring->rsp_prod;
997 rmb(); /* Ensure we see queued responses up to 'rp'. */
999 for ( i = blk_ring.rsp_cons; i != rp; i++ )
1002 blkif_response_t *bret;
1004 bret = RING_GET_RESPONSE(&blk_ring, i);
1006 bh = (struct buffer_head *)blk_shadow[id].request;
1008 blkif_completion(&blk_shadow[id]);
1010 ADD_ID_TO_FREELIST(id);
1012 switch ( bret->operation )
1015 case BLKIF_OP_WRITE:
1016 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
1017 DPRINTK("Bad return from blkdev data request: %lx\n",
1019 for ( ; bh != NULL; bh = next_bh )
1021 next_bh = bh->b_reqnext;
1022 bh->b_reqnext = NULL;
1023 bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
1027 case BLKIF_OP_PROBE:
1028 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
1029 blkif_control_rsp_valid = 1;
1036 blk_ring.rsp_cons = i;
1038 kick_pending_request_queues();
1040 spin_unlock_irqrestore(&io_request_lock, flags);
1045 /***************************** COMMON CODE *******************************/
1047 #ifdef CONFIG_XEN_BLKDEV_GRANT
1048 void blkif_control_probe_send(blkif_request_t *req, blkif_response_t *rsp,
1049 unsigned long address)
1051 int ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
1052 ASSERT( ref != -ENOSPC );
1054 gnttab_grant_foreign_access_ref( ref, rdomid, address >> PAGE_SHIFT, 0 );
1056 req->frame_and_sects[0] = (((u32) ref) << 16) | 7;
1058 blkif_control_send(req, rsp);
1062 void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
1064 unsigned long flags, id;
1065 blkif_request_t *req_d;
1068 while ( RING_FULL(&blk_ring) )
1070 set_current_state(TASK_INTERRUPTIBLE);
1071 schedule_timeout(1);
1074 spin_lock_irqsave(&blkif_io_lock, flags);
1075 if ( RING_FULL(&blk_ring) )
1077 spin_unlock_irqrestore(&blkif_io_lock, flags);
1081 DISABLE_SCATTERGATHER();
1082 req_d = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
1085 id = GET_ID_FROM_FREELIST();
1087 blk_shadow[id].request = (unsigned long)req;
1089 pickle_request(&blk_shadow[id], req);
1091 blk_ring.req_prod_pvt++;
1094 spin_unlock_irqrestore(&blkif_io_lock, flags);
1096 while ( !blkif_control_rsp_valid )
1098 set_current_state(TASK_INTERRUPTIBLE);
1099 schedule_timeout(1);
1102 memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
1103 blkif_control_rsp_valid = 0;
1107 /* Send a driver status notification to the domain controller. */
1108 static void send_driver_status(int ok)
1111 .type = CMSG_BLKIF_FE,
1112 .subtype = CMSG_BLKIF_FE_DRIVER_STATUS,
1113 .length = sizeof(blkif_fe_driver_status_t),
1115 blkif_fe_driver_status_t *msg = (void*)cmsg.msg;
1117 msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN);
1119 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1122 /* Tell the controller to bring up the interface. */
1123 static void blkif_send_interface_connect(void)
1126 .type = CMSG_BLKIF_FE,
1127 .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
1128 .length = sizeof(blkif_fe_interface_connect_t),
1130 blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
1133 msg->shmem_frame = (virt_to_machine(blk_ring.sring) >> PAGE_SHIFT);
1135 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1138 static void blkif_free(void)
1140 /* Prevent new requests being issued until we fix things up. */
1141 spin_lock_irq(&blkif_io_lock);
1143 blkif_state = BLKIF_STATE_DISCONNECTED;
1144 spin_unlock_irq(&blkif_io_lock);
1146 /* Free resources associated with old device channel. */
1147 if ( blk_ring.sring != NULL )
1149 free_page((unsigned long)blk_ring.sring);
1150 blk_ring.sring = NULL;
1152 free_irq(blkif_irq, NULL);
1155 unbind_evtchn_from_irq(blkif_evtchn);
1159 static void blkif_close(void)
1163 /* Move from CLOSED to DISCONNECTED state. */
1164 static void blkif_disconnect(void)
1166 blkif_sring_t *sring;
1168 if ( blk_ring.sring != NULL )
1169 free_page((unsigned long)blk_ring.sring);
1171 sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
1172 SHARED_RING_INIT(sring);
1173 FRONT_RING_INIT(&blk_ring, sring, PAGE_SIZE);
1174 blkif_state = BLKIF_STATE_DISCONNECTED;
1175 blkif_send_interface_connect();
1178 static void blkif_reset(void)
1184 static void blkif_recover(void)
1187 blkif_request_t *req;
1188 struct blk_shadow *copy;
1189 #ifdef CONFIG_XEN_BLKDEV_GRANT
1193 /* Stage 1: Make a safe copy of the shadow state. */
1194 copy = (struct blk_shadow *)kmalloc(sizeof(blk_shadow), GFP_KERNEL);
1195 BUG_ON(copy == NULL);
1196 memcpy(copy, blk_shadow, sizeof(blk_shadow));
1198 /* Stage 2: Set up free list. */
1199 memset(&blk_shadow, 0, sizeof(blk_shadow));
1200 for ( i = 0; i < BLK_RING_SIZE; i++ )
1201 blk_shadow[i].req.id = i+1;
1202 blk_shadow_free = blk_ring.req_prod_pvt;
1203 blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1205 /* Stage 3: Find pending requests and requeue them. */
1206 for ( i = 0; i < BLK_RING_SIZE; i++ )
1209 if ( copy[i].request == 0 )
1212 /* Grab a request slot and unpickle shadow state into it. */
1213 req = RING_GET_REQUEST(
1214 &blk_ring, blk_ring.req_prod_pvt);
1215 unpickle_request(req, ©[i]);
1217 /* We get a new request id, and must reset the shadow state. */
1218 req->id = GET_ID_FROM_FREELIST();
1219 memcpy(&blk_shadow[req->id], ©[i], sizeof(copy[i]));
1221 #ifdef CONFIG_XEN_BLKDEV_GRANT
1222 /* Rewrite any grant references invalidated by suspend/resume. */
1223 for ( j = 0; j < req->nr_segments; j++ )
1225 if ( req->frame_and_sects[j] & GRANTREF_INVALID )
1226 gnttab_grant_foreign_access_ref(
1227 blkif_gref_from_fas(req->frame_and_sects[j]),
1229 blk_shadow[req->id].frame[j],
1230 rq_data_dir((struct request *)
1231 blk_shadow[req->id].request));
1232 req->frame_and_sects[j] &= ~GRANTREF_INVALID;
1234 blk_shadow[req->id].req = *req;
1237 blk_ring.req_prod_pvt++;
1244 /* blk_ring->req_prod will be set when we flush_requests().*/
1247 /* Kicks things back into life. */
1250 /* Now safe to left other people use the interface. */
1251 blkif_state = BLKIF_STATE_CONNECTED;
1254 static void blkif_connect(blkif_fe_interface_status_t *status)
1258 blkif_evtchn = status->evtchn;
1259 blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
1260 #ifdef CONFIG_XEN_BLKDEV_GRANT
1261 rdomid = status->domid;
1264 err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL);
1267 printk(KERN_ALERT "xen_blk: request_irq failed (err=%d)\n", err);
1277 /* Transition to connected in case we need to do
1278 * a partition probe on a whole disk. */
1279 blkif_state = BLKIF_STATE_CONNECTED;
1281 /* Probe for discs attached to the interface. */
1285 /* Kick pending requests. */
1286 spin_lock_irq(&blkif_io_lock);
1287 kick_pending_request_queues();
1288 spin_unlock_irq(&blkif_io_lock);
1291 static void unexpected(blkif_fe_interface_status_t *status)
1293 DPRINTK(" Unexpected blkif status %u in state %u\n",
1294 status->status, blkif_state);
1297 static void blkif_status(blkif_fe_interface_status_t *status)
1299 if ( status->handle != blkif_handle )
1301 WPRINTK(" Invalid blkif: handle=%u\n", status->handle);
1306 switch ( status->status )
1308 case BLKIF_INTERFACE_STATUS_CLOSED:
1309 switch ( blkif_state )
1311 case BLKIF_STATE_CLOSED:
1314 case BLKIF_STATE_DISCONNECTED:
1315 case BLKIF_STATE_CONNECTED:
1322 case BLKIF_INTERFACE_STATUS_DISCONNECTED:
1323 switch ( blkif_state )
1325 case BLKIF_STATE_CLOSED:
1328 case BLKIF_STATE_DISCONNECTED:
1329 case BLKIF_STATE_CONNECTED:
1330 /* unexpected(status); */ /* occurs during suspend/resume */
1336 case BLKIF_INTERFACE_STATUS_CONNECTED:
1337 switch ( blkif_state )
1339 case BLKIF_STATE_CLOSED:
1342 blkif_connect(status);
1344 case BLKIF_STATE_DISCONNECTED:
1345 blkif_connect(status);
1347 case BLKIF_STATE_CONNECTED:
1349 blkif_connect(status);
1354 case BLKIF_INTERFACE_STATUS_CHANGED:
1355 switch ( blkif_state )
1357 case BLKIF_STATE_CLOSED:
1358 case BLKIF_STATE_DISCONNECTED:
1361 case BLKIF_STATE_CONNECTED:
1368 WPRINTK(" Invalid blkif status: %d\n", status->status);
1374 static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
1376 switch ( msg->subtype )
1378 case CMSG_BLKIF_FE_INTERFACE_STATUS:
1379 blkif_status((blkif_fe_interface_status_t *)
1387 ctrl_if_send_response(msg);
1390 int wait_for_blkif(void)
1394 send_driver_status(1);
1397 * We should read 'nr_interfaces' from response message and wait
1398 * for notifications before proceeding. For now we assume that we
1399 * will be notified of exactly one interface.
1401 for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*HZ); i++ )
1403 set_current_state(TASK_INTERRUPTIBLE);
1404 schedule_timeout(1);
1407 if ( blkif_state != BLKIF_STATE_CONNECTED )
1409 printk(KERN_INFO "xen_blk: Timeout connecting to device!\n");
1415 int __init xlblk_init(void)
1419 #ifdef CONFIG_XEN_BLKDEV_GRANT
1420 if ( 0 > gnttab_alloc_grant_references( MAXIMUM_OUTSTANDING_BLOCK_REQS,
1421 &gref_head, &gref_terminal ))
1423 printk(KERN_ALERT "Blkif frontend is using grant tables.\n");
1426 if ( (xen_start_info.flags & SIF_INITDOMAIN) ||
1427 (xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
1430 printk(KERN_INFO "xen_blk: Initialising virtual block device driver\n");
1432 blk_shadow_free = 0;
1433 memset(blk_shadow, 0, sizeof(blk_shadow));
1434 for ( i = 0; i < BLK_RING_SIZE; i++ )
1435 blk_shadow[i].req.id = i+1;
1436 blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1438 (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
1439 CALLBACK_IN_BLOCKING_CONTEXT);
1446 void blkdev_suspend(void)
1450 void blkdev_resume(void)
1452 #ifdef CONFIG_XEN_BLKDEV_GRANT
1454 for ( i = 0; i < BLK_RING_SIZE; i++ )
1455 for ( j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++ )
1456 blk_shadow[i].req.frame_and_sects[j] |= GRANTREF_INVALID;
1458 send_driver_status(1);
1461 static void blkif_completion(struct blk_shadow *s)
1464 #ifdef CONFIG_XEN_BLKDEV_GRANT
1465 for ( i = 0; i < s->req.nr_segments; i++ )
1466 gnttab_release_grant_reference(
1467 &gref_head, blkif_gref_from_fas(s->req.frame_and_sects[i]));
1469 /* This is a hack to get the dirty logging bits set */
1470 if ( s->req.operation == BLKIF_OP_READ )
1472 for ( i = 0; i < s->req.nr_segments; i++ )
1474 unsigned long pfn = s->req.frame_and_sects[i] >> PAGE_SHIFT;
1475 unsigned long mfn = phys_to_machine_mapping[pfn];
1476 xen_machphys_update(mfn, pfn);