1 /******************************************************************************
2 * drivers/xen/blktap/blktap.c
4 * Back-end driver for user level virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. Requests
7 * are remapped to a user-space memory region.
9 * Based on the blkback driver code.
11 * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License version 2
15 * as published by the Free Software Foundation; or, when distributed
16 * separately from the Linux kernel or incorporated into other
17 * software packages, subject to the following license:
19 * Permission is hereby granted, free of charge, to any person obtaining a copy
20 * of this source file (the "Software"), to deal in the Software without
21 * restriction, including without limitation the rights to use, copy, modify,
22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23 * and to permit persons to whom the Software is furnished to do so, subject to
24 * the following conditions:
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 #include <linux/spinlock.h>
39 #include <linux/kthread.h>
40 #include <linux/list.h>
41 #include <asm/hypervisor.h>
43 #include <xen/balloon.h>
44 #include <linux/kernel.h>
47 #include <linux/miscdevice.h>
48 #include <linux/errno.h>
49 #include <linux/major.h>
50 #include <linux/gfp.h>
51 #include <linux/poll.h>
52 #include <asm/tlbflush.h>
53 #include <linux/devfs_fs_kernel.h>
55 #define MAX_TAP_DEV 100 /*the maximum number of tapdisk ring devices */
56 #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
59 * The maximum number of requests that can be outstanding at any time
62 * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
64 * where mmap_alloc < MAX_DYNAMIC_MEM.
67 * mmap_alloc is initialised to 2 and should be adjustable on the fly via
70 #define MAX_DYNAMIC_MEM 64
71 #define MAX_PENDING_REQS 64
72 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
73 #define MMAP_VADDR(_start, _req,_seg) \
75 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
77 static int blkif_reqs = MAX_PENDING_REQS;
78 static int mmap_pages = MMAP_PAGES;
80 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
81 * have a bunch of pages reserved for shared
85 /*Data struct associated with each of the tapdisk devices*/
86 typedef struct tap_blkif {
87 struct vm_area_struct *vma; /*Shared memory area */
88 unsigned long rings_vstart; /*Kernel memory mapping */
89 unsigned long user_vstart; /*User memory mapping */
90 unsigned long dev_inuse; /*One process opens device at a time. */
91 unsigned long dev_pending; /*In process of being opened */
92 unsigned long ring_ok; /*make this ring->state */
93 blkif_front_ring_t ufe_ring; /*Rings up to user space. */
94 wait_queue_head_t wait; /*for poll */
95 unsigned long mode; /*current switching mode */
96 int minor; /*Minor number for tapdisk device */
97 pid_t pid; /*tapdisk process id */
98 enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
100 unsigned long *idx_map; /*Record the user ring id to kern
101 [req id, idx] tuple */
102 blkif_t *blkif; /*Associate blkif with tapdev */
105 /*Private data struct associated with the inode*/
106 typedef struct private_info {
110 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
111 typedef struct domid_translate {
112 unsigned short domid;
113 unsigned short busid;
114 } domid_translate_t ;
117 domid_translate_t translate_domid[MAX_TAP_DEV];
118 tap_blkif_t *tapfds[MAX_TAP_DEV];
120 static int __init set_blkif_reqs(char *str)
122 get_option(&str, &blkif_reqs);
125 __setup("blkif_reqs=", set_blkif_reqs);
127 /* Run-time switchable: /sys/module/blktap/parameters/ */
128 static unsigned int log_stats = 0;
129 static unsigned int debug_lvl = 0;
130 module_param(log_stats, int, 0644);
131 module_param(debug_lvl, int, 0644);
134 * Each outstanding request that we've passed to the lower device layers has a
135 * 'pending_req' allocated to it. Each buffer_head that completes decrements
136 * the pendcnt towards zero. When it hits zero, the specified domain has a
137 * response queued for it, with the saved 'id' passed back.
142 unsigned short mem_idx;
145 unsigned short operation;
147 struct list_head free_list;
151 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
152 static struct list_head pending_free;
153 static DEFINE_SPINLOCK(pending_free_lock);
154 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
155 static int alloc_pending_reqs;
157 typedef unsigned int PEND_RING_IDX;
159 static inline int MASK_PEND_IDX(int i) {
160 return (i & (MAX_PENDING_REQS-1));
163 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
164 return (req - pending_reqs[idx]);
167 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
169 #define BLKBACK_INVALID_HANDLE (~0)
171 typedef struct mmap_page {
176 static mmap_page_t mmap_start[MAX_DYNAMIC_MEM];
177 static unsigned short mmap_alloc = 0;
178 static unsigned short mmap_lock = 0;
179 static unsigned short mmap_inuse = 0;
180 static unsigned long *pending_addrs[MAX_DYNAMIC_MEM];
182 /******************************************************************
186 /* When using grant tables to map a frame for device access then the
187 * handle returned must be used to unmap the frame. This is needed to
188 * drop the ref count on the frame.
190 struct grant_handle_pair
192 grant_handle_t kernel;
196 static struct grant_handle_pair
197 pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
198 #define pending_handle(_id, _idx, _i) \
199 (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
203 static int blktap_read_ufe_ring(int idx); /*local prototypes*/
205 #define BLKTAP_MINOR 0 /*/dev/xen/blktap resides at device number
206 major=254, minor numbers begin at 0 */
207 #define BLKTAP_DEV_MAJOR 254 /* TODO: Make major number dynamic *
208 * and create devices in the kernel *
210 #define BLKTAP_DEV_DIR "/dev/xen"
213 #define BLKTAP_IOCTL_KICK_FE 1
214 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
215 #define BLKTAP_IOCTL_SETMODE 3
216 #define BLKTAP_IOCTL_SENDPID 4
217 #define BLKTAP_IOCTL_NEWINTF 5
218 #define BLKTAP_IOCTL_MINOR 6
219 #define BLKTAP_IOCTL_MAJOR 7
220 #define BLKTAP_QUERY_ALLOC_REQS 8
221 #define BLKTAP_IOCTL_FREEINTF 9
222 #define BLKTAP_IOCTL_PRINT_IDXS 100
224 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
225 #define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
226 #define BLKTAP_MODE_INTERCEPT_FE 0x00000001
227 #define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
229 #define BLKTAP_MODE_INTERPOSE \
230 (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
233 static inline int BLKTAP_MODE_VALID(unsigned long arg)
235 return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
236 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
237 (arg == BLKTAP_MODE_INTERPOSE ));
240 /* Requests passing through the tap to userspace are re-assigned an ID.
241 * We must record a mapping between the BE [IDX,ID] tuple and the userspace
245 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
247 return ((fe_dom << 16) | MASK_PEND_IDX(idx));
250 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
252 return (PEND_RING_IDX)(id & 0x0000ffff);
255 extern inline int ID_TO_MIDX(unsigned long id)
257 return (int)(id >> 16);
260 #define INVALID_REQ 0xdead0000
262 /*TODO: Convert to a free list*/
263 static inline int GET_NEXT_REQ(unsigned long *idx_map)
266 for (i = 0; i < MAX_PENDING_REQS; i++)
267 if (idx_map[i] == INVALID_REQ) return i;
273 #define BLKTAP_INVALID_HANDLE(_g) \
274 (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
276 #define BLKTAP_INVALIDATE_HANDLE(_g) do { \
277 (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
281 /******************************************************************
285 static struct page *blktap_nopage(struct vm_area_struct *vma,
286 unsigned long address,
290 * if the page has not been mapped in by the driver then return
291 * NOPAGE_SIGBUS to the domain.
294 return NOPAGE_SIGBUS;
297 struct vm_operations_struct blktap_vm_ops = {
298 nopage: blktap_nopage,
301 /******************************************************************
305 /*Function Declarations*/
306 static int get_next_free_dev(void);
307 static int blktap_open(struct inode *inode, struct file *filp);
308 static int blktap_release(struct inode *inode, struct file *filp);
309 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
310 static int blktap_ioctl(struct inode *inode, struct file *filp,
311 unsigned int cmd, unsigned long arg);
312 static unsigned int blktap_poll(struct file *file, poll_table *wait);
314 struct miscdevice *set_misc(int minor, char *name, int dev);
316 static struct file_operations blktap_fops = {
317 .owner = THIS_MODULE,
319 .ioctl = blktap_ioctl,
321 .release = blktap_release,
326 static int get_next_free_dev(void)
332 spin_lock_irqsave(&pending_free_lock, flags);
334 while (i < MAX_TAP_DEV) {
336 if ( (tapfds[i] != NULL) && (info->dev_inuse == 0)
337 && (info->dev_pending == 0) ) {
338 info->dev_pending = 1;
346 spin_unlock_irqrestore(&pending_free_lock, flags);
350 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
354 for (i = 0; i < MAX_TAP_DEV; i++)
355 if ( (translate_domid[i].domid == domid)
356 && (translate_domid[i].busid == xenbus_id) ) {
357 tapfds[i]->blkif = blkif;
358 tapfds[i]->status = RUNNING;
364 void signal_tapdisk(int idx)
367 struct task_struct *ptask;
370 if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) {
371 ptask = find_task_by_pid(info->pid);
373 info->status = CLEANSHUTDOWN;
380 static int blktap_open(struct inode *inode, struct file *filp)
382 blkif_sring_t *sring;
383 int idx = iminor(inode) - BLKTAP_MINOR;
388 if (tapfds[idx] == NULL) {
389 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
393 DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
397 /*Only one process can access device at a time*/
398 if (test_and_set_bit(0, &info->dev_inuse))
401 info->dev_pending = 0;
403 /* Allocate the fe ring. */
404 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
408 SetPageReserved(virt_to_page(sring));
410 SHARED_RING_INIT(sring);
411 FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
413 prv = kzalloc(sizeof(private_info_t),GFP_KERNEL);
415 filp->private_data = prv;
418 info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS,
422 init_waitqueue_head(&info->wait);
423 for (i = 0; i < MAX_PENDING_REQS; i++)
424 info->idx_map[i] = INVALID_REQ;
427 DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
434 static int blktap_release(struct inode *inode, struct file *filp)
436 int idx = iminor(inode) - BLKTAP_MINOR;
439 if (tapfds[idx] == NULL) {
440 WPRINTK("Trying to free device that doesn't exist "
441 "[/dev/xen/blktap%d]\n",idx);
446 DPRINTK("Freeing device [/dev/xen/blktap%d]\n",idx);
448 /* Free the ring page. */
449 ClearPageReserved(virt_to_page(info->ufe_ring.sring));
450 free_page((unsigned long) info->ufe_ring.sring);
452 /* Clear any active mappings and free foreign map table */
455 info->vma, info->vma->vm_start,
456 info->vma->vm_end - info->vma->vm_start, NULL);
460 if (filp->private_data) kfree(filp->private_data);
462 if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
463 kthread_stop(info->blkif->xenblkd);
464 info->blkif->xenblkd = NULL;
465 info->status = CLEANSHUTDOWN;
472 * We need to map pages to user space in a way that will allow the block
473 * subsystem set up direct IO to them. This couldn't be done before, because
474 * there isn't really a sane way to translate a user virtual address down to a
475 * physical address when the page belongs to another domain.
477 * My first approach was to map the page in to kernel memory, add an entry
478 * for it in the physical frame list (using alloc_lomem_region as in blkback)
479 * and then attempt to map that page up to user space. This is disallowed
480 * by xen though, which realizes that we don't really own the machine frame
481 * underlying the physical page.
483 * The new approach is to provide explicit support for this in xen linux.
484 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
485 * mapped from other vms. vma->vm_private_data is set up as a mapping
486 * from pages to actual page structs. There is a new clause in get_user_pages
487 * that does the right thing for this sort of mapping.
489 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
497 /*Retrieve the dev info*/
498 prv = (private_info_t *)filp->private_data;
500 WPRINTK("blktap: mmap, retrieving idx failed\n");
503 info = tapfds[prv->idx];
505 vma->vm_flags |= VM_RESERVED;
506 vma->vm_ops = &blktap_vm_ops;
508 size = vma->vm_end - vma->vm_start;
509 if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
510 WPRINTK("you _must_ map exactly %d pages!\n",
511 mmap_pages + RING_PAGES);
516 info->rings_vstart = vma->vm_start;
517 info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
519 /* Map the ring pages to the start of the region and reserve it. */
520 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
522 if (remap_pfn_range(vma, vma->vm_start,
523 __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
524 PAGE_SIZE, vma->vm_page_prot)) {
525 WPRINTK("Mapping user ring failed!\n");
529 /* Mark this VM as containing foreign pages, and set up mappings. */
530 map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
531 * sizeof(struct page_struct*),
534 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
538 for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
541 vma->vm_private_data = map;
542 vma->vm_flags |= VM_FOREIGN;
548 /* Clear any active mappings. */
549 zap_page_range(vma, vma->vm_start,
550 vma->vm_end - vma->vm_start, NULL);
556 static int blktap_ioctl(struct inode *inode, struct file *filp,
557 unsigned int cmd, unsigned long arg)
559 int idx = iminor(inode) - BLKTAP_MINOR;
561 case BLKTAP_IOCTL_KICK_FE:
563 /* There are fe messages to process. */
564 return blktap_read_ufe_ring(idx);
566 case BLKTAP_IOCTL_SETMODE:
568 tap_blkif_t *info = tapfds[idx];
570 if ( (idx > 0) && (idx < MAX_TAP_DEV)
571 && (tapfds[idx] != NULL) )
573 if (BLKTAP_MODE_VALID(arg)) {
575 /* XXX: may need to flush rings here. */
576 DPRINTK("blktap: set mode to %lx\n",
583 case BLKTAP_IOCTL_PRINT_IDXS:
585 tap_blkif_t *info = tapfds[idx];
587 if ( (idx > 0) && (idx < MAX_TAP_DEV)
588 && (tapfds[idx] != NULL) )
590 printk("User Rings: \n-----------\n");
591 printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
592 "| req_prod: %2d, rsp_prod: %2d\n",
593 info->ufe_ring.rsp_cons,
594 info->ufe_ring.req_prod_pvt,
595 info->ufe_ring.sring->req_prod,
596 info->ufe_ring.sring->rsp_prod);
600 case BLKTAP_IOCTL_SENDPID:
602 tap_blkif_t *info = tapfds[idx];
604 if ( (idx > 0) && (idx < MAX_TAP_DEV)
605 && (tapfds[idx] != NULL) )
607 info->pid = (pid_t)arg;
608 DPRINTK("blktap: pid received %d\n",
613 case BLKTAP_IOCTL_NEWINTF:
615 uint64_t val = (uint64_t)arg;
616 domid_translate_t *tr = (domid_translate_t *)&val;
619 DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
620 tr->domid, tr->busid);
621 newdev = get_next_free_dev();
623 WPRINTK("Error initialising /dev/xen/blktap - "
624 "No more devices\n");
627 translate_domid[newdev].domid = tr->domid;
628 translate_domid[newdev].busid = tr->busid;
631 case BLKTAP_IOCTL_FREEINTF:
633 unsigned long dev = arg;
634 tap_blkif_t *info = NULL;
636 if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev];
638 if ( (info != NULL) && (info->dev_pending) )
639 info->dev_pending = 0;
642 case BLKTAP_IOCTL_MINOR:
644 unsigned long dev = arg;
645 tap_blkif_t *info = NULL;
647 if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev];
649 if (info != NULL) return info->minor;
652 case BLKTAP_IOCTL_MAJOR:
653 return BLKTAP_DEV_MAJOR;
655 case BLKTAP_QUERY_ALLOC_REQS:
657 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
658 alloc_pending_reqs, blkif_reqs);
659 return (alloc_pending_reqs/blkif_reqs) * 100;
665 static unsigned int blktap_poll(struct file *file, poll_table *wait)
670 /*Retrieve the dev info*/
671 prv = (private_info_t *)file->private_data;
673 WPRINTK(" poll, retrieving idx failed\n");
677 if (prv->idx == 0) return 0;
679 info = tapfds[prv->idx];
681 poll_wait(file, &info->wait, wait);
682 if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
684 RING_PUSH_REQUESTS(&info->ufe_ring);
685 return POLLIN | POLLRDNORM;
690 void blktap_kick_user(int idx)
694 if (idx == 0) return;
698 if (info != NULL) wake_up_interruptible(&info->wait);
702 static int do_block_io_op(blkif_t *blkif);
703 static void dispatch_rw_block_io(blkif_t *blkif,
704 blkif_request_t *req,
705 pending_req_t *pending_req);
706 static void make_response(blkif_t *blkif, unsigned long id,
707 unsigned short op, int st);
709 /******************************************************************
712 static int req_increase(void)
719 spin_lock_irqsave(&pending_free_lock, flags);
722 if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
726 extern unsigned long alloc_empty_foreign_map_page_range(
727 unsigned long pages);
728 mmap_start[mmap_alloc].start = (unsigned long)
729 alloc_empty_foreign_map_page_range(mmap_pages);
731 page = balloon_alloc_empty_page_range(mmap_pages);
734 printk("%s balloon_alloc_empty_page_range gave NULL\n", __FUNCTION__);
738 /* Pin all of the pages. */
739 for (i=0; i<mmap_pages; i++)
742 mmap_start[mmap_alloc].start =
743 (unsigned long)pfn_to_kaddr(page_to_pfn(page));
744 mmap_start[mmap_alloc].mpage = page;
748 pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t) *
749 blkif_reqs, GFP_KERNEL);
750 pending_addrs[mmap_alloc] = kzalloc(sizeof(unsigned long) *
751 mmap_pages, GFP_KERNEL);
754 if (!pending_reqs[mmap_alloc] || !pending_addrs[mmap_alloc]) {
755 kfree(pending_reqs[mmap_alloc]);
756 kfree(pending_addrs[mmap_alloc]);
757 WPRINTK("%s: out of memory\n", __FUNCTION__);
764 DPRINTK("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
765 __FUNCTION__, blkif_reqs, mmap_pages,
766 mmap_start[mmap_alloc].start);
768 BUG_ON(mmap_start[mmap_alloc].start == 0);
770 for (i = 0; i < mmap_pages; i++)
771 pending_addrs[mmap_alloc][i] =
772 mmap_start[mmap_alloc].start + (i << PAGE_SHIFT);
774 for (i = 0; i < MAX_PENDING_REQS ; i++) {
775 list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
777 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
778 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
779 BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
784 DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
786 spin_unlock_irqrestore(&pending_free_lock, flags);
790 static void mmap_req_del(int mmap)
795 /*Spinlock already acquired*/
796 kfree(pending_reqs[mmap]);
797 kfree(pending_addrs[mmap]);
800 /*Not sure what goes here yet!*/
803 /* Unpin all of the pages. */
804 page = mmap_start[mmap].mpage;
805 for (i=0; i<mmap_pages; i++)
808 balloon_dealloc_empty_page_range(mmap_start[mmap].mpage, mmap_pages);
812 DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
816 /*N.B. Currently unused - will be accessed via sysfs*/
817 static void req_decrease(void)
823 spin_lock_irqsave(&pending_free_lock, flags);
825 DPRINTK("Req decrease called.\n");
826 if (mmap_lock || mmap_alloc == 1)
830 mmap_inuse = MAX_PENDING_REQS;
832 /*Go through reqs and remove any that aren't in use*/
833 for (i = 0; i < MAX_PENDING_REQS ; i++) {
834 req = &pending_reqs[mmap_alloc-1][i];
835 if (req->inuse == 0) {
836 list_del(&req->free_list);
840 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
842 spin_unlock_irqrestore(&pending_free_lock, flags);
846 static pending_req_t* alloc_req(void)
848 pending_req_t *req = NULL;
851 spin_lock_irqsave(&pending_free_lock, flags);
853 if (!list_empty(&pending_free)) {
854 req = list_entry(pending_free.next, pending_req_t, free_list);
855 list_del(&req->free_list);
860 alloc_pending_reqs++;
862 spin_unlock_irqrestore(&pending_free_lock, flags);
867 static void free_req(pending_req_t *req)
872 spin_lock_irqsave(&pending_free_lock, flags);
874 alloc_pending_reqs--;
876 if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
878 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
879 spin_unlock_irqrestore(&pending_free_lock, flags);
882 was_empty = list_empty(&pending_free);
883 list_add(&req->free_list, &pending_free);
885 spin_unlock_irqrestore(&pending_free_lock, flags);
888 wake_up(&pending_free_wq);
891 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int
894 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
895 unsigned int i, invcount = 0;
896 struct grant_handle_pair *khandle;
899 unsigned long kvaddr, uvaddr;
901 tap_blkif_t *info = tapfds[tapidx];
904 WPRINTK("fast_flush: Couldn't get info!\n");
907 mmap_idx = req->mem_idx;
909 for (i = 0; i < req->nr_pages; i++) {
910 kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i);
911 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
913 khandle = &pending_handle(mmap_idx, k_idx, i);
914 if (BLKTAP_INVALID_HANDLE(khandle)) {
915 WPRINTK("BLKTAP_INVALID_HANDLE\n");
918 gnttab_set_unmap_op(&unmap[invcount],
919 MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i),
920 GNTMAP_host_map, khandle->kernel);
923 if (create_lookup_pte_addr(
925 MMAP_VADDR(info->user_vstart, u_idx, i),
927 WPRINTK("Couldn't get a pte addr!\n");
931 gnttab_set_unmap_op(&unmap[invcount],
932 ptep, GNTMAP_host_map,
936 BLKTAP_INVALIDATE_HANDLE(khandle);
938 ret = HYPERVISOR_grant_table_op(
939 GNTTABOP_unmap_grant_ref, unmap, invcount);
942 if (info->vma != NULL)
943 zap_page_range(info->vma,
944 MMAP_VADDR(info->user_vstart, u_idx, 0),
945 req->nr_pages << PAGE_SHIFT, NULL);
948 /******************************************************************
949 * SCHEDULER FUNCTIONS
952 static void print_stats(blkif_t *blkif)
954 printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
955 current->comm, blkif->st_oo_req,
956 blkif->st_rd_req, blkif->st_wr_req);
957 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
958 blkif->st_rd_req = 0;
959 blkif->st_wr_req = 0;
960 blkif->st_oo_req = 0;
963 int tap_blkif_schedule(void *arg)
965 blkif_t *blkif = arg;
970 printk(KERN_DEBUG "%s: started\n", current->comm);
972 while (!kthread_should_stop()) {
973 wait_event_interruptible(
975 blkif->waiting_reqs || kthread_should_stop());
976 wait_event_interruptible(
978 !list_empty(&pending_free) || kthread_should_stop());
980 blkif->waiting_reqs = 0;
981 smp_mb(); /* clear flag *before* checking for work */
983 if (do_block_io_op(blkif))
984 blkif->waiting_reqs = 1;
986 if (log_stats && time_after(jiffies, blkif->st_print))
993 printk(KERN_DEBUG "%s: exiting\n", current->comm);
995 blkif->xenblkd = NULL;
1001 /******************************************************************
1002 * COMPLETION CALLBACK -- Called by user level ioctl()
1005 static int blktap_read_ufe_ring(int idx)
1007 /* This is called to read responses from the UFE ring. */
1009 blkif_response_t *resp;
1010 blkif_t *blkif=NULL;
1011 int pending_idx, usr_idx, mmap_idx;
1012 pending_req_t *pending_req;
1020 /* We currently only forward packets in INTERCEPT_FE mode. */
1021 if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
1024 /* for each outstanding message on the UFEring */
1025 rp = info->ufe_ring.sring->rsp_prod;
1028 for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
1029 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
1030 ++info->ufe_ring.rsp_cons;
1032 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
1033 usr_idx = (int)resp->id;
1034 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
1035 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
1037 if ( (mmap_idx >= mmap_alloc) ||
1038 (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
1039 WPRINTK("Incorrect req map"
1040 "[%d], internal map [%d,%d (%d)]\n",
1042 ID_TO_IDX(info->idx_map[usr_idx]),
1044 ID_TO_IDX(info->idx_map[usr_idx])));
1046 pending_req = &pending_reqs[mmap_idx][pending_idx];
1047 blkif = pending_req->blkif;
1049 for (j = 0; j < pending_req->nr_pages; j++) {
1051 unsigned long kvaddr, uvaddr;
1052 struct page **map = info->vma->vm_private_data;
1056 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
1057 kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start,
1060 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1061 ClearPageReserved(pg);
1062 offset = (uvaddr - info->vma->vm_start)
1066 fast_flush_area(pending_req, pending_idx, usr_idx, idx);
1067 make_response(blkif, pending_req->id, resp->operation,
1069 info->idx_map[usr_idx] = INVALID_REQ;
1070 blkif_put(pending_req->blkif);
1071 free_req(pending_req);
1078 /******************************************************************************
1079 * NOTIFICATION FROM GUEST OS.
1082 static void blkif_notify_work(blkif_t *blkif)
1084 blkif->waiting_reqs = 1;
1085 wake_up(&blkif->wq);
1088 irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
1090 blkif_notify_work(dev_id);
1096 /******************************************************************
1097 * DOWNWARD CALLS -- These interface with the block-device layer proper.
1099 static int print_dbug = 1;
1100 static int do_block_io_op(blkif_t *blkif)
1102 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1103 blkif_request_t *req;
1104 pending_req_t *pending_req;
1109 rc = blk_ring->req_cons;
1110 rp = blk_ring->sring->req_prod;
1111 rmb(); /* Ensure we see queued requests up to 'rp'. */
1113 /*Check blkif has corresponding UE ring*/
1114 if (blkif->dev_num == -1) {
1117 WPRINTK("Corresponding UE "
1118 "ring does not exist!\n");
1119 print_dbug = 0; /*We only print this message once*/
1124 info = tapfds[blkif->dev_num];
1125 if (info == NULL || !info->dev_inuse) {
1127 WPRINTK("Can't get UE info!\n");
1135 if (RING_FULL(&info->ufe_ring)) {
1136 WPRINTK("RING_FULL! More to do\n");
1141 if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
1142 WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1148 pending_req = alloc_req();
1149 if (NULL == pending_req) {
1155 req = RING_GET_REQUEST(blk_ring, rc);
1156 blk_ring->req_cons = ++rc; /* before make_response() */
1158 switch (req->operation) {
1161 dispatch_rw_block_io(blkif, req, pending_req);
1164 case BLKIF_OP_WRITE:
1166 dispatch_rw_block_io(blkif, req, pending_req);
1170 WPRINTK("unknown operation [%d]\n",
1172 make_response(blkif, req->id, req->operation,
1174 free_req(pending_req);
1179 blktap_kick_user(blkif->dev_num);
1184 static void dispatch_rw_block_io(blkif_t *blkif,
1185 blkif_request_t *req,
1186 pending_req_t *pending_req)
1188 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
1189 int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
1190 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1193 tap_blkif_t *info = tapfds[blkif->dev_num];
1196 blkif_request_t *target;
1197 int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
1198 int usr_idx = GET_NEXT_REQ(info->idx_map);
1199 uint16_t mmap_idx = pending_req->mem_idx;
1201 /*Check we have space on user ring - should never fail*/
1202 if(usr_idx == INVALID_REQ) goto fail_flush;
1204 /* Check that number of segments is sane. */
1205 nseg = req->nr_segments;
1206 if ( unlikely(nseg == 0) ||
1207 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1208 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1212 /* Make sure userspace is ready. */
1213 if (!info->ring_ok) {
1214 WPRINTK("blktap: ring not ready for requests!\n");
1218 if (RING_FULL(&info->ufe_ring)) {
1219 WPRINTK("blktap: fe_ring is full, can't add "
1220 "IO Request will be dropped. %d %d\n",
1221 RING_SIZE(&info->ufe_ring),
1222 RING_SIZE(&blkif->blk_ring));
1226 pending_req->blkif = blkif;
1227 pending_req->id = req->id;
1228 pending_req->operation = operation;
1229 pending_req->status = BLKIF_RSP_OKAY;
1230 pending_req->nr_pages = nseg;
1232 for (i = 0; i < nseg; i++) {
1233 unsigned long uvaddr;
1234 unsigned long kvaddr;
1239 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1240 kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start,
1242 page = virt_to_page(kvaddr);
1244 sector = req->sector_number + (8*i);
1245 if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
1246 WPRINTK("BLKTAP: Sector request greater"
1248 WPRINTK("BLKTAP: %s request sector"
1249 "[%llu,%llu], Total [%llu]\n",
1251 BLKIF_OP_WRITE ? "WRITE" : "READ"),
1252 (long long unsigned) sector,
1253 (long long unsigned) sector>>9,
1257 flags = GNTMAP_host_map;
1258 if (operation == WRITE)
1259 flags |= GNTMAP_readonly;
1260 gnttab_set_map_op(&map[op], kvaddr, flags,
1261 req->seg[i].gref, blkif->domid);
1264 /* Now map it to user. */
1265 ret = create_lookup_pte_addr(info->vma->vm_mm,
1268 WPRINTK("Couldn't get a pte addr!\n");
1269 fast_flush_area(pending_req, pending_idx, usr_idx,
1274 flags = GNTMAP_host_map | GNTMAP_application_map
1275 | GNTMAP_contains_pte;
1276 if (operation == WRITE)
1277 flags |= GNTMAP_readonly;
1278 gnttab_set_map_op(&map[op], ptep, flags,
1279 req->seg[i].gref, blkif->domid);
1283 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1286 for (i = 0; i < (nseg*2); i+=2) {
1287 unsigned long uvaddr;
1288 unsigned long kvaddr;
1289 unsigned long offset;
1292 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1293 kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start,
1296 if (unlikely(map[i].status != 0)) {
1297 WPRINTK("invalid kernel buffer -- "
1298 "could not remap it\n");
1302 if (unlikely(map[i+1].status != 0)) {
1303 WPRINTK("invalid user buffer -- "
1304 "could not remap it\n");
1308 pending_handle(mmap_idx, pending_idx, i/2).kernel
1310 pending_handle(mmap_idx, pending_idx, i/2).user
1312 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
1313 FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
1314 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1315 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1316 ((struct page **)info->vma->vm_private_data)[offset] =
1319 /* Mark mapped pages as reserved: */
1320 for (i = 0; i < req->nr_segments; i++) {
1321 unsigned long kvaddr;
1324 kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start,
1326 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1327 SetPageReserved(pg);
1330 /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1331 info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
1334 /* Finally, write the request message to the user ring. */
1335 target = RING_GET_REQUEST(&info->ufe_ring,
1336 info->ufe_ring.req_prod_pvt);
1337 memcpy(target, req, sizeof(*req));
1338 target->id = usr_idx;
1339 info->ufe_ring.req_prod_pvt++;
1343 WPRINTK("Reached Fail_flush\n");
1344 fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
1346 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1347 free_req(pending_req);
1352 /******************************************************************
1353 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1357 static void make_response(blkif_t *blkif, unsigned long id,
1358 unsigned short op, int st)
1360 blkif_response_t *resp;
1361 unsigned long flags;
1362 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1366 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1367 /* Place on the response ring for the relevant domain. */
1368 resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
1370 resp->operation = op;
1372 blk_ring->rsp_prod_pvt++;
1373 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
1375 if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
1377 * Tail check for pending requests. Allows frontend to avoid
1378 * notifications if requests are already in flight (lower
1379 * overheads and promotes batching).
1381 RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
1382 } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
1386 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1388 blkif_notify_work(blkif);
1390 notify_remote_via_irq(blkif->irq);
1393 static int __init blkif_init(void)
1395 int i,ret,blktap_dir;
1398 if (!is_running_on_xen())
1401 INIT_LIST_HEAD(&pending_free);
1402 for(i = 0; i < 2; i++) {
1403 ret = req_increase();
1410 tap_blkif_interface_init();
1412 alloc_pending_reqs = 0;
1414 tap_blkif_xenbus_init();
1416 /*Create the blktap devices, but do not map memory or waitqueue*/
1417 for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF;
1419 ret = register_chrdev(BLKTAP_DEV_MAJOR,"blktap",&blktap_fops);
1420 blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL);
1422 if ( (ret < 0)||(blktap_dir < 0) ) {
1423 WPRINTK("Couldn't register /dev/xen/blktap\n");
1427 for(i = 0; i < MAX_TAP_DEV; i++ ) {
1428 info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL);
1429 if(tapfds[i] == NULL) return -ENOMEM;
1434 ret = devfs_mk_cdev(MKDEV(BLKTAP_DEV_MAJOR, i),
1435 S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
1437 if(ret != 0) return -ENOMEM;
1438 info->dev_pending = info->dev_inuse = 0;
1440 DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
1443 DPRINTK("Blktap device successfully created\n");
1448 module_init(blkif_init);
1450 MODULE_LICENSE("Dual BSD/GPL");