X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=drivers%2Fxen%2Fblktap%2Fblktap.c;fp=drivers%2Fxen%2Fblktap%2Fblktap.c;h=8c5cf68d7fb445661b15da5b1ab8e36ba21e5f30;hb=4e76c8a9fa413ccc09d3f7f664183dcce3555d57;hp=a9a00677bc0fbc6bdca6d1564ee9a915aca47902;hpb=1db395853d4f30d6120458bd279ede1f882a8525;p=linux-2.6.git diff --git a/drivers/xen/blktap/blktap.c b/drivers/xen/blktap/blktap.c index a9a00677b..8c5cf68d7 100644 --- a/drivers/xen/blktap/blktap.c +++ b/drivers/xen/blktap/blktap.c @@ -1,87 +1,1450 @@ /****************************************************************************** - * blktap.c + * drivers/xen/blktap/blktap.c * - * XenLinux virtual block-device tap. + * Back-end driver for user level virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. Requests + * are remapped to a user-space memory region. + * + * Based on the blkback driver code. * - * Copyright (c) 2004, Andrew Warfield + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield * - * Based on the original split block driver: - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - * Modifications by Mark A. Williamson are (c) Intel Research Cambridge - * Copyright (c) 2004, Christian Limpach + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * - * Note that unlike the split block driver code, this driver has been developed - * strictly for Linux 2.6 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "common.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_TAP_DEV 100 /*the maximum number of tapdisk ring devices */ +#define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */ + +/* + * The maximum number of requests that can be outstanding at any time + * is determined by + * + * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] + * + * where mmap_alloc < MAX_DYNAMIC_MEM. + * + * TODO: + * mmap_alloc is initialised to 2 and should be adjustable on the fly via + * sysfs. */ +#define MAX_DYNAMIC_MEM 64 +#define MAX_PENDING_REQS 64 +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_start, _req,_seg) \ + (_start + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) +static int blkif_reqs = MAX_PENDING_REQS; +static int mmap_pages = MMAP_PAGES; + +#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we + * have a bunch of pages reserved for shared + * memory rings. + */ + +/*Data struct associated with each of the tapdisk devices*/ +typedef struct tap_blkif { + struct vm_area_struct *vma; /*Shared memory area */ + unsigned long rings_vstart; /*Kernel memory mapping */ + unsigned long user_vstart; /*User memory mapping */ + unsigned long dev_inuse; /*One process opens device at a time. */ + unsigned long dev_pending; /*In process of being opened */ + unsigned long ring_ok; /*make this ring->state */ + blkif_front_ring_t ufe_ring; /*Rings up to user space. */ + wait_queue_head_t wait; /*for poll */ + unsigned long mode; /*current switching mode */ + int minor; /*Minor number for tapdisk device */ + pid_t pid; /*tapdisk process id */ + enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace + shutdown */ + unsigned long *idx_map; /*Record the user ring id to kern + [req id, idx] tuple */ + blkif_t *blkif; /*Associate blkif with tapdev */ +} tap_blkif_t; + +/*Private data struct associated with the inode*/ +typedef struct private_info { + int idx; +} private_info_t; + +/*Data struct handed back to userspace for tapdisk device to VBD mapping*/ +typedef struct domid_translate { + unsigned short domid; + unsigned short busid; +} domid_translate_t ; -#include "blktap.h" -int __init xlblktap_init(void) +domid_translate_t translate_domid[MAX_TAP_DEV]; +tap_blkif_t *tapfds[MAX_TAP_DEV]; + +static int __init set_blkif_reqs(char *str) { - ctrl_msg_t cmsg; - blkif_fe_driver_status_t fe_st; - blkif_be_driver_status_t be_st; + get_option(&str, &blkif_reqs); + return 1; +} +__setup("blkif_reqs=", set_blkif_reqs); + +/* Run-time switchable: /sys/module/blktap/parameters/ */ +static unsigned int log_stats = 0; +static unsigned int debug_lvl = 0; +module_param(log_stats, int, 0644); +module_param(debug_lvl, int, 0644); + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct { + blkif_t *blkif; + unsigned long id; + unsigned short mem_idx; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; + struct list_head free_list; + int inuse; +} pending_req_t; + +static pending_req_t *pending_reqs[MAX_PENDING_REQS]; +static struct list_head pending_free; +static DEFINE_SPINLOCK(pending_free_lock); +static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq); +static int alloc_pending_reqs; + +typedef unsigned int PEND_RING_IDX; + +static inline int MASK_PEND_IDX(int i) { + return (i & (MAX_PENDING_REQS-1)); +} + +static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) { + return (req - pending_reqs[idx]); +} + +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +#define BLKBACK_INVALID_HANDLE (~0) + +typedef struct mmap_page { + unsigned long start; + struct page *mpage; +} mmap_page_t; + +static mmap_page_t mmap_start[MAX_DYNAMIC_MEM]; +static unsigned short mmap_alloc = 0; +static unsigned short mmap_lock = 0; +static unsigned short mmap_inuse = 0; +static unsigned long *pending_addrs[MAX_DYNAMIC_MEM]; + +/****************************************************************** + * GRANT HANDLES + */ + +/* When using grant tables to map a frame for device access then the + * handle returned must be used to unmap the frame. This is needed to + * drop the ref count on the frame. + */ +struct grant_handle_pair +{ + grant_handle_t kernel; + grant_handle_t user; +}; + +static struct grant_handle_pair + pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES]; +#define pending_handle(_id, _idx, _i) \ + (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \ + + (_i)]) - printk(KERN_INFO "Initialising Xen block tap device\n"); - DPRINTK(" tap - Backend connection init:\n"); +static int blktap_read_ufe_ring(int idx); /*local prototypes*/ +#define BLKTAP_MINOR 0 /*/dev/xen/blktap resides at device number + major=254, minor numbers begin at 0 */ +#define BLKTAP_DEV_MAJOR 254 /* TODO: Make major number dynamic * + * and create devices in the kernel * + */ +#define BLKTAP_DEV_DIR "/dev/xen" - (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); +/* blktap IOCTLs: */ +#define BLKTAP_IOCTL_KICK_FE 1 +#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */ +#define BLKTAP_IOCTL_SETMODE 3 +#define BLKTAP_IOCTL_SENDPID 4 +#define BLKTAP_IOCTL_NEWINTF 5 +#define BLKTAP_IOCTL_MINOR 6 +#define BLKTAP_IOCTL_MAJOR 7 +#define BLKTAP_QUERY_ALLOC_REQS 8 +#define BLKTAP_IOCTL_FREEINTF 9 +#define BLKTAP_IOCTL_PRINT_IDXS 100 - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_FE; - cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_fe_driver_status_t); - fe_st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &fe_st, sizeof(fe_st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ +#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ +#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 +#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */ + +#define BLKTAP_MODE_INTERPOSE \ + (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) + + +static inline int BLKTAP_MODE_VALID(unsigned long arg) +{ + return ((arg == BLKTAP_MODE_PASSTHROUGH ) || + (arg == BLKTAP_MODE_INTERCEPT_FE) || + (arg == BLKTAP_MODE_INTERPOSE )); +} + +/* Requests passing through the tap to userspace are re-assigned an ID. + * We must record a mapping between the BE [IDX,ID] tuple and the userspace + * ring ID. + */ + +static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx) +{ + return ((fe_dom << 16) | MASK_PEND_IDX(idx)); +} + +extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id) +{ + return (PEND_RING_IDX)(id & 0x0000ffff); +} + +extern inline int ID_TO_MIDX(unsigned long id) +{ + return (int)(id >> 16); +} + +#define INVALID_REQ 0xdead0000 + +/*TODO: Convert to a free list*/ +static inline int GET_NEXT_REQ(unsigned long *idx_map) +{ + int i; + for (i = 0; i < MAX_PENDING_REQS; i++) + if (idx_map[i] == INVALID_REQ) return i; + + return INVALID_REQ; +} + + +#define BLKTAP_INVALID_HANDLE(_g) \ + (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF)) + +#define BLKTAP_INVALIDATE_HANDLE(_g) do { \ + (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \ + } while(0) + + +/****************************************************************** + * BLKTAP VM OPS + */ + +static struct page *blktap_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + /* + * if the page has not been mapped in by the driver then return + * NOPAGE_SIGBUS to the domain. + */ + + return NOPAGE_SIGBUS; +} + +struct vm_operations_struct blktap_vm_ops = { + nopage: blktap_nopage, +}; + +/****************************************************************** + * BLKTAP FILE OPS + */ + +/*Function Declarations*/ +static int get_next_free_dev(void); +static int blktap_open(struct inode *inode, struct file *filp); +static int blktap_release(struct inode *inode, struct file *filp); +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma); +static int blktap_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); +static unsigned int blktap_poll(struct file *file, poll_table *wait); - DPRINTK(" tap - Frontend connection init:\n"); +struct miscdevice *set_misc(int minor, char *name, int dev); + +static struct file_operations blktap_fops = { + .owner = THIS_MODULE, + .poll = blktap_poll, + .ioctl = blktap_ioctl, + .open = blktap_open, + .release = blktap_release, + .mmap = blktap_mmap, +}; + + +static int get_next_free_dev(void) +{ + tap_blkif_t *info; + int i = 0, ret = -1; + unsigned long flags; + + spin_lock_irqsave(&pending_free_lock, flags); + + while (i < MAX_TAP_DEV) { + info = tapfds[i]; + if ( (tapfds[i] != NULL) && (info->dev_inuse == 0) + && (info->dev_pending == 0) ) { + info->dev_pending = 1; + ret = i; + goto done; + } + i++; + } + +done: + spin_unlock_irqrestore(&pending_free_lock, flags); + return ret; +} + +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) +{ + int i; + + for (i = 0; i < MAX_TAP_DEV; i++) + if ( (translate_domid[i].domid == domid) + && (translate_domid[i].busid == xenbus_id) ) { + tapfds[i]->blkif = blkif; + tapfds[i]->status = RUNNING; + return i; + } + return -1; +} + +void signal_tapdisk(int idx) +{ + tap_blkif_t *info; + struct task_struct *ptask; + + info = tapfds[idx]; + if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) { + ptask = find_task_by_pid(info->pid); + if (ptask) { + info->status = CLEANSHUTDOWN; + } + } + info->blkif = NULL; + return; +} + +static int blktap_open(struct inode *inode, struct file *filp) +{ + blkif_sring_t *sring; + int idx = iminor(inode) - BLKTAP_MINOR; + tap_blkif_t *info; + private_info_t *prv; + int i; + + if (tapfds[idx] == NULL) { + WPRINTK("Unable to open device /dev/xen/blktap%d\n", + idx); + return -ENOMEM; + } + DPRINTK("Opening device /dev/xen/blktap%d\n",idx); + + info = tapfds[idx]; + + /*Only one process can access device at a time*/ + if (test_and_set_bit(0, &info->dev_inuse)) + return -EBUSY; + + info->dev_pending = 0; + + /* Allocate the fe ring. */ + sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); + if (sring == NULL) + goto fail_nomem; + + SetPageReserved(virt_to_page(sring)); + + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE); + + prv = kzalloc(sizeof(private_info_t),GFP_KERNEL); + prv->idx = idx; + filp->private_data = prv; + info->vma = NULL; + + info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, + GFP_KERNEL); + + if (idx > 0) { + init_waitqueue_head(&info->wait); + for (i = 0; i < MAX_PENDING_REQS; i++) + info->idx_map[i] = INVALID_REQ; + } + + DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx); + return 0; + + fail_nomem: + return -ENOMEM; +} + +static int blktap_release(struct inode *inode, struct file *filp) +{ + int idx = iminor(inode) - BLKTAP_MINOR; + tap_blkif_t *info; + + if (tapfds[idx] == NULL) { + WPRINTK("Trying to free device that doesn't exist " + "[/dev/xen/blktap%d]\n",idx); + return -1; + } + info = tapfds[idx]; + info->dev_inuse = 0; + DPRINTK("Freeing device [/dev/xen/blktap%d]\n",idx); + + /* Free the ring page. */ + ClearPageReserved(virt_to_page(info->ufe_ring.sring)); + free_page((unsigned long) info->ufe_ring.sring); + + /* Clear any active mappings and free foreign map table */ + if (info->vma) { + zap_page_range( + info->vma, info->vma->vm_start, + info->vma->vm_end - info->vma->vm_start, NULL); + info->vma = NULL; + } + + if (filp->private_data) kfree(filp->private_data); + + if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) { + kthread_stop(info->blkif->xenblkd); + info->blkif->xenblkd = NULL; + info->status = CLEANSHUTDOWN; + } + return 0; +} + + +/* Note on mmap: + * We need to map pages to user space in a way that will allow the block + * subsystem set up direct IO to them. This couldn't be done before, because + * there isn't really a sane way to translate a user virtual address down to a + * physical address when the page belongs to another domain. + * + * My first approach was to map the page in to kernel memory, add an entry + * for it in the physical frame list (using alloc_lomem_region as in blkback) + * and then attempt to map that page up to user space. This is disallowed + * by xen though, which realizes that we don't really own the machine frame + * underlying the physical page. + * + * The new approach is to provide explicit support for this in xen linux. + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages + * mapped from other vms. vma->vm_private_data is set up as a mapping + * from pages to actual page structs. There is a new clause in get_user_pages + * that does the right thing for this sort of mapping. + */ +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int size; + struct page **map; + int i; + private_info_t *prv; + tap_blkif_t *info; + + /*Retrieve the dev info*/ + prv = (private_info_t *)filp->private_data; + if (prv == NULL) { + WPRINTK("blktap: mmap, retrieving idx failed\n"); + return -ENOMEM; + } + info = tapfds[prv->idx]; + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &blktap_vm_ops; + + size = vma->vm_end - vma->vm_start; + if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) { + WPRINTK("you _must_ map exactly %d pages!\n", + mmap_pages + RING_PAGES); + return -EAGAIN; + } + + size >>= PAGE_SHIFT; + info->rings_vstart = vma->vm_start; + info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); - active_reqs_init(); - blkif_interface_init(); - blkdev_schedule_init(); + /* Map the ring pages to the start of the region and reserve it. */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (remap_pfn_range(vma, vma->vm_start, + __pa(info->ufe_ring.sring) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot)) { + WPRINTK("Mapping user ring failed!\n"); + goto fail; + } + + /* Mark this VM as containing foreign pages, and set up mappings. */ + map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + * sizeof(struct page_struct*), + GFP_KERNEL); + if (map == NULL) { + WPRINTK("Couldn't alloc VM_FOREIGN map.\n"); + goto fail; + } + + for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++) + map[i] = NULL; - (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); + vma->vm_private_data = map; + vma->vm_flags |= VM_FOREIGN; + + info->vma = vma; + info->ring_ok = 1; + return 0; + fail: + /* Clear any active mappings. */ + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_BE; - cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_be_driver_status_t); - be_st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &be_st, sizeof(be_st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + return -ENOMEM; +} - DPRINTK(" tap - Userland channel init:\n"); - blktap_init(); +static int blktap_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int idx = iminor(inode) - BLKTAP_MINOR; + switch(cmd) { + case BLKTAP_IOCTL_KICK_FE: + { + /* There are fe messages to process. */ + return blktap_read_ufe_ring(idx); + } + case BLKTAP_IOCTL_SETMODE: + { + tap_blkif_t *info = tapfds[idx]; + + if ( (idx > 0) && (idx < MAX_TAP_DEV) + && (tapfds[idx] != NULL) ) + { + if (BLKTAP_MODE_VALID(arg)) { + info->mode = arg; + /* XXX: may need to flush rings here. */ + DPRINTK("blktap: set mode to %lx\n", + arg); + return 0; + } + } + return 0; + } + case BLKTAP_IOCTL_PRINT_IDXS: + { + tap_blkif_t *info = tapfds[idx]; + + if ( (idx > 0) && (idx < MAX_TAP_DEV) + && (tapfds[idx] != NULL) ) + { + printk("User Rings: \n-----------\n"); + printk("UF: rsp_cons: %2d, req_prod_prv: %2d " + "| req_prod: %2d, rsp_prod: %2d\n", + info->ufe_ring.rsp_cons, + info->ufe_ring.req_prod_pvt, + info->ufe_ring.sring->req_prod, + info->ufe_ring.sring->rsp_prod); + } + return 0; + } + case BLKTAP_IOCTL_SENDPID: + { + tap_blkif_t *info = tapfds[idx]; + + if ( (idx > 0) && (idx < MAX_TAP_DEV) + && (tapfds[idx] != NULL) ) + { + info->pid = (pid_t)arg; + DPRINTK("blktap: pid received %d\n", + info->pid); + } + return 0; + } + case BLKTAP_IOCTL_NEWINTF: + { + uint64_t val = (uint64_t)arg; + domid_translate_t *tr = (domid_translate_t *)&val; + int newdev; - DPRINTK("Blkif tap device initialized.\n"); + DPRINTK("NEWINTF Req for domid %d and bus id %d\n", + tr->domid, tr->busid); + newdev = get_next_free_dev(); + if (newdev < 1) { + WPRINTK("Error initialising /dev/xen/blktap - " + "No more devices\n"); + return -1; + } + translate_domid[newdev].domid = tr->domid; + translate_domid[newdev].busid = tr->busid; + return newdev; + } + case BLKTAP_IOCTL_FREEINTF: + { + unsigned long dev = arg; + tap_blkif_t *info = NULL; - return 0; + if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev]; + + if ( (info != NULL) && (info->dev_pending) ) + info->dev_pending = 0; + return 0; + } + case BLKTAP_IOCTL_MINOR: + { + unsigned long dev = arg; + tap_blkif_t *info = NULL; + + if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev]; + + if (info != NULL) return info->minor; + else return -1; + } + case BLKTAP_IOCTL_MAJOR: + return BLKTAP_DEV_MAJOR; + + case BLKTAP_QUERY_ALLOC_REQS: + { + WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n", + alloc_pending_reqs, blkif_reqs); + return (alloc_pending_reqs/blkif_reqs) * 100; + } + } + return -ENOIOCTLCMD; } -#if 0 /* tap doesn't handle suspend/resume */ -void blkdev_suspend(void) +static unsigned int blktap_poll(struct file *file, poll_table *wait) { + private_info_t *prv; + tap_blkif_t *info; + + /*Retrieve the dev info*/ + prv = (private_info_t *)file->private_data; + if (prv == NULL) { + WPRINTK(" poll, retrieving idx failed\n"); + return 0; + } + + if (prv->idx == 0) return 0; + + info = tapfds[prv->idx]; + + poll_wait(file, &info->wait, wait); + if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) { + flush_tlb_all(); + RING_PUSH_REQUESTS(&info->ufe_ring); + return POLLIN | POLLRDNORM; + } + return 0; } -void blkdev_resume(void) +void blktap_kick_user(int idx) { - ctrl_msg_t cmsg; - blkif_fe_driver_status_t st; + tap_blkif_t *info; - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_FE; - cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_fe_driver_status_t); - st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + if (idx == 0) return; + + info = tapfds[idx]; + + if (info != NULL) wake_up_interruptible(&info->wait); + return; } + +static int do_block_io_op(blkif_t *blkif); +static void dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req); +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st); + +/****************************************************************** + * misc small helpers + */ +static int req_increase(void) +{ + int i, j; + struct page *page; + unsigned long flags; + int ret; + + spin_lock_irqsave(&pending_free_lock, flags); + + ret = -EINVAL; + if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) + goto done; + +#ifdef __ia64__ + extern unsigned long alloc_empty_foreign_map_page_range( + unsigned long pages); + mmap_start[mmap_alloc].start = (unsigned long) + alloc_empty_foreign_map_page_range(mmap_pages); +#else /* ! ia64 */ + page = balloon_alloc_empty_page_range(mmap_pages); + ret = -ENOMEM; + if (page == NULL) { + printk("%s balloon_alloc_empty_page_range gave NULL\n", __FUNCTION__); + goto done; + } + + /* Pin all of the pages. */ + for (i=0; iinuse == 0) { + list_del(&req->free_list); + mmap_inuse--; + } + } + if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1); + done: + spin_unlock_irqrestore(&pending_free_lock, flags); + return; +} + +static pending_req_t* alloc_req(void) +{ + pending_req_t *req = NULL; + unsigned long flags; + + spin_lock_irqsave(&pending_free_lock, flags); + + if (!list_empty(&pending_free)) { + req = list_entry(pending_free.next, pending_req_t, free_list); + list_del(&req->free_list); + } + + if (req) { + req->inuse = 1; + alloc_pending_reqs++; + } + spin_unlock_irqrestore(&pending_free_lock, flags); + + return req; +} + +static void free_req(pending_req_t *req) +{ + unsigned long flags; + int was_empty; + + spin_lock_irqsave(&pending_free_lock, flags); + + alloc_pending_reqs--; + req->inuse = 0; + if (mmap_lock && (req->mem_idx == mmap_alloc-1)) { + mmap_inuse--; + if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1); + spin_unlock_irqrestore(&pending_free_lock, flags); + return; + } + was_empty = list_empty(&pending_free); + list_add(&req->free_list, &pending_free); + + spin_unlock_irqrestore(&pending_free_lock, flags); + + if (was_empty) + wake_up(&pending_free_wq); +} + +static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int + tapidx) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + unsigned int i, invcount = 0; + struct grant_handle_pair *khandle; + uint64_t ptep; + int ret, mmap_idx; + unsigned long kvaddr, uvaddr; + + tap_blkif_t *info = tapfds[tapidx]; + + if (info == NULL) { + WPRINTK("fast_flush: Couldn't get info!\n"); + return; + } + mmap_idx = req->mem_idx; + + for (i = 0; i < req->nr_pages; i++) { + kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i); + uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i); + + khandle = &pending_handle(mmap_idx, k_idx, i); + if (BLKTAP_INVALID_HANDLE(khandle)) { + WPRINTK("BLKTAP_INVALID_HANDLE\n"); + continue; + } + gnttab_set_unmap_op(&unmap[invcount], + MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i), + GNTMAP_host_map, khandle->kernel); + invcount++; + + if (create_lookup_pte_addr( + info->vma->vm_mm, + MMAP_VADDR(info->user_vstart, u_idx, i), + &ptep) !=0) { + WPRINTK("Couldn't get a pte addr!\n"); + return; + } + + gnttab_set_unmap_op(&unmap[invcount], + ptep, GNTMAP_host_map, + khandle->user); + invcount++; + + BLKTAP_INVALIDATE_HANDLE(khandle); + } + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + BUG_ON(ret); + + if (info->vma != NULL) + zap_page_range(info->vma, + MMAP_VADDR(info->user_vstart, u_idx, 0), + req->nr_pages << PAGE_SHIFT, NULL); +} + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static void print_stats(blkif_t *blkif) +{ + printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n", + current->comm, blkif->st_oo_req, + blkif->st_rd_req, blkif->st_wr_req); + blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); + blkif->st_rd_req = 0; + blkif->st_wr_req = 0; + blkif->st_oo_req = 0; +} + +int tap_blkif_schedule(void *arg) +{ + blkif_t *blkif = arg; + + blkif_get(blkif); + + if (debug_lvl) + printk(KERN_DEBUG "%s: started\n", current->comm); + + while (!kthread_should_stop()) { + wait_event_interruptible( + blkif->wq, + blkif->waiting_reqs || kthread_should_stop()); + wait_event_interruptible( + pending_free_wq, + !list_empty(&pending_free) || kthread_should_stop()); + + blkif->waiting_reqs = 0; + smp_mb(); /* clear flag *before* checking for work */ + + if (do_block_io_op(blkif)) + blkif->waiting_reqs = 1; + + if (log_stats && time_after(jiffies, blkif->st_print)) + print_stats(blkif); + } + + if (log_stats) + print_stats(blkif); + if (debug_lvl) + printk(KERN_DEBUG "%s: exiting\n", current->comm); + + blkif->xenblkd = NULL; + blkif_put(blkif); + + return 0; +} + +/****************************************************************** + * COMPLETION CALLBACK -- Called by user level ioctl() + */ + +static int blktap_read_ufe_ring(int idx) +{ + /* This is called to read responses from the UFE ring. */ + RING_IDX i, j, rp; + blkif_response_t *resp; + blkif_t *blkif=NULL; + int pending_idx, usr_idx, mmap_idx; + pending_req_t *pending_req; + tap_blkif_t *info; + + info = tapfds[idx]; + if (info == NULL) { + return 0; + } + + /* We currently only forward packets in INTERCEPT_FE mode. */ + if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE)) + return 0; + + /* for each outstanding message on the UFEring */ + rp = info->ufe_ring.sring->rsp_prod; + rmb(); + + for (i = info->ufe_ring.rsp_cons; i != rp; i++) { + resp = RING_GET_RESPONSE(&info->ufe_ring, i); + ++info->ufe_ring.rsp_cons; + + /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/ + usr_idx = (int)resp->id; + pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx])); + mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]); + + if ( (mmap_idx >= mmap_alloc) || + (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) ) + WPRINTK("Incorrect req map" + "[%d], internal map [%d,%d (%d)]\n", + usr_idx, mmap_idx, + ID_TO_IDX(info->idx_map[usr_idx]), + MASK_PEND_IDX( + ID_TO_IDX(info->idx_map[usr_idx]))); + + pending_req = &pending_reqs[mmap_idx][pending_idx]; + blkif = pending_req->blkif; + + for (j = 0; j < pending_req->nr_pages; j++) { + + unsigned long kvaddr, uvaddr; + struct page **map = info->vma->vm_private_data; + struct page *pg; + int offset; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j); + kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, + pending_idx, j); + + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + ClearPageReserved(pg); + offset = (uvaddr - info->vma->vm_start) + >> PAGE_SHIFT; + map[offset] = NULL; + } + fast_flush_area(pending_req, pending_idx, usr_idx, idx); + make_response(blkif, pending_req->id, resp->operation, + resp->status); + info->idx_map[usr_idx] = INVALID_REQ; + blkif_put(pending_req->blkif); + free_req(pending_req); + } + + return 0; +} + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +static void blkif_notify_work(blkif_t *blkif) +{ + blkif->waiting_reqs = 1; + wake_up(&blkif->wq); +} + +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_notify_work(dev_id); + return IRQ_HANDLED; +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ +static int print_dbug = 1; +static int do_block_io_op(blkif_t *blkif) +{ + blkif_back_ring_t *blk_ring = &blkif->blk_ring; + blkif_request_t *req; + pending_req_t *pending_req; + RING_IDX rc, rp; + int more_to_do = 0; + tap_blkif_t *info; + + rc = blk_ring->req_cons; + rp = blk_ring->sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + /*Check blkif has corresponding UE ring*/ + if (blkif->dev_num == -1) { + /*oops*/ + if (print_dbug) { + WPRINTK("Corresponding UE " + "ring does not exist!\n"); + print_dbug = 0; /*We only print this message once*/ + } + return 1; + } + + info = tapfds[blkif->dev_num]; + if (info == NULL || !info->dev_inuse) { + if (print_dbug) { + WPRINTK("Can't get UE info!\n"); + print_dbug = 0; + } + return 1; + } + + while (rc != rp) { + + if (RING_FULL(&info->ufe_ring)) { + WPRINTK("RING_FULL! More to do\n"); + more_to_do = 1; + break; + } + + if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) { + WPRINTK("RING_REQUEST_CONS_OVERFLOW!" + " More to do\n"); + more_to_do = 1; + break; + } + + pending_req = alloc_req(); + if (NULL == pending_req) { + blkif->st_oo_req++; + more_to_do = 1; + break; + } + + req = RING_GET_REQUEST(blk_ring, rc); + blk_ring->req_cons = ++rc; /* before make_response() */ + + switch (req->operation) { + case BLKIF_OP_READ: + blkif->st_rd_req++; + dispatch_rw_block_io(blkif, req, pending_req); + break; + + case BLKIF_OP_WRITE: + blkif->st_wr_req++; + dispatch_rw_block_io(blkif, req, pending_req); + break; + + default: + WPRINTK("unknown operation [%d]\n", + req->operation); + make_response(blkif, req->id, req->operation, + BLKIF_RSP_ERROR); + free_req(pending_req); + break; + } + } + + blktap_kick_user(blkif->dev_num); + + return more_to_do; +} + +static void dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req) +{ + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + unsigned int nseg; + int ret, i; + tap_blkif_t *info = tapfds[blkif->dev_num]; + uint64_t sector; + + blkif_request_t *target; + int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx); + int usr_idx = GET_NEXT_REQ(info->idx_map); + uint16_t mmap_idx = pending_req->mem_idx; + + /*Check we have space on user ring - should never fail*/ + if(usr_idx == INVALID_REQ) goto fail_flush; + + /* Check that number of segments is sane. */ + nseg = req->nr_segments; + if ( unlikely(nseg == 0) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) { + WPRINTK("Bad number of segments in request (%d)\n", nseg); + goto fail_response; + } + + /* Make sure userspace is ready. */ + if (!info->ring_ok) { + WPRINTK("blktap: ring not ready for requests!\n"); + goto fail_response; + } + + if (RING_FULL(&info->ufe_ring)) { + WPRINTK("blktap: fe_ring is full, can't add " + "IO Request will be dropped. %d %d\n", + RING_SIZE(&info->ufe_ring), + RING_SIZE(&blkif->blk_ring)); + goto fail_response; + } + + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + op = 0; + for (i = 0; i < nseg; i++) { + unsigned long uvaddr; + unsigned long kvaddr; + uint64_t ptep; + struct page *page; + uint32_t flags; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); + kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, + pending_idx, i); + page = virt_to_page(kvaddr); + + sector = req->sector_number + (8*i); + if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) { + WPRINTK("BLKTAP: Sector request greater" + "than size\n"); + WPRINTK("BLKTAP: %s request sector" + "[%llu,%llu], Total [%llu]\n", + (req->operation == + BLKIF_OP_WRITE ? "WRITE" : "READ"), + (long long unsigned) sector, + (long long unsigned) sector>>9, + blkif->sectors); + } + + flags = GNTMAP_host_map; + if (operation == WRITE) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[op], kvaddr, flags, + req->seg[i].gref, blkif->domid); + op++; + + /* Now map it to user. */ + ret = create_lookup_pte_addr(info->vma->vm_mm, + uvaddr, &ptep); + if (ret) { + WPRINTK("Couldn't get a pte addr!\n"); + fast_flush_area(pending_req, pending_idx, usr_idx, + blkif->dev_num); + goto fail_flush; + } + + flags = GNTMAP_host_map | GNTMAP_application_map + | GNTMAP_contains_pte; + if (operation == WRITE) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[op], ptep, flags, + req->seg[i].gref, blkif->domid); + op++; + } + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op); + BUG_ON(ret); + + for (i = 0; i < (nseg*2); i+=2) { + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long offset; + struct page *pg; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2); + kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, + pending_idx, i/2); + + if (unlikely(map[i].status != 0)) { + WPRINTK("invalid kernel buffer -- " + "could not remap it\n"); + goto fail_flush; + } + + if (unlikely(map[i+1].status != 0)) { + WPRINTK("invalid user buffer -- " + "could not remap it\n"); + goto fail_flush; + } + + pending_handle(mmap_idx, pending_idx, i/2).kernel + = map[i].handle; + pending_handle(mmap_idx, pending_idx, i/2).user + = map[i+1].handle; + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); + offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + ((struct page **)info->vma->vm_private_data)[offset] = + pg; + } + /* Mark mapped pages as reserved: */ + for (i = 0; i < req->nr_segments; i++) { + unsigned long kvaddr; + struct page *pg; + + kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, + pending_idx, i); + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + SetPageReserved(pg); + } + + /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/ + info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx); + + blkif_get(blkif); + /* Finally, write the request message to the user ring. */ + target = RING_GET_REQUEST(&info->ufe_ring, + info->ufe_ring.req_prod_pvt); + memcpy(target, req, sizeof(*req)); + target->id = usr_idx; + info->ufe_ring.req_prod_pvt++; + return; + + fail_flush: + WPRINTK("Reached Fail_flush\n"); + fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num); + fail_response: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + free_req(pending_req); +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st) +{ + blkif_response_t *resp; + unsigned long flags; + blkif_back_ring_t *blk_ring = &blkif->blk_ring; + int more_to_do = 0; + int notify; + + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + /* Place on the response ring for the relevant domain. */ + resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); + resp->id = id; + resp->operation = op; + resp->status = st; + blk_ring->rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify); + + if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) { + /* + * Tail check for pending requests. Allows frontend to avoid + * notifications if requests are already in flight (lower + * overheads and promotes batching). + */ + RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do); + } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) { + more_to_do = 1; + + } + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + if (more_to_do) + blkif_notify_work(blkif); + if (notify) + notify_remote_via_irq(blkif->irq); +} + +static int __init blkif_init(void) +{ + int i,ret,blktap_dir; + tap_blkif_t *info; + + if (!is_running_on_xen()) + return -ENODEV; + + INIT_LIST_HEAD(&pending_free); + for(i = 0; i < 2; i++) { + ret = req_increase(); + if (ret) + break; + } + if (i == 0) + return ret; + + tap_blkif_interface_init(); + + alloc_pending_reqs = 0; + + tap_blkif_xenbus_init(); + + /*Create the blktap devices, but do not map memory or waitqueue*/ + for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF; + + ret = register_chrdev(BLKTAP_DEV_MAJOR,"blktap",&blktap_fops); + blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL); + + if ( (ret < 0)||(blktap_dir < 0) ) { + WPRINTK("Couldn't register /dev/xen/blktap\n"); + return -ENOMEM; + } + + for(i = 0; i < MAX_TAP_DEV; i++ ) { + info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL); + if(tapfds[i] == NULL) return -ENOMEM; + info->minor = i; + info->pid = 0; + info->blkif = NULL; + + ret = devfs_mk_cdev(MKDEV(BLKTAP_DEV_MAJOR, i), + S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i); + + if(ret != 0) return -ENOMEM; + info->dev_pending = info->dev_inuse = 0; + + DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i); + } + + DPRINTK("Blktap device successfully created\n"); + + return 0; +} + +module_init(blkif_init); + +MODULE_LICENSE("Dual BSD/GPL");