/******************************************************************************
- * blktap.c
+ * drivers/xen/blktap/blktap.c
*
- * XenLinux virtual block-device tap.
+ * Back-end driver for user level virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. Requests
+ * are remapped to a user-space memory region.
+ *
+ * Based on the blkback driver code.
*
- * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
*
- * Based on the original split block driver:
- * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
- * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
- * Copyright (c) 2004, Christian Limpach
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
- * Note that unlike the split block driver code, this driver has been developed
- * strictly for Linux 2.6
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <asm/hypervisor.h>
+#include "common.h"
+#include <xen/balloon.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/gfp.h>
+#include <linux/poll.h>
+#include <asm/tlbflush.h>
+#include <linux/devfs_fs_kernel.h>
+
+#define MAX_TAP_DEV 100 /*the maximum number of tapdisk ring devices */
+#define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
*/
+#define MAX_DYNAMIC_MEM 64
+#define MAX_PENDING_REQS 64
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg) \
+ (_start + \
+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
+ ((_seg) * PAGE_SIZE))
+static int blkif_reqs = MAX_PENDING_REQS;
+static int mmap_pages = MMAP_PAGES;
+
+#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
+ * have a bunch of pages reserved for shared
+ * memory rings.
+ */
+
+/*Data struct associated with each of the tapdisk devices*/
+typedef struct tap_blkif {
+ struct vm_area_struct *vma; /*Shared memory area */
+ unsigned long rings_vstart; /*Kernel memory mapping */
+ unsigned long user_vstart; /*User memory mapping */
+ unsigned long dev_inuse; /*One process opens device at a time. */
+ unsigned long dev_pending; /*In process of being opened */
+ unsigned long ring_ok; /*make this ring->state */
+ blkif_front_ring_t ufe_ring; /*Rings up to user space. */
+ wait_queue_head_t wait; /*for poll */
+ unsigned long mode; /*current switching mode */
+ int minor; /*Minor number for tapdisk device */
+ pid_t pid; /*tapdisk process id */
+ enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
+ shutdown */
+ unsigned long *idx_map; /*Record the user ring id to kern
+ [req id, idx] tuple */
+ blkif_t *blkif; /*Associate blkif with tapdev */
+} tap_blkif_t;
+
+/*Private data struct associated with the inode*/
+typedef struct private_info {
+ int idx;
+} private_info_t;
+
+/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
+typedef struct domid_translate {
+ unsigned short domid;
+ unsigned short busid;
+} domid_translate_t ;
-#include "blktap.h"
-int __init xlblktap_init(void)
+domid_translate_t translate_domid[MAX_TAP_DEV];
+tap_blkif_t *tapfds[MAX_TAP_DEV];
+
+static int __init set_blkif_reqs(char *str)
{
- ctrl_msg_t cmsg;
- blkif_fe_driver_status_t fe_st;
- blkif_be_driver_status_t be_st;
+ get_option(&str, &blkif_reqs);
+ return 1;
+}
+__setup("blkif_reqs=", set_blkif_reqs);
+
+/* Run-time switchable: /sys/module/blktap/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+ blkif_t *blkif;
+ unsigned long id;
+ unsigned short mem_idx;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+ struct list_head free_list;
+ int inuse;
+} pending_req_t;
+
+static pending_req_t *pending_reqs[MAX_PENDING_REQS];
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
+static int alloc_pending_reqs;
+
+typedef unsigned int PEND_RING_IDX;
+
+static inline int MASK_PEND_IDX(int i) {
+ return (i & (MAX_PENDING_REQS-1));
+}
+
+static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
+ return (req - pending_reqs[idx]);
+}
+
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+typedef struct mmap_page {
+ unsigned long start;
+ struct page *mpage;
+} mmap_page_t;
+
+static mmap_page_t mmap_start[MAX_DYNAMIC_MEM];
+static unsigned short mmap_alloc = 0;
+static unsigned short mmap_lock = 0;
+static unsigned short mmap_inuse = 0;
+static unsigned long *pending_addrs[MAX_DYNAMIC_MEM];
+
+/******************************************************************
+ * GRANT HANDLES
+ */
+
+/* When using grant tables to map a frame for device access then the
+ * handle returned must be used to unmap the frame. This is needed to
+ * drop the ref count on the frame.
+ */
+struct grant_handle_pair
+{
+ grant_handle_t kernel;
+ grant_handle_t user;
+};
+
+static struct grant_handle_pair
+ pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
+#define pending_handle(_id, _idx, _i) \
+ (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
+ + (_i)])
- printk(KERN_INFO "Initialising Xen block tap device\n");
- DPRINTK(" tap - Backend connection init:\n");
+static int blktap_read_ufe_ring(int idx); /*local prototypes*/
+#define BLKTAP_MINOR 0 /*/dev/xen/blktap resides at device number
+ major=254, minor numbers begin at 0 */
+#define BLKTAP_DEV_MAJOR 254 /* TODO: Make major number dynamic *
+ * and create devices in the kernel *
+ */
+#define BLKTAP_DEV_DIR "/dev/xen"
- (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
- CALLBACK_IN_BLOCKING_CONTEXT);
+/* blktap IOCTLs: */
+#define BLKTAP_IOCTL_KICK_FE 1
+#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
+#define BLKTAP_IOCTL_SETMODE 3
+#define BLKTAP_IOCTL_SENDPID 4
+#define BLKTAP_IOCTL_NEWINTF 5
+#define BLKTAP_IOCTL_MINOR 6
+#define BLKTAP_IOCTL_MAJOR 7
+#define BLKTAP_QUERY_ALLOC_REQS 8
+#define BLKTAP_IOCTL_FREEINTF 9
+#define BLKTAP_IOCTL_PRINT_IDXS 100
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_BLKIF_FE;
- cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS;
- cmsg.length = sizeof(blkif_fe_driver_status_t);
- fe_st.status = BLKIF_DRIVER_STATUS_UP;
- memcpy(cmsg.msg, &fe_st, sizeof(fe_st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
+#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
+#define BLKTAP_MODE_INTERCEPT_FE 0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
+
+#define BLKTAP_MODE_INTERPOSE \
+ (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+ return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
+ (arg == BLKTAP_MODE_INTERCEPT_FE) ||
+ (arg == BLKTAP_MODE_INTERPOSE ));
+}
+
+/* Requests passing through the tap to userspace are re-assigned an ID.
+ * We must record a mapping between the BE [IDX,ID] tuple and the userspace
+ * ring ID.
+ */
+
+static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
+{
+ return ((fe_dom << 16) | MASK_PEND_IDX(idx));
+}
+
+extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
+{
+ return (PEND_RING_IDX)(id & 0x0000ffff);
+}
+
+extern inline int ID_TO_MIDX(unsigned long id)
+{
+ return (int)(id >> 16);
+}
+
+#define INVALID_REQ 0xdead0000
+
+/*TODO: Convert to a free list*/
+static inline int GET_NEXT_REQ(unsigned long *idx_map)
+{
+ int i;
+ for (i = 0; i < MAX_PENDING_REQS; i++)
+ if (idx_map[i] == INVALID_REQ) return i;
+
+ return INVALID_REQ;
+}
+
+
+#define BLKTAP_INVALID_HANDLE(_g) \
+ (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
+
+#define BLKTAP_INVALIDATE_HANDLE(_g) do { \
+ (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
+ } while(0)
+
+
+/******************************************************************
+ * BLKTAP VM OPS
+ */
+
+static struct page *blktap_nopage(struct vm_area_struct *vma,
+ unsigned long address,
+ int *type)
+{
+ /*
+ * if the page has not been mapped in by the driver then return
+ * NOPAGE_SIGBUS to the domain.
+ */
+
+ return NOPAGE_SIGBUS;
+}
+
+struct vm_operations_struct blktap_vm_ops = {
+ nopage: blktap_nopage,
+};
+
+/******************************************************************
+ * BLKTAP FILE OPS
+ */
+
+/*Function Declarations*/
+static int get_next_free_dev(void);
+static int blktap_open(struct inode *inode, struct file *filp);
+static int blktap_release(struct inode *inode, struct file *filp);
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+static unsigned int blktap_poll(struct file *file, poll_table *wait);
- DPRINTK(" tap - Frontend connection init:\n");
+struct miscdevice *set_misc(int minor, char *name, int dev);
+
+static struct file_operations blktap_fops = {
+ .owner = THIS_MODULE,
+ .poll = blktap_poll,
+ .ioctl = blktap_ioctl,
+ .open = blktap_open,
+ .release = blktap_release,
+ .mmap = blktap_mmap,
+};
+
+
+static int get_next_free_dev(void)
+{
+ tap_blkif_t *info;
+ int i = 0, ret = -1;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+
+ while (i < MAX_TAP_DEV) {
+ info = tapfds[i];
+ if ( (tapfds[i] != NULL) && (info->dev_inuse == 0)
+ && (info->dev_pending == 0) ) {
+ info->dev_pending = 1;
+ ret = i;
+ goto done;
+ }
+ i++;
+ }
+
+done:
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ return ret;
+}
+
+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
+{
+ int i;
+
+ for (i = 0; i < MAX_TAP_DEV; i++)
+ if ( (translate_domid[i].domid == domid)
+ && (translate_domid[i].busid == xenbus_id) ) {
+ tapfds[i]->blkif = blkif;
+ tapfds[i]->status = RUNNING;
+ return i;
+ }
+ return -1;
+}
+
+void signal_tapdisk(int idx)
+{
+ tap_blkif_t *info;
+ struct task_struct *ptask;
+
+ info = tapfds[idx];
+ if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) {
+ ptask = find_task_by_pid(info->pid);
+ if (ptask) {
+ info->status = CLEANSHUTDOWN;
+ }
+ }
+ info->blkif = NULL;
+ return;
+}
+
+static int blktap_open(struct inode *inode, struct file *filp)
+{
+ blkif_sring_t *sring;
+ int idx = iminor(inode) - BLKTAP_MINOR;
+ tap_blkif_t *info;
+ private_info_t *prv;
+ int i;
+
+ if (tapfds[idx] == NULL) {
+ WPRINTK("Unable to open device /dev/xen/blktap%d\n",
+ idx);
+ return -ENOMEM;
+ }
+ DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
+
+ info = tapfds[idx];
+
+ /*Only one process can access device at a time*/
+ if (test_and_set_bit(0, &info->dev_inuse))
+ return -EBUSY;
+
+ info->dev_pending = 0;
+
+ /* Allocate the fe ring. */
+ sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+ if (sring == NULL)
+ goto fail_nomem;
+
+ SetPageReserved(virt_to_page(sring));
+
+ SHARED_RING_INIT(sring);
+ FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
+
+ prv = kzalloc(sizeof(private_info_t),GFP_KERNEL);
+ prv->idx = idx;
+ filp->private_data = prv;
+ info->vma = NULL;
+
+ info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS,
+ GFP_KERNEL);
+
+ if (idx > 0) {
+ init_waitqueue_head(&info->wait);
+ for (i = 0; i < MAX_PENDING_REQS; i++)
+ info->idx_map[i] = INVALID_REQ;
+ }
+
+ DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
+ return 0;
+
+ fail_nomem:
+ return -ENOMEM;
+}
+
+static int blktap_release(struct inode *inode, struct file *filp)
+{
+ int idx = iminor(inode) - BLKTAP_MINOR;
+ tap_blkif_t *info;
+
+ if (tapfds[idx] == NULL) {
+ WPRINTK("Trying to free device that doesn't exist "
+ "[/dev/xen/blktap%d]\n",idx);
+ return -1;
+ }
+ info = tapfds[idx];
+ info->dev_inuse = 0;
+ DPRINTK("Freeing device [/dev/xen/blktap%d]\n",idx);
+
+ /* Free the ring page. */
+ ClearPageReserved(virt_to_page(info->ufe_ring.sring));
+ free_page((unsigned long) info->ufe_ring.sring);
+
+ /* Clear any active mappings and free foreign map table */
+ if (info->vma) {
+ zap_page_range(
+ info->vma, info->vma->vm_start,
+ info->vma->vm_end - info->vma->vm_start, NULL);
+ info->vma = NULL;
+ }
+
+ if (filp->private_data) kfree(filp->private_data);
+
+ if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
+ kthread_stop(info->blkif->xenblkd);
+ info->blkif->xenblkd = NULL;
+ info->status = CLEANSHUTDOWN;
+ }
+ return 0;
+}
+
+
+/* Note on mmap:
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them. This couldn't be done before, because
+ * there isn't really a sane way to translate a user virtual address down to a
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space. This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms. vma->vm_private_data is set up as a mapping
+ * from pages to actual page structs. There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ */
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ int size;
+ struct page **map;
+ int i;
+ private_info_t *prv;
+ tap_blkif_t *info;
+
+ /*Retrieve the dev info*/
+ prv = (private_info_t *)filp->private_data;
+ if (prv == NULL) {
+ WPRINTK("blktap: mmap, retrieving idx failed\n");
+ return -ENOMEM;
+ }
+ info = tapfds[prv->idx];
+
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_ops = &blktap_vm_ops;
+
+ size = vma->vm_end - vma->vm_start;
+ if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
+ WPRINTK("you _must_ map exactly %d pages!\n",
+ mmap_pages + RING_PAGES);
+ return -EAGAIN;
+ }
+
+ size >>= PAGE_SHIFT;
+ info->rings_vstart = vma->vm_start;
+ info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
- active_reqs_init();
- blkif_interface_init();
- blkdev_schedule_init();
+ /* Map the ring pages to the start of the region and reserve it. */
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ if (remap_pfn_range(vma, vma->vm_start,
+ __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
+ PAGE_SIZE, vma->vm_page_prot)) {
+ WPRINTK("Mapping user ring failed!\n");
+ goto fail;
+ }
+
+ /* Mark this VM as containing foreign pages, and set up mappings. */
+ map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
+ * sizeof(struct page_struct*),
+ GFP_KERNEL);
+ if (map == NULL) {
+ WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
+ goto fail;
+ }
+
+ for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
+ map[i] = NULL;
- (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx,
- CALLBACK_IN_BLOCKING_CONTEXT);
+ vma->vm_private_data = map;
+ vma->vm_flags |= VM_FOREIGN;
+
+ info->vma = vma;
+ info->ring_ok = 1;
+ return 0;
+ fail:
+ /* Clear any active mappings. */
+ zap_page_range(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start, NULL);
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_BLKIF_BE;
- cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS;
- cmsg.length = sizeof(blkif_be_driver_status_t);
- be_st.status = BLKIF_DRIVER_STATUS_UP;
- memcpy(cmsg.msg, &be_st, sizeof(be_st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+ return -ENOMEM;
+}
- DPRINTK(" tap - Userland channel init:\n");
- blktap_init();
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ int idx = iminor(inode) - BLKTAP_MINOR;
+ switch(cmd) {
+ case BLKTAP_IOCTL_KICK_FE:
+ {
+ /* There are fe messages to process. */
+ return blktap_read_ufe_ring(idx);
+ }
+ case BLKTAP_IOCTL_SETMODE:
+ {
+ tap_blkif_t *info = tapfds[idx];
+
+ if ( (idx > 0) && (idx < MAX_TAP_DEV)
+ && (tapfds[idx] != NULL) )
+ {
+ if (BLKTAP_MODE_VALID(arg)) {
+ info->mode = arg;
+ /* XXX: may need to flush rings here. */
+ DPRINTK("blktap: set mode to %lx\n",
+ arg);
+ return 0;
+ }
+ }
+ return 0;
+ }
+ case BLKTAP_IOCTL_PRINT_IDXS:
+ {
+ tap_blkif_t *info = tapfds[idx];
+
+ if ( (idx > 0) && (idx < MAX_TAP_DEV)
+ && (tapfds[idx] != NULL) )
+ {
+ printk("User Rings: \n-----------\n");
+ printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
+ "| req_prod: %2d, rsp_prod: %2d\n",
+ info->ufe_ring.rsp_cons,
+ info->ufe_ring.req_prod_pvt,
+ info->ufe_ring.sring->req_prod,
+ info->ufe_ring.sring->rsp_prod);
+ }
+ return 0;
+ }
+ case BLKTAP_IOCTL_SENDPID:
+ {
+ tap_blkif_t *info = tapfds[idx];
+
+ if ( (idx > 0) && (idx < MAX_TAP_DEV)
+ && (tapfds[idx] != NULL) )
+ {
+ info->pid = (pid_t)arg;
+ DPRINTK("blktap: pid received %d\n",
+ info->pid);
+ }
+ return 0;
+ }
+ case BLKTAP_IOCTL_NEWINTF:
+ {
+ uint64_t val = (uint64_t)arg;
+ domid_translate_t *tr = (domid_translate_t *)&val;
+ int newdev;
- DPRINTK("Blkif tap device initialized.\n");
+ DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
+ tr->domid, tr->busid);
+ newdev = get_next_free_dev();
+ if (newdev < 1) {
+ WPRINTK("Error initialising /dev/xen/blktap - "
+ "No more devices\n");
+ return -1;
+ }
+ translate_domid[newdev].domid = tr->domid;
+ translate_domid[newdev].busid = tr->busid;
+ return newdev;
+ }
+ case BLKTAP_IOCTL_FREEINTF:
+ {
+ unsigned long dev = arg;
+ tap_blkif_t *info = NULL;
- return 0;
+ if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev];
+
+ if ( (info != NULL) && (info->dev_pending) )
+ info->dev_pending = 0;
+ return 0;
+ }
+ case BLKTAP_IOCTL_MINOR:
+ {
+ unsigned long dev = arg;
+ tap_blkif_t *info = NULL;
+
+ if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev];
+
+ if (info != NULL) return info->minor;
+ else return -1;
+ }
+ case BLKTAP_IOCTL_MAJOR:
+ return BLKTAP_DEV_MAJOR;
+
+ case BLKTAP_QUERY_ALLOC_REQS:
+ {
+ WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
+ alloc_pending_reqs, blkif_reqs);
+ return (alloc_pending_reqs/blkif_reqs) * 100;
+ }
+ }
+ return -ENOIOCTLCMD;
}
-#if 0 /* tap doesn't handle suspend/resume */
-void blkdev_suspend(void)
+static unsigned int blktap_poll(struct file *file, poll_table *wait)
{
+ private_info_t *prv;
+ tap_blkif_t *info;
+
+ /*Retrieve the dev info*/
+ prv = (private_info_t *)file->private_data;
+ if (prv == NULL) {
+ WPRINTK(" poll, retrieving idx failed\n");
+ return 0;
+ }
+
+ if (prv->idx == 0) return 0;
+
+ info = tapfds[prv->idx];
+
+ poll_wait(file, &info->wait, wait);
+ if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
+ flush_tlb_all();
+ RING_PUSH_REQUESTS(&info->ufe_ring);
+ return POLLIN | POLLRDNORM;
+ }
+ return 0;
}
-void blkdev_resume(void)
+void blktap_kick_user(int idx)
{
- ctrl_msg_t cmsg;
- blkif_fe_driver_status_t st;
+ tap_blkif_t *info;
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_BLKIF_FE;
- cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS;
- cmsg.length = sizeof(blkif_fe_driver_status_t);
- st.status = BLKIF_DRIVER_STATUS_UP;
- memcpy(cmsg.msg, &st, sizeof(st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+ if (idx == 0) return;
+
+ info = tapfds[idx];
+
+ if (info != NULL) wake_up_interruptible(&info->wait);
+ return;
}
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blkif_request_t *req,
+ pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, unsigned long id,
+ unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static int req_increase(void)
+{
+ int i, j;
+ struct page *page;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+
+ ret = -EINVAL;
+ if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
+ goto done;
+
+#ifdef __ia64__
+ extern unsigned long alloc_empty_foreign_map_page_range(
+ unsigned long pages);
+ mmap_start[mmap_alloc].start = (unsigned long)
+ alloc_empty_foreign_map_page_range(mmap_pages);
+#else /* ! ia64 */
+ page = balloon_alloc_empty_page_range(mmap_pages);
+ ret = -ENOMEM;
+ if (page == NULL) {
+ printk("%s balloon_alloc_empty_page_range gave NULL\n", __FUNCTION__);
+ goto done;
+ }
+
+ /* Pin all of the pages. */
+ for (i=0; i<mmap_pages; i++)
+ get_page(&page[i]);
+
+ mmap_start[mmap_alloc].start =
+ (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+ mmap_start[mmap_alloc].mpage = page;
+
#endif
-__initcall(xlblktap_init);
+ pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t) *
+ blkif_reqs, GFP_KERNEL);
+ pending_addrs[mmap_alloc] = kzalloc(sizeof(unsigned long) *
+ mmap_pages, GFP_KERNEL);
+
+ ret = -ENOMEM;
+ if (!pending_reqs[mmap_alloc] || !pending_addrs[mmap_alloc]) {
+ kfree(pending_reqs[mmap_alloc]);
+ kfree(pending_addrs[mmap_alloc]);
+ WPRINTK("%s: out of memory\n", __FUNCTION__);
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ ret = 0;
+
+ DPRINTK("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
+ __FUNCTION__, blkif_reqs, mmap_pages,
+ mmap_start[mmap_alloc].start);
+
+ BUG_ON(mmap_start[mmap_alloc].start == 0);
+
+ for (i = 0; i < mmap_pages; i++)
+ pending_addrs[mmap_alloc][i] =
+ mmap_start[mmap_alloc].start + (i << PAGE_SHIFT);
+
+ for (i = 0; i < MAX_PENDING_REQS ; i++) {
+ list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
+ &pending_free);
+ pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
+ for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
+ BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
+ i, j));
+ }
+
+ mmap_alloc++;
+ DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
+ done:
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ return ret;
+}
+
+static void mmap_req_del(int mmap)
+{
+ int i;
+ struct page *page;
+
+ /*Spinlock already acquired*/
+ kfree(pending_reqs[mmap]);
+ kfree(pending_addrs[mmap]);
+
+#ifdef __ia64__
+ /*Not sure what goes here yet!*/
+#else
+
+ /* Unpin all of the pages. */
+ page = mmap_start[mmap].mpage;
+ for (i=0; i<mmap_pages; i++)
+ put_page(&page[i]);
+
+ balloon_dealloc_empty_page_range(mmap_start[mmap].mpage, mmap_pages);
+#endif
+
+ mmap_lock = 0;
+ DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
+ mmap_alloc--;
+}
+
+/*N.B. Currently unused - will be accessed via sysfs*/
+static void req_decrease(void)
+{
+ pending_req_t *req;
+ int i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+
+ DPRINTK("Req decrease called.\n");
+ if (mmap_lock || mmap_alloc == 1)
+ goto done;
+
+ mmap_lock = 1;
+ mmap_inuse = MAX_PENDING_REQS;
+
+ /*Go through reqs and remove any that aren't in use*/
+ for (i = 0; i < MAX_PENDING_REQS ; i++) {
+ req = &pending_reqs[mmap_alloc-1][i];
+ if (req->inuse == 0) {
+ list_del(&req->free_list);
+ mmap_inuse--;
+ }
+ }
+ if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
+ done:
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ return;
+}
+
+static pending_req_t* alloc_req(void)
+{
+ pending_req_t *req = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+
+ if (!list_empty(&pending_free)) {
+ req = list_entry(pending_free.next, pending_req_t, free_list);
+ list_del(&req->free_list);
+ }
+
+ if (req) {
+ req->inuse = 1;
+ alloc_pending_reqs++;
+ }
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+
+ return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+ unsigned long flags;
+ int was_empty;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+
+ alloc_pending_reqs--;
+ req->inuse = 0;
+ if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
+ mmap_inuse--;
+ if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ return;
+ }
+ was_empty = list_empty(&pending_free);
+ list_add(&req->free_list, &pending_free);
+
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+
+ if (was_empty)
+ wake_up(&pending_free_wq);
+}
+
+static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int
+ tapidx)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+ unsigned int i, invcount = 0;
+ struct grant_handle_pair *khandle;
+ uint64_t ptep;
+ int ret, mmap_idx;
+ unsigned long kvaddr, uvaddr;
+
+ tap_blkif_t *info = tapfds[tapidx];
+
+ if (info == NULL) {
+ WPRINTK("fast_flush: Couldn't get info!\n");
+ return;
+ }
+ mmap_idx = req->mem_idx;
+
+ for (i = 0; i < req->nr_pages; i++) {
+ kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i);
+ uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
+
+ khandle = &pending_handle(mmap_idx, k_idx, i);
+ if (BLKTAP_INVALID_HANDLE(khandle)) {
+ WPRINTK("BLKTAP_INVALID_HANDLE\n");
+ continue;
+ }
+ gnttab_set_unmap_op(&unmap[invcount],
+ MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i),
+ GNTMAP_host_map, khandle->kernel);
+ invcount++;
+
+ if (create_lookup_pte_addr(
+ info->vma->vm_mm,
+ MMAP_VADDR(info->user_vstart, u_idx, i),
+ &ptep) !=0) {
+ WPRINTK("Couldn't get a pte addr!\n");
+ return;
+ }
+
+ gnttab_set_unmap_op(&unmap[invcount],
+ ptep, GNTMAP_host_map,
+ khandle->user);
+ invcount++;
+
+ BLKTAP_INVALIDATE_HANDLE(khandle);
+ }
+ ret = HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, unmap, invcount);
+ BUG_ON(ret);
+
+ if (info->vma != NULL)
+ zap_page_range(info->vma,
+ MMAP_VADDR(info->user_vstart, u_idx, 0),
+ req->nr_pages << PAGE_SHIFT, NULL);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
+ current->comm, blkif->st_oo_req,
+ blkif->st_rd_req, blkif->st_wr_req);
+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+ blkif->st_rd_req = 0;
+ blkif->st_wr_req = 0;
+ blkif->st_oo_req = 0;
+}
+
+int tap_blkif_schedule(void *arg)
+{
+ blkif_t *blkif = arg;
+
+ blkif_get(blkif);
+
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: started\n", current->comm);
+
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(
+ blkif->wq,
+ blkif->waiting_reqs || kthread_should_stop());
+ wait_event_interruptible(
+ pending_free_wq,
+ !list_empty(&pending_free) || kthread_should_stop());
+
+ blkif->waiting_reqs = 0;
+ smp_mb(); /* clear flag *before* checking for work */
+
+ if (do_block_io_op(blkif))
+ blkif->waiting_reqs = 1;
+
+ if (log_stats && time_after(jiffies, blkif->st_print))
+ print_stats(blkif);
+ }
+
+ if (log_stats)
+ print_stats(blkif);
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+ blkif->xenblkd = NULL;
+ blkif_put(blkif);
+
+ return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called by user level ioctl()
+ */
+
+static int blktap_read_ufe_ring(int idx)
+{
+ /* This is called to read responses from the UFE ring. */
+ RING_IDX i, j, rp;
+ blkif_response_t *resp;
+ blkif_t *blkif=NULL;
+ int pending_idx, usr_idx, mmap_idx;
+ pending_req_t *pending_req;
+ tap_blkif_t *info;
+
+ info = tapfds[idx];
+ if (info == NULL) {
+ return 0;
+ }
+
+ /* We currently only forward packets in INTERCEPT_FE mode. */
+ if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
+ return 0;
+
+ /* for each outstanding message on the UFEring */
+ rp = info->ufe_ring.sring->rsp_prod;
+ rmb();
+
+ for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
+ resp = RING_GET_RESPONSE(&info->ufe_ring, i);
+ ++info->ufe_ring.rsp_cons;
+
+ /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
+ usr_idx = (int)resp->id;
+ pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
+ mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
+
+ if ( (mmap_idx >= mmap_alloc) ||
+ (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
+ WPRINTK("Incorrect req map"
+ "[%d], internal map [%d,%d (%d)]\n",
+ usr_idx, mmap_idx,
+ ID_TO_IDX(info->idx_map[usr_idx]),
+ MASK_PEND_IDX(
+ ID_TO_IDX(info->idx_map[usr_idx])));
+
+ pending_req = &pending_reqs[mmap_idx][pending_idx];
+ blkif = pending_req->blkif;
+
+ for (j = 0; j < pending_req->nr_pages; j++) {
+
+ unsigned long kvaddr, uvaddr;
+ struct page **map = info->vma->vm_private_data;
+ struct page *pg;
+ int offset;
+
+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
+ kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start,
+ pending_idx, j);
+
+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ ClearPageReserved(pg);
+ offset = (uvaddr - info->vma->vm_start)
+ >> PAGE_SHIFT;
+ map[offset] = NULL;
+ }
+ fast_flush_area(pending_req, pending_idx, usr_idx, idx);
+ make_response(blkif, pending_req->id, resp->operation,
+ resp->status);
+ info->idx_map[usr_idx] = INVALID_REQ;
+ blkif_put(pending_req->blkif);
+ free_req(pending_req);
+ }
+
+ return 0;
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+ blkif->waiting_reqs = 1;
+ wake_up(&blkif->wq);
+}
+
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+ blkif_notify_work(dev_id);
+ return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+static int print_dbug = 1;
+static int do_block_io_op(blkif_t *blkif)
+{
+ blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+ blkif_request_t *req;
+ pending_req_t *pending_req;
+ RING_IDX rc, rp;
+ int more_to_do = 0;
+ tap_blkif_t *info;
+
+ rc = blk_ring->req_cons;
+ rp = blk_ring->sring->req_prod;
+ rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+ /*Check blkif has corresponding UE ring*/
+ if (blkif->dev_num == -1) {
+ /*oops*/
+ if (print_dbug) {
+ WPRINTK("Corresponding UE "
+ "ring does not exist!\n");
+ print_dbug = 0; /*We only print this message once*/
+ }
+ return 1;
+ }
+
+ info = tapfds[blkif->dev_num];
+ if (info == NULL || !info->dev_inuse) {
+ if (print_dbug) {
+ WPRINTK("Can't get UE info!\n");
+ print_dbug = 0;
+ }
+ return 1;
+ }
+
+ while (rc != rp) {
+
+ if (RING_FULL(&info->ufe_ring)) {
+ WPRINTK("RING_FULL! More to do\n");
+ more_to_do = 1;
+ break;
+ }
+
+ if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
+ WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
+ " More to do\n");
+ more_to_do = 1;
+ break;
+ }
+
+ pending_req = alloc_req();
+ if (NULL == pending_req) {
+ blkif->st_oo_req++;
+ more_to_do = 1;
+ break;
+ }
+
+ req = RING_GET_REQUEST(blk_ring, rc);
+ blk_ring->req_cons = ++rc; /* before make_response() */
+
+ switch (req->operation) {
+ case BLKIF_OP_READ:
+ blkif->st_rd_req++;
+ dispatch_rw_block_io(blkif, req, pending_req);
+ break;
+
+ case BLKIF_OP_WRITE:
+ blkif->st_wr_req++;
+ dispatch_rw_block_io(blkif, req, pending_req);
+ break;
+
+ default:
+ WPRINTK("unknown operation [%d]\n",
+ req->operation);
+ make_response(blkif, req->id, req->operation,
+ BLKIF_RSP_ERROR);
+ free_req(pending_req);
+ break;
+ }
+ }
+
+ blktap_kick_user(blkif->dev_num);
+
+ return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blkif_request_t *req,
+ pending_req_t *pending_req)
+{
+ extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
+ int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+ unsigned int nseg;
+ int ret, i;
+ tap_blkif_t *info = tapfds[blkif->dev_num];
+ uint64_t sector;
+
+ blkif_request_t *target;
+ int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
+ int usr_idx = GET_NEXT_REQ(info->idx_map);
+ uint16_t mmap_idx = pending_req->mem_idx;
+
+ /*Check we have space on user ring - should never fail*/
+ if(usr_idx == INVALID_REQ) goto fail_flush;
+
+ /* Check that number of segments is sane. */
+ nseg = req->nr_segments;
+ if ( unlikely(nseg == 0) ||
+ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
+ WPRINTK("Bad number of segments in request (%d)\n", nseg);
+ goto fail_response;
+ }
+
+ /* Make sure userspace is ready. */
+ if (!info->ring_ok) {
+ WPRINTK("blktap: ring not ready for requests!\n");
+ goto fail_response;
+ }
+
+ if (RING_FULL(&info->ufe_ring)) {
+ WPRINTK("blktap: fe_ring is full, can't add "
+ "IO Request will be dropped. %d %d\n",
+ RING_SIZE(&info->ufe_ring),
+ RING_SIZE(&blkif->blk_ring));
+ goto fail_response;
+ }
+
+ pending_req->blkif = blkif;
+ pending_req->id = req->id;
+ pending_req->operation = operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = nseg;
+ op = 0;
+ for (i = 0; i < nseg; i++) {
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ uint64_t ptep;
+ struct page *page;
+ uint32_t flags;
+
+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
+ kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start,
+ pending_idx, i);
+ page = virt_to_page(kvaddr);
+
+ sector = req->sector_number + (8*i);
+ if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
+ WPRINTK("BLKTAP: Sector request greater"
+ "than size\n");
+ WPRINTK("BLKTAP: %s request sector"
+ "[%llu,%llu], Total [%llu]\n",
+ (req->operation ==
+ BLKIF_OP_WRITE ? "WRITE" : "READ"),
+ (long long unsigned) sector,
+ (long long unsigned) sector>>9,
+ blkif->sectors);
+ }
+
+ flags = GNTMAP_host_map;
+ if (operation == WRITE)
+ flags |= GNTMAP_readonly;
+ gnttab_set_map_op(&map[op], kvaddr, flags,
+ req->seg[i].gref, blkif->domid);
+ op++;
+
+ /* Now map it to user. */
+ ret = create_lookup_pte_addr(info->vma->vm_mm,
+ uvaddr, &ptep);
+ if (ret) {
+ WPRINTK("Couldn't get a pte addr!\n");
+ fast_flush_area(pending_req, pending_idx, usr_idx,
+ blkif->dev_num);
+ goto fail_flush;
+ }
+
+ flags = GNTMAP_host_map | GNTMAP_application_map
+ | GNTMAP_contains_pte;
+ if (operation == WRITE)
+ flags |= GNTMAP_readonly;
+ gnttab_set_map_op(&map[op], ptep, flags,
+ req->seg[i].gref, blkif->domid);
+ op++;
+ }
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
+ BUG_ON(ret);
+
+ for (i = 0; i < (nseg*2); i+=2) {
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ unsigned long offset;
+ struct page *pg;
+
+ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
+ kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start,
+ pending_idx, i/2);
+
+ if (unlikely(map[i].status != 0)) {
+ WPRINTK("invalid kernel buffer -- "
+ "could not remap it\n");
+ goto fail_flush;
+ }
+
+ if (unlikely(map[i+1].status != 0)) {
+ WPRINTK("invalid user buffer -- "
+ "could not remap it\n");
+ goto fail_flush;
+ }
+
+ pending_handle(mmap_idx, pending_idx, i/2).kernel
+ = map[i].handle;
+ pending_handle(mmap_idx, pending_idx, i/2).user
+ = map[i+1].handle;
+ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
+ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+ offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ ((struct page **)info->vma->vm_private_data)[offset] =
+ pg;
+ }
+ /* Mark mapped pages as reserved: */
+ for (i = 0; i < req->nr_segments; i++) {
+ unsigned long kvaddr;
+ struct page *pg;
+
+ kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start,
+ pending_idx, i);
+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ SetPageReserved(pg);
+ }
+
+ /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
+ info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
+
+ blkif_get(blkif);
+ /* Finally, write the request message to the user ring. */
+ target = RING_GET_REQUEST(&info->ufe_ring,
+ info->ufe_ring.req_prod_pvt);
+ memcpy(target, req, sizeof(*req));
+ target->id = usr_idx;
+ info->ufe_ring.req_prod_pvt++;
+ return;
+
+ fail_flush:
+ WPRINTK("Reached Fail_flush\n");
+ fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
+ fail_response:
+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+ free_req(pending_req);
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, unsigned long id,
+ unsigned short op, int st)
+{
+ blkif_response_t *resp;
+ unsigned long flags;
+ blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+ int more_to_do = 0;
+ int notify;
+
+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+ /* Place on the response ring for the relevant domain. */
+ resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
+ resp->id = id;
+ resp->operation = op;
+ resp->status = st;
+ blk_ring->rsp_prod_pvt++;
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
+
+ if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
+ /*
+ * Tail check for pending requests. Allows frontend to avoid
+ * notifications if requests are already in flight (lower
+ * overheads and promotes batching).
+ */
+ RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
+ } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
+ more_to_do = 1;
+
+ }
+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+ if (more_to_do)
+ blkif_notify_work(blkif);
+ if (notify)
+ notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+ int i,ret,blktap_dir;
+ tap_blkif_t *info;
+
+ if (!is_running_on_xen())
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&pending_free);
+ for(i = 0; i < 2; i++) {
+ ret = req_increase();
+ if (ret)
+ break;
+ }
+ if (i == 0)
+ return ret;
+
+ tap_blkif_interface_init();
+
+ alloc_pending_reqs = 0;
+
+ tap_blkif_xenbus_init();
+
+ /*Create the blktap devices, but do not map memory or waitqueue*/
+ for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF;
+
+ ret = register_chrdev(BLKTAP_DEV_MAJOR,"blktap",&blktap_fops);
+ blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL);
+
+ if ( (ret < 0)||(blktap_dir < 0) ) {
+ WPRINTK("Couldn't register /dev/xen/blktap\n");
+ return -ENOMEM;
+ }
+
+ for(i = 0; i < MAX_TAP_DEV; i++ ) {
+ info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL);
+ if(tapfds[i] == NULL) return -ENOMEM;
+ info->minor = i;
+ info->pid = 0;
+ info->blkif = NULL;
+
+ ret = devfs_mk_cdev(MKDEV(BLKTAP_DEV_MAJOR, i),
+ S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
+
+ if(ret != 0) return -ENOMEM;
+ info->dev_pending = info->dev_inuse = 0;
+
+ DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
+ }
+
+ DPRINTK("Blktap device successfully created\n");
+
+ return 0;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");