/****************************************************************************** * drivers/xen/blktap/blktap.c * * Back-end driver for user level virtual block devices. This portion of the * driver exports a 'unified' block-device interface that can be accessed * by any operating system that implements a compatible front end. Requests * are remapped to a user-space memory region. * * Based on the blkback driver code. * * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation; or, when distributed * separately from the Linux kernel or incorporated into other * software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include #include #include #include "common.h" #include #include #include #include #include #include #include #include #include #include #include #define MAX_TAP_DEV 100 /*the maximum number of tapdisk ring devices */ #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */ /* * The maximum number of requests that can be outstanding at any time * is determined by * * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] * * where mmap_alloc < MAX_DYNAMIC_MEM. * * TODO: * mmap_alloc is initialised to 2 and should be adjustable on the fly via * sysfs. */ #define MAX_DYNAMIC_MEM 64 #define MAX_PENDING_REQS 64 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) #define MMAP_VADDR(_start, _req,_seg) \ (_start + \ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ ((_seg) * PAGE_SIZE)) static int blkif_reqs = MAX_PENDING_REQS; static int mmap_pages = MMAP_PAGES; #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we * have a bunch of pages reserved for shared * memory rings. */ /*Data struct associated with each of the tapdisk devices*/ typedef struct tap_blkif { struct vm_area_struct *vma; /*Shared memory area */ unsigned long rings_vstart; /*Kernel memory mapping */ unsigned long user_vstart; /*User memory mapping */ unsigned long dev_inuse; /*One process opens device at a time. */ unsigned long dev_pending; /*In process of being opened */ unsigned long ring_ok; /*make this ring->state */ blkif_front_ring_t ufe_ring; /*Rings up to user space. */ wait_queue_head_t wait; /*for poll */ unsigned long mode; /*current switching mode */ int minor; /*Minor number for tapdisk device */ pid_t pid; /*tapdisk process id */ enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace shutdown */ unsigned long *idx_map; /*Record the user ring id to kern [req id, idx] tuple */ blkif_t *blkif; /*Associate blkif with tapdev */ } tap_blkif_t; /*Private data struct associated with the inode*/ typedef struct private_info { int idx; } private_info_t; /*Data struct handed back to userspace for tapdisk device to VBD mapping*/ typedef struct domid_translate { unsigned short domid; unsigned short busid; } domid_translate_t ; domid_translate_t translate_domid[MAX_TAP_DEV]; tap_blkif_t *tapfds[MAX_TAP_DEV]; static int __init set_blkif_reqs(char *str) { get_option(&str, &blkif_reqs); return 1; } __setup("blkif_reqs=", set_blkif_reqs); /* Run-time switchable: /sys/module/blktap/parameters/ */ static unsigned int log_stats = 0; static unsigned int debug_lvl = 0; module_param(log_stats, int, 0644); module_param(debug_lvl, int, 0644); /* * Each outstanding request that we've passed to the lower device layers has a * 'pending_req' allocated to it. Each buffer_head that completes decrements * the pendcnt towards zero. When it hits zero, the specified domain has a * response queued for it, with the saved 'id' passed back. */ typedef struct { blkif_t *blkif; unsigned long id; unsigned short mem_idx; int nr_pages; atomic_t pendcnt; unsigned short operation; int status; struct list_head free_list; int inuse; } pending_req_t; static pending_req_t *pending_reqs[MAX_PENDING_REQS]; static struct list_head pending_free; static DEFINE_SPINLOCK(pending_free_lock); static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq); static int alloc_pending_reqs; typedef unsigned int PEND_RING_IDX; static inline int MASK_PEND_IDX(int i) { return (i & (MAX_PENDING_REQS-1)); } static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) { return (req - pending_reqs[idx]); } #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) #define BLKBACK_INVALID_HANDLE (~0) typedef struct mmap_page { unsigned long start; struct page *mpage; } mmap_page_t; static mmap_page_t mmap_start[MAX_DYNAMIC_MEM]; static unsigned short mmap_alloc = 0; static unsigned short mmap_lock = 0; static unsigned short mmap_inuse = 0; static unsigned long *pending_addrs[MAX_DYNAMIC_MEM]; /****************************************************************** * GRANT HANDLES */ /* When using grant tables to map a frame for device access then the * handle returned must be used to unmap the frame. This is needed to * drop the ref count on the frame. */ struct grant_handle_pair { grant_handle_t kernel; grant_handle_t user; }; static struct grant_handle_pair pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES]; #define pending_handle(_id, _idx, _i) \ (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \ + (_i)]) static int blktap_read_ufe_ring(int idx); /*local prototypes*/ #define BLKTAP_MINOR 0 /*/dev/xen/blktap resides at device number major=254, minor numbers begin at 0 */ #define BLKTAP_DEV_MAJOR 254 /* TODO: Make major number dynamic * * and create devices in the kernel * */ #define BLKTAP_DEV_DIR "/dev/xen" /* blktap IOCTLs: */ #define BLKTAP_IOCTL_KICK_FE 1 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */ #define BLKTAP_IOCTL_SETMODE 3 #define BLKTAP_IOCTL_SENDPID 4 #define BLKTAP_IOCTL_NEWINTF 5 #define BLKTAP_IOCTL_MINOR 6 #define BLKTAP_IOCTL_MAJOR 7 #define BLKTAP_QUERY_ALLOC_REQS 8 #define BLKTAP_IOCTL_FREEINTF 9 #define BLKTAP_IOCTL_PRINT_IDXS 100 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ #define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ #define BLKTAP_MODE_INTERCEPT_FE 0x00000001 #define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */ #define BLKTAP_MODE_INTERPOSE \ (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) static inline int BLKTAP_MODE_VALID(unsigned long arg) { return ((arg == BLKTAP_MODE_PASSTHROUGH ) || (arg == BLKTAP_MODE_INTERCEPT_FE) || (arg == BLKTAP_MODE_INTERPOSE )); } /* Requests passing through the tap to userspace are re-assigned an ID. * We must record a mapping between the BE [IDX,ID] tuple and the userspace * ring ID. */ static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx) { return ((fe_dom << 16) | MASK_PEND_IDX(idx)); } extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id) { return (PEND_RING_IDX)(id & 0x0000ffff); } extern inline int ID_TO_MIDX(unsigned long id) { return (int)(id >> 16); } #define INVALID_REQ 0xdead0000 /*TODO: Convert to a free list*/ static inline int GET_NEXT_REQ(unsigned long *idx_map) { int i; for (i = 0; i < MAX_PENDING_REQS; i++) if (idx_map[i] == INVALID_REQ) return i; return INVALID_REQ; } #define BLKTAP_INVALID_HANDLE(_g) \ (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF)) #define BLKTAP_INVALIDATE_HANDLE(_g) do { \ (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \ } while(0) /****************************************************************** * BLKTAP VM OPS */ static struct page *blktap_nopage(struct vm_area_struct *vma, unsigned long address, int *type) { /* * if the page has not been mapped in by the driver then return * NOPAGE_SIGBUS to the domain. */ return NOPAGE_SIGBUS; } struct vm_operations_struct blktap_vm_ops = { nopage: blktap_nopage, }; /****************************************************************** * BLKTAP FILE OPS */ /*Function Declarations*/ static int get_next_free_dev(void); static int blktap_open(struct inode *inode, struct file *filp); static int blktap_release(struct inode *inode, struct file *filp); static int blktap_mmap(struct file *filp, struct vm_area_struct *vma); static int blktap_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); static unsigned int blktap_poll(struct file *file, poll_table *wait); struct miscdevice *set_misc(int minor, char *name, int dev); static struct file_operations blktap_fops = { .owner = THIS_MODULE, .poll = blktap_poll, .ioctl = blktap_ioctl, .open = blktap_open, .release = blktap_release, .mmap = blktap_mmap, }; static int get_next_free_dev(void) { tap_blkif_t *info; int i = 0, ret = -1; unsigned long flags; spin_lock_irqsave(&pending_free_lock, flags); while (i < MAX_TAP_DEV) { info = tapfds[i]; if ( (tapfds[i] != NULL) && (info->dev_inuse == 0) && (info->dev_pending == 0) ) { info->dev_pending = 1; ret = i; goto done; } i++; } done: spin_unlock_irqrestore(&pending_free_lock, flags); return ret; } int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) { int i; for (i = 0; i < MAX_TAP_DEV; i++) if ( (translate_domid[i].domid == domid) && (translate_domid[i].busid == xenbus_id) ) { tapfds[i]->blkif = blkif; tapfds[i]->status = RUNNING; return i; } return -1; } void signal_tapdisk(int idx) { tap_blkif_t *info; struct task_struct *ptask; info = tapfds[idx]; if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) { ptask = find_task_by_pid(info->pid); if (ptask) { info->status = CLEANSHUTDOWN; } } info->blkif = NULL; return; } static int blktap_open(struct inode *inode, struct file *filp) { blkif_sring_t *sring; int idx = iminor(inode) - BLKTAP_MINOR; tap_blkif_t *info; private_info_t *prv; int i; if (tapfds[idx] == NULL) { WPRINTK("Unable to open device /dev/xen/blktap%d\n", idx); return -ENOMEM; } DPRINTK("Opening device /dev/xen/blktap%d\n",idx); info = tapfds[idx]; /*Only one process can access device at a time*/ if (test_and_set_bit(0, &info->dev_inuse)) return -EBUSY; info->dev_pending = 0; /* Allocate the fe ring. */ sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); if (sring == NULL) goto fail_nomem; SetPageReserved(virt_to_page(sring)); SHARED_RING_INIT(sring); FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE); prv = kzalloc(sizeof(private_info_t),GFP_KERNEL); prv->idx = idx; filp->private_data = prv; info->vma = NULL; info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, GFP_KERNEL); if (idx > 0) { init_waitqueue_head(&info->wait); for (i = 0; i < MAX_PENDING_REQS; i++) info->idx_map[i] = INVALID_REQ; } DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx); return 0; fail_nomem: return -ENOMEM; } static int blktap_release(struct inode *inode, struct file *filp) { int idx = iminor(inode) - BLKTAP_MINOR; tap_blkif_t *info; if (tapfds[idx] == NULL) { WPRINTK("Trying to free device that doesn't exist " "[/dev/xen/blktap%d]\n",idx); return -1; } info = tapfds[idx]; info->dev_inuse = 0; DPRINTK("Freeing device [/dev/xen/blktap%d]\n",idx); /* Free the ring page. */ ClearPageReserved(virt_to_page(info->ufe_ring.sring)); free_page((unsigned long) info->ufe_ring.sring); /* Clear any active mappings and free foreign map table */ if (info->vma) { zap_page_range( info->vma, info->vma->vm_start, info->vma->vm_end - info->vma->vm_start, NULL); info->vma = NULL; } if (filp->private_data) kfree(filp->private_data); if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) { kthread_stop(info->blkif->xenblkd); info->blkif->xenblkd = NULL; info->status = CLEANSHUTDOWN; } return 0; } /* Note on mmap: * We need to map pages to user space in a way that will allow the block * subsystem set up direct IO to them. This couldn't be done before, because * there isn't really a sane way to translate a user virtual address down to a * physical address when the page belongs to another domain. * * My first approach was to map the page in to kernel memory, add an entry * for it in the physical frame list (using alloc_lomem_region as in blkback) * and then attempt to map that page up to user space. This is disallowed * by xen though, which realizes that we don't really own the machine frame * underlying the physical page. * * The new approach is to provide explicit support for this in xen linux. * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages * mapped from other vms. vma->vm_private_data is set up as a mapping * from pages to actual page structs. There is a new clause in get_user_pages * that does the right thing for this sort of mapping. */ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) { int size; struct page **map; int i; private_info_t *prv; tap_blkif_t *info; /*Retrieve the dev info*/ prv = (private_info_t *)filp->private_data; if (prv == NULL) { WPRINTK("blktap: mmap, retrieving idx failed\n"); return -ENOMEM; } info = tapfds[prv->idx]; vma->vm_flags |= VM_RESERVED; vma->vm_ops = &blktap_vm_ops; size = vma->vm_end - vma->vm_start; if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) { WPRINTK("you _must_ map exactly %d pages!\n", mmap_pages + RING_PAGES); return -EAGAIN; } size >>= PAGE_SHIFT; info->rings_vstart = vma->vm_start; info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); /* Map the ring pages to the start of the region and reserve it. */ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (remap_pfn_range(vma, vma->vm_start, __pa(info->ufe_ring.sring) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) { WPRINTK("Mapping user ring failed!\n"); goto fail; } /* Mark this VM as containing foreign pages, and set up mappings. */ map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) * sizeof(struct page_struct*), GFP_KERNEL); if (map == NULL) { WPRINTK("Couldn't alloc VM_FOREIGN map.\n"); goto fail; } for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++) map[i] = NULL; vma->vm_private_data = map; vma->vm_flags |= VM_FOREIGN; info->vma = vma; info->ring_ok = 1; return 0; fail: /* Clear any active mappings. */ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); return -ENOMEM; } static int blktap_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { int idx = iminor(inode) - BLKTAP_MINOR; switch(cmd) { case BLKTAP_IOCTL_KICK_FE: { /* There are fe messages to process. */ return blktap_read_ufe_ring(idx); } case BLKTAP_IOCTL_SETMODE: { tap_blkif_t *info = tapfds[idx]; if ( (idx > 0) && (idx < MAX_TAP_DEV) && (tapfds[idx] != NULL) ) { if (BLKTAP_MODE_VALID(arg)) { info->mode = arg; /* XXX: may need to flush rings here. */ DPRINTK("blktap: set mode to %lx\n", arg); return 0; } } return 0; } case BLKTAP_IOCTL_PRINT_IDXS: { tap_blkif_t *info = tapfds[idx]; if ( (idx > 0) && (idx < MAX_TAP_DEV) && (tapfds[idx] != NULL) ) { printk("User Rings: \n-----------\n"); printk("UF: rsp_cons: %2d, req_prod_prv: %2d " "| req_prod: %2d, rsp_prod: %2d\n", info->ufe_ring.rsp_cons, info->ufe_ring.req_prod_pvt, info->ufe_ring.sring->req_prod, info->ufe_ring.sring->rsp_prod); } return 0; } case BLKTAP_IOCTL_SENDPID: { tap_blkif_t *info = tapfds[idx]; if ( (idx > 0) && (idx < MAX_TAP_DEV) && (tapfds[idx] != NULL) ) { info->pid = (pid_t)arg; DPRINTK("blktap: pid received %d\n", info->pid); } return 0; } case BLKTAP_IOCTL_NEWINTF: { uint64_t val = (uint64_t)arg; domid_translate_t *tr = (domid_translate_t *)&val; int newdev; DPRINTK("NEWINTF Req for domid %d and bus id %d\n", tr->domid, tr->busid); newdev = get_next_free_dev(); if (newdev < 1) { WPRINTK("Error initialising /dev/xen/blktap - " "No more devices\n"); return -1; } translate_domid[newdev].domid = tr->domid; translate_domid[newdev].busid = tr->busid; return newdev; } case BLKTAP_IOCTL_FREEINTF: { unsigned long dev = arg; tap_blkif_t *info = NULL; if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev]; if ( (info != NULL) && (info->dev_pending) ) info->dev_pending = 0; return 0; } case BLKTAP_IOCTL_MINOR: { unsigned long dev = arg; tap_blkif_t *info = NULL; if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev]; if (info != NULL) return info->minor; else return -1; } case BLKTAP_IOCTL_MAJOR: return BLKTAP_DEV_MAJOR; case BLKTAP_QUERY_ALLOC_REQS: { WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n", alloc_pending_reqs, blkif_reqs); return (alloc_pending_reqs/blkif_reqs) * 100; } } return -ENOIOCTLCMD; } static unsigned int blktap_poll(struct file *file, poll_table *wait) { private_info_t *prv; tap_blkif_t *info; /*Retrieve the dev info*/ prv = (private_info_t *)file->private_data; if (prv == NULL) { WPRINTK(" poll, retrieving idx failed\n"); return 0; } if (prv->idx == 0) return 0; info = tapfds[prv->idx]; poll_wait(file, &info->wait, wait); if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) { flush_tlb_all(); RING_PUSH_REQUESTS(&info->ufe_ring); return POLLIN | POLLRDNORM; } return 0; } void blktap_kick_user(int idx) { tap_blkif_t *info; if (idx == 0) return; info = tapfds[idx]; if (info != NULL) wake_up_interruptible(&info->wait); return; } static int do_block_io_op(blkif_t *blkif); static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req); static void make_response(blkif_t *blkif, unsigned long id, unsigned short op, int st); /****************************************************************** * misc small helpers */ static int req_increase(void) { int i, j; struct page *page; unsigned long flags; int ret; spin_lock_irqsave(&pending_free_lock, flags); ret = -EINVAL; if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) goto done; #ifdef __ia64__ extern unsigned long alloc_empty_foreign_map_page_range( unsigned long pages); mmap_start[mmap_alloc].start = (unsigned long) alloc_empty_foreign_map_page_range(mmap_pages); #else /* ! ia64 */ page = balloon_alloc_empty_page_range(mmap_pages); ret = -ENOMEM; if (page == NULL) { printk("%s balloon_alloc_empty_page_range gave NULL\n", __FUNCTION__); goto done; } /* Pin all of the pages. */ for (i=0; iinuse == 0) { list_del(&req->free_list); mmap_inuse--; } } if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1); done: spin_unlock_irqrestore(&pending_free_lock, flags); return; } static pending_req_t* alloc_req(void) { pending_req_t *req = NULL; unsigned long flags; spin_lock_irqsave(&pending_free_lock, flags); if (!list_empty(&pending_free)) { req = list_entry(pending_free.next, pending_req_t, free_list); list_del(&req->free_list); } if (req) { req->inuse = 1; alloc_pending_reqs++; } spin_unlock_irqrestore(&pending_free_lock, flags); return req; } static void free_req(pending_req_t *req) { unsigned long flags; int was_empty; spin_lock_irqsave(&pending_free_lock, flags); alloc_pending_reqs--; req->inuse = 0; if (mmap_lock && (req->mem_idx == mmap_alloc-1)) { mmap_inuse--; if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1); spin_unlock_irqrestore(&pending_free_lock, flags); return; } was_empty = list_empty(&pending_free); list_add(&req->free_list, &pending_free); spin_unlock_irqrestore(&pending_free_lock, flags); if (was_empty) wake_up(&pending_free_wq); } static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int tapidx) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; unsigned int i, invcount = 0; struct grant_handle_pair *khandle; uint64_t ptep; int ret, mmap_idx; unsigned long kvaddr, uvaddr; tap_blkif_t *info = tapfds[tapidx]; if (info == NULL) { WPRINTK("fast_flush: Couldn't get info!\n"); return; } mmap_idx = req->mem_idx; for (i = 0; i < req->nr_pages; i++) { kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i); uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i); khandle = &pending_handle(mmap_idx, k_idx, i); if (BLKTAP_INVALID_HANDLE(khandle)) { WPRINTK("BLKTAP_INVALID_HANDLE\n"); continue; } gnttab_set_unmap_op(&unmap[invcount], MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i), GNTMAP_host_map, khandle->kernel); invcount++; if (create_lookup_pte_addr( info->vma->vm_mm, MMAP_VADDR(info->user_vstart, u_idx, i), &ptep) !=0) { WPRINTK("Couldn't get a pte addr!\n"); return; } gnttab_set_unmap_op(&unmap[invcount], ptep, GNTMAP_host_map, khandle->user); invcount++; BLKTAP_INVALIDATE_HANDLE(khandle); } ret = HYPERVISOR_grant_table_op( GNTTABOP_unmap_grant_ref, unmap, invcount); BUG_ON(ret); if (info->vma != NULL) zap_page_range(info->vma, MMAP_VADDR(info->user_vstart, u_idx, 0), req->nr_pages << PAGE_SHIFT, NULL); } /****************************************************************** * SCHEDULER FUNCTIONS */ static void print_stats(blkif_t *blkif) { printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n", current->comm, blkif->st_oo_req, blkif->st_rd_req, blkif->st_wr_req); blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); blkif->st_rd_req = 0; blkif->st_wr_req = 0; blkif->st_oo_req = 0; } int tap_blkif_schedule(void *arg) { blkif_t *blkif = arg; blkif_get(blkif); if (debug_lvl) printk(KERN_DEBUG "%s: started\n", current->comm); while (!kthread_should_stop()) { wait_event_interruptible( blkif->wq, blkif->waiting_reqs || kthread_should_stop()); wait_event_interruptible( pending_free_wq, !list_empty(&pending_free) || kthread_should_stop()); blkif->waiting_reqs = 0; smp_mb(); /* clear flag *before* checking for work */ if (do_block_io_op(blkif)) blkif->waiting_reqs = 1; if (log_stats && time_after(jiffies, blkif->st_print)) print_stats(blkif); } if (log_stats) print_stats(blkif); if (debug_lvl) printk(KERN_DEBUG "%s: exiting\n", current->comm); blkif->xenblkd = NULL; blkif_put(blkif); return 0; } /****************************************************************** * COMPLETION CALLBACK -- Called by user level ioctl() */ static int blktap_read_ufe_ring(int idx) { /* This is called to read responses from the UFE ring. */ RING_IDX i, j, rp; blkif_response_t *resp; blkif_t *blkif=NULL; int pending_idx, usr_idx, mmap_idx; pending_req_t *pending_req; tap_blkif_t *info; info = tapfds[idx]; if (info == NULL) { return 0; } /* We currently only forward packets in INTERCEPT_FE mode. */ if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE)) return 0; /* for each outstanding message on the UFEring */ rp = info->ufe_ring.sring->rsp_prod; rmb(); for (i = info->ufe_ring.rsp_cons; i != rp; i++) { resp = RING_GET_RESPONSE(&info->ufe_ring, i); ++info->ufe_ring.rsp_cons; /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/ usr_idx = (int)resp->id; pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx])); mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]); if ( (mmap_idx >= mmap_alloc) || (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) ) WPRINTK("Incorrect req map" "[%d], internal map [%d,%d (%d)]\n", usr_idx, mmap_idx, ID_TO_IDX(info->idx_map[usr_idx]), MASK_PEND_IDX( ID_TO_IDX(info->idx_map[usr_idx]))); pending_req = &pending_reqs[mmap_idx][pending_idx]; blkif = pending_req->blkif; for (j = 0; j < pending_req->nr_pages; j++) { unsigned long kvaddr, uvaddr; struct page **map = info->vma->vm_private_data; struct page *pg; int offset; uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j); kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, pending_idx, j); pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); ClearPageReserved(pg); offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; map[offset] = NULL; } fast_flush_area(pending_req, pending_idx, usr_idx, idx); make_response(blkif, pending_req->id, resp->operation, resp->status); info->idx_map[usr_idx] = INVALID_REQ; blkif_put(pending_req->blkif); free_req(pending_req); } return 0; } /****************************************************************************** * NOTIFICATION FROM GUEST OS. */ static void blkif_notify_work(blkif_t *blkif) { blkif->waiting_reqs = 1; wake_up(&blkif->wq); } irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) { blkif_notify_work(dev_id); return IRQ_HANDLED; } /****************************************************************** * DOWNWARD CALLS -- These interface with the block-device layer proper. */ static int print_dbug = 1; static int do_block_io_op(blkif_t *blkif) { blkif_back_ring_t *blk_ring = &blkif->blk_ring; blkif_request_t *req; pending_req_t *pending_req; RING_IDX rc, rp; int more_to_do = 0; tap_blkif_t *info; rc = blk_ring->req_cons; rp = blk_ring->sring->req_prod; rmb(); /* Ensure we see queued requests up to 'rp'. */ /*Check blkif has corresponding UE ring*/ if (blkif->dev_num == -1) { /*oops*/ if (print_dbug) { WPRINTK("Corresponding UE " "ring does not exist!\n"); print_dbug = 0; /*We only print this message once*/ } return 1; } info = tapfds[blkif->dev_num]; if (info == NULL || !info->dev_inuse) { if (print_dbug) { WPRINTK("Can't get UE info!\n"); print_dbug = 0; } return 1; } while (rc != rp) { if (RING_FULL(&info->ufe_ring)) { WPRINTK("RING_FULL! More to do\n"); more_to_do = 1; break; } if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) { WPRINTK("RING_REQUEST_CONS_OVERFLOW!" " More to do\n"); more_to_do = 1; break; } pending_req = alloc_req(); if (NULL == pending_req) { blkif->st_oo_req++; more_to_do = 1; break; } req = RING_GET_REQUEST(blk_ring, rc); blk_ring->req_cons = ++rc; /* before make_response() */ switch (req->operation) { case BLKIF_OP_READ: blkif->st_rd_req++; dispatch_rw_block_io(blkif, req, pending_req); break; case BLKIF_OP_WRITE: blkif->st_wr_req++; dispatch_rw_block_io(blkif, req, pending_req); break; default: WPRINTK("unknown operation [%d]\n", req->operation); make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); free_req(pending_req); break; } } blktap_kick_user(blkif->dev_num); return more_to_do; } static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req) { extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; unsigned int nseg; int ret, i; tap_blkif_t *info = tapfds[blkif->dev_num]; uint64_t sector; blkif_request_t *target; int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx); int usr_idx = GET_NEXT_REQ(info->idx_map); uint16_t mmap_idx = pending_req->mem_idx; /*Check we have space on user ring - should never fail*/ if(usr_idx == INVALID_REQ) goto fail_flush; /* Check that number of segments is sane. */ nseg = req->nr_segments; if ( unlikely(nseg == 0) || unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) { WPRINTK("Bad number of segments in request (%d)\n", nseg); goto fail_response; } /* Make sure userspace is ready. */ if (!info->ring_ok) { WPRINTK("blktap: ring not ready for requests!\n"); goto fail_response; } if (RING_FULL(&info->ufe_ring)) { WPRINTK("blktap: fe_ring is full, can't add " "IO Request will be dropped. %d %d\n", RING_SIZE(&info->ufe_ring), RING_SIZE(&blkif->blk_ring)); goto fail_response; } pending_req->blkif = blkif; pending_req->id = req->id; pending_req->operation = operation; pending_req->status = BLKIF_RSP_OKAY; pending_req->nr_pages = nseg; op = 0; for (i = 0; i < nseg; i++) { unsigned long uvaddr; unsigned long kvaddr; uint64_t ptep; struct page *page; uint32_t flags; uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, pending_idx, i); page = virt_to_page(kvaddr); sector = req->sector_number + (8*i); if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) { WPRINTK("BLKTAP: Sector request greater" "than size\n"); WPRINTK("BLKTAP: %s request sector" "[%llu,%llu], Total [%llu]\n", (req->operation == BLKIF_OP_WRITE ? "WRITE" : "READ"), (long long unsigned) sector, (long long unsigned) sector>>9, blkif->sectors); } flags = GNTMAP_host_map; if (operation == WRITE) flags |= GNTMAP_readonly; gnttab_set_map_op(&map[op], kvaddr, flags, req->seg[i].gref, blkif->domid); op++; /* Now map it to user. */ ret = create_lookup_pte_addr(info->vma->vm_mm, uvaddr, &ptep); if (ret) { WPRINTK("Couldn't get a pte addr!\n"); fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num); goto fail_flush; } flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte; if (operation == WRITE) flags |= GNTMAP_readonly; gnttab_set_map_op(&map[op], ptep, flags, req->seg[i].gref, blkif->domid); op++; } ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op); BUG_ON(ret); for (i = 0; i < (nseg*2); i+=2) { unsigned long uvaddr; unsigned long kvaddr; unsigned long offset; struct page *pg; uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2); kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, pending_idx, i/2); if (unlikely(map[i].status != 0)) { WPRINTK("invalid kernel buffer -- " "could not remap it\n"); goto fail_flush; } if (unlikely(map[i+1].status != 0)) { WPRINTK("invalid user buffer -- " "could not remap it\n"); goto fail_flush; } pending_handle(mmap_idx, pending_idx, i/2).kernel = map[i].handle; pending_handle(mmap_idx, pending_idx, i/2).user = map[i+1].handle; set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); ((struct page **)info->vma->vm_private_data)[offset] = pg; } /* Mark mapped pages as reserved: */ for (i = 0; i < req->nr_segments; i++) { unsigned long kvaddr; struct page *pg; kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, pending_idx, i); pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); SetPageReserved(pg); } /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/ info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx); blkif_get(blkif); /* Finally, write the request message to the user ring. */ target = RING_GET_REQUEST(&info->ufe_ring, info->ufe_ring.req_prod_pvt); memcpy(target, req, sizeof(*req)); target->id = usr_idx; info->ufe_ring.req_prod_pvt++; return; fail_flush: WPRINTK("Reached Fail_flush\n"); fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num); fail_response: make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); free_req(pending_req); } /****************************************************************** * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING */ static void make_response(blkif_t *blkif, unsigned long id, unsigned short op, int st) { blkif_response_t *resp; unsigned long flags; blkif_back_ring_t *blk_ring = &blkif->blk_ring; int more_to_do = 0; int notify; spin_lock_irqsave(&blkif->blk_ring_lock, flags); /* Place on the response ring for the relevant domain. */ resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); resp->id = id; resp->operation = op; resp->status = st; blk_ring->rsp_prod_pvt++; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify); if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) { /* * Tail check for pending requests. Allows frontend to avoid * notifications if requests are already in flight (lower * overheads and promotes batching). */ RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do); } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) { more_to_do = 1; } spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); if (more_to_do) blkif_notify_work(blkif); if (notify) notify_remote_via_irq(blkif->irq); } static int __init blkif_init(void) { int i,ret,blktap_dir; tap_blkif_t *info; if (!is_running_on_xen()) return -ENODEV; INIT_LIST_HEAD(&pending_free); for(i = 0; i < 2; i++) { ret = req_increase(); if (ret) break; } if (i == 0) return ret; tap_blkif_interface_init(); alloc_pending_reqs = 0; tap_blkif_xenbus_init(); /*Create the blktap devices, but do not map memory or waitqueue*/ for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF; ret = register_chrdev(BLKTAP_DEV_MAJOR,"blktap",&blktap_fops); blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL); if ( (ret < 0)||(blktap_dir < 0) ) { WPRINTK("Couldn't register /dev/xen/blktap\n"); return -ENOMEM; } for(i = 0; i < MAX_TAP_DEV; i++ ) { info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL); if(tapfds[i] == NULL) return -ENOMEM; info->minor = i; info->pid = 0; info->blkif = NULL; ret = devfs_mk_cdev(MKDEV(BLKTAP_DEV_MAJOR, i), S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i); if(ret != 0) return -ENOMEM; info->dev_pending = info->dev_inuse = 0; DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i); } DPRINTK("Blktap device successfully created\n"); return 0; } module_init(blkif_init); MODULE_LICENSE("Dual BSD/GPL");