Merge to Fedora kernel-2.6.17-1.2187_FC5 patched with stable patch-2.6.17.13-vs2...
[linux-2.6.git] / drivers / xen / blktap / blktap.c
1 /******************************************************************************
2  * drivers/xen/blktap/blktap.c
3  * 
4  * Back-end driver for user level virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. Requests
7  * are remapped to a user-space memory region.
8  *
9  * Based on the blkback driver code.
10  * 
11  * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
12  *
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version 2
15  * as published by the Free Software Foundation; or, when distributed
16  * separately from the Linux kernel or incorporated into other
17  * software packages, subject to the following license:
18  * 
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this source file (the "Software"), to deal in the Software without
21  * restriction, including without limitation the rights to use, copy, modify,
22  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23  * and to permit persons to whom the Software is furnished to do so, subject to
24  * the following conditions:
25  * 
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  * 
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37
38 #include <linux/spinlock.h>
39 #include <linux/kthread.h>
40 #include <linux/list.h>
41 #include <asm/hypervisor.h>
42 #include "common.h"
43 #include <xen/balloon.h>
44 #include <linux/kernel.h>
45 #include <linux/fs.h>
46 #include <linux/mm.h>
47 #include <linux/miscdevice.h>
48 #include <linux/errno.h>
49 #include <linux/major.h>
50 #include <linux/gfp.h>
51 #include <linux/poll.h>
52 #include <asm/tlbflush.h>
53 #include <linux/devfs_fs_kernel.h>
54
55 #define MAX_TAP_DEV 100     /*the maximum number of tapdisk ring devices    */
56 #define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
57
58 /*
59  * The maximum number of requests that can be outstanding at any time
60  * is determined by 
61  *
62  *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
63  *
64  * where mmap_alloc < MAX_DYNAMIC_MEM.
65  *
66  * TODO:
67  * mmap_alloc is initialised to 2 and should be adjustable on the fly via
68  * sysfs.
69  */
70 #define MAX_DYNAMIC_MEM 64
71 #define MAX_PENDING_REQS 64   
72 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
73 #define MMAP_VADDR(_start, _req,_seg)                                   \
74         (_start +                                                       \
75          ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
76          ((_seg) * PAGE_SIZE))
77 static int blkif_reqs = MAX_PENDING_REQS;
78 static int mmap_pages = MMAP_PAGES;
79
80 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
81                       * have a bunch of pages reserved for shared
82                       * memory rings.
83                       */
84
85 /*Data struct associated with each of the tapdisk devices*/
86 typedef struct tap_blkif {
87         struct vm_area_struct *vma;   /*Shared memory area                   */
88         unsigned long rings_vstart;   /*Kernel memory mapping                */
89         unsigned long user_vstart;    /*User memory mapping                  */
90         unsigned long dev_inuse;      /*One process opens device at a time.  */
91         unsigned long dev_pending;    /*In process of being opened           */
92         unsigned long ring_ok;        /*make this ring->state                */
93         blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
94         wait_queue_head_t wait;       /*for poll                             */
95         unsigned long mode;           /*current switching mode               */
96         int minor;                    /*Minor number for tapdisk device      */
97         pid_t pid;                    /*tapdisk process id                   */
98         enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
99                                                   shutdown                   */
100         unsigned long *idx_map;       /*Record the user ring id to kern 
101                                         [req id, idx] tuple                  */
102         blkif_t *blkif;               /*Associate blkif with tapdev          */
103 } tap_blkif_t;
104
105 /*Private data struct associated with the inode*/
106 typedef struct private_info {
107         int idx;
108 } private_info_t;
109
110 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
111 typedef struct domid_translate {
112         unsigned short domid;
113         unsigned short busid;
114 } domid_translate_t ;
115
116
117 domid_translate_t  translate_domid[MAX_TAP_DEV];
118 tap_blkif_t *tapfds[MAX_TAP_DEV];
119
120 static int __init set_blkif_reqs(char *str)
121 {
122         get_option(&str, &blkif_reqs);
123         return 1;
124 }
125 __setup("blkif_reqs=", set_blkif_reqs);
126
127 /* Run-time switchable: /sys/module/blktap/parameters/ */
128 static unsigned int log_stats = 0;
129 static unsigned int debug_lvl = 0;
130 module_param(log_stats, int, 0644);
131 module_param(debug_lvl, int, 0644);
132
133 /*
134  * Each outstanding request that we've passed to the lower device layers has a 
135  * 'pending_req' allocated to it. Each buffer_head that completes decrements 
136  * the pendcnt towards zero. When it hits zero, the specified domain has a 
137  * response queued for it, with the saved 'id' passed back.
138  */
139 typedef struct {
140         blkif_t       *blkif;
141         unsigned long  id;
142         unsigned short mem_idx;
143         int            nr_pages;
144         atomic_t       pendcnt;
145         unsigned short operation;
146         int            status;
147         struct list_head free_list;
148         int            inuse;
149 } pending_req_t;
150
151 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
152 static struct list_head pending_free;
153 static DEFINE_SPINLOCK(pending_free_lock);
154 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
155 static int alloc_pending_reqs;
156
157 typedef unsigned int PEND_RING_IDX;
158
159 static inline int MASK_PEND_IDX(int i) { 
160         return (i & (MAX_PENDING_REQS-1)); 
161 }
162
163 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
164         return (req - pending_reqs[idx]);
165 }
166
167 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
168
169 #define BLKBACK_INVALID_HANDLE (~0)
170
171 typedef struct mmap_page {
172         unsigned long start;
173         struct page *mpage;
174 } mmap_page_t;
175
176 static mmap_page_t mmap_start[MAX_DYNAMIC_MEM];
177 static unsigned short mmap_alloc = 0;
178 static unsigned short mmap_lock = 0;
179 static unsigned short mmap_inuse = 0;
180 static unsigned long *pending_addrs[MAX_DYNAMIC_MEM];
181
182 /******************************************************************
183  * GRANT HANDLES
184  */
185
186 /* When using grant tables to map a frame for device access then the
187  * handle returned must be used to unmap the frame. This is needed to
188  * drop the ref count on the frame.
189  */
190 struct grant_handle_pair
191 {
192         grant_handle_t kernel;
193         grant_handle_t user;
194 };
195
196 static struct grant_handle_pair 
197     pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
198 #define pending_handle(_id, _idx, _i) \
199     (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
200     + (_i)])
201
202
203 static int blktap_read_ufe_ring(int idx); /*local prototypes*/
204
205 #define BLKTAP_MINOR 0  /*/dev/xen/blktap resides at device number
206                           major=254, minor numbers begin at 0            */ 
207 #define BLKTAP_DEV_MAJOR 254         /* TODO: Make major number dynamic  *
208                                       * and create devices in the kernel *
209                                       */
210 #define BLKTAP_DEV_DIR  "/dev/xen"
211
212 /* blktap IOCTLs: */
213 #define BLKTAP_IOCTL_KICK_FE         1
214 #define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
215 #define BLKTAP_IOCTL_SETMODE         3
216 #define BLKTAP_IOCTL_SENDPID         4
217 #define BLKTAP_IOCTL_NEWINTF         5
218 #define BLKTAP_IOCTL_MINOR           6
219 #define BLKTAP_IOCTL_MAJOR           7
220 #define BLKTAP_QUERY_ALLOC_REQS      8
221 #define BLKTAP_IOCTL_FREEINTF        9
222 #define BLKTAP_IOCTL_PRINT_IDXS      100  
223
224 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
225 #define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
226 #define BLKTAP_MODE_INTERCEPT_FE     0x00000001
227 #define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
228
229 #define BLKTAP_MODE_INTERPOSE \
230            (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
231
232
233 static inline int BLKTAP_MODE_VALID(unsigned long arg)
234 {
235         return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
236                 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
237                 (arg == BLKTAP_MODE_INTERPOSE   ));
238 }
239
240 /* Requests passing through the tap to userspace are re-assigned an ID.
241  * We must record a mapping between the BE [IDX,ID] tuple and the userspace
242  * ring ID. 
243  */
244
245 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
246 {
247         return ((fe_dom << 16) | MASK_PEND_IDX(idx));
248 }
249
250 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
251 {
252         return (PEND_RING_IDX)(id & 0x0000ffff);
253 }
254
255 extern inline int ID_TO_MIDX(unsigned long id)
256 {
257         return (int)(id >> 16);
258 }
259
260 #define INVALID_REQ 0xdead0000
261
262 /*TODO: Convert to a free list*/
263 static inline int GET_NEXT_REQ(unsigned long *idx_map)
264 {
265         int i;
266         for (i = 0; i < MAX_PENDING_REQS; i++)
267                 if (idx_map[i] == INVALID_REQ) return i;
268
269         return INVALID_REQ;
270 }
271
272
273 #define BLKTAP_INVALID_HANDLE(_g) \
274     (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
275
276 #define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
277     (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
278     } while(0)
279
280
281 /******************************************************************
282  * BLKTAP VM OPS
283  */
284
285 static struct page *blktap_nopage(struct vm_area_struct *vma,
286                                   unsigned long address,
287                                   int *type)
288 {
289         /*
290          * if the page has not been mapped in by the driver then return
291          * NOPAGE_SIGBUS to the domain.
292          */
293
294         return NOPAGE_SIGBUS;
295 }
296
297 struct vm_operations_struct blktap_vm_ops = {
298         nopage:   blktap_nopage,
299 };
300
301 /******************************************************************
302  * BLKTAP FILE OPS
303  */
304  
305 /*Function Declarations*/
306 static int get_next_free_dev(void);
307 static int blktap_open(struct inode *inode, struct file *filp);
308 static int blktap_release(struct inode *inode, struct file *filp);
309 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
310 static int blktap_ioctl(struct inode *inode, struct file *filp,
311                         unsigned int cmd, unsigned long arg);
312 static unsigned int blktap_poll(struct file *file, poll_table *wait);
313
314 struct miscdevice *set_misc(int minor, char *name, int dev);
315
316 static struct file_operations blktap_fops = {
317         .owner   = THIS_MODULE,
318         .poll    = blktap_poll,
319         .ioctl   = blktap_ioctl,
320         .open    = blktap_open,
321         .release = blktap_release,
322         .mmap    = blktap_mmap,
323 };
324
325
326 static int get_next_free_dev(void)
327 {
328         tap_blkif_t *info;
329         int i = 0, ret = -1;
330         unsigned long flags;
331
332         spin_lock_irqsave(&pending_free_lock, flags);
333         
334         while (i < MAX_TAP_DEV) {
335                 info = tapfds[i];
336                 if ( (tapfds[i] != NULL) && (info->dev_inuse == 0)
337                         && (info->dev_pending == 0) ) {
338                         info->dev_pending = 1;
339                         ret = i;
340                         goto done;
341                 }
342                 i++;
343         }
344         
345 done:
346         spin_unlock_irqrestore(&pending_free_lock, flags);
347         return ret;
348 }
349
350 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
351 {
352         int i;
353                 
354         for (i = 0; i < MAX_TAP_DEV; i++)
355                 if ( (translate_domid[i].domid == domid)
356                     && (translate_domid[i].busid == xenbus_id) ) {
357                         tapfds[i]->blkif = blkif;
358                         tapfds[i]->status = RUNNING;
359                         return i;
360                 }
361         return -1;
362 }
363
364 void signal_tapdisk(int idx) 
365 {
366         tap_blkif_t *info;
367         struct task_struct *ptask;
368
369         info = tapfds[idx];
370         if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) {
371                 ptask = find_task_by_pid(info->pid);
372                 if (ptask) { 
373                         info->status = CLEANSHUTDOWN;
374                 }
375         }
376         info->blkif = NULL;
377         return;
378 }
379
380 static int blktap_open(struct inode *inode, struct file *filp)
381 {
382         blkif_sring_t *sring;
383         int idx = iminor(inode) - BLKTAP_MINOR;
384         tap_blkif_t *info;
385         private_info_t *prv;
386         int i;
387         
388         if (tapfds[idx] == NULL) {
389                 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
390                        idx);
391                 return -ENOMEM;
392         }
393         DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
394         
395         info = tapfds[idx];
396         
397         /*Only one process can access device at a time*/
398         if (test_and_set_bit(0, &info->dev_inuse))
399                 return -EBUSY;
400
401         info->dev_pending = 0;
402             
403         /* Allocate the fe ring. */
404         sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
405         if (sring == NULL)
406                 goto fail_nomem;
407
408         SetPageReserved(virt_to_page(sring));
409     
410         SHARED_RING_INIT(sring);
411         FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
412         
413         prv = kzalloc(sizeof(private_info_t),GFP_KERNEL);
414         prv->idx = idx;
415         filp->private_data = prv;
416         info->vma = NULL;
417
418         info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, 
419                                 GFP_KERNEL);
420         
421         if (idx > 0) {
422                 init_waitqueue_head(&info->wait);
423                 for (i = 0; i < MAX_PENDING_REQS; i++) 
424                         info->idx_map[i] = INVALID_REQ;
425         }
426
427         DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
428         return 0;
429
430  fail_nomem:
431         return -ENOMEM;
432 }
433
434 static int blktap_release(struct inode *inode, struct file *filp)
435 {
436         int idx = iminor(inode) - BLKTAP_MINOR;
437         tap_blkif_t *info;
438         
439         if (tapfds[idx] == NULL) {
440                 WPRINTK("Trying to free device that doesn't exist "
441                        "[/dev/xen/blktap%d]\n",idx);
442                 return -1;
443         }
444         info = tapfds[idx];
445         info->dev_inuse = 0;
446         DPRINTK("Freeing device [/dev/xen/blktap%d]\n",idx);
447
448         /* Free the ring page. */
449         ClearPageReserved(virt_to_page(info->ufe_ring.sring));
450         free_page((unsigned long) info->ufe_ring.sring);
451
452         /* Clear any active mappings and free foreign map table */
453         if (info->vma) {
454                 zap_page_range(
455                         info->vma, info->vma->vm_start, 
456                         info->vma->vm_end - info->vma->vm_start, NULL);
457                 info->vma = NULL;
458         }
459         
460         if (filp->private_data) kfree(filp->private_data);
461
462         if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
463                 kthread_stop(info->blkif->xenblkd);
464                 info->blkif->xenblkd = NULL;
465                 info->status = CLEANSHUTDOWN;
466         }       
467         return 0;
468 }
469
470
471 /* Note on mmap:
472  * We need to map pages to user space in a way that will allow the block
473  * subsystem set up direct IO to them.  This couldn't be done before, because
474  * there isn't really a sane way to translate a user virtual address down to a 
475  * physical address when the page belongs to another domain.
476  *
477  * My first approach was to map the page in to kernel memory, add an entry
478  * for it in the physical frame list (using alloc_lomem_region as in blkback)
479  * and then attempt to map that page up to user space.  This is disallowed
480  * by xen though, which realizes that we don't really own the machine frame
481  * underlying the physical page.
482  *
483  * The new approach is to provide explicit support for this in xen linux.
484  * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
485  * mapped from other vms.  vma->vm_private_data is set up as a mapping 
486  * from pages to actual page structs.  There is a new clause in get_user_pages
487  * that does the right thing for this sort of mapping.
488  */
489 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
490 {
491         int size;
492         struct page **map;
493         int i;
494         private_info_t *prv;
495         tap_blkif_t *info;
496
497         /*Retrieve the dev info*/
498         prv = (private_info_t *)filp->private_data;
499         if (prv == NULL) {
500                 WPRINTK("blktap: mmap, retrieving idx failed\n");
501                 return -ENOMEM;
502         }
503         info = tapfds[prv->idx];
504         
505         vma->vm_flags |= VM_RESERVED;
506         vma->vm_ops = &blktap_vm_ops;
507
508         size = vma->vm_end - vma->vm_start;
509         if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
510                 WPRINTK("you _must_ map exactly %d pages!\n",
511                        mmap_pages + RING_PAGES);
512                 return -EAGAIN;
513         }
514
515         size >>= PAGE_SHIFT;
516         info->rings_vstart = vma->vm_start;
517         info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
518     
519         /* Map the ring pages to the start of the region and reserve it. */
520         vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
521
522         if (remap_pfn_range(vma, vma->vm_start, 
523                             __pa(info->ufe_ring.sring) >> PAGE_SHIFT, 
524                             PAGE_SIZE, vma->vm_page_prot)) {
525                 WPRINTK("Mapping user ring failed!\n");
526                 goto fail;
527         }
528
529         /* Mark this VM as containing foreign pages, and set up mappings. */
530         map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
531                       * sizeof(struct page_struct*),
532                       GFP_KERNEL);
533         if (map == NULL) {
534                 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
535                 goto fail;
536         }
537
538         for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
539                 map[i] = NULL;
540     
541         vma->vm_private_data = map;
542         vma->vm_flags |= VM_FOREIGN;
543
544         info->vma = vma;
545         info->ring_ok = 1;
546         return 0;
547  fail:
548         /* Clear any active mappings. */
549         zap_page_range(vma, vma->vm_start, 
550                        vma->vm_end - vma->vm_start, NULL);
551
552         return -ENOMEM;
553 }
554
555
556 static int blktap_ioctl(struct inode *inode, struct file *filp,
557                         unsigned int cmd, unsigned long arg)
558 {
559         int idx = iminor(inode) - BLKTAP_MINOR;
560         switch(cmd) {
561         case BLKTAP_IOCTL_KICK_FE: 
562         {
563                 /* There are fe messages to process. */
564                 return blktap_read_ufe_ring(idx);
565         }
566         case BLKTAP_IOCTL_SETMODE:
567         {
568                 tap_blkif_t *info = tapfds[idx];
569                 
570                 if ( (idx > 0) && (idx < MAX_TAP_DEV) 
571                      && (tapfds[idx] != NULL) ) 
572                 {
573                         if (BLKTAP_MODE_VALID(arg)) {
574                                 info->mode = arg;
575                                 /* XXX: may need to flush rings here. */
576                                 DPRINTK("blktap: set mode to %lx\n", 
577                                        arg);
578                                 return 0;
579                         }
580                 }
581                 return 0;
582         }
583         case BLKTAP_IOCTL_PRINT_IDXS:
584         {
585                 tap_blkif_t *info = tapfds[idx];
586                 
587                 if ( (idx > 0) && (idx < MAX_TAP_DEV) 
588                      && (tapfds[idx] != NULL) ) 
589                 {
590                         printk("User Rings: \n-----------\n");
591                         printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
592                                 "| req_prod: %2d, rsp_prod: %2d\n",
593                                 info->ufe_ring.rsp_cons,
594                                 info->ufe_ring.req_prod_pvt,
595                                 info->ufe_ring.sring->req_prod,
596                                 info->ufe_ring.sring->rsp_prod);
597                 }
598                 return 0;
599         }
600         case BLKTAP_IOCTL_SENDPID:
601         {
602                 tap_blkif_t *info = tapfds[idx];
603                 
604                 if ( (idx > 0) && (idx < MAX_TAP_DEV) 
605                      && (tapfds[idx] != NULL) ) 
606                 {
607                         info->pid = (pid_t)arg;
608                         DPRINTK("blktap: pid received %d\n", 
609                                info->pid);
610                 }
611                 return 0;
612         }
613         case BLKTAP_IOCTL_NEWINTF:
614         {               
615                 uint64_t val = (uint64_t)arg;
616                 domid_translate_t *tr = (domid_translate_t *)&val;
617                 int newdev;
618
619                 DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
620                        tr->domid, tr->busid);
621                 newdev = get_next_free_dev();
622                 if (newdev < 1) {
623                         WPRINTK("Error initialising /dev/xen/blktap - "
624                                 "No more devices\n");
625                         return -1;
626                 }
627                 translate_domid[newdev].domid = tr->domid;
628                 translate_domid[newdev].busid = tr->busid;
629                 return newdev;
630         }
631         case BLKTAP_IOCTL_FREEINTF:
632         {
633                 unsigned long dev = arg;
634                 tap_blkif_t *info = NULL;
635
636                 if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev];
637
638                 if ( (info != NULL) && (info->dev_pending) )
639                         info->dev_pending = 0;
640                 return 0;
641         }
642         case BLKTAP_IOCTL_MINOR:
643         {
644                 unsigned long dev = arg;
645                 tap_blkif_t *info = NULL;
646                 
647                 if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev];
648                 
649                 if (info != NULL) return info->minor;
650                 else return -1;
651         }
652         case BLKTAP_IOCTL_MAJOR:
653                 return BLKTAP_DEV_MAJOR;
654
655         case BLKTAP_QUERY_ALLOC_REQS:
656         {
657                 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
658                        alloc_pending_reqs, blkif_reqs);
659                 return (alloc_pending_reqs/blkif_reqs) * 100;
660         }
661         }
662         return -ENOIOCTLCMD;
663 }
664
665 static unsigned int blktap_poll(struct file *file, poll_table *wait)
666 {
667         private_info_t *prv;
668         tap_blkif_t *info;
669         
670         /*Retrieve the dev info*/
671         prv = (private_info_t *)file->private_data;
672         if (prv == NULL) {
673                 WPRINTK(" poll, retrieving idx failed\n");
674                 return 0;
675         }
676         
677         if (prv->idx == 0) return 0;
678         
679         info = tapfds[prv->idx];
680         
681         poll_wait(file, &info->wait, wait);
682         if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
683                 flush_tlb_all();
684                 RING_PUSH_REQUESTS(&info->ufe_ring);
685                 return POLLIN | POLLRDNORM;
686         }
687         return 0;
688 }
689
690 void blktap_kick_user(int idx)
691 {
692         tap_blkif_t *info;
693
694         if (idx == 0) return;
695         
696         info = tapfds[idx];
697         
698         if (info != NULL) wake_up_interruptible(&info->wait);
699         return;
700 }
701
702 static int do_block_io_op(blkif_t *blkif);
703 static void dispatch_rw_block_io(blkif_t *blkif,
704                                  blkif_request_t *req,
705                                  pending_req_t *pending_req);
706 static void make_response(blkif_t *blkif, unsigned long id, 
707                           unsigned short op, int st);
708
709 /******************************************************************
710  * misc small helpers
711  */
712 static int req_increase(void)
713 {
714         int i, j;
715         struct page *page;
716         unsigned long flags;
717         int ret;
718
719         spin_lock_irqsave(&pending_free_lock, flags);
720
721         ret = -EINVAL;
722         if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
723                 goto done;
724
725 #ifdef __ia64__
726         extern unsigned long alloc_empty_foreign_map_page_range(
727                 unsigned long pages);
728         mmap_start[mmap_alloc].start = (unsigned long)
729                 alloc_empty_foreign_map_page_range(mmap_pages);
730 #else /* ! ia64 */
731         page = balloon_alloc_empty_page_range(mmap_pages);
732         ret = -ENOMEM;
733         if (page == NULL) {
734                 printk("%s balloon_alloc_empty_page_range gave NULL\n", __FUNCTION__);
735                 goto done;
736         }
737
738         /* Pin all of the pages. */
739         for (i=0; i<mmap_pages; i++)
740                 get_page(&page[i]);
741
742         mmap_start[mmap_alloc].start = 
743                 (unsigned long)pfn_to_kaddr(page_to_pfn(page));
744         mmap_start[mmap_alloc].mpage = page;
745
746 #endif
747
748         pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t) *
749                                         blkif_reqs, GFP_KERNEL);
750         pending_addrs[mmap_alloc] = kzalloc(sizeof(unsigned long) *
751                                         mmap_pages, GFP_KERNEL);
752
753         ret = -ENOMEM;
754         if (!pending_reqs[mmap_alloc] || !pending_addrs[mmap_alloc]) {
755                 kfree(pending_reqs[mmap_alloc]);
756                 kfree(pending_addrs[mmap_alloc]);
757                 WPRINTK("%s: out of memory\n", __FUNCTION__); 
758                 ret = -ENOMEM;
759                 goto done;
760         }
761
762         ret = 0;
763
764         DPRINTK("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
765                 __FUNCTION__, blkif_reqs, mmap_pages, 
766                mmap_start[mmap_alloc].start);
767
768         BUG_ON(mmap_start[mmap_alloc].start == 0);
769
770         for (i = 0; i < mmap_pages; i++) 
771                 pending_addrs[mmap_alloc][i] = 
772                         mmap_start[mmap_alloc].start + (i << PAGE_SHIFT);
773
774         for (i = 0; i < MAX_PENDING_REQS ; i++) {
775                 list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
776                               &pending_free);
777                 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
778                 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
779                         BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
780                                                                  i, j));
781         }
782
783         mmap_alloc++;
784         DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
785  done:
786         spin_unlock_irqrestore(&pending_free_lock, flags);
787         return ret;
788 }
789
790 static void mmap_req_del(int mmap)
791 {
792         int i;
793         struct page *page;
794
795         /*Spinlock already acquired*/
796         kfree(pending_reqs[mmap]);
797         kfree(pending_addrs[mmap]);
798
799 #ifdef __ia64__
800         /*Not sure what goes here yet!*/
801 #else
802
803         /* Unpin all of the pages. */
804         page = mmap_start[mmap].mpage;
805         for (i=0; i<mmap_pages; i++)
806                 put_page(&page[i]);
807
808         balloon_dealloc_empty_page_range(mmap_start[mmap].mpage, mmap_pages);
809 #endif
810
811         mmap_lock = 0;
812         DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
813         mmap_alloc--;
814 }
815
816 /*N.B. Currently unused - will be accessed via sysfs*/
817 static void req_decrease(void)
818 {
819         pending_req_t *req;
820         int i;
821         unsigned long flags;
822
823         spin_lock_irqsave(&pending_free_lock, flags);
824
825         DPRINTK("Req decrease called.\n");
826         if (mmap_lock || mmap_alloc == 1) 
827                 goto done;
828
829         mmap_lock = 1;
830         mmap_inuse = MAX_PENDING_REQS;
831         
832         /*Go through reqs and remove any that aren't in use*/
833         for (i = 0; i < MAX_PENDING_REQS ; i++) {
834                 req = &pending_reqs[mmap_alloc-1][i];
835                 if (req->inuse == 0) {
836                         list_del(&req->free_list);
837                         mmap_inuse--;
838                 }
839         }
840         if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
841  done:
842         spin_unlock_irqrestore(&pending_free_lock, flags);
843         return;
844 }
845
846 static pending_req_t* alloc_req(void)
847 {
848         pending_req_t *req = NULL;
849         unsigned long flags;
850
851         spin_lock_irqsave(&pending_free_lock, flags);
852
853         if (!list_empty(&pending_free)) {
854                 req = list_entry(pending_free.next, pending_req_t, free_list);
855                 list_del(&req->free_list);
856         }
857
858         if (req) {
859                 req->inuse = 1;
860                 alloc_pending_reqs++;
861         }
862         spin_unlock_irqrestore(&pending_free_lock, flags);
863
864         return req;
865 }
866
867 static void free_req(pending_req_t *req)
868 {
869         unsigned long flags;
870         int was_empty;
871
872         spin_lock_irqsave(&pending_free_lock, flags);
873
874         alloc_pending_reqs--;
875         req->inuse = 0;
876         if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
877                 mmap_inuse--;
878                 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
879                 spin_unlock_irqrestore(&pending_free_lock, flags);
880                 return;
881         }
882         was_empty = list_empty(&pending_free);
883         list_add(&req->free_list, &pending_free);
884
885         spin_unlock_irqrestore(&pending_free_lock, flags);
886
887         if (was_empty)
888                 wake_up(&pending_free_wq);
889 }
890
891 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int 
892                             tapidx)
893 {
894         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
895         unsigned int i, invcount = 0;
896         struct grant_handle_pair *khandle;
897         uint64_t ptep;
898         int ret, mmap_idx;
899         unsigned long kvaddr, uvaddr;
900
901         tap_blkif_t *info = tapfds[tapidx];
902         
903         if (info == NULL) {
904                 WPRINTK("fast_flush: Couldn't get info!\n");
905                 return;
906         }
907         mmap_idx = req->mem_idx;
908
909         for (i = 0; i < req->nr_pages; i++) {
910                 kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i);
911                 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
912
913                 khandle = &pending_handle(mmap_idx, k_idx, i);
914                 if (BLKTAP_INVALID_HANDLE(khandle)) {
915                         WPRINTK("BLKTAP_INVALID_HANDLE\n");
916                         continue;
917                 }
918                 gnttab_set_unmap_op(&unmap[invcount], 
919                         MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i), 
920                                     GNTMAP_host_map, khandle->kernel);
921                 invcount++;
922
923                 if (create_lookup_pte_addr(
924                     info->vma->vm_mm,
925                     MMAP_VADDR(info->user_vstart, u_idx, i), 
926                     &ptep) !=0) {
927                         WPRINTK("Couldn't get a pte addr!\n");
928                         return;
929                 }
930
931                 gnttab_set_unmap_op(&unmap[invcount], 
932                         ptep, GNTMAP_host_map,
933                         khandle->user);
934                 invcount++;
935             
936                 BLKTAP_INVALIDATE_HANDLE(khandle);
937         }
938         ret = HYPERVISOR_grant_table_op(
939                 GNTTABOP_unmap_grant_ref, unmap, invcount);
940         BUG_ON(ret);
941         
942         if (info->vma != NULL)
943                 zap_page_range(info->vma, 
944                                MMAP_VADDR(info->user_vstart, u_idx, 0), 
945                                req->nr_pages << PAGE_SHIFT, NULL);
946 }
947
948 /******************************************************************
949  * SCHEDULER FUNCTIONS
950  */
951
952 static void print_stats(blkif_t *blkif)
953 {
954         printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
955                current->comm, blkif->st_oo_req,
956                blkif->st_rd_req, blkif->st_wr_req);
957         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
958         blkif->st_rd_req = 0;
959         blkif->st_wr_req = 0;
960         blkif->st_oo_req = 0;
961 }
962
963 int tap_blkif_schedule(void *arg)
964 {
965         blkif_t *blkif = arg;
966
967         blkif_get(blkif);
968
969         if (debug_lvl)
970                 printk(KERN_DEBUG "%s: started\n", current->comm);
971
972         while (!kthread_should_stop()) {
973                 wait_event_interruptible(
974                         blkif->wq,
975                         blkif->waiting_reqs || kthread_should_stop());
976                 wait_event_interruptible(
977                         pending_free_wq,
978                         !list_empty(&pending_free) || kthread_should_stop());
979
980                 blkif->waiting_reqs = 0;
981                 smp_mb(); /* clear flag *before* checking for work */
982
983                 if (do_block_io_op(blkif))
984                         blkif->waiting_reqs = 1;
985
986                 if (log_stats && time_after(jiffies, blkif->st_print))
987                         print_stats(blkif);
988         }
989
990         if (log_stats)
991                 print_stats(blkif);
992         if (debug_lvl)
993                 printk(KERN_DEBUG "%s: exiting\n", current->comm);
994
995         blkif->xenblkd = NULL;
996         blkif_put(blkif);
997
998         return 0;
999 }
1000
1001 /******************************************************************
1002  * COMPLETION CALLBACK -- Called by user level ioctl()
1003  */
1004
1005 static int blktap_read_ufe_ring(int idx)
1006 {
1007         /* This is called to read responses from the UFE ring. */
1008         RING_IDX i, j, rp;
1009         blkif_response_t *resp;
1010         blkif_t *blkif=NULL;
1011         int pending_idx, usr_idx, mmap_idx;
1012         pending_req_t *pending_req;
1013         tap_blkif_t *info;
1014         
1015         info = tapfds[idx];
1016         if (info == NULL) {
1017                 return 0;
1018         }
1019
1020         /* We currently only forward packets in INTERCEPT_FE mode. */
1021         if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
1022                 return 0;
1023
1024         /* for each outstanding message on the UFEring  */
1025         rp = info->ufe_ring.sring->rsp_prod;
1026         rmb();
1027         
1028         for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
1029                 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
1030                 ++info->ufe_ring.rsp_cons;
1031
1032                 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
1033                 usr_idx = (int)resp->id;
1034                 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
1035                 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
1036
1037                 if ( (mmap_idx >= mmap_alloc) || 
1038                    (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
1039                         WPRINTK("Incorrect req map"
1040                                "[%d], internal map [%d,%d (%d)]\n", 
1041                                usr_idx, mmap_idx, 
1042                                ID_TO_IDX(info->idx_map[usr_idx]),
1043                                MASK_PEND_IDX(
1044                                        ID_TO_IDX(info->idx_map[usr_idx])));
1045
1046                 pending_req = &pending_reqs[mmap_idx][pending_idx];
1047                 blkif = pending_req->blkif;
1048
1049                 for (j = 0; j < pending_req->nr_pages; j++) {
1050
1051                         unsigned long kvaddr, uvaddr;
1052                         struct page **map = info->vma->vm_private_data;
1053                         struct page *pg;
1054                         int offset; 
1055
1056                         uvaddr  = MMAP_VADDR(info->user_vstart, usr_idx, j);
1057                         kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, 
1058                                             pending_idx, j);
1059
1060                         pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1061                         ClearPageReserved(pg);
1062                         offset = (uvaddr - info->vma->vm_start) 
1063                                 >> PAGE_SHIFT;
1064                         map[offset] = NULL;
1065                 }
1066                 fast_flush_area(pending_req, pending_idx, usr_idx, idx); 
1067                 make_response(blkif, pending_req->id, resp->operation,
1068                               resp->status);
1069                 info->idx_map[usr_idx] = INVALID_REQ;
1070                 blkif_put(pending_req->blkif);
1071                 free_req(pending_req);
1072         }
1073                 
1074         return 0;
1075 }
1076
1077
1078 /******************************************************************************
1079  * NOTIFICATION FROM GUEST OS.
1080  */
1081
1082 static void blkif_notify_work(blkif_t *blkif)
1083 {
1084         blkif->waiting_reqs = 1;
1085         wake_up(&blkif->wq);
1086 }
1087
1088 irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
1089 {
1090         blkif_notify_work(dev_id);
1091         return IRQ_HANDLED;
1092 }
1093
1094
1095
1096 /******************************************************************
1097  * DOWNWARD CALLS -- These interface with the block-device layer proper.
1098  */
1099 static int print_dbug = 1;
1100 static int do_block_io_op(blkif_t *blkif)
1101 {
1102         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1103         blkif_request_t *req;
1104         pending_req_t *pending_req;
1105         RING_IDX rc, rp;
1106         int more_to_do = 0;
1107         tap_blkif_t *info;
1108
1109         rc = blk_ring->req_cons;
1110         rp = blk_ring->sring->req_prod;
1111         rmb(); /* Ensure we see queued requests up to 'rp'. */
1112
1113         /*Check blkif has corresponding UE ring*/
1114         if (blkif->dev_num == -1) {
1115                 /*oops*/
1116                 if (print_dbug) {
1117                         WPRINTK("Corresponding UE " 
1118                                "ring does not exist!\n");
1119                         print_dbug = 0; /*We only print this message once*/
1120                 }
1121                 return 1; 
1122         }
1123
1124         info = tapfds[blkif->dev_num];
1125         if (info == NULL || !info->dev_inuse) {
1126                 if (print_dbug) {
1127                         WPRINTK("Can't get UE info!\n");
1128                         print_dbug = 0;
1129                 }
1130                 return 1;
1131         }
1132
1133         while (rc != rp) {
1134                 
1135                 if (RING_FULL(&info->ufe_ring)) {
1136                         WPRINTK("RING_FULL! More to do\n");
1137                         more_to_do = 1;
1138                         break;
1139                 }
1140                 
1141                 if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
1142                         WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1143                                " More to do\n");
1144                         more_to_do = 1;
1145                         break;          
1146                 }
1147
1148                 pending_req = alloc_req();
1149                 if (NULL == pending_req) {
1150                         blkif->st_oo_req++;
1151                         more_to_do = 1;
1152                         break;
1153                 }
1154
1155                 req = RING_GET_REQUEST(blk_ring, rc);
1156                 blk_ring->req_cons = ++rc; /* before make_response() */ 
1157
1158                 switch (req->operation) {
1159                 case BLKIF_OP_READ:
1160                         blkif->st_rd_req++;
1161                         dispatch_rw_block_io(blkif, req, pending_req);
1162                         break;
1163
1164                 case BLKIF_OP_WRITE:
1165                         blkif->st_wr_req++;
1166                         dispatch_rw_block_io(blkif, req, pending_req);
1167                         break;
1168
1169                 default:
1170                         WPRINTK("unknown operation [%d]\n",
1171                                 req->operation);
1172                         make_response(blkif, req->id, req->operation,
1173                                       BLKIF_RSP_ERROR);
1174                         free_req(pending_req);
1175                         break;
1176                 }
1177         }
1178                 
1179         blktap_kick_user(blkif->dev_num);
1180
1181         return more_to_do;
1182 }
1183
1184 static void dispatch_rw_block_io(blkif_t *blkif,
1185                                  blkif_request_t *req,
1186                                  pending_req_t *pending_req)
1187 {
1188         extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
1189         int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
1190         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1191         unsigned int nseg;
1192         int ret, i;
1193         tap_blkif_t *info = tapfds[blkif->dev_num];
1194         uint64_t sector;
1195         
1196         blkif_request_t *target;
1197         int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
1198         int usr_idx = GET_NEXT_REQ(info->idx_map);
1199         uint16_t mmap_idx = pending_req->mem_idx;
1200
1201         /*Check we have space on user ring - should never fail*/
1202         if(usr_idx == INVALID_REQ) goto fail_flush;
1203         
1204         /* Check that number of segments is sane. */
1205         nseg = req->nr_segments;
1206         if ( unlikely(nseg == 0) || 
1207             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1208                 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1209                 goto fail_response;
1210         }
1211         
1212         /* Make sure userspace is ready. */
1213         if (!info->ring_ok) {
1214                 WPRINTK("blktap: ring not ready for requests!\n");
1215                 goto fail_response;
1216         }
1217
1218         if (RING_FULL(&info->ufe_ring)) {
1219                 WPRINTK("blktap: fe_ring is full, can't add "
1220                         "IO Request will be dropped. %d %d\n",
1221                         RING_SIZE(&info->ufe_ring),
1222                         RING_SIZE(&blkif->blk_ring));
1223                 goto fail_response;
1224         }
1225
1226         pending_req->blkif     = blkif;
1227         pending_req->id        = req->id;
1228         pending_req->operation = operation;
1229         pending_req->status    = BLKIF_RSP_OKAY;
1230         pending_req->nr_pages  = nseg;
1231         op = 0;
1232         for (i = 0; i < nseg; i++) {
1233                 unsigned long uvaddr;
1234                 unsigned long kvaddr;
1235                 uint64_t ptep;
1236                 struct page *page;
1237                 uint32_t flags;
1238
1239                 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1240                 kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, 
1241                                     pending_idx, i);
1242                 page = virt_to_page(kvaddr);
1243
1244                 sector = req->sector_number + (8*i);
1245                 if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
1246                         WPRINTK("BLKTAP: Sector request greater" 
1247                                "than size\n");
1248                         WPRINTK("BLKTAP: %s request sector" 
1249                                "[%llu,%llu], Total [%llu]\n",
1250                                (req->operation == 
1251                                 BLKIF_OP_WRITE ? "WRITE" : "READ"),
1252                                 (long long unsigned) sector,
1253                                 (long long unsigned) sector>>9,
1254                                 blkif->sectors);
1255                 }
1256
1257                 flags = GNTMAP_host_map;
1258                 if (operation == WRITE)
1259                         flags |= GNTMAP_readonly;
1260                 gnttab_set_map_op(&map[op], kvaddr, flags,
1261                                   req->seg[i].gref, blkif->domid);
1262                 op++;
1263
1264                 /* Now map it to user. */
1265                 ret = create_lookup_pte_addr(info->vma->vm_mm, 
1266                                              uvaddr, &ptep);
1267                 if (ret) {
1268                         WPRINTK("Couldn't get a pte addr!\n");
1269                         fast_flush_area(pending_req, pending_idx, usr_idx, 
1270                                         blkif->dev_num);
1271                         goto fail_flush;
1272                 }
1273
1274                 flags = GNTMAP_host_map | GNTMAP_application_map
1275                         | GNTMAP_contains_pte;
1276                 if (operation == WRITE)
1277                         flags |= GNTMAP_readonly;
1278                 gnttab_set_map_op(&map[op], ptep, flags,
1279                                   req->seg[i].gref, blkif->domid);
1280                 op++;
1281         }
1282
1283         ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1284         BUG_ON(ret);
1285
1286         for (i = 0; i < (nseg*2); i+=2) {
1287                 unsigned long uvaddr;
1288                 unsigned long kvaddr;
1289                 unsigned long offset;
1290                 struct page *pg;
1291
1292                 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1293                 kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, 
1294                                     pending_idx, i/2);
1295
1296                 if (unlikely(map[i].status != 0)) {
1297                         WPRINTK("invalid kernel buffer -- "
1298                                 "could not remap it\n");
1299                         goto fail_flush;
1300                 }
1301
1302                 if (unlikely(map[i+1].status != 0)) {
1303                         WPRINTK("invalid user buffer -- "
1304                                 "could not remap it\n");
1305                         goto fail_flush;
1306                 }
1307
1308                 pending_handle(mmap_idx, pending_idx, i/2).kernel 
1309                         = map[i].handle;
1310                 pending_handle(mmap_idx, pending_idx, i/2).user   
1311                         = map[i+1].handle;
1312                 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
1313                         FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
1314                 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1315                 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1316                 ((struct page **)info->vma->vm_private_data)[offset] =
1317                         pg;
1318         }
1319         /* Mark mapped pages as reserved: */
1320         for (i = 0; i < req->nr_segments; i++) {
1321                 unsigned long kvaddr;
1322                 struct page *pg;
1323
1324                 kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, 
1325                                     pending_idx, i);
1326                 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1327                 SetPageReserved(pg);
1328         }
1329         
1330         /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1331         info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
1332
1333         blkif_get(blkif);
1334         /* Finally, write the request message to the user ring. */
1335         target = RING_GET_REQUEST(&info->ufe_ring,
1336                                   info->ufe_ring.req_prod_pvt);
1337         memcpy(target, req, sizeof(*req));
1338         target->id = usr_idx;
1339         info->ufe_ring.req_prod_pvt++;
1340         return;
1341
1342  fail_flush:
1343         WPRINTK("Reached Fail_flush\n");
1344         fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
1345  fail_response:
1346         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1347         free_req(pending_req);
1348
1349
1350
1351
1352 /******************************************************************
1353  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1354  */
1355
1356
1357 static void make_response(blkif_t *blkif, unsigned long id, 
1358                           unsigned short op, int st)
1359 {
1360         blkif_response_t *resp;
1361         unsigned long     flags;
1362         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1363         int more_to_do = 0;
1364         int notify;
1365
1366         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1367         /* Place on the response ring for the relevant domain. */ 
1368         resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
1369         resp->id        = id;
1370         resp->operation = op;
1371         resp->status    = st;
1372         blk_ring->rsp_prod_pvt++;
1373         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
1374
1375         if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
1376                 /*
1377                  * Tail check for pending requests. Allows frontend to avoid
1378                  * notifications if requests are already in flight (lower
1379                  * overheads and promotes batching).
1380                  */
1381                 RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
1382         } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
1383                 more_to_do = 1;
1384
1385         }       
1386         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1387         if (more_to_do)
1388                 blkif_notify_work(blkif);
1389         if (notify)
1390                 notify_remote_via_irq(blkif->irq);
1391 }
1392
1393 static int __init blkif_init(void)
1394 {
1395         int i,ret,blktap_dir;
1396         tap_blkif_t *info;
1397
1398         if (!is_running_on_xen())
1399                 return -ENODEV;
1400
1401         INIT_LIST_HEAD(&pending_free);
1402         for(i = 0; i < 2; i++) {
1403                 ret = req_increase();
1404                 if (ret)
1405                         break;
1406         }
1407         if (i == 0)
1408                 return ret;
1409
1410         tap_blkif_interface_init();
1411
1412         alloc_pending_reqs = 0;
1413
1414         tap_blkif_xenbus_init();
1415
1416         /*Create the blktap devices, but do not map memory or waitqueue*/
1417         for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF;
1418
1419         ret = register_chrdev(BLKTAP_DEV_MAJOR,"blktap",&blktap_fops);
1420         blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL);
1421
1422         if ( (ret < 0)||(blktap_dir < 0) ) {
1423                 WPRINTK("Couldn't register /dev/xen/blktap\n");
1424                 return -ENOMEM;
1425         }       
1426         
1427         for(i = 0; i < MAX_TAP_DEV; i++ ) {
1428                 info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL);
1429                 if(tapfds[i] == NULL) return -ENOMEM;
1430                 info->minor = i;
1431                 info->pid = 0;
1432                 info->blkif = NULL;
1433
1434                 ret = devfs_mk_cdev(MKDEV(BLKTAP_DEV_MAJOR, i),
1435                         S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
1436
1437                 if(ret != 0) return -ENOMEM;
1438                 info->dev_pending = info->dev_inuse = 0;
1439
1440                 DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
1441         }
1442         
1443         DPRINTK("Blktap device successfully created\n");
1444
1445         return 0;
1446 }
1447
1448 module_init(blkif_init);
1449
1450 MODULE_LICENSE("Dual BSD/GPL");