fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / drivers / xen / blktap / blktapmain.c
1 /******************************************************************************
2  * drivers/xen/blktap/blktap.c
3  * 
4  * Back-end driver for user level virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. Requests
7  * are remapped to a user-space memory region.
8  *
9  * Based on the blkback driver code.
10  * 
11  * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
12  *
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version 2
15  * as published by the Free Software Foundation; or, when distributed
16  * separately from the Linux kernel or incorporated into other
17  * software packages, subject to the following license:
18  * 
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this source file (the "Software"), to deal in the Software without
21  * restriction, including without limitation the rights to use, copy, modify,
22  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23  * and to permit persons to whom the Software is furnished to do so, subject to
24  * the following conditions:
25  * 
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  * 
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37
38 #include <linux/spinlock.h>
39 #include <linux/kthread.h>
40 #include <linux/list.h>
41 #include <asm/hypervisor.h>
42 #include "common.h"
43 #include <xen/balloon.h>
44 #include <linux/kernel.h>
45 #include <linux/fs.h>
46 #include <linux/mm.h>
47 #include <linux/errno.h>
48 #include <linux/major.h>
49 #include <linux/gfp.h>
50 #include <linux/poll.h>
51 #include <linux/init.h>
52 #include <asm/tlbflush.h>
53
54 #define MAX_TAP_DEV 100     /*the maximum number of tapdisk ring devices    */
55 #define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
56
57
58 struct class *xen_class;
59 EXPORT_SYMBOL_GPL(xen_class);
60
61 /*
62  * Setup the xen class.  This should probably go in another file, but
63  * since blktap is the only user of it so far, it gets to keep it.
64  */
65 int setup_xen_class(void)
66 {
67         int ret;
68
69         if (xen_class)
70                 return 0;
71
72         xen_class = class_create(THIS_MODULE, "xen");
73         if ((ret = IS_ERR(xen_class))) {
74                 xen_class = NULL;
75                 return ret;
76         }
77
78         return 0;
79 }
80
81 /*
82  * The maximum number of requests that can be outstanding at any time
83  * is determined by 
84  *
85  *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
86  *
87  * where mmap_alloc < MAX_DYNAMIC_MEM.
88  *
89  * TODO:
90  * mmap_alloc is initialised to 2 and should be adjustable on the fly via
91  * sysfs.
92  */
93 #define MAX_DYNAMIC_MEM 64
94 #define MAX_PENDING_REQS 64   
95 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
96 #define MMAP_VADDR(_start, _req,_seg)                                   \
97         (_start +                                                       \
98          ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
99          ((_seg) * PAGE_SIZE))
100 static int blkif_reqs = MAX_PENDING_REQS;
101 module_param(blkif_reqs, int, 0);
102
103 static int mmap_pages = MMAP_PAGES;
104
105 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
106                       * have a bunch of pages reserved for shared
107                       * memory rings.
108                       */
109
110 /*Data struct associated with each of the tapdisk devices*/
111 typedef struct tap_blkif {
112         struct vm_area_struct *vma;   /*Shared memory area                   */
113         unsigned long rings_vstart;   /*Kernel memory mapping                */
114         unsigned long user_vstart;    /*User memory mapping                  */
115         unsigned long dev_inuse;      /*One process opens device at a time.  */
116         unsigned long dev_pending;    /*In process of being opened           */
117         unsigned long ring_ok;        /*make this ring->state                */
118         blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
119         wait_queue_head_t wait;       /*for poll                             */
120         unsigned long mode;           /*current switching mode               */
121         int minor;                    /*Minor number for tapdisk device      */
122         pid_t pid;                    /*tapdisk process id                   */
123         enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
124                                                   shutdown                   */
125         unsigned long *idx_map;       /*Record the user ring id to kern 
126                                         [req id, idx] tuple                  */
127         blkif_t *blkif;               /*Associate blkif with tapdev          */
128         int sysfs_set;                /*Set if it has a class device.        */
129 } tap_blkif_t;
130
131 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
132 typedef struct domid_translate {
133         unsigned short domid;
134         unsigned short busid;
135 } domid_translate_t ;
136
137 static domid_translate_t  translate_domid[MAX_TAP_DEV];
138 static tap_blkif_t *tapfds[MAX_TAP_DEV];
139
140 /* Run-time switchable: /sys/module/blktap/parameters/ */
141 static unsigned int log_stats = 0;
142 static unsigned int debug_lvl = 0;
143 module_param(log_stats, int, 0644);
144 module_param(debug_lvl, int, 0644);
145
146 /*
147  * Each outstanding request that we've passed to the lower device layers has a 
148  * 'pending_req' allocated to it. Each buffer_head that completes decrements 
149  * the pendcnt towards zero. When it hits zero, the specified domain has a 
150  * response queued for it, with the saved 'id' passed back.
151  */
152 typedef struct {
153         blkif_t       *blkif;
154         unsigned long  id;
155         unsigned short mem_idx;
156         int            nr_pages;
157         atomic_t       pendcnt;
158         unsigned short operation;
159         int            status;
160         struct list_head free_list;
161         int            inuse;
162 } pending_req_t;
163
164 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
165 static struct list_head pending_free;
166 static DEFINE_SPINLOCK(pending_free_lock);
167 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
168 static int alloc_pending_reqs;
169
170 typedef unsigned int PEND_RING_IDX;
171
172 static inline int MASK_PEND_IDX(int i) { 
173         return (i & (MAX_PENDING_REQS-1));
174 }
175
176 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
177         return (req - pending_reqs[idx]);
178 }
179
180 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
181
182 #define BLKBACK_INVALID_HANDLE (~0)
183
184 static struct page **foreign_pages[MAX_DYNAMIC_MEM];
185 static inline unsigned long idx_to_kaddr(
186         unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
187 {
188         unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
189         unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
190         return (unsigned long)pfn_to_kaddr(pfn);
191 }
192
193 static unsigned short mmap_alloc = 0;
194 static unsigned short mmap_lock = 0;
195 static unsigned short mmap_inuse = 0;
196
197 /******************************************************************
198  * GRANT HANDLES
199  */
200
201 /* When using grant tables to map a frame for device access then the
202  * handle returned must be used to unmap the frame. This is needed to
203  * drop the ref count on the frame.
204  */
205 struct grant_handle_pair
206 {
207         grant_handle_t kernel;
208         grant_handle_t user;
209 };
210
211 static struct grant_handle_pair 
212     pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
213 #define pending_handle(_id, _idx, _i) \
214     (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
215     + (_i)])
216
217
218 static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
219
220 #define BLKTAP_MINOR 0  /*/dev/xen/blktap has a dynamic major */
221 #define BLKTAP_DEV_DIR  "/dev/xen"
222
223 static int blktap_major;
224
225 /* blktap IOCTLs: */
226 #define BLKTAP_IOCTL_KICK_FE         1
227 #define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
228 #define BLKTAP_IOCTL_SETMODE         3
229 #define BLKTAP_IOCTL_SENDPID         4
230 #define BLKTAP_IOCTL_NEWINTF         5
231 #define BLKTAP_IOCTL_MINOR           6
232 #define BLKTAP_IOCTL_MAJOR           7
233 #define BLKTAP_QUERY_ALLOC_REQS      8
234 #define BLKTAP_IOCTL_FREEINTF        9
235 #define BLKTAP_IOCTL_PRINT_IDXS      100  
236
237 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
238 #define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
239 #define BLKTAP_MODE_INTERCEPT_FE     0x00000001
240 #define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
241
242 #define BLKTAP_MODE_INTERPOSE \
243            (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
244
245
246 static inline int BLKTAP_MODE_VALID(unsigned long arg)
247 {
248         return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
249                 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
250                 (arg == BLKTAP_MODE_INTERPOSE   ));
251 }
252
253 /* Requests passing through the tap to userspace are re-assigned an ID.
254  * We must record a mapping between the BE [IDX,ID] tuple and the userspace
255  * ring ID. 
256  */
257
258 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
259 {
260         return ((fe_dom << 16) | MASK_PEND_IDX(idx));
261 }
262
263 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
264 {
265         return (PEND_RING_IDX)(id & 0x0000ffff);
266 }
267
268 extern inline int ID_TO_MIDX(unsigned long id)
269 {
270         return (int)(id >> 16);
271 }
272
273 #define INVALID_REQ 0xdead0000
274
275 /*TODO: Convert to a free list*/
276 static inline int GET_NEXT_REQ(unsigned long *idx_map)
277 {
278         int i;
279         for (i = 0; i < MAX_PENDING_REQS; i++)
280                 if (idx_map[i] == INVALID_REQ)
281                         return i;
282
283         return INVALID_REQ;
284 }
285
286
287 #define BLKTAP_INVALID_HANDLE(_g) \
288     (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
289
290 #define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
291     (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
292     } while(0)
293
294
295 /******************************************************************
296  * BLKTAP VM OPS
297  */
298
299 static struct page *blktap_nopage(struct vm_area_struct *vma,
300                                   unsigned long address,
301                                   int *type)
302 {
303         /*
304          * if the page has not been mapped in by the driver then return
305          * NOPAGE_SIGBUS to the domain.
306          */
307
308         return NOPAGE_SIGBUS;
309 }
310
311 struct vm_operations_struct blktap_vm_ops = {
312         nopage:   blktap_nopage,
313 };
314
315 /******************************************************************
316  * BLKTAP FILE OPS
317  */
318  
319 /*Function Declarations*/
320 static int get_next_free_dev(void);
321 static int blktap_open(struct inode *inode, struct file *filp);
322 static int blktap_release(struct inode *inode, struct file *filp);
323 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
324 static int blktap_ioctl(struct inode *inode, struct file *filp,
325                         unsigned int cmd, unsigned long arg);
326 static unsigned int blktap_poll(struct file *file, poll_table *wait);
327
328 static struct file_operations blktap_fops = {
329         .owner   = THIS_MODULE,
330         .poll    = blktap_poll,
331         .ioctl   = blktap_ioctl,
332         .open    = blktap_open,
333         .release = blktap_release,
334         .mmap    = blktap_mmap,
335 };
336
337
338 static int get_next_free_dev(void)
339 {
340         tap_blkif_t *info;
341         int i = 0, ret = -1;
342         unsigned long flags;
343
344         spin_lock_irqsave(&pending_free_lock, flags);
345         
346         while (i < MAX_TAP_DEV) {
347                 info = tapfds[i];
348                 if ( (tapfds[i] != NULL) && (info->dev_inuse == 0)
349                         && (info->dev_pending == 0) ) {
350                         info->dev_pending = 1;
351                         ret = i;
352                         goto done;
353                 }
354                 i++;
355         }
356         
357 done:
358         spin_unlock_irqrestore(&pending_free_lock, flags);
359
360         /*
361          * We are protected by having the dev_pending set.
362          */
363         if (!tapfds[i]->sysfs_set && xen_class) {
364                 class_device_create(xen_class, NULL,
365                                     MKDEV(blktap_major, ret), NULL,
366                                     "blktap%d", ret);
367                 tapfds[i]->sysfs_set = 1;
368         }
369         return ret;
370 }
371
372 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
373 {
374         int i;
375                 
376         for (i = 0; i < MAX_TAP_DEV; i++)
377                 if ( (translate_domid[i].domid == domid)
378                     && (translate_domid[i].busid == xenbus_id) ) {
379                         tapfds[i]->blkif = blkif;
380                         tapfds[i]->status = RUNNING;
381                         return i;
382                 }
383         return -1;
384 }
385
386 void signal_tapdisk(int idx) 
387 {
388         tap_blkif_t *info;
389         struct task_struct *ptask;
390
391         info = tapfds[idx];
392         if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) {
393                 ptask = find_task_by_pid(info->pid);
394                 if (ptask)
395                         info->status = CLEANSHUTDOWN;
396         }
397         info->blkif = NULL;
398         return;
399 }
400
401 static int blktap_open(struct inode *inode, struct file *filp)
402 {
403         blkif_sring_t *sring;
404         int idx = iminor(inode) - BLKTAP_MINOR;
405         tap_blkif_t *info;
406         int i;
407         
408         if (tapfds[idx] == NULL) {
409                 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
410                        idx);
411                 return -ENOMEM;
412         }
413         DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
414         
415         info = tapfds[idx];
416         
417         /*Only one process can access device at a time*/
418         if (test_and_set_bit(0, &info->dev_inuse))
419                 return -EBUSY;
420
421         info->dev_pending = 0;
422             
423         /* Allocate the fe ring. */
424         sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
425         if (sring == NULL)
426                 goto fail_nomem;
427
428         SetPageReserved(virt_to_page(sring));
429     
430         SHARED_RING_INIT(sring);
431         FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
432         
433         filp->private_data = info;
434         info->vma = NULL;
435
436         info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, 
437                                 GFP_KERNEL);
438         
439         if (idx > 0) {
440                 init_waitqueue_head(&info->wait);
441                 for (i = 0; i < MAX_PENDING_REQS; i++) 
442                         info->idx_map[i] = INVALID_REQ;
443         }
444
445         DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
446         return 0;
447
448  fail_nomem:
449         return -ENOMEM;
450 }
451
452 static int blktap_release(struct inode *inode, struct file *filp)
453 {
454         tap_blkif_t *info = filp->private_data;
455         
456         /* can this ever happen? - sdr */
457         if (!info) {
458                 WPRINTK("Trying to free device that doesn't exist "
459                        "[/dev/xen/blktap%d]\n",iminor(inode) - BLKTAP_MINOR);
460                 return -EBADF;
461         }
462         info->dev_inuse = 0;
463         DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
464
465         /* Free the ring page. */
466         ClearPageReserved(virt_to_page(info->ufe_ring.sring));
467         free_page((unsigned long) info->ufe_ring.sring);
468
469         /* Clear any active mappings and free foreign map table */
470         if (info->vma) {
471                 zap_page_range(
472                         info->vma, info->vma->vm_start, 
473                         info->vma->vm_end - info->vma->vm_start, NULL);
474                 info->vma = NULL;
475         }
476         
477         if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
478                 kthread_stop(info->blkif->xenblkd);
479                 info->blkif->xenblkd = NULL;
480                 info->status = CLEANSHUTDOWN;
481         }       
482         return 0;
483 }
484
485
486 /* Note on mmap:
487  * We need to map pages to user space in a way that will allow the block
488  * subsystem set up direct IO to them.  This couldn't be done before, because
489  * there isn't really a sane way to translate a user virtual address down to a 
490  * physical address when the page belongs to another domain.
491  *
492  * My first approach was to map the page in to kernel memory, add an entry
493  * for it in the physical frame list (using alloc_lomem_region as in blkback)
494  * and then attempt to map that page up to user space.  This is disallowed
495  * by xen though, which realizes that we don't really own the machine frame
496  * underlying the physical page.
497  *
498  * The new approach is to provide explicit support for this in xen linux.
499  * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
500  * mapped from other vms.  vma->vm_private_data is set up as a mapping 
501  * from pages to actual page structs.  There is a new clause in get_user_pages
502  * that does the right thing for this sort of mapping.
503  */
504 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
505 {
506         int size;
507         struct page **map;
508         int i;
509         tap_blkif_t *info = filp->private_data;
510
511         if (info == NULL) {
512                 WPRINTK("blktap: mmap, retrieving idx failed\n");
513                 return -ENOMEM;
514         }
515         
516         vma->vm_flags |= VM_RESERVED;
517         vma->vm_ops = &blktap_vm_ops;
518
519         size = vma->vm_end - vma->vm_start;
520         if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
521                 WPRINTK("you _must_ map exactly %d pages!\n",
522                        mmap_pages + RING_PAGES);
523                 return -EAGAIN;
524         }
525
526         size >>= PAGE_SHIFT;
527         info->rings_vstart = vma->vm_start;
528         info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
529     
530         /* Map the ring pages to the start of the region and reserve it. */
531         vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
532
533         if (remap_pfn_range(vma, vma->vm_start, 
534                             __pa(info->ufe_ring.sring) >> PAGE_SHIFT, 
535                             PAGE_SIZE, vma->vm_page_prot)) {
536                 WPRINTK("Mapping user ring failed!\n");
537                 goto fail;
538         }
539
540         /* Mark this VM as containing foreign pages, and set up mappings. */
541         map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
542                       * sizeof(struct page_struct*),
543                       GFP_KERNEL);
544         if (map == NULL) {
545                 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
546                 goto fail;
547         }
548
549         for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
550                 map[i] = NULL;
551     
552         vma->vm_private_data = map;
553         vma->vm_flags |= VM_FOREIGN;
554
555         info->vma = vma;
556         info->ring_ok = 1;
557         return 0;
558  fail:
559         /* Clear any active mappings. */
560         zap_page_range(vma, vma->vm_start, 
561                        vma->vm_end - vma->vm_start, NULL);
562
563         return -ENOMEM;
564 }
565
566
567 static int blktap_ioctl(struct inode *inode, struct file *filp,
568                         unsigned int cmd, unsigned long arg)
569 {
570         tap_blkif_t *info = filp->private_data;
571
572         switch(cmd) {
573         case BLKTAP_IOCTL_KICK_FE: 
574         {
575                 /* There are fe messages to process. */
576                 return blktap_read_ufe_ring(info);
577         }
578         case BLKTAP_IOCTL_SETMODE:
579         {
580                 if (info) {
581                         if (BLKTAP_MODE_VALID(arg)) {
582                                 info->mode = arg;
583                                 /* XXX: may need to flush rings here. */
584                                 DPRINTK("blktap: set mode to %lx\n", 
585                                        arg);
586                                 return 0;
587                         }
588                 }
589                 return 0;
590         }
591         case BLKTAP_IOCTL_PRINT_IDXS:
592         {
593                 if (info) {
594                         printk("User Rings: \n-----------\n");
595                         printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
596                                 "| req_prod: %2d, rsp_prod: %2d\n",
597                                 info->ufe_ring.rsp_cons,
598                                 info->ufe_ring.req_prod_pvt,
599                                 info->ufe_ring.sring->req_prod,
600                                 info->ufe_ring.sring->rsp_prod);
601                 }
602                 return 0;
603         }
604         case BLKTAP_IOCTL_SENDPID:
605         {
606                 if (info) {
607                         info->pid = (pid_t)arg;
608                         DPRINTK("blktap: pid received %d\n", 
609                                info->pid);
610                 }
611                 return 0;
612         }
613         case BLKTAP_IOCTL_NEWINTF:
614         {               
615                 uint64_t val = (uint64_t)arg;
616                 domid_translate_t *tr = (domid_translate_t *)&val;
617                 int newdev;
618
619                 DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
620                        tr->domid, tr->busid);
621                 newdev = get_next_free_dev();
622                 if (newdev < 1) {
623                         WPRINTK("Error initialising /dev/xen/blktap - "
624                                 "No more devices\n");
625                         return -1;
626                 }
627                 translate_domid[newdev].domid = tr->domid;
628                 translate_domid[newdev].busid = tr->busid;
629                 return newdev;
630         }
631         case BLKTAP_IOCTL_FREEINTF:
632         {
633                 unsigned long dev = arg;
634                 unsigned long flags;
635
636                 /* Looking at another device */
637                 info = NULL;
638
639                 if ( (dev > 0) && (dev < MAX_TAP_DEV) )
640                         info = tapfds[dev];
641
642                 spin_lock_irqsave(&pending_free_lock, flags);
643                 if ( (info != NULL) && (info->dev_pending) )
644                         info->dev_pending = 0;
645                 spin_unlock_irqrestore(&pending_free_lock, flags);
646
647                 return 0;
648         }
649         case BLKTAP_IOCTL_MINOR:
650         {
651                 unsigned long dev = arg;
652
653                 /* Looking at another device */
654                 info = NULL;
655                 
656                 if ( (dev > 0) && (dev < MAX_TAP_DEV) )
657                         info = tapfds[dev];
658                 
659                 if (info != NULL)
660                         return info->minor;
661                 else
662                         return -1;
663         }
664         case BLKTAP_IOCTL_MAJOR:
665                 return blktap_major;
666
667         case BLKTAP_QUERY_ALLOC_REQS:
668         {
669                 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
670                        alloc_pending_reqs, blkif_reqs);
671                 return (alloc_pending_reqs/blkif_reqs) * 100;
672         }
673         }
674         return -ENOIOCTLCMD;
675 }
676
677 static unsigned int blktap_poll(struct file *filp, poll_table *wait)
678 {
679         tap_blkif_t *info = filp->private_data;
680         
681         if (!info) {
682                 WPRINTK(" poll, retrieving idx failed\n");
683                 return 0;
684         }
685
686         /* do not work on the control device */
687         if (!info->minor)
688                 return 0;
689
690         poll_wait(filp, &info->wait, wait);
691         if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
692                 RING_PUSH_REQUESTS(&info->ufe_ring);
693                 return POLLIN | POLLRDNORM;
694         }
695         return 0;
696 }
697
698 void blktap_kick_user(int idx)
699 {
700         tap_blkif_t *info;
701
702         if (idx == 0)
703                 return;
704         
705         info = tapfds[idx];
706         
707         if (info != NULL)
708                 wake_up_interruptible(&info->wait);
709
710         return;
711 }
712
713 static int do_block_io_op(blkif_t *blkif);
714 static void dispatch_rw_block_io(blkif_t *blkif,
715                                  blkif_request_t *req,
716                                  pending_req_t *pending_req);
717 static void make_response(blkif_t *blkif, unsigned long id, 
718                           unsigned short op, int st);
719
720 /******************************************************************
721  * misc small helpers
722  */
723 static int req_increase(void)
724 {
725         int i, j;
726
727         if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
728                 return -EINVAL;
729
730         pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t)
731                                             * blkif_reqs, GFP_KERNEL);
732         foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
733
734         if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
735                 goto out_of_memory;
736
737         DPRINTK("%s: reqs=%d, pages=%d\n",
738                 __FUNCTION__, blkif_reqs, mmap_pages);
739
740         for (i = 0; i < MAX_PENDING_REQS; i++) {
741                 list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
742                               &pending_free);
743                 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
744                 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
745                         BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
746                                                                  i, j));
747         }
748
749         mmap_alloc++;
750         DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
751         return 0;
752
753  out_of_memory:
754         free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
755         kfree(pending_reqs[mmap_alloc]);
756         WPRINTK("%s: out of memory\n", __FUNCTION__);
757         return -ENOMEM;
758 }
759
760 static void mmap_req_del(int mmap)
761 {
762         BUG_ON(!spin_is_locked(&pending_free_lock));
763
764         kfree(pending_reqs[mmap]);
765         pending_reqs[mmap] = NULL;
766
767         free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
768         foreign_pages[mmap] = NULL;
769
770         mmap_lock = 0;
771         DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
772         mmap_alloc--;
773 }
774
775 static pending_req_t* alloc_req(void)
776 {
777         pending_req_t *req = NULL;
778         unsigned long flags;
779
780         spin_lock_irqsave(&pending_free_lock, flags);
781
782         if (!list_empty(&pending_free)) {
783                 req = list_entry(pending_free.next, pending_req_t, free_list);
784                 list_del(&req->free_list);
785         }
786
787         if (req) {
788                 req->inuse = 1;
789                 alloc_pending_reqs++;
790         }
791         spin_unlock_irqrestore(&pending_free_lock, flags);
792
793         return req;
794 }
795
796 static void free_req(pending_req_t *req)
797 {
798         unsigned long flags;
799         int was_empty;
800
801         spin_lock_irqsave(&pending_free_lock, flags);
802
803         alloc_pending_reqs--;
804         req->inuse = 0;
805         if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
806                 mmap_inuse--;
807                 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
808                 spin_unlock_irqrestore(&pending_free_lock, flags);
809                 return;
810         }
811         was_empty = list_empty(&pending_free);
812         list_add(&req->free_list, &pending_free);
813
814         spin_unlock_irqrestore(&pending_free_lock, flags);
815
816         if (was_empty)
817                 wake_up(&pending_free_wq);
818 }
819
820 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int 
821                             tapidx)
822 {
823         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
824         unsigned int i, invcount = 0;
825         struct grant_handle_pair *khandle;
826         uint64_t ptep;
827         int ret, mmap_idx;
828         unsigned long kvaddr, uvaddr;
829
830         tap_blkif_t *info = tapfds[tapidx];
831         
832         if (info == NULL) {
833                 WPRINTK("fast_flush: Couldn't get info!\n");
834                 return;
835         }
836         mmap_idx = req->mem_idx;
837
838         for (i = 0; i < req->nr_pages; i++) {
839                 kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
840                 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
841
842                 khandle = &pending_handle(mmap_idx, k_idx, i);
843
844                 if (khandle->kernel != 0xFFFF) {
845                         gnttab_set_unmap_op(&unmap[invcount],
846                                             idx_to_kaddr(mmap_idx, k_idx, i),
847                                             GNTMAP_host_map, khandle->kernel);
848                         invcount++;
849                 }
850
851                 if (khandle->user != 0xFFFF) {
852                         if (create_lookup_pte_addr(
853                                 info->vma->vm_mm,
854                                 MMAP_VADDR(info->user_vstart, u_idx, i),
855                                 &ptep) !=0) {
856                                 WPRINTK("Couldn't get a pte addr!\n");
857                                 return;
858                         }
859
860                         gnttab_set_unmap_op(&unmap[invcount], ptep,
861                                 GNTMAP_host_map |
862                                 GNTMAP_application_map |
863                                 GNTMAP_contains_pte,
864                                 khandle->user);
865                         invcount++;
866                 }
867
868                 BLKTAP_INVALIDATE_HANDLE(khandle);
869         }
870         ret = HYPERVISOR_grant_table_op(
871                 GNTTABOP_unmap_grant_ref, unmap, invcount);
872         BUG_ON(ret);
873         
874         if (info->vma != NULL)
875                 zap_page_range(info->vma, 
876                                MMAP_VADDR(info->user_vstart, u_idx, 0), 
877                                req->nr_pages << PAGE_SHIFT, NULL);
878 }
879
880 /******************************************************************
881  * SCHEDULER FUNCTIONS
882  */
883
884 static void print_stats(blkif_t *blkif)
885 {
886         printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
887                current->comm, blkif->st_oo_req,
888                blkif->st_rd_req, blkif->st_wr_req);
889         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
890         blkif->st_rd_req = 0;
891         blkif->st_wr_req = 0;
892         blkif->st_oo_req = 0;
893 }
894
895 int tap_blkif_schedule(void *arg)
896 {
897         blkif_t *blkif = arg;
898
899         blkif_get(blkif);
900
901         if (debug_lvl)
902                 printk(KERN_DEBUG "%s: started\n", current->comm);
903
904         while (!kthread_should_stop()) {
905                 wait_event_interruptible(
906                         blkif->wq,
907                         blkif->waiting_reqs || kthread_should_stop());
908                 wait_event_interruptible(
909                         pending_free_wq,
910                         !list_empty(&pending_free) || kthread_should_stop());
911
912                 blkif->waiting_reqs = 0;
913                 smp_mb(); /* clear flag *before* checking for work */
914
915                 if (do_block_io_op(blkif))
916                         blkif->waiting_reqs = 1;
917
918                 if (log_stats && time_after(jiffies, blkif->st_print))
919                         print_stats(blkif);
920         }
921
922         if (log_stats)
923                 print_stats(blkif);
924         if (debug_lvl)
925                 printk(KERN_DEBUG "%s: exiting\n", current->comm);
926
927         blkif->xenblkd = NULL;
928         blkif_put(blkif);
929
930         return 0;
931 }
932
933 /******************************************************************
934  * COMPLETION CALLBACK -- Called by user level ioctl()
935  */
936
937 static int blktap_read_ufe_ring(tap_blkif_t *info)
938 {
939         /* This is called to read responses from the UFE ring. */
940         RING_IDX i, j, rp;
941         blkif_response_t *resp;
942         blkif_t *blkif=NULL;
943         int pending_idx, usr_idx, mmap_idx;
944         pending_req_t *pending_req;
945         
946         if (!info)
947                 return 0;
948
949         /* We currently only forward packets in INTERCEPT_FE mode. */
950         if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
951                 return 0;
952
953         /* for each outstanding message on the UFEring  */
954         rp = info->ufe_ring.sring->rsp_prod;
955         rmb();
956         
957         for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
958                 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
959                 ++info->ufe_ring.rsp_cons;
960
961                 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
962                 usr_idx = (int)resp->id;
963                 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
964                 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
965
966                 if ( (mmap_idx >= mmap_alloc) || 
967                    (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
968                         WPRINTK("Incorrect req map"
969                                "[%d], internal map [%d,%d (%d)]\n", 
970                                usr_idx, mmap_idx, 
971                                ID_TO_IDX(info->idx_map[usr_idx]),
972                                MASK_PEND_IDX(
973                                        ID_TO_IDX(info->idx_map[usr_idx])));
974
975                 pending_req = &pending_reqs[mmap_idx][pending_idx];
976                 blkif = pending_req->blkif;
977
978                 for (j = 0; j < pending_req->nr_pages; j++) {
979
980                         unsigned long kvaddr, uvaddr;
981                         struct page **map = info->vma->vm_private_data;
982                         struct page *pg;
983                         int offset;
984
985                         uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
986                         kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
987
988                         pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
989                         ClearPageReserved(pg);
990                         offset = (uvaddr - info->vma->vm_start) 
991                                 >> PAGE_SHIFT;
992                         map[offset] = NULL;
993                 }
994                 fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
995                 make_response(blkif, pending_req->id, resp->operation,
996                               resp->status);
997                 info->idx_map[usr_idx] = INVALID_REQ;
998                 blkif_put(pending_req->blkif);
999                 free_req(pending_req);
1000         }
1001                 
1002         return 0;
1003 }
1004
1005
1006 /******************************************************************************
1007  * NOTIFICATION FROM GUEST OS.
1008  */
1009
1010 static void blkif_notify_work(blkif_t *blkif)
1011 {
1012         blkif->waiting_reqs = 1;
1013         wake_up(&blkif->wq);
1014 }
1015
1016 irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
1017 {
1018         blkif_notify_work(dev_id);
1019         return IRQ_HANDLED;
1020 }
1021
1022
1023
1024 /******************************************************************
1025  * DOWNWARD CALLS -- These interface with the block-device layer proper.
1026  */
1027 static int print_dbug = 1;
1028 static int do_block_io_op(blkif_t *blkif)
1029 {
1030         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1031         blkif_request_t req;
1032         pending_req_t *pending_req;
1033         RING_IDX rc, rp;
1034         int more_to_do = 0;
1035         tap_blkif_t *info;
1036
1037         rc = blk_ring->req_cons;
1038         rp = blk_ring->sring->req_prod;
1039         rmb(); /* Ensure we see queued requests up to 'rp'. */
1040
1041         /*Check blkif has corresponding UE ring*/
1042         if (blkif->dev_num == -1) {
1043                 /*oops*/
1044                 if (print_dbug) {
1045                         WPRINTK("Corresponding UE " 
1046                                "ring does not exist!\n");
1047                         print_dbug = 0; /*We only print this message once*/
1048                 }
1049                 return 0;
1050         }
1051
1052         info = tapfds[blkif->dev_num];
1053         if (info == NULL || !info->dev_inuse) {
1054                 if (print_dbug) {
1055                         WPRINTK("Can't get UE info!\n");
1056                         print_dbug = 0;
1057                 }
1058                 return 0;
1059         }
1060
1061         while (rc != rp) {
1062                 
1063                 if (RING_FULL(&info->ufe_ring)) {
1064                         WPRINTK("RING_FULL! More to do\n");
1065                         more_to_do = 1;
1066                         break;
1067                 }
1068                 
1069                 if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
1070                         WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1071                                " More to do\n");
1072                         more_to_do = 1;
1073                         break;          
1074                 }
1075
1076                 pending_req = alloc_req();
1077                 if (NULL == pending_req) {
1078                         blkif->st_oo_req++;
1079                         more_to_do = 1;
1080                         break;
1081                 }
1082
1083                 memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
1084                 blk_ring->req_cons = ++rc; /* before make_response() */ 
1085
1086                 switch (req.operation) {
1087                 case BLKIF_OP_READ:
1088                         blkif->st_rd_req++;
1089                         dispatch_rw_block_io(blkif, &req, pending_req);
1090                         break;
1091
1092                 case BLKIF_OP_WRITE:
1093                         blkif->st_wr_req++;
1094                         dispatch_rw_block_io(blkif, &req, pending_req);
1095                         break;
1096
1097                 default:
1098                         WPRINTK("unknown operation [%d]\n",
1099                                 req.operation);
1100                         make_response(blkif, req.id, req.operation,
1101                                       BLKIF_RSP_ERROR);
1102                         free_req(pending_req);
1103                         break;
1104                 }
1105         }
1106                 
1107         blktap_kick_user(blkif->dev_num);
1108
1109         return more_to_do;
1110 }
1111
1112 static void dispatch_rw_block_io(blkif_t *blkif,
1113                                  blkif_request_t *req,
1114                                  pending_req_t *pending_req)
1115 {
1116         extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
1117         int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
1118         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1119         unsigned int nseg;
1120         int ret, i;
1121         tap_blkif_t *info = tapfds[blkif->dev_num];
1122         uint64_t sector;
1123         
1124         blkif_request_t *target;
1125         int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
1126         int usr_idx = GET_NEXT_REQ(info->idx_map);
1127         uint16_t mmap_idx = pending_req->mem_idx;
1128
1129         /* Check we have space on user ring - should never fail. */
1130         if (usr_idx == INVALID_REQ)
1131                 goto fail_response;
1132
1133         /* Check that number of segments is sane. */
1134         nseg = req->nr_segments;
1135         if ( unlikely(nseg == 0) || 
1136             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1137                 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1138                 goto fail_response;
1139         }
1140         
1141         /* Make sure userspace is ready. */
1142         if (!info->ring_ok) {
1143                 WPRINTK("blktap: ring not ready for requests!\n");
1144                 goto fail_response;
1145         }
1146
1147         if (RING_FULL(&info->ufe_ring)) {
1148                 WPRINTK("blktap: fe_ring is full, can't add "
1149                         "IO Request will be dropped. %d %d\n",
1150                         RING_SIZE(&info->ufe_ring),
1151                         RING_SIZE(&blkif->blk_ring));
1152                 goto fail_response;
1153         }
1154
1155         pending_req->blkif     = blkif;
1156         pending_req->id        = req->id;
1157         pending_req->operation = operation;
1158         pending_req->status    = BLKIF_RSP_OKAY;
1159         pending_req->nr_pages  = nseg;
1160         op = 0;
1161         for (i = 0; i < nseg; i++) {
1162                 unsigned long uvaddr;
1163                 unsigned long kvaddr;
1164                 uint64_t ptep;
1165                 struct page *page;
1166                 uint32_t flags;
1167
1168                 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1169                 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1170                 page = virt_to_page(kvaddr);
1171
1172                 sector = req->sector_number + (8*i);
1173                 if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
1174                         WPRINTK("BLKTAP: Sector request greater" 
1175                                "than size\n");
1176                         WPRINTK("BLKTAP: %s request sector" 
1177                                "[%llu,%llu], Total [%llu]\n",
1178                                (req->operation == 
1179                                 BLKIF_OP_WRITE ? "WRITE" : "READ"),
1180                                 (long long unsigned) sector,
1181                                 (long long unsigned) sector>>9,
1182                                 blkif->sectors);
1183                 }
1184
1185                 flags = GNTMAP_host_map;
1186                 if (operation == WRITE)
1187                         flags |= GNTMAP_readonly;
1188                 gnttab_set_map_op(&map[op], kvaddr, flags,
1189                                   req->seg[i].gref, blkif->domid);
1190                 op++;
1191
1192                 /* Now map it to user. */
1193                 ret = create_lookup_pte_addr(info->vma->vm_mm, 
1194                                              uvaddr, &ptep);
1195                 if (ret) {
1196                         WPRINTK("Couldn't get a pte addr!\n");
1197                         goto fail_flush;
1198                 }
1199
1200                 flags = GNTMAP_host_map | GNTMAP_application_map
1201                         | GNTMAP_contains_pte;
1202                 if (operation == WRITE)
1203                         flags |= GNTMAP_readonly;
1204                 gnttab_set_map_op(&map[op], ptep, flags,
1205                                   req->seg[i].gref, blkif->domid);
1206                 op++;
1207         }
1208
1209         ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1210         BUG_ON(ret);
1211
1212         for (i = 0; i < (nseg*2); i+=2) {
1213                 unsigned long uvaddr;
1214                 unsigned long kvaddr;
1215                 unsigned long offset;
1216                 struct page *pg;
1217
1218                 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1219                 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
1220
1221                 if (unlikely(map[i].status != 0)) {
1222                         WPRINTK("invalid kernel buffer -- "
1223                                 "could not remap it\n");
1224                         ret |= 1;
1225                         map[i].handle = 0xFFFF;
1226                 }
1227
1228                 if (unlikely(map[i+1].status != 0)) {
1229                         WPRINTK("invalid user buffer -- "
1230                                 "could not remap it\n");
1231                         ret |= 1;
1232                         map[i+1].handle = 0xFFFF;
1233                 }
1234
1235                 pending_handle(mmap_idx, pending_idx, i/2).kernel 
1236                         = map[i].handle;
1237                 pending_handle(mmap_idx, pending_idx, i/2).user   
1238                         = map[i+1].handle;
1239
1240                 if (ret)
1241                         continue;
1242
1243                 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
1244                         FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
1245                 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1246                 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1247                 ((struct page **)info->vma->vm_private_data)[offset] =
1248                         pg;
1249         }
1250
1251         if (ret)
1252                 goto fail_flush;
1253
1254         /* Mark mapped pages as reserved: */
1255         for (i = 0; i < req->nr_segments; i++) {
1256                 unsigned long kvaddr;
1257                 struct page *pg;
1258
1259                 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1260                 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1261                 SetPageReserved(pg);
1262         }
1263         
1264         /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1265         info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
1266
1267         blkif_get(blkif);
1268         /* Finally, write the request message to the user ring. */
1269         target = RING_GET_REQUEST(&info->ufe_ring,
1270                                   info->ufe_ring.req_prod_pvt);
1271         memcpy(target, req, sizeof(*req));
1272         target->id = usr_idx;
1273         info->ufe_ring.req_prod_pvt++;
1274         return;
1275
1276  fail_flush:
1277         WPRINTK("Reached Fail_flush\n");
1278         fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
1279  fail_response:
1280         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1281         free_req(pending_req);
1282
1283
1284
1285
1286 /******************************************************************
1287  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1288  */
1289
1290
1291 static void make_response(blkif_t *blkif, unsigned long id, 
1292                           unsigned short op, int st)
1293 {
1294         blkif_response_t *resp;
1295         unsigned long     flags;
1296         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1297         int more_to_do = 0;
1298         int notify;
1299
1300         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1301         /* Place on the response ring for the relevant domain. */ 
1302         resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
1303         resp->id        = id;
1304         resp->operation = op;
1305         resp->status    = st;
1306         blk_ring->rsp_prod_pvt++;
1307         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
1308
1309         if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
1310                 /*
1311                  * Tail check for pending requests. Allows frontend to avoid
1312                  * notifications if requests are already in flight (lower
1313                  * overheads and promotes batching).
1314                  */
1315                 RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
1316         } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
1317                 more_to_do = 1;
1318
1319         }       
1320         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1321         if (more_to_do)
1322                 blkif_notify_work(blkif);
1323         if (notify)
1324                 notify_remote_via_irq(blkif->irq);
1325 }
1326
1327 static int __init blkif_init(void)
1328 {
1329         int i, ret;
1330         tap_blkif_t *info;
1331
1332         if (!is_running_on_xen())
1333                 return -ENODEV;
1334
1335         INIT_LIST_HEAD(&pending_free);
1336         for(i = 0; i < 2; i++) {
1337                 ret = req_increase();
1338                 if (ret)
1339                         break;
1340         }
1341         if (i == 0)
1342                 return ret;
1343
1344         tap_blkif_interface_init();
1345
1346         alloc_pending_reqs = 0;
1347
1348         tap_blkif_xenbus_init();
1349
1350         /*Create the blktap devices, but do not map memory or waitqueue*/
1351         for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF;
1352
1353         /* Dynamically allocate a major for this device */
1354         ret = register_chrdev(0, "blktap", &blktap_fops);
1355
1356         if ( (ret < 0) ) {
1357                 WPRINTK("Couldn't register /dev/xen/blktap\n");
1358                 return -ENOMEM;
1359         }       
1360         
1361         blktap_major = ret;
1362
1363         for(i = 0; i < MAX_TAP_DEV; i++ ) {
1364                 info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL);
1365                 if(tapfds[i] == NULL)
1366                         return -ENOMEM;
1367                 info->minor = i;
1368                 info->pid = 0;
1369                 info->blkif = NULL;
1370
1371                 info->dev_pending = info->dev_inuse = 0;
1372
1373                 DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
1374         }
1375         
1376         /* Make sure the xen class exists */
1377         if (!setup_xen_class()) {
1378                 /*
1379                  * This will allow udev to create the blktap ctrl device.
1380                  * We only want to create blktap0 first.  We don't want
1381                  * to flood the sysfs system with needless blktap devices.
1382                  * We only create the device when a request of a new device is
1383                  * made.
1384                  */
1385                 class_device_create(xen_class, NULL,
1386                                     MKDEV(blktap_major, 0), NULL,
1387                                     "blktap0");
1388                 tapfds[0]->sysfs_set = 1;
1389         } else {
1390                 /* this is bad, but not fatal */
1391                 WPRINTK("blktap: sysfs xen_class not created\n");
1392         }
1393
1394         DPRINTK("Blktap device successfully created\n");
1395
1396         return 0;
1397 }
1398
1399 module_init(blkif_init);
1400
1401 MODULE_LICENSE("Dual BSD/GPL");