This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / drivers / xen / blktap / blktapmain.c
1 /******************************************************************************
2  * drivers/xen/blktap/blktap.c
3  * 
4  * Back-end driver for user level virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. Requests
7  * are remapped to a user-space memory region.
8  *
9  * Based on the blkback driver code.
10  * 
11  * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
12  *
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version 2
15  * as published by the Free Software Foundation; or, when distributed
16  * separately from the Linux kernel or incorporated into other
17  * software packages, subject to the following license:
18  * 
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this source file (the "Software"), to deal in the Software without
21  * restriction, including without limitation the rights to use, copy, modify,
22  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23  * and to permit persons to whom the Software is furnished to do so, subject to
24  * the following conditions:
25  * 
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  * 
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37
38 #include <linux/spinlock.h>
39 #include <linux/kthread.h>
40 #include <linux/list.h>
41 #include <asm/hypervisor.h>
42 #include "common.h"
43 #include <xen/balloon.h>
44 #include <linux/kernel.h>
45 #include <linux/fs.h>
46 #include <linux/mm.h>
47 #include <linux/errno.h>
48 #include <linux/major.h>
49 #include <linux/gfp.h>
50 #include <linux/poll.h>
51 #include <asm/tlbflush.h>
52
53 #define MAX_TAP_DEV 100     /*the maximum number of tapdisk ring devices    */
54 #define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
55
56
57 struct class *xen_class;
58 EXPORT_SYMBOL_GPL(xen_class);
59
60 /*
61  * Setup the xen class.  This should probably go in another file, but
62  * since blktap is the only user of it so far, it gets to keep it.
63  */
64 int setup_xen_class(void)
65 {
66         int ret;
67
68         if (xen_class)
69                 return 0;
70
71         xen_class = class_create(THIS_MODULE, "xen");
72         if ((ret = IS_ERR(xen_class))) {
73                 xen_class = NULL;
74                 return ret;
75         }
76
77         return 0;
78 }
79
80 /*
81  * The maximum number of requests that can be outstanding at any time
82  * is determined by 
83  *
84  *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
85  *
86  * where mmap_alloc < MAX_DYNAMIC_MEM.
87  *
88  * TODO:
89  * mmap_alloc is initialised to 2 and should be adjustable on the fly via
90  * sysfs.
91  */
92 #define MAX_DYNAMIC_MEM 64
93 #define MAX_PENDING_REQS 64   
94 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
95 #define MMAP_VADDR(_start, _req,_seg)                                   \
96         (_start +                                                       \
97          ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
98          ((_seg) * PAGE_SIZE))
99 static int blkif_reqs = MAX_PENDING_REQS;
100 static int mmap_pages = MMAP_PAGES;
101
102 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
103                       * have a bunch of pages reserved for shared
104                       * memory rings.
105                       */
106
107 /*Data struct associated with each of the tapdisk devices*/
108 typedef struct tap_blkif {
109         struct vm_area_struct *vma;   /*Shared memory area                   */
110         unsigned long rings_vstart;   /*Kernel memory mapping                */
111         unsigned long user_vstart;    /*User memory mapping                  */
112         unsigned long dev_inuse;      /*One process opens device at a time.  */
113         unsigned long dev_pending;    /*In process of being opened           */
114         unsigned long ring_ok;        /*make this ring->state                */
115         blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
116         wait_queue_head_t wait;       /*for poll                             */
117         unsigned long mode;           /*current switching mode               */
118         int minor;                    /*Minor number for tapdisk device      */
119         pid_t pid;                    /*tapdisk process id                   */
120         enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
121                                                   shutdown                   */
122         unsigned long *idx_map;       /*Record the user ring id to kern 
123                                         [req id, idx] tuple                  */
124         blkif_t *blkif;               /*Associate blkif with tapdev          */
125         int sysfs_set;                /*Set if it has a class device.        */
126 } tap_blkif_t;
127
128 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
129 typedef struct domid_translate {
130         unsigned short domid;
131         unsigned short busid;
132 } domid_translate_t ;
133
134 static domid_translate_t  translate_domid[MAX_TAP_DEV];
135 static tap_blkif_t *tapfds[MAX_TAP_DEV];
136
137 static int __init set_blkif_reqs(char *str)
138 {
139         get_option(&str, &blkif_reqs);
140         return 1;
141 }
142 __setup("blkif_reqs=", set_blkif_reqs);
143
144 /* Run-time switchable: /sys/module/blktap/parameters/ */
145 static unsigned int log_stats = 0;
146 static unsigned int debug_lvl = 0;
147 module_param(log_stats, int, 0644);
148 module_param(debug_lvl, int, 0644);
149
150 /*
151  * Each outstanding request that we've passed to the lower device layers has a 
152  * 'pending_req' allocated to it. Each buffer_head that completes decrements 
153  * the pendcnt towards zero. When it hits zero, the specified domain has a 
154  * response queued for it, with the saved 'id' passed back.
155  */
156 typedef struct {
157         blkif_t       *blkif;
158         unsigned long  id;
159         unsigned short mem_idx;
160         int            nr_pages;
161         atomic_t       pendcnt;
162         unsigned short operation;
163         int            status;
164         struct list_head free_list;
165         int            inuse;
166 } pending_req_t;
167
168 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
169 static struct list_head pending_free;
170 static DEFINE_SPINLOCK(pending_free_lock);
171 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
172 static int alloc_pending_reqs;
173
174 typedef unsigned int PEND_RING_IDX;
175
176 static inline int MASK_PEND_IDX(int i) { 
177         return (i & (MAX_PENDING_REQS-1));
178 }
179
180 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
181         return (req - pending_reqs[idx]);
182 }
183
184 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
185
186 #define BLKBACK_INVALID_HANDLE (~0)
187
188 static struct page **foreign_pages[MAX_DYNAMIC_MEM];
189 static inline unsigned long idx_to_kaddr(
190         unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
191 {
192         unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
193         unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
194         return (unsigned long)pfn_to_kaddr(pfn);
195 }
196
197 static unsigned short mmap_alloc = 0;
198 static unsigned short mmap_lock = 0;
199 static unsigned short mmap_inuse = 0;
200
201 /******************************************************************
202  * GRANT HANDLES
203  */
204
205 /* When using grant tables to map a frame for device access then the
206  * handle returned must be used to unmap the frame. This is needed to
207  * drop the ref count on the frame.
208  */
209 struct grant_handle_pair
210 {
211         grant_handle_t kernel;
212         grant_handle_t user;
213 };
214
215 static struct grant_handle_pair 
216     pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
217 #define pending_handle(_id, _idx, _i) \
218     (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
219     + (_i)])
220
221
222 static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
223
224 #define BLKTAP_MINOR 0  /*/dev/xen/blktap has a dynamic major */
225 #define BLKTAP_DEV_DIR  "/dev/xen"
226
227 static int blktap_major;
228
229 /* blktap IOCTLs: */
230 #define BLKTAP_IOCTL_KICK_FE         1
231 #define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
232 #define BLKTAP_IOCTL_SETMODE         3
233 #define BLKTAP_IOCTL_SENDPID         4
234 #define BLKTAP_IOCTL_NEWINTF         5
235 #define BLKTAP_IOCTL_MINOR           6
236 #define BLKTAP_IOCTL_MAJOR           7
237 #define BLKTAP_QUERY_ALLOC_REQS      8
238 #define BLKTAP_IOCTL_FREEINTF        9
239 #define BLKTAP_IOCTL_PRINT_IDXS      100  
240
241 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
242 #define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
243 #define BLKTAP_MODE_INTERCEPT_FE     0x00000001
244 #define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
245
246 #define BLKTAP_MODE_INTERPOSE \
247            (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
248
249
250 static inline int BLKTAP_MODE_VALID(unsigned long arg)
251 {
252         return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
253                 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
254                 (arg == BLKTAP_MODE_INTERPOSE   ));
255 }
256
257 /* Requests passing through the tap to userspace are re-assigned an ID.
258  * We must record a mapping between the BE [IDX,ID] tuple and the userspace
259  * ring ID. 
260  */
261
262 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
263 {
264         return ((fe_dom << 16) | MASK_PEND_IDX(idx));
265 }
266
267 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
268 {
269         return (PEND_RING_IDX)(id & 0x0000ffff);
270 }
271
272 extern inline int ID_TO_MIDX(unsigned long id)
273 {
274         return (int)(id >> 16);
275 }
276
277 #define INVALID_REQ 0xdead0000
278
279 /*TODO: Convert to a free list*/
280 static inline int GET_NEXT_REQ(unsigned long *idx_map)
281 {
282         int i;
283         for (i = 0; i < MAX_PENDING_REQS; i++)
284                 if (idx_map[i] == INVALID_REQ)
285                         return i;
286
287         return INVALID_REQ;
288 }
289
290
291 #define BLKTAP_INVALID_HANDLE(_g) \
292     (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
293
294 #define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
295     (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
296     } while(0)
297
298
299 /******************************************************************
300  * BLKTAP VM OPS
301  */
302
303 static struct page *blktap_nopage(struct vm_area_struct *vma,
304                                   unsigned long address,
305                                   int *type)
306 {
307         /*
308          * if the page has not been mapped in by the driver then return
309          * NOPAGE_SIGBUS to the domain.
310          */
311
312         return NOPAGE_SIGBUS;
313 }
314
315 struct vm_operations_struct blktap_vm_ops = {
316         nopage:   blktap_nopage,
317 };
318
319 /******************************************************************
320  * BLKTAP FILE OPS
321  */
322  
323 /*Function Declarations*/
324 static int get_next_free_dev(void);
325 static int blktap_open(struct inode *inode, struct file *filp);
326 static int blktap_release(struct inode *inode, struct file *filp);
327 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
328 static int blktap_ioctl(struct inode *inode, struct file *filp,
329                         unsigned int cmd, unsigned long arg);
330 static unsigned int blktap_poll(struct file *file, poll_table *wait);
331
332 static struct file_operations blktap_fops = {
333         .owner   = THIS_MODULE,
334         .poll    = blktap_poll,
335         .ioctl   = blktap_ioctl,
336         .open    = blktap_open,
337         .release = blktap_release,
338         .mmap    = blktap_mmap,
339 };
340
341
342 static int get_next_free_dev(void)
343 {
344         tap_blkif_t *info;
345         int i = 0, ret = -1;
346         unsigned long flags;
347
348         spin_lock_irqsave(&pending_free_lock, flags);
349         
350         while (i < MAX_TAP_DEV) {
351                 info = tapfds[i];
352                 if ( (tapfds[i] != NULL) && (info->dev_inuse == 0)
353                         && (info->dev_pending == 0) ) {
354                         info->dev_pending = 1;
355                         ret = i;
356                         goto done;
357                 }
358                 i++;
359         }
360         
361 done:
362         spin_unlock_irqrestore(&pending_free_lock, flags);
363
364         /*
365          * We are protected by having the dev_pending set.
366          */
367         if (!tapfds[i]->sysfs_set && xen_class) {
368                 class_device_create(xen_class, NULL,
369                                     MKDEV(blktap_major, ret), NULL,
370                                     "blktap%d", ret);
371                 tapfds[i]->sysfs_set = 1;
372         }
373         return ret;
374 }
375
376 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
377 {
378         int i;
379                 
380         for (i = 0; i < MAX_TAP_DEV; i++)
381                 if ( (translate_domid[i].domid == domid)
382                     && (translate_domid[i].busid == xenbus_id) ) {
383                         tapfds[i]->blkif = blkif;
384                         tapfds[i]->status = RUNNING;
385                         return i;
386                 }
387         return -1;
388 }
389
390 void signal_tapdisk(int idx) 
391 {
392         tap_blkif_t *info;
393         struct task_struct *ptask;
394
395         info = tapfds[idx];
396         if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) {
397                 ptask = find_task_by_pid(info->pid);
398                 if (ptask)
399                         info->status = CLEANSHUTDOWN;
400         }
401         info->blkif = NULL;
402         return;
403 }
404
405 static int blktap_open(struct inode *inode, struct file *filp)
406 {
407         blkif_sring_t *sring;
408         int idx = iminor(inode) - BLKTAP_MINOR;
409         tap_blkif_t *info;
410         int i;
411         
412         if (tapfds[idx] == NULL) {
413                 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
414                        idx);
415                 return -ENOMEM;
416         }
417         DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
418         
419         info = tapfds[idx];
420         
421         /*Only one process can access device at a time*/
422         if (test_and_set_bit(0, &info->dev_inuse))
423                 return -EBUSY;
424
425         info->dev_pending = 0;
426             
427         /* Allocate the fe ring. */
428         sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
429         if (sring == NULL)
430                 goto fail_nomem;
431
432         SetPageReserved(virt_to_page(sring));
433     
434         SHARED_RING_INIT(sring);
435         FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
436         
437         filp->private_data = info;
438         info->vma = NULL;
439
440         info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, 
441                                 GFP_KERNEL);
442         
443         if (idx > 0) {
444                 init_waitqueue_head(&info->wait);
445                 for (i = 0; i < MAX_PENDING_REQS; i++) 
446                         info->idx_map[i] = INVALID_REQ;
447         }
448
449         DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
450         return 0;
451
452  fail_nomem:
453         return -ENOMEM;
454 }
455
456 static int blktap_release(struct inode *inode, struct file *filp)
457 {
458         tap_blkif_t *info = filp->private_data;
459         
460         /* can this ever happen? - sdr */
461         if (!info) {
462                 WPRINTK("Trying to free device that doesn't exist "
463                        "[/dev/xen/blktap%d]\n",iminor(inode) - BLKTAP_MINOR);
464                 return -EBADF;
465         }
466         info->dev_inuse = 0;
467         DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
468
469         /* Free the ring page. */
470         ClearPageReserved(virt_to_page(info->ufe_ring.sring));
471         free_page((unsigned long) info->ufe_ring.sring);
472
473         /* Clear any active mappings and free foreign map table */
474         if (info->vma) {
475                 zap_page_range(
476                         info->vma, info->vma->vm_start, 
477                         info->vma->vm_end - info->vma->vm_start, NULL);
478                 info->vma = NULL;
479         }
480         
481         if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
482                 kthread_stop(info->blkif->xenblkd);
483                 info->blkif->xenblkd = NULL;
484                 info->status = CLEANSHUTDOWN;
485         }       
486         return 0;
487 }
488
489
490 /* Note on mmap:
491  * We need to map pages to user space in a way that will allow the block
492  * subsystem set up direct IO to them.  This couldn't be done before, because
493  * there isn't really a sane way to translate a user virtual address down to a 
494  * physical address when the page belongs to another domain.
495  *
496  * My first approach was to map the page in to kernel memory, add an entry
497  * for it in the physical frame list (using alloc_lomem_region as in blkback)
498  * and then attempt to map that page up to user space.  This is disallowed
499  * by xen though, which realizes that we don't really own the machine frame
500  * underlying the physical page.
501  *
502  * The new approach is to provide explicit support for this in xen linux.
503  * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
504  * mapped from other vms.  vma->vm_private_data is set up as a mapping 
505  * from pages to actual page structs.  There is a new clause in get_user_pages
506  * that does the right thing for this sort of mapping.
507  */
508 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
509 {
510         int size;
511         struct page **map;
512         int i;
513         tap_blkif_t *info = filp->private_data;
514
515         if (info == NULL) {
516                 WPRINTK("blktap: mmap, retrieving idx failed\n");
517                 return -ENOMEM;
518         }
519         
520         vma->vm_flags |= VM_RESERVED;
521         vma->vm_ops = &blktap_vm_ops;
522
523         size = vma->vm_end - vma->vm_start;
524         if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
525                 WPRINTK("you _must_ map exactly %d pages!\n",
526                        mmap_pages + RING_PAGES);
527                 return -EAGAIN;
528         }
529
530         size >>= PAGE_SHIFT;
531         info->rings_vstart = vma->vm_start;
532         info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
533     
534         /* Map the ring pages to the start of the region and reserve it. */
535         vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
536
537         if (remap_pfn_range(vma, vma->vm_start, 
538                             __pa(info->ufe_ring.sring) >> PAGE_SHIFT, 
539                             PAGE_SIZE, vma->vm_page_prot)) {
540                 WPRINTK("Mapping user ring failed!\n");
541                 goto fail;
542         }
543
544         /* Mark this VM as containing foreign pages, and set up mappings. */
545         map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
546                       * sizeof(struct page_struct*),
547                       GFP_KERNEL);
548         if (map == NULL) {
549                 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
550                 goto fail;
551         }
552
553         for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
554                 map[i] = NULL;
555     
556         vma->vm_private_data = map;
557         vma->vm_flags |= VM_FOREIGN;
558
559         info->vma = vma;
560         info->ring_ok = 1;
561         return 0;
562  fail:
563         /* Clear any active mappings. */
564         zap_page_range(vma, vma->vm_start, 
565                        vma->vm_end - vma->vm_start, NULL);
566
567         return -ENOMEM;
568 }
569
570
571 static int blktap_ioctl(struct inode *inode, struct file *filp,
572                         unsigned int cmd, unsigned long arg)
573 {
574         tap_blkif_t *info = filp->private_data;
575
576         switch(cmd) {
577         case BLKTAP_IOCTL_KICK_FE: 
578         {
579                 /* There are fe messages to process. */
580                 return blktap_read_ufe_ring(info);
581         }
582         case BLKTAP_IOCTL_SETMODE:
583         {
584                 if (info) {
585                         if (BLKTAP_MODE_VALID(arg)) {
586                                 info->mode = arg;
587                                 /* XXX: may need to flush rings here. */
588                                 DPRINTK("blktap: set mode to %lx\n", 
589                                        arg);
590                                 return 0;
591                         }
592                 }
593                 return 0;
594         }
595         case BLKTAP_IOCTL_PRINT_IDXS:
596         {
597                 if (info) {
598                         printk("User Rings: \n-----------\n");
599                         printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
600                                 "| req_prod: %2d, rsp_prod: %2d\n",
601                                 info->ufe_ring.rsp_cons,
602                                 info->ufe_ring.req_prod_pvt,
603                                 info->ufe_ring.sring->req_prod,
604                                 info->ufe_ring.sring->rsp_prod);
605                 }
606                 return 0;
607         }
608         case BLKTAP_IOCTL_SENDPID:
609         {
610                 if (info) {
611                         info->pid = (pid_t)arg;
612                         DPRINTK("blktap: pid received %d\n", 
613                                info->pid);
614                 }
615                 return 0;
616         }
617         case BLKTAP_IOCTL_NEWINTF:
618         {               
619                 uint64_t val = (uint64_t)arg;
620                 domid_translate_t *tr = (domid_translate_t *)&val;
621                 int newdev;
622
623                 DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
624                        tr->domid, tr->busid);
625                 newdev = get_next_free_dev();
626                 if (newdev < 1) {
627                         WPRINTK("Error initialising /dev/xen/blktap - "
628                                 "No more devices\n");
629                         return -1;
630                 }
631                 translate_domid[newdev].domid = tr->domid;
632                 translate_domid[newdev].busid = tr->busid;
633                 return newdev;
634         }
635         case BLKTAP_IOCTL_FREEINTF:
636         {
637                 unsigned long dev = arg;
638                 unsigned long flags;
639
640                 /* Looking at another device */
641                 info = NULL;
642
643                 if ( (dev > 0) && (dev < MAX_TAP_DEV) )
644                         info = tapfds[dev];
645
646                 spin_lock_irqsave(&pending_free_lock, flags);
647                 if ( (info != NULL) && (info->dev_pending) )
648                         info->dev_pending = 0;
649                 spin_unlock_irqrestore(&pending_free_lock, flags);
650
651                 return 0;
652         }
653         case BLKTAP_IOCTL_MINOR:
654         {
655                 unsigned long dev = arg;
656
657                 /* Looking at another device */
658                 info = NULL;
659                 
660                 if ( (dev > 0) && (dev < MAX_TAP_DEV) )
661                         info = tapfds[dev];
662                 
663                 if (info != NULL)
664                         return info->minor;
665                 else
666                         return -1;
667         }
668         case BLKTAP_IOCTL_MAJOR:
669                 return blktap_major;
670
671         case BLKTAP_QUERY_ALLOC_REQS:
672         {
673                 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
674                        alloc_pending_reqs, blkif_reqs);
675                 return (alloc_pending_reqs/blkif_reqs) * 100;
676         }
677         }
678         return -ENOIOCTLCMD;
679 }
680
681 static unsigned int blktap_poll(struct file *filp, poll_table *wait)
682 {
683         tap_blkif_t *info = filp->private_data;
684         
685         if (!info) {
686                 WPRINTK(" poll, retrieving idx failed\n");
687                 return 0;
688         }
689
690         /* do not work on the control device */
691         if (!info->minor)
692                 return 0;
693
694         poll_wait(filp, &info->wait, wait);
695         if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
696                 RING_PUSH_REQUESTS(&info->ufe_ring);
697                 return POLLIN | POLLRDNORM;
698         }
699         return 0;
700 }
701
702 void blktap_kick_user(int idx)
703 {
704         tap_blkif_t *info;
705
706         if (idx == 0)
707                 return;
708         
709         info = tapfds[idx];
710         
711         if (info != NULL)
712                 wake_up_interruptible(&info->wait);
713
714         return;
715 }
716
717 static int do_block_io_op(blkif_t *blkif);
718 static void dispatch_rw_block_io(blkif_t *blkif,
719                                  blkif_request_t *req,
720                                  pending_req_t *pending_req);
721 static void make_response(blkif_t *blkif, unsigned long id, 
722                           unsigned short op, int st);
723
724 /******************************************************************
725  * misc small helpers
726  */
727 static int req_increase(void)
728 {
729         int i, j;
730
731         if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
732                 return -EINVAL;
733
734         pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t)
735                                             * blkif_reqs, GFP_KERNEL);
736         foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
737
738         if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
739                 goto out_of_memory;
740
741         DPRINTK("%s: reqs=%d, pages=%d\n",
742                 __FUNCTION__, blkif_reqs, mmap_pages);
743
744         for (i = 0; i < MAX_PENDING_REQS; i++) {
745                 list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
746                               &pending_free);
747                 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
748                 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
749                         BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
750                                                                  i, j));
751         }
752
753         mmap_alloc++;
754         DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
755         return 0;
756
757  out_of_memory:
758         free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
759         kfree(pending_reqs[mmap_alloc]);
760         WPRINTK("%s: out of memory\n", __FUNCTION__);
761         return -ENOMEM;
762 }
763
764 static void mmap_req_del(int mmap)
765 {
766         BUG_ON(!spin_is_locked(&pending_free_lock));
767
768         kfree(pending_reqs[mmap]);
769         pending_reqs[mmap] = NULL;
770
771         free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
772         foreign_pages[mmap] = NULL;
773
774         mmap_lock = 0;
775         DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
776         mmap_alloc--;
777 }
778
779 static pending_req_t* alloc_req(void)
780 {
781         pending_req_t *req = NULL;
782         unsigned long flags;
783
784         spin_lock_irqsave(&pending_free_lock, flags);
785
786         if (!list_empty(&pending_free)) {
787                 req = list_entry(pending_free.next, pending_req_t, free_list);
788                 list_del(&req->free_list);
789         }
790
791         if (req) {
792                 req->inuse = 1;
793                 alloc_pending_reqs++;
794         }
795         spin_unlock_irqrestore(&pending_free_lock, flags);
796
797         return req;
798 }
799
800 static void free_req(pending_req_t *req)
801 {
802         unsigned long flags;
803         int was_empty;
804
805         spin_lock_irqsave(&pending_free_lock, flags);
806
807         alloc_pending_reqs--;
808         req->inuse = 0;
809         if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
810                 mmap_inuse--;
811                 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
812                 spin_unlock_irqrestore(&pending_free_lock, flags);
813                 return;
814         }
815         was_empty = list_empty(&pending_free);
816         list_add(&req->free_list, &pending_free);
817
818         spin_unlock_irqrestore(&pending_free_lock, flags);
819
820         if (was_empty)
821                 wake_up(&pending_free_wq);
822 }
823
824 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int 
825                             tapidx)
826 {
827         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
828         unsigned int i, invcount = 0;
829         struct grant_handle_pair *khandle;
830         uint64_t ptep;
831         int ret, mmap_idx;
832         unsigned long kvaddr, uvaddr;
833
834         tap_blkif_t *info = tapfds[tapidx];
835         
836         if (info == NULL) {
837                 WPRINTK("fast_flush: Couldn't get info!\n");
838                 return;
839         }
840         mmap_idx = req->mem_idx;
841
842         for (i = 0; i < req->nr_pages; i++) {
843                 kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
844                 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
845
846                 khandle = &pending_handle(mmap_idx, k_idx, i);
847                 if (BLKTAP_INVALID_HANDLE(khandle)) {
848                         WPRINTK("BLKTAP_INVALID_HANDLE\n");
849                         continue;
850                 }
851                 gnttab_set_unmap_op(&unmap[invcount], 
852                                     idx_to_kaddr(mmap_idx, k_idx, i), 
853                                     GNTMAP_host_map, khandle->kernel);
854                 invcount++;
855
856                 if (create_lookup_pte_addr(
857                     info->vma->vm_mm,
858                     MMAP_VADDR(info->user_vstart, u_idx, i), 
859                     &ptep) !=0) {
860                         WPRINTK("Couldn't get a pte addr!\n");
861                         return;
862                 }
863
864                 gnttab_set_unmap_op(&unmap[invcount], 
865                         ptep, GNTMAP_host_map,
866                         khandle->user);
867                 invcount++;
868             
869                 BLKTAP_INVALIDATE_HANDLE(khandle);
870         }
871         ret = HYPERVISOR_grant_table_op(
872                 GNTTABOP_unmap_grant_ref, unmap, invcount);
873         BUG_ON(ret);
874         
875         if (info->vma != NULL)
876                 zap_page_range(info->vma, 
877                                MMAP_VADDR(info->user_vstart, u_idx, 0), 
878                                req->nr_pages << PAGE_SHIFT, NULL);
879 }
880
881 /******************************************************************
882  * SCHEDULER FUNCTIONS
883  */
884
885 static void print_stats(blkif_t *blkif)
886 {
887         printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
888                current->comm, blkif->st_oo_req,
889                blkif->st_rd_req, blkif->st_wr_req);
890         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
891         blkif->st_rd_req = 0;
892         blkif->st_wr_req = 0;
893         blkif->st_oo_req = 0;
894 }
895
896 int tap_blkif_schedule(void *arg)
897 {
898         blkif_t *blkif = arg;
899
900         blkif_get(blkif);
901
902         if (debug_lvl)
903                 printk(KERN_DEBUG "%s: started\n", current->comm);
904
905         while (!kthread_should_stop()) {
906                 wait_event_interruptible(
907                         blkif->wq,
908                         blkif->waiting_reqs || kthread_should_stop());
909                 wait_event_interruptible(
910                         pending_free_wq,
911                         !list_empty(&pending_free) || kthread_should_stop());
912
913                 blkif->waiting_reqs = 0;
914                 smp_mb(); /* clear flag *before* checking for work */
915
916                 if (do_block_io_op(blkif))
917                         blkif->waiting_reqs = 1;
918
919                 if (log_stats && time_after(jiffies, blkif->st_print))
920                         print_stats(blkif);
921         }
922
923         if (log_stats)
924                 print_stats(blkif);
925         if (debug_lvl)
926                 printk(KERN_DEBUG "%s: exiting\n", current->comm);
927
928         blkif->xenblkd = NULL;
929         blkif_put(blkif);
930
931         return 0;
932 }
933
934 /******************************************************************
935  * COMPLETION CALLBACK -- Called by user level ioctl()
936  */
937
938 static int blktap_read_ufe_ring(tap_blkif_t *info)
939 {
940         /* This is called to read responses from the UFE ring. */
941         RING_IDX i, j, rp;
942         blkif_response_t *resp;
943         blkif_t *blkif=NULL;
944         int pending_idx, usr_idx, mmap_idx;
945         pending_req_t *pending_req;
946         
947         if (!info)
948                 return 0;
949
950         /* We currently only forward packets in INTERCEPT_FE mode. */
951         if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
952                 return 0;
953
954         /* for each outstanding message on the UFEring  */
955         rp = info->ufe_ring.sring->rsp_prod;
956         rmb();
957         
958         for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
959                 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
960                 ++info->ufe_ring.rsp_cons;
961
962                 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
963                 usr_idx = (int)resp->id;
964                 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
965                 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
966
967                 if ( (mmap_idx >= mmap_alloc) || 
968                    (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
969                         WPRINTK("Incorrect req map"
970                                "[%d], internal map [%d,%d (%d)]\n", 
971                                usr_idx, mmap_idx, 
972                                ID_TO_IDX(info->idx_map[usr_idx]),
973                                MASK_PEND_IDX(
974                                        ID_TO_IDX(info->idx_map[usr_idx])));
975
976                 pending_req = &pending_reqs[mmap_idx][pending_idx];
977                 blkif = pending_req->blkif;
978
979                 for (j = 0; j < pending_req->nr_pages; j++) {
980
981                         unsigned long kvaddr, uvaddr;
982                         struct page **map = info->vma->vm_private_data;
983                         struct page *pg;
984                         int offset;
985
986                         uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
987                         kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
988
989                         pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
990                         ClearPageReserved(pg);
991                         offset = (uvaddr - info->vma->vm_start) 
992                                 >> PAGE_SHIFT;
993                         map[offset] = NULL;
994                 }
995                 fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
996                 make_response(blkif, pending_req->id, resp->operation,
997                               resp->status);
998                 info->idx_map[usr_idx] = INVALID_REQ;
999                 blkif_put(pending_req->blkif);
1000                 free_req(pending_req);
1001         }
1002                 
1003         return 0;
1004 }
1005
1006
1007 /******************************************************************************
1008  * NOTIFICATION FROM GUEST OS.
1009  */
1010
1011 static void blkif_notify_work(blkif_t *blkif)
1012 {
1013         blkif->waiting_reqs = 1;
1014         wake_up(&blkif->wq);
1015 }
1016
1017 irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
1018 {
1019         blkif_notify_work(dev_id);
1020         return IRQ_HANDLED;
1021 }
1022
1023
1024
1025 /******************************************************************
1026  * DOWNWARD CALLS -- These interface with the block-device layer proper.
1027  */
1028 static int print_dbug = 1;
1029 static int do_block_io_op(blkif_t *blkif)
1030 {
1031         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1032         blkif_request_t *req;
1033         pending_req_t *pending_req;
1034         RING_IDX rc, rp;
1035         int more_to_do = 0;
1036         tap_blkif_t *info;
1037
1038         rc = blk_ring->req_cons;
1039         rp = blk_ring->sring->req_prod;
1040         rmb(); /* Ensure we see queued requests up to 'rp'. */
1041
1042         /*Check blkif has corresponding UE ring*/
1043         if (blkif->dev_num == -1) {
1044                 /*oops*/
1045                 if (print_dbug) {
1046                         WPRINTK("Corresponding UE " 
1047                                "ring does not exist!\n");
1048                         print_dbug = 0; /*We only print this message once*/
1049                 }
1050                 return 0;
1051         }
1052
1053         info = tapfds[blkif->dev_num];
1054         if (info == NULL || !info->dev_inuse) {
1055                 if (print_dbug) {
1056                         WPRINTK("Can't get UE info!\n");
1057                         print_dbug = 0;
1058                 }
1059                 return 0;
1060         }
1061
1062         while (rc != rp) {
1063                 
1064                 if (RING_FULL(&info->ufe_ring)) {
1065                         WPRINTK("RING_FULL! More to do\n");
1066                         more_to_do = 1;
1067                         break;
1068                 }
1069                 
1070                 if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
1071                         WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1072                                " More to do\n");
1073                         more_to_do = 1;
1074                         break;          
1075                 }
1076
1077                 pending_req = alloc_req();
1078                 if (NULL == pending_req) {
1079                         blkif->st_oo_req++;
1080                         more_to_do = 1;
1081                         break;
1082                 }
1083
1084                 req = RING_GET_REQUEST(blk_ring, rc);
1085                 blk_ring->req_cons = ++rc; /* before make_response() */ 
1086
1087                 switch (req->operation) {
1088                 case BLKIF_OP_READ:
1089                         blkif->st_rd_req++;
1090                         dispatch_rw_block_io(blkif, req, pending_req);
1091                         break;
1092
1093                 case BLKIF_OP_WRITE:
1094                         blkif->st_wr_req++;
1095                         dispatch_rw_block_io(blkif, req, pending_req);
1096                         break;
1097
1098                 default:
1099                         WPRINTK("unknown operation [%d]\n",
1100                                 req->operation);
1101                         make_response(blkif, req->id, req->operation,
1102                                       BLKIF_RSP_ERROR);
1103                         free_req(pending_req);
1104                         break;
1105                 }
1106         }
1107                 
1108         blktap_kick_user(blkif->dev_num);
1109
1110         return more_to_do;
1111 }
1112
1113 static void dispatch_rw_block_io(blkif_t *blkif,
1114                                  blkif_request_t *req,
1115                                  pending_req_t *pending_req)
1116 {
1117         extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
1118         int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
1119         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1120         unsigned int nseg;
1121         int ret, i;
1122         tap_blkif_t *info = tapfds[blkif->dev_num];
1123         uint64_t sector;
1124         
1125         blkif_request_t *target;
1126         int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
1127         int usr_idx = GET_NEXT_REQ(info->idx_map);
1128         uint16_t mmap_idx = pending_req->mem_idx;
1129
1130         /*Check we have space on user ring - should never fail*/
1131         if(usr_idx == INVALID_REQ) goto fail_flush;
1132         
1133         /* Check that number of segments is sane. */
1134         nseg = req->nr_segments;
1135         if ( unlikely(nseg == 0) || 
1136             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1137                 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1138                 goto fail_response;
1139         }
1140         
1141         /* Make sure userspace is ready. */
1142         if (!info->ring_ok) {
1143                 WPRINTK("blktap: ring not ready for requests!\n");
1144                 goto fail_response;
1145         }
1146
1147         if (RING_FULL(&info->ufe_ring)) {
1148                 WPRINTK("blktap: fe_ring is full, can't add "
1149                         "IO Request will be dropped. %d %d\n",
1150                         RING_SIZE(&info->ufe_ring),
1151                         RING_SIZE(&blkif->blk_ring));
1152                 goto fail_response;
1153         }
1154
1155         pending_req->blkif     = blkif;
1156         pending_req->id        = req->id;
1157         pending_req->operation = operation;
1158         pending_req->status    = BLKIF_RSP_OKAY;
1159         pending_req->nr_pages  = nseg;
1160         op = 0;
1161         for (i = 0; i < nseg; i++) {
1162                 unsigned long uvaddr;
1163                 unsigned long kvaddr;
1164                 uint64_t ptep;
1165                 struct page *page;
1166                 uint32_t flags;
1167
1168                 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1169                 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1170                 page = virt_to_page(kvaddr);
1171
1172                 sector = req->sector_number + (8*i);
1173                 if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
1174                         WPRINTK("BLKTAP: Sector request greater" 
1175                                "than size\n");
1176                         WPRINTK("BLKTAP: %s request sector" 
1177                                "[%llu,%llu], Total [%llu]\n",
1178                                (req->operation == 
1179                                 BLKIF_OP_WRITE ? "WRITE" : "READ"),
1180                                 (long long unsigned) sector,
1181                                 (long long unsigned) sector>>9,
1182                                 blkif->sectors);
1183                 }
1184
1185                 flags = GNTMAP_host_map;
1186                 if (operation == WRITE)
1187                         flags |= GNTMAP_readonly;
1188                 gnttab_set_map_op(&map[op], kvaddr, flags,
1189                                   req->seg[i].gref, blkif->domid);
1190                 op++;
1191
1192                 /* Now map it to user. */
1193                 ret = create_lookup_pte_addr(info->vma->vm_mm, 
1194                                              uvaddr, &ptep);
1195                 if (ret) {
1196                         WPRINTK("Couldn't get a pte addr!\n");
1197                         fast_flush_area(pending_req, pending_idx, usr_idx, 
1198                                         blkif->dev_num);
1199                         goto fail_flush;
1200                 }
1201
1202                 flags = GNTMAP_host_map | GNTMAP_application_map
1203                         | GNTMAP_contains_pte;
1204                 if (operation == WRITE)
1205                         flags |= GNTMAP_readonly;
1206                 gnttab_set_map_op(&map[op], ptep, flags,
1207                                   req->seg[i].gref, blkif->domid);
1208                 op++;
1209         }
1210
1211         ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1212         BUG_ON(ret);
1213
1214         for (i = 0; i < (nseg*2); i+=2) {
1215                 unsigned long uvaddr;
1216                 unsigned long kvaddr;
1217                 unsigned long offset;
1218                 struct page *pg;
1219
1220                 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1221                 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
1222
1223                 if (unlikely(map[i].status != 0)) {
1224                         WPRINTK("invalid kernel buffer -- "
1225                                 "could not remap it\n");
1226                         goto fail_flush;
1227                 }
1228
1229                 if (unlikely(map[i+1].status != 0)) {
1230                         WPRINTK("invalid user buffer -- "
1231                                 "could not remap it\n");
1232                         goto fail_flush;
1233                 }
1234
1235                 pending_handle(mmap_idx, pending_idx, i/2).kernel 
1236                         = map[i].handle;
1237                 pending_handle(mmap_idx, pending_idx, i/2).user   
1238                         = map[i+1].handle;
1239                 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
1240                         FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
1241                 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1242                 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1243                 ((struct page **)info->vma->vm_private_data)[offset] =
1244                         pg;
1245         }
1246         /* Mark mapped pages as reserved: */
1247         for (i = 0; i < req->nr_segments; i++) {
1248                 unsigned long kvaddr;
1249                 struct page *pg;
1250
1251                 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1252                 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1253                 SetPageReserved(pg);
1254         }
1255         
1256         /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1257         info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
1258
1259         blkif_get(blkif);
1260         /* Finally, write the request message to the user ring. */
1261         target = RING_GET_REQUEST(&info->ufe_ring,
1262                                   info->ufe_ring.req_prod_pvt);
1263         memcpy(target, req, sizeof(*req));
1264         target->id = usr_idx;
1265         info->ufe_ring.req_prod_pvt++;
1266         return;
1267
1268  fail_flush:
1269         WPRINTK("Reached Fail_flush\n");
1270         fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
1271  fail_response:
1272         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1273         free_req(pending_req);
1274
1275
1276
1277
1278 /******************************************************************
1279  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1280  */
1281
1282
1283 static void make_response(blkif_t *blkif, unsigned long id, 
1284                           unsigned short op, int st)
1285 {
1286         blkif_response_t *resp;
1287         unsigned long     flags;
1288         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1289         int more_to_do = 0;
1290         int notify;
1291
1292         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1293         /* Place on the response ring for the relevant domain. */ 
1294         resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
1295         resp->id        = id;
1296         resp->operation = op;
1297         resp->status    = st;
1298         blk_ring->rsp_prod_pvt++;
1299         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
1300
1301         if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
1302                 /*
1303                  * Tail check for pending requests. Allows frontend to avoid
1304                  * notifications if requests are already in flight (lower
1305                  * overheads and promotes batching).
1306                  */
1307                 RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
1308         } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
1309                 more_to_do = 1;
1310
1311         }       
1312         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1313         if (more_to_do)
1314                 blkif_notify_work(blkif);
1315         if (notify)
1316                 notify_remote_via_irq(blkif->irq);
1317 }
1318
1319 static int __init blkif_init(void)
1320 {
1321         int i, ret;
1322         tap_blkif_t *info;
1323
1324         if (!is_running_on_xen())
1325                 return -ENODEV;
1326
1327         INIT_LIST_HEAD(&pending_free);
1328         for(i = 0; i < 2; i++) {
1329                 ret = req_increase();
1330                 if (ret)
1331                         break;
1332         }
1333         if (i == 0)
1334                 return ret;
1335
1336         tap_blkif_interface_init();
1337
1338         alloc_pending_reqs = 0;
1339
1340         tap_blkif_xenbus_init();
1341
1342         /*Create the blktap devices, but do not map memory or waitqueue*/
1343         for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF;
1344
1345         /* Dynamically allocate a major for this device */
1346         ret = register_chrdev(0, "blktap", &blktap_fops);
1347
1348         if ( (ret < 0) ) {
1349                 WPRINTK("Couldn't register /dev/xen/blktap\n");
1350                 return -ENOMEM;
1351         }       
1352         
1353         blktap_major = ret;
1354
1355         for(i = 0; i < MAX_TAP_DEV; i++ ) {
1356                 info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL);
1357                 if(tapfds[i] == NULL)
1358                         return -ENOMEM;
1359                 info->minor = i;
1360                 info->pid = 0;
1361                 info->blkif = NULL;
1362
1363                 info->dev_pending = info->dev_inuse = 0;
1364
1365                 DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
1366         }
1367         
1368         /* Make sure the xen class exists */
1369         if (!setup_xen_class()) {
1370                 /*
1371                  * This will allow udev to create the blktap ctrl device.
1372                  * We only want to create blktap0 first.  We don't want
1373                  * to flood the sysfs system with needless blktap devices.
1374                  * We only create the device when a request of a new device is
1375                  * made.
1376                  */
1377                 class_device_create(xen_class, NULL,
1378                                     MKDEV(blktap_major, 0), NULL,
1379                                     "blktap0");
1380                 tapfds[0]->sysfs_set = 1;
1381         } else {
1382                 /* this is bad, but not fatal */
1383                 WPRINTK("blktap: sysfs xen_class not created\n");
1384         }
1385
1386         DPRINTK("Blktap device successfully created\n");
1387
1388         return 0;
1389 }
1390
1391 module_init(blkif_init);
1392
1393 MODULE_LICENSE("Dual BSD/GPL");