2554f999d76b85d6bc6b69430e85c56dd70988ad
[linux-2.6.git] / drivers / xen / blkback / blkback.c
1 /******************************************************************************
2  * arch/xen/drivers/blkif/backend/main.c
3  * 
4  * Back-end of the driver for virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. A 
7  * reference front-end implementation can be found in:
8  *  arch/xen/drivers/blkif/frontend
9  * 
10  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11  * Copyright (c) 2005, Christopher Clark
12  */
13
14 #include "common.h"
15 #include <asm-xen/evtchn.h>
16 #ifdef CONFIG_XEN_BLKDEV_GRANT
17 #include <asm-xen/xen-public/grant_table.h>
18 #endif
19
20 /*
21  * These are rather arbitrary. They are fairly large because adjacent requests
22  * pulled from a communication ring are quite likely to end up being part of
23  * the same scatter/gather request at the disc.
24  * 
25  * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
26  * This will increase the chances of being able to write whole tracks.
27  * 64 should be enough to keep us competitive with Linux.
28  */
29 #define MAX_PENDING_REQS 64
30 #define BATCH_PER_DOMAIN 16
31
32 static unsigned long mmap_vstart;
33 #define MMAP_PAGES                                              \
34     (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
35 #define MMAP_VADDR(_req,_seg)                                   \
36     (mmap_vstart +                                              \
37      ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +    \
38      ((_seg) * PAGE_SIZE))
39
40 /*
41  * Each outstanding request that we've passed to the lower device layers has a 
42  * 'pending_req' allocated to it. Each buffer_head that completes decrements 
43  * the pendcnt towards zero. When it hits zero, the specified domain has a 
44  * response queued for it, with the saved 'id' passed back.
45  */
46 typedef struct {
47     blkif_t       *blkif;
48     unsigned long  id;
49     int            nr_pages;
50     atomic_t       pendcnt;
51     unsigned short operation;
52     int            status;
53 } pending_req_t;
54
55 /*
56  * We can't allocate pending_req's in order, since they may complete out of 
57  * order. We therefore maintain an allocation ring. This ring also indicates 
58  * when enough work has been passed down -- at that point the allocation ring 
59  * will be empty.
60  */
61 static pending_req_t pending_reqs[MAX_PENDING_REQS];
62 static unsigned char pending_ring[MAX_PENDING_REQS];
63 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
64 /* NB. We use a different index type to differentiate from shared blk rings. */
65 typedef unsigned int PEND_RING_IDX;
66 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
67 static PEND_RING_IDX pending_prod, pending_cons;
68 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
69
70 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
71 static kmem_cache_t *buffer_head_cachep;
72 #else
73 static request_queue_t *plugged_queue;
74 static inline void flush_plugged_queue(void)
75 {
76     request_queue_t *q = plugged_queue;
77     if ( q != NULL )
78     {
79         if ( q->unplug_fn != NULL )
80             q->unplug_fn(q);
81         blk_put_queue(q);
82         plugged_queue = NULL;
83     }
84 }
85 #endif
86
87 #ifdef CONFIG_XEN_BLKDEV_GRANT
88 /* When using grant tables to map a frame for device access then the
89  * handle returned must be used to unmap the frame. This is needed to
90  * drop the ref count on the frame.
91  */
92 static u16 pending_grant_handles[MMAP_PAGES];
93 #define pending_handle(_idx, _i) \
94     (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
95 #define BLKBACK_INVALID_HANDLE (0xFFFF)
96 #endif
97
98 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
99 /*
100  * If the tap driver is used, we may get pages belonging to either the tap
101  * or (more likely) the real frontend.  The backend must specify which domain
102  * a given page belongs to in update_va_mapping though.  For the moment, 
103  * the tap rewrites the ID field of the request to contain the request index
104  * and the id of the real front end domain.
105  */
106 #define BLKTAP_COOKIE 0xbeadfeed
107 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
108 #endif
109
110 static int do_block_io_op(blkif_t *blkif, int max_to_do);
111 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
112 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
113 static void make_response(blkif_t *blkif, unsigned long id, 
114                           unsigned short op, int st);
115
116 static void fast_flush_area(int idx, int nr_pages)
117 {
118 #ifdef CONFIG_XEN_BLKDEV_GRANT
119     gnttab_op_t       aop[BLKIF_MAX_SEGMENTS_PER_REQUEST];
120     unsigned int      i, invcount = 0;
121     u16               handle;
122
123     for ( i = 0; i < nr_pages; i++ )
124     {
125         if ( BLKBACK_INVALID_HANDLE != ( handle = pending_handle(idx, i) ) )
126         {
127             aop[i].u.unmap_grant_ref.host_virt_addr = MMAP_VADDR(idx, i);
128             aop[i].u.unmap_grant_ref.dev_bus_addr   = 0;
129             aop[i].u.unmap_grant_ref.handle         = handle;
130             pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
131             invcount++;
132         }
133     }
134     if ( unlikely(HYPERVISOR_grant_table_op(
135                     GNTTABOP_unmap_grant_ref, aop, invcount)))
136         BUG();
137 #else
138
139     multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
140     int               i;
141
142     for ( i = 0; i < nr_pages; i++ )
143     {
144         mcl[i].op = __HYPERVISOR_update_va_mapping;
145         mcl[i].args[0] = MMAP_VADDR(idx, i);
146         mcl[i].args[1] = 0;
147         mcl[i].args[2] = 0;
148     }
149
150     mcl[nr_pages-1].args[2] = UVMF_TLB_FLUSH|UVMF_ALL;
151     if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
152         BUG();
153 #endif
154 }
155
156
157 /******************************************************************
158  * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
159  */
160
161 static struct list_head blkio_schedule_list;
162 static spinlock_t blkio_schedule_list_lock;
163
164 static int __on_blkdev_list(blkif_t *blkif)
165 {
166     return blkif->blkdev_list.next != NULL;
167 }
168
169 static void remove_from_blkdev_list(blkif_t *blkif)
170 {
171     unsigned long flags;
172     if ( !__on_blkdev_list(blkif) ) return;
173     spin_lock_irqsave(&blkio_schedule_list_lock, flags);
174     if ( __on_blkdev_list(blkif) )
175     {
176         list_del(&blkif->blkdev_list);
177         blkif->blkdev_list.next = NULL;
178         blkif_put(blkif);
179     }
180     spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
181 }
182
183 static void add_to_blkdev_list_tail(blkif_t *blkif)
184 {
185     unsigned long flags;
186     if ( __on_blkdev_list(blkif) ) return;
187     spin_lock_irqsave(&blkio_schedule_list_lock, flags);
188     if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
189     {
190         list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
191         blkif_get(blkif);
192     }
193     spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
194 }
195
196
197 /******************************************************************
198  * SCHEDULER FUNCTIONS
199  */
200
201 static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
202
203 static int blkio_schedule(void *arg)
204 {
205     DECLARE_WAITQUEUE(wq, current);
206
207     blkif_t          *blkif;
208     struct list_head *ent;
209
210     daemonize(
211 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
212         "xenblkd"
213 #endif
214         );
215
216     for ( ; ; )
217     {
218         /* Wait for work to do. */
219         add_wait_queue(&blkio_schedule_wait, &wq);
220         set_current_state(TASK_INTERRUPTIBLE);
221         if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
222              list_empty(&blkio_schedule_list) )
223             schedule();
224         __set_current_state(TASK_RUNNING);
225         remove_wait_queue(&blkio_schedule_wait, &wq);
226
227         /* Queue up a batch of requests. */
228         while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
229                 !list_empty(&blkio_schedule_list) )
230         {
231             ent = blkio_schedule_list.next;
232             blkif = list_entry(ent, blkif_t, blkdev_list);
233             blkif_get(blkif);
234             remove_from_blkdev_list(blkif);
235             if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
236                 add_to_blkdev_list_tail(blkif);
237             blkif_put(blkif);
238         }
239
240         /* Push the batch through to disc. */
241 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
242         run_task_queue(&tq_disk);
243 #else
244         flush_plugged_queue();
245 #endif
246     }
247 }
248
249 static void maybe_trigger_blkio_schedule(void)
250 {
251     /*
252      * Needed so that two processes, who together make the following predicate
253      * true, don't both read stale values and evaluate the predicate
254      * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
255      */
256     smp_mb();
257
258     if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
259          !list_empty(&blkio_schedule_list) )
260         wake_up(&blkio_schedule_wait);
261 }
262
263
264
265 /******************************************************************
266  * COMPLETION CALLBACK -- Called as bh->b_end_io()
267  */
268
269 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
270 {
271     unsigned long flags;
272
273     /* An error fails the entire request. */
274     if ( !uptodate )
275     {
276         DPRINTK("Buffer not up-to-date at end of operation\n");
277         pending_req->status = BLKIF_RSP_ERROR;
278     }
279
280     if ( atomic_dec_and_test(&pending_req->pendcnt) )
281     {
282         int pending_idx = pending_req - pending_reqs;
283         fast_flush_area(pending_idx, pending_req->nr_pages);
284         make_response(pending_req->blkif, pending_req->id,
285                       pending_req->operation, pending_req->status);
286         blkif_put(pending_req->blkif);
287         spin_lock_irqsave(&pend_prod_lock, flags);
288         pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
289         spin_unlock_irqrestore(&pend_prod_lock, flags);
290         maybe_trigger_blkio_schedule();
291     }
292 }
293
294 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
295 static void end_block_io_op(struct buffer_head *bh, int uptodate)
296 {
297     __end_block_io_op(bh->b_private, uptodate);
298     kmem_cache_free(buffer_head_cachep, bh);
299 }
300 #else
301 static int end_block_io_op(struct bio *bio, unsigned int done, int error)
302 {
303     if ( done || error )
304         __end_block_io_op(bio->bi_private, (done && !error));
305     bio_put(bio);
306     return error;
307 }
308 #endif
309
310
311 /******************************************************************************
312  * NOTIFICATION FROM GUEST OS.
313  */
314
315 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
316 {
317     blkif_t *blkif = dev_id;
318     add_to_blkdev_list_tail(blkif);
319     maybe_trigger_blkio_schedule();
320     return IRQ_HANDLED;
321 }
322
323
324
325 /******************************************************************
326  * DOWNWARD CALLS -- These interface with the block-device layer proper.
327  */
328
329 static int do_block_io_op(blkif_t *blkif, int max_to_do)
330 {
331     blkif_back_ring_t *blk_ring = &blkif->blk_ring;
332     blkif_request_t *req;
333     RING_IDX i, rp;
334     int more_to_do = 0;
335
336     rp = blk_ring->sring->req_prod;
337     rmb(); /* Ensure we see queued requests up to 'rp'. */
338
339     for ( i = blk_ring->req_cons; 
340          (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
341           i++ )
342     {
343         if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
344         {
345             more_to_do = 1;
346             break;
347         }
348         
349         req = RING_GET_REQUEST(blk_ring, i);
350         switch ( req->operation )
351         {
352         case BLKIF_OP_READ:
353         case BLKIF_OP_WRITE:
354             dispatch_rw_block_io(blkif, req);
355             break;
356
357         case BLKIF_OP_PROBE:
358             dispatch_probe(blkif, req);
359             break;
360
361         default:
362             DPRINTK("error: unknown block io operation [%d]\n",
363                     req->operation);
364             make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
365             break;
366         }
367     }
368
369     blk_ring->req_cons = i;
370     return more_to_do;
371 }
372
373 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
374 {
375     int rsp = BLKIF_RSP_ERROR;
376     int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
377
378     /* We expect one buffer only. */
379     if ( unlikely(req->nr_segments != 1) )
380         goto out;
381
382     /* Make sure the buffer is page-sized. */
383     if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
384          (blkif_last_sect(req->frame_and_sects[0]) != 7) )
385         goto out;
386
387 #ifdef CONFIG_XEN_BLKDEV_GRANT
388     {
389         gnttab_op_t     op;
390
391         op.u.map_grant_ref.host_virt_addr = MMAP_VADDR(pending_idx, 0);
392         op.u.map_grant_ref.flags = GNTMAP_host_map;
393         op.u.map_grant_ref.ref = blkif_gref_from_fas(req->frame_and_sects[0]);
394         op.u.map_grant_ref.dom = blkif->domid;
395
396         if ( unlikely(HYPERVISOR_grant_table_op(
397                         GNTTABOP_map_grant_ref, &op, 1)))
398             BUG();
399
400         if ( op.u.map_grant_ref.handle < 0 )
401             goto out;
402
403         pending_handle(pending_idx, 0) = op.u.map_grant_ref.handle;
404     }
405 #else /* else CONFIG_XEN_BLKDEV_GRANT */
406
407 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
408     /* Grab the real frontend out of the probe message. */
409     if (req->frame_and_sects[1] == BLKTAP_COOKIE) 
410         blkif->is_blktap = 1;
411 #endif
412
413
414 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
415     if ( HYPERVISOR_update_va_mapping_otherdomain(
416         MMAP_VADDR(pending_idx, 0),
417         (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
418         0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) )
419         
420         goto out;
421 #else
422     if ( HYPERVISOR_update_va_mapping_otherdomain(
423         MMAP_VADDR(pending_idx, 0),
424         (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
425         0, blkif->domid) ) 
426         
427         goto out;
428 #endif
429 #endif /* endif CONFIG_XEN_BLKDEV_GRANT */
430    
431     rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), 
432                     PAGE_SIZE / sizeof(vdisk_t));
433
434  out:
435     fast_flush_area(pending_idx, 1);
436     make_response(blkif, req->id, req->operation, rsp);
437 }
438
439 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
440 {
441     extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
442     int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
443     unsigned long fas = 0;
444     int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
445     pending_req_t *pending_req;
446 #ifdef CONFIG_XEN_BLKDEV_GRANT
447     gnttab_op_t       aop[BLKIF_MAX_SEGMENTS_PER_REQUEST];
448 #else
449     unsigned long remap_prot;
450     multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
451 #endif
452     struct phys_req preq;
453     struct { 
454         unsigned long buf; unsigned int nsec;
455     } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
456     unsigned int nseg;
457 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
458     struct buffer_head *bh;
459 #else
460     struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
461     int nbio = 0;
462     request_queue_t *q;
463 #endif
464
465     /* Check that number of segments is sane. */
466     nseg = req->nr_segments;
467     if ( unlikely(nseg == 0) || 
468          unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
469     {
470         DPRINTK("Bad number of segments in request (%d)\n", nseg);
471         goto bad_descriptor;
472     }
473
474     preq.dev           = req->device;
475     preq.sector_number = req->sector_number;
476     preq.nr_sects      = 0;
477
478 #ifdef CONFIG_XEN_BLKDEV_GRANT
479     for ( i = 0; i < nseg; i++ )
480     {
481         fas         = req->frame_and_sects[i];
482         seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
483
484         if ( seg[i].nsec <= 0 )
485             goto bad_descriptor;
486         preq.nr_sects += seg[i].nsec;
487
488         aop[i].u.map_grant_ref.host_virt_addr = MMAP_VADDR(pending_idx, i);
489
490         aop[i].u.map_grant_ref.dom = blkif->domid;
491         aop[i].u.map_grant_ref.ref = blkif_gref_from_fas(fas);
492         aop[i].u.map_grant_ref.flags = ( GNTMAP_host_map   |
493                                        ( ( operation == READ ) ?
494                                              0 : GNTMAP_readonly ) );
495     }
496
497     if ( unlikely(HYPERVISOR_grant_table_op(
498                     GNTTABOP_map_grant_ref, aop, nseg)))
499         BUG();
500
501     for ( i = 0; i < nseg; i++ )
502     {
503         if ( unlikely(aop[i].u.map_grant_ref.handle < 0) )
504         {
505             DPRINTK("invalid buffer -- could not remap it\n");
506             fast_flush_area(pending_idx, nseg);
507             goto bad_descriptor;
508         }
509
510         phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
511             FOREIGN_FRAME(aop[i].u.map_grant_ref.dev_bus_addr);
512
513         pending_handle(pending_idx, i) = aop[i].u.map_grant_ref.handle;
514     }
515 #endif
516
517     for ( i = 0; i < nseg; i++ )
518     {
519         fas         = req->frame_and_sects[i];
520 #ifdef CONFIG_XEN_BLKDEV_GRANT
521         seg[i].buf  = (aop[i].u.map_grant_ref.dev_bus_addr << PAGE_SHIFT) |
522                       (blkif_first_sect(fas) << 9);
523 #else
524         seg[i].buf  = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
525         seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
526         if ( seg[i].nsec <= 0 )
527             goto bad_descriptor;
528         preq.nr_sects += seg[i].nsec;
529 #endif
530     }
531
532     if ( vbd_translate(&preq, blkif, operation) != 0 )
533     {
534         DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
535                 operation == READ ? "read" : "write", preq.sector_number,
536                 preq.sector_number + preq.nr_sects, preq.dev); 
537         goto bad_descriptor;
538     }
539
540 #ifndef CONFIG_XEN_BLKDEV_GRANT
541     if ( operation == READ )
542         remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
543     else
544         remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;
545
546     for ( i = 0; i < nseg; i++ )
547     {
548         mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
549         mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
550         mcl[i].args[1] = (seg[i].buf & PAGE_MASK) | remap_prot;
551         mcl[i].args[2] = 0;
552         mcl[i].args[3] = blkif->domid;
553 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
554         if ( blkif->is_blktap )
555             mcl[i].args[3] = ID_TO_DOM(req->id);
556 #endif
557         phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
558             FOREIGN_FRAME(seg[i].buf >> PAGE_SHIFT);
559     }
560
561     BUG_ON(HYPERVISOR_multicall(mcl, nseg) != 0);
562
563     for ( i = 0; i < nseg; i++ )
564     {
565         if ( unlikely(mcl[i].args[5] != 0) )
566         {
567             DPRINTK("invalid buffer -- could not remap it\n");
568             fast_flush_area(pending_idx, nseg);
569             goto bad_descriptor;
570         }
571     }
572 #endif /* end ifndef CONFIG_XEN_BLKDEV_GRANT */
573
574     pending_req = &pending_reqs[pending_idx];
575     pending_req->blkif     = blkif;
576     pending_req->id        = req->id;
577     pending_req->operation = operation;
578     pending_req->status    = BLKIF_RSP_OKAY;
579     pending_req->nr_pages  = nseg;
580
581 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
582
583     atomic_set(&pending_req->pendcnt, nseg);
584     pending_cons++;
585     blkif_get(blkif);
586
587     for ( i = 0; i < nseg; i++ )
588     {
589         bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
590         if ( unlikely(bh == NULL) )
591         {
592             __end_block_io_op(pending_req, 0);
593             continue;
594         }
595
596         memset(bh, 0, sizeof (struct buffer_head));
597
598         init_waitqueue_head(&bh->b_wait);
599         bh->b_size          = seg[i].nsec << 9;
600         bh->b_dev           = preq.dev;
601         bh->b_rdev          = preq.dev;
602         bh->b_rsector       = (unsigned long)preq.sector_number;
603         bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
604             (seg[i].buf & ~PAGE_MASK);
605         bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
606         bh->b_end_io        = end_block_io_op;
607         bh->b_private       = pending_req;
608
609         bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | 
610             (1 << BH_Req) | (1 << BH_Launder);
611         if ( operation == WRITE )
612             bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
613
614         atomic_set(&bh->b_count, 1);
615
616         /* Dispatch a single request. We'll flush it to disc later. */
617         generic_make_request(operation, bh);
618
619         preq.sector_number += seg[i].nsec;
620     }
621
622 #else
623
624     for ( i = 0; i < nseg; i++ )
625     {
626         while ( (bio == NULL) ||
627                 (bio_add_page(bio,
628                               virt_to_page(MMAP_VADDR(pending_idx, i)),
629                               seg[i].nsec << 9,
630                               seg[i].buf & ~PAGE_MASK) == 0) )
631         {
632             bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
633             if ( unlikely(bio == NULL) )
634             {
635                 for ( i = 0; i < (nbio-1); i++ )
636                     bio_put(biolist[i]);
637                 fast_flush_area(pending_idx, nseg);
638                 goto bad_descriptor;
639             }
640                 
641             bio->bi_bdev    = preq.bdev;
642             bio->bi_private = pending_req;
643             bio->bi_end_io  = end_block_io_op;
644             bio->bi_sector  = preq.sector_number;
645         }
646
647         preq.sector_number += seg[i].nsec;
648     }
649
650     if ( (q = bdev_get_queue(bio->bi_bdev)) != plugged_queue )
651     {
652         flush_plugged_queue();
653         blk_get_queue(q);
654         plugged_queue = q;
655     }
656
657     atomic_set(&pending_req->pendcnt, nbio);
658     pending_cons++;
659     blkif_get(blkif);
660
661     for ( i = 0; i < nbio; i++ )
662         submit_bio(operation, biolist[i]);
663
664 #endif
665
666     return;
667
668  bad_descriptor:
669     make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
670
671
672
673
674 /******************************************************************
675  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
676  */
677
678
679 static void make_response(blkif_t *blkif, unsigned long id, 
680                           unsigned short op, int st)
681 {
682     blkif_response_t *resp;
683     unsigned long     flags;
684     blkif_back_ring_t *blk_ring = &blkif->blk_ring;
685
686     /* Place on the response ring for the relevant domain. */ 
687     spin_lock_irqsave(&blkif->blk_ring_lock, flags);
688     resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
689     resp->id        = id;
690     resp->operation = op;
691     resp->status    = st;
692     wmb(); /* Ensure other side can see the response fields. */
693     blk_ring->rsp_prod_pvt++;
694     RING_PUSH_RESPONSES(blk_ring);
695     spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
696
697     /* Kick the relevant domain. */
698     notify_via_evtchn(blkif->evtchn);
699 }
700
701 void blkif_deschedule(blkif_t *blkif)
702 {
703     remove_from_blkdev_list(blkif);
704 }
705
706 static int __init blkif_init(void)
707 {
708     int i;
709
710     if ( !(xen_start_info.flags & SIF_INITDOMAIN) &&
711          !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
712         return 0;
713
714     blkif_interface_init();
715
716     if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
717         BUG();
718
719     pending_cons = 0;
720     pending_prod = MAX_PENDING_REQS;
721     memset(pending_reqs, 0, sizeof(pending_reqs));
722     for ( i = 0; i < MAX_PENDING_REQS; i++ )
723         pending_ring[i] = i;
724     
725     spin_lock_init(&blkio_schedule_list_lock);
726     INIT_LIST_HEAD(&blkio_schedule_list);
727
728     if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
729         BUG();
730
731 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
732     buffer_head_cachep = kmem_cache_create(
733         "buffer_head_cache", sizeof(struct buffer_head),
734         0, SLAB_HWCACHE_ALIGN, NULL, NULL);
735 #endif
736
737     blkif_ctrlif_init();
738     
739 #ifdef CONFIG_XEN_BLKDEV_GRANT
740     memset( pending_grant_handles,  BLKBACK_INVALID_HANDLE, MMAP_PAGES );
741     printk(KERN_ALERT "Blkif backend is using grant tables.\n");
742 #endif
743
744 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
745     printk(KERN_ALERT "NOTE: Blkif backend is running with tap support on!\n");
746 #endif
747
748     return 0;
749 }
750
751 __initcall(blkif_init);