Merge to Fedora kernel-2.6.18-1.2255_FC5-vs2.0.2.2-rc9 patched with stable patch...
[linux-2.6.git] / drivers / xen / blkback / blkback.c
1 /******************************************************************************
2  * arch/xen/drivers/blkif/backend/main.c
3  * 
4  * Back-end of the driver for virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. A 
7  * reference front-end implementation can be found in:
8  *  arch/xen/drivers/blkif/frontend
9  * 
10  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11  * Copyright (c) 2005, Christopher Clark
12  * 
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version 2
15  * as published by the Free Software Foundation; or, when distributed
16  * separately from the Linux kernel or incorporated into other
17  * software packages, subject to the following license:
18  * 
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this source file (the "Software"), to deal in the Software without
21  * restriction, including without limitation the rights to use, copy, modify,
22  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23  * and to permit persons to whom the Software is furnished to do so, subject to
24  * the following conditions:
25  * 
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  * 
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37
38 #include <linux/spinlock.h>
39 #include <linux/kthread.h>
40 #include <linux/list.h>
41 #include <xen/balloon.h>
42 #include <asm/hypervisor.h>
43 #include <asm/hypercall.h>
44 #include "common.h"
45
46 /*
47  * These are rather arbitrary. They are fairly large because adjacent requests
48  * pulled from a communication ring are quite likely to end up being part of
49  * the same scatter/gather request at the disc.
50  * 
51  * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
52  * 
53  * This will increase the chances of being able to write whole tracks.
54  * 64 should be enough to keep us competitive with Linux.
55  */
56 static int blkif_reqs = 64;
57 module_param_named(reqs, blkif_reqs, int, 0);
58 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
59
60 /* Run-time switchable: /sys/module/blkback/parameters/ */
61 static unsigned int log_stats = 0;
62 static unsigned int debug_lvl = 0;
63 module_param(log_stats, int, 0644);
64 module_param(debug_lvl, int, 0644);
65
66 /*
67  * Each outstanding request that we've passed to the lower device layers has a 
68  * 'pending_req' allocated to it. Each buffer_head that completes decrements 
69  * the pendcnt towards zero. When it hits zero, the specified domain has a 
70  * response queued for it, with the saved 'id' passed back.
71  */
72 typedef struct {
73         blkif_t       *blkif;
74         unsigned long  id;
75         int            nr_pages;
76         atomic_t       pendcnt;
77         unsigned short operation;
78         int            status;
79         struct list_head free_list;
80 } pending_req_t;
81
82 static pending_req_t *pending_reqs;
83 static struct list_head pending_free;
84 static DEFINE_SPINLOCK(pending_free_lock);
85 static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
86
87 #define BLKBACK_INVALID_HANDLE (~0)
88
89 static struct page **pending_pages;
90 static grant_handle_t *pending_grant_handles;
91
92 static inline int vaddr_pagenr(pending_req_t *req, int seg)
93 {
94         return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
95 }
96
97 static inline unsigned long vaddr(pending_req_t *req, int seg)
98 {
99         unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
100         return (unsigned long)pfn_to_kaddr(pfn);
101 }
102
103 #define pending_handle(_req, _seg) \
104         (pending_grant_handles[vaddr_pagenr(_req, _seg)])
105
106
107 static int do_block_io_op(blkif_t *blkif);
108 static void dispatch_rw_block_io(blkif_t *blkif,
109                                  blkif_request_t *req,
110                                  pending_req_t *pending_req);
111 static void make_response(blkif_t *blkif, unsigned long id, 
112                           unsigned short op, int st);
113
114 /******************************************************************
115  * misc small helpers
116  */
117 static pending_req_t* alloc_req(void)
118 {
119         pending_req_t *req = NULL;
120         unsigned long flags;
121
122         spin_lock_irqsave(&pending_free_lock, flags);
123         if (!list_empty(&pending_free)) {
124                 req = list_entry(pending_free.next, pending_req_t, free_list);
125                 list_del(&req->free_list);
126         }
127         spin_unlock_irqrestore(&pending_free_lock, flags);
128         return req;
129 }
130
131 static void free_req(pending_req_t *req)
132 {
133         unsigned long flags;
134         int was_empty;
135
136         spin_lock_irqsave(&pending_free_lock, flags);
137         was_empty = list_empty(&pending_free);
138         list_add(&req->free_list, &pending_free);
139         spin_unlock_irqrestore(&pending_free_lock, flags);
140         if (was_empty)
141                 wake_up(&pending_free_wq);
142 }
143
144 static void unplug_queue(blkif_t *blkif)
145 {
146         if (blkif->plug == NULL)
147                 return;
148         if (blkif->plug->unplug_fn)
149                 blkif->plug->unplug_fn(blkif->plug);
150         blk_put_queue(blkif->plug);
151         blkif->plug = NULL;
152 }
153
154 static void plug_queue(blkif_t *blkif, struct bio *bio)
155 {
156         request_queue_t *q = bdev_get_queue(bio->bi_bdev);
157
158         if (q == blkif->plug)
159                 return;
160         unplug_queue(blkif);
161         blk_get_queue(q);
162         blkif->plug = q;
163 }
164
165 static void fast_flush_area(pending_req_t *req)
166 {
167         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
168         unsigned int i, invcount = 0;
169         grant_handle_t handle;
170         int ret;
171
172         for (i = 0; i < req->nr_pages; i++) {
173                 handle = pending_handle(req, i);
174                 if (handle == BLKBACK_INVALID_HANDLE)
175                         continue;
176                 gnttab_set_unmap_op(&unmap[i], vaddr(req, i), GNTMAP_host_map,
177                                     handle);
178                 pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
179                 invcount++;
180         }
181
182         ret = HYPERVISOR_grant_table_op(
183                 GNTTABOP_unmap_grant_ref, unmap, invcount);
184         BUG_ON(ret);
185 }
186
187 /******************************************************************
188  * SCHEDULER FUNCTIONS
189  */
190
191 static void print_stats(blkif_t *blkif)
192 {
193         printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
194                current->comm, blkif->st_oo_req,
195                blkif->st_rd_req, blkif->st_wr_req);
196         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
197         blkif->st_rd_req = 0;
198         blkif->st_wr_req = 0;
199         blkif->st_oo_req = 0;
200 }
201
202 int blkif_schedule(void *arg)
203 {
204         blkif_t *blkif = arg;
205
206         blkif_get(blkif);
207
208         if (debug_lvl)
209                 printk(KERN_DEBUG "%s: started\n", current->comm);
210
211         while (!kthread_should_stop()) {
212                 wait_event_interruptible(
213                         blkif->wq,
214                         blkif->waiting_reqs || kthread_should_stop());
215                 wait_event_interruptible(
216                         pending_free_wq,
217                         !list_empty(&pending_free) || kthread_should_stop());
218
219                 blkif->waiting_reqs = 0;
220                 smp_mb(); /* clear flag *before* checking for work */
221
222                 if (do_block_io_op(blkif))
223                         blkif->waiting_reqs = 1;
224                 unplug_queue(blkif);
225
226                 if (log_stats && time_after(jiffies, blkif->st_print))
227                         print_stats(blkif);
228         }
229
230         if (log_stats)
231                 print_stats(blkif);
232         if (debug_lvl)
233                 printk(KERN_DEBUG "%s: exiting\n", current->comm);
234
235         blkif->xenblkd = NULL;
236         blkif_put(blkif);
237
238         return 0;
239 }
240
241 /******************************************************************
242  * COMPLETION CALLBACK -- Called as bh->b_end_io()
243  */
244
245 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
246 {
247         /* An error fails the entire request. */
248         if (!uptodate) {
249                 DPRINTK("Buffer not up-to-date at end of operation\n");
250                 pending_req->status = BLKIF_RSP_ERROR;
251         }
252
253         if (atomic_dec_and_test(&pending_req->pendcnt)) {
254                 fast_flush_area(pending_req);
255                 make_response(pending_req->blkif, pending_req->id,
256                               pending_req->operation, pending_req->status);
257                 blkif_put(pending_req->blkif);
258                 free_req(pending_req);
259         }
260 }
261
262 static int end_block_io_op(struct bio *bio, unsigned int done, int error)
263 {
264         if (bio->bi_size != 0)
265                 return 1;
266         __end_block_io_op(bio->bi_private, !error);
267         bio_put(bio);
268         return error;
269 }
270
271
272 /******************************************************************************
273  * NOTIFICATION FROM GUEST OS.
274  */
275
276 static void blkif_notify_work(blkif_t *blkif)
277 {
278         blkif->waiting_reqs = 1;
279         wake_up(&blkif->wq);
280 }
281
282 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
283 {
284         blkif_notify_work(dev_id);
285         return IRQ_HANDLED;
286 }
287
288
289
290 /******************************************************************
291  * DOWNWARD CALLS -- These interface with the block-device layer proper.
292  */
293
294 static int do_block_io_op(blkif_t *blkif)
295 {
296         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
297         blkif_request_t *req;
298         pending_req_t *pending_req;
299         RING_IDX rc, rp;
300         int more_to_do = 0;
301
302         rc = blk_ring->req_cons;
303         rp = blk_ring->sring->req_prod;
304         rmb(); /* Ensure we see queued requests up to 'rp'. */
305
306         while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
307
308                 pending_req = alloc_req();
309                 if (NULL == pending_req) {
310                         blkif->st_oo_req++;
311                         more_to_do = 1;
312                         break;
313                 }
314
315                 req = RING_GET_REQUEST(blk_ring, rc);
316                 blk_ring->req_cons = ++rc; /* before make_response() */
317
318                 switch (req->operation) {
319                 case BLKIF_OP_READ:
320                         blkif->st_rd_req++;
321                         dispatch_rw_block_io(blkif, req, pending_req);
322                         break;
323                 case BLKIF_OP_WRITE:
324                         blkif->st_wr_req++;
325                         dispatch_rw_block_io(blkif, req, pending_req);
326                         break;
327                 default:
328                         DPRINTK("error: unknown block io operation [%d]\n",
329                                 req->operation);
330                         make_response(blkif, req->id, req->operation,
331                                       BLKIF_RSP_ERROR);
332                         free_req(pending_req);
333                         break;
334                 }
335         }
336         return more_to_do;
337 }
338
339 static void dispatch_rw_block_io(blkif_t *blkif,
340                                  blkif_request_t *req,
341                                  pending_req_t *pending_req)
342 {
343         extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
344         int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
345         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
346         struct phys_req preq;
347         struct { 
348                 unsigned long buf; unsigned int nsec;
349         } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
350         unsigned int nseg;
351         struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
352         int ret, i, nbio = 0;
353
354         /* Check that number of segments is sane. */
355         nseg = req->nr_segments;
356         if (unlikely(nseg == 0) || 
357             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
358                 DPRINTK("Bad number of segments in request (%d)\n", nseg);
359                 goto fail_response;
360         }
361
362         preq.dev           = req->handle;
363         preq.sector_number = req->sector_number;
364         preq.nr_sects      = 0;
365
366         pending_req->blkif     = blkif;
367         pending_req->id        = req->id;
368         pending_req->operation = operation;
369         pending_req->status    = BLKIF_RSP_OKAY;
370         pending_req->nr_pages  = nseg;
371
372         for (i = 0; i < nseg; i++) {
373                 uint32_t flags;
374
375                 seg[i].nsec = req->seg[i].last_sect -
376                         req->seg[i].first_sect + 1;
377
378                 if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
379                     (seg[i].nsec <= 0))
380                         goto fail_response;
381                 preq.nr_sects += seg[i].nsec;
382
383                 flags = GNTMAP_host_map;
384                 if ( operation == WRITE )
385                         flags |= GNTMAP_readonly;
386                 gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
387                                   req->seg[i].gref, blkif->domid);
388         }
389
390         ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
391         BUG_ON(ret);
392
393         for (i = 0; i < nseg; i++) {
394                 if (unlikely(map[i].status != 0)) {
395                         DPRINTK("invalid buffer -- could not remap it\n");
396                         goto fail_flush;
397                 }
398
399                 pending_handle(pending_req, i) = map[i].handle;
400                 set_phys_to_machine(__pa(vaddr(
401                         pending_req, i)) >> PAGE_SHIFT,
402                         FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
403                 seg[i].buf  = map[i].dev_bus_addr | 
404                         (req->seg[i].first_sect << 9);
405         }
406
407         if (vbd_translate(&preq, blkif, operation) != 0) {
408                 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
409                         operation == READ ? "read" : "write",
410                         preq.sector_number,
411                         preq.sector_number + preq.nr_sects, preq.dev);
412                 goto fail_flush;
413         }
414
415         for (i = 0; i < nseg; i++) {
416                 if (((int)preq.sector_number|(int)seg[i].nsec) &
417                     ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
418                         DPRINTK("Misaligned I/O request from domain %d",
419                                 blkif->domid);
420                         goto fail_put_bio;
421                 }
422
423                 while ((bio == NULL) ||
424                        (bio_add_page(bio,
425                                      virt_to_page(vaddr(pending_req, i)),
426                                      seg[i].nsec << 9,
427                                      seg[i].buf & ~PAGE_MASK) == 0)) {
428                         bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
429                         if (unlikely(bio == NULL))
430                                 goto fail_put_bio;
431
432                         bio->bi_bdev    = preq.bdev;
433                         bio->bi_private = pending_req;
434                         bio->bi_end_io  = end_block_io_op;
435                         bio->bi_sector  = preq.sector_number;
436                 }
437
438                 preq.sector_number += seg[i].nsec;
439         }
440
441         plug_queue(blkif, bio);
442         atomic_set(&pending_req->pendcnt, nbio);
443         blkif_get(blkif);
444
445         for (i = 0; i < nbio; i++)
446                 submit_bio(operation, biolist[i]);
447
448         return;
449
450  fail_put_bio:
451         for (i = 0; i < (nbio-1); i++)
452                 bio_put(biolist[i]);
453  fail_flush:
454         fast_flush_area(pending_req);
455  fail_response:
456         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
457         free_req(pending_req);
458
459
460
461
462 /******************************************************************
463  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
464  */
465
466
467 static void make_response(blkif_t *blkif, unsigned long id, 
468                           unsigned short op, int st)
469 {
470         blkif_response_t *resp;
471         unsigned long     flags;
472         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
473         int more_to_do = 0;
474         int notify;
475
476         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
477
478         /* Place on the response ring for the relevant domain. */ 
479         resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
480         resp->id        = id;
481         resp->operation = op;
482         resp->status    = st;
483         blk_ring->rsp_prod_pvt++;
484         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
485
486         if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
487                 /*
488                  * Tail check for pending requests. Allows frontend to avoid
489                  * notifications if requests are already in flight (lower
490                  * overheads and promotes batching).
491                  */
492                 RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
493
494         } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
495                 more_to_do = 1;
496
497         }
498         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
499
500         if (more_to_do)
501                 blkif_notify_work(blkif);
502         if (notify)
503                 notify_remote_via_irq(blkif->irq);
504 }
505
506 static int __init blkif_init(void)
507 {
508         int i, mmap_pages;
509
510         if (!is_running_on_xen())
511                 return -ENODEV;
512
513         mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
514
515         pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
516                                         blkif_reqs, GFP_KERNEL);
517         pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
518                                         mmap_pages, GFP_KERNEL);
519         pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
520
521         if (!pending_reqs || !pending_grant_handles || !pending_pages)
522                 goto out_of_memory;
523
524         for (i = 0; i < mmap_pages; i++)
525                 pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
526
527         blkif_interface_init();
528
529         memset(pending_reqs, 0, sizeof(pending_reqs));
530         INIT_LIST_HEAD(&pending_free);
531
532         for (i = 0; i < blkif_reqs; i++)
533                 list_add_tail(&pending_reqs[i].free_list, &pending_free);
534
535         blkif_xenbus_init();
536
537         return 0;
538
539  out_of_memory:
540         kfree(pending_reqs);
541         kfree(pending_grant_handles);
542         free_empty_pages_and_pagevec(pending_pages, mmap_pages);
543         printk("%s: out of memory\n", __FUNCTION__);
544         return -ENOMEM;
545 }
546
547 module_init(blkif_init);
548
549 MODULE_LICENSE("Dual BSD/GPL");