fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / drivers / xen / blkback / blkback.c
1 /******************************************************************************
2  * arch/xen/drivers/blkif/backend/main.c
3  * 
4  * Back-end of the driver for virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. A 
7  * reference front-end implementation can be found in:
8  *  arch/xen/drivers/blkif/frontend
9  * 
10  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11  * Copyright (c) 2005, Christopher Clark
12  * 
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version 2
15  * as published by the Free Software Foundation; or, when distributed
16  * separately from the Linux kernel or incorporated into other
17  * software packages, subject to the following license:
18  * 
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this source file (the "Software"), to deal in the Software without
21  * restriction, including without limitation the rights to use, copy, modify,
22  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23  * and to permit persons to whom the Software is furnished to do so, subject to
24  * the following conditions:
25  * 
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  * 
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37
38 #include <linux/spinlock.h>
39 #include <linux/kthread.h>
40 #include <linux/list.h>
41 #include <xen/balloon.h>
42 #include <asm/hypervisor.h>
43 #include <asm/hypercall.h>
44 #include "common.h"
45
46 /*
47  * These are rather arbitrary. They are fairly large because adjacent requests
48  * pulled from a communication ring are quite likely to end up being part of
49  * the same scatter/gather request at the disc.
50  * 
51  * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
52  * 
53  * This will increase the chances of being able to write whole tracks.
54  * 64 should be enough to keep us competitive with Linux.
55  */
56 static int blkif_reqs = 64;
57 module_param_named(reqs, blkif_reqs, int, 0);
58 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
59
60 /* Run-time switchable: /sys/module/blkback/parameters/ */
61 static unsigned int log_stats = 0;
62 static unsigned int debug_lvl = 0;
63 module_param(log_stats, int, 0644);
64 module_param(debug_lvl, int, 0644);
65
66 /*
67  * Each outstanding request that we've passed to the lower device layers has a 
68  * 'pending_req' allocated to it. Each buffer_head that completes decrements 
69  * the pendcnt towards zero. When it hits zero, the specified domain has a 
70  * response queued for it, with the saved 'id' passed back.
71  */
72 typedef struct {
73         blkif_t       *blkif;
74         unsigned long  id;
75         int            nr_pages;
76         atomic_t       pendcnt;
77         unsigned short operation;
78         int            status;
79         struct list_head free_list;
80 } pending_req_t;
81
82 static pending_req_t *pending_reqs;
83 static struct list_head pending_free;
84 static DEFINE_SPINLOCK(pending_free_lock);
85 static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
86
87 #define BLKBACK_INVALID_HANDLE (~0)
88
89 static struct page **pending_pages;
90 static grant_handle_t *pending_grant_handles;
91
92 static inline int vaddr_pagenr(pending_req_t *req, int seg)
93 {
94         return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
95 }
96
97 static inline unsigned long vaddr(pending_req_t *req, int seg)
98 {
99         unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
100         return (unsigned long)pfn_to_kaddr(pfn);
101 }
102
103 #define pending_handle(_req, _seg) \
104         (pending_grant_handles[vaddr_pagenr(_req, _seg)])
105
106
107 static int do_block_io_op(blkif_t *blkif);
108 static void dispatch_rw_block_io(blkif_t *blkif,
109                                  blkif_request_t *req,
110                                  pending_req_t *pending_req);
111 static void make_response(blkif_t *blkif, unsigned long id, 
112                           unsigned short op, int st);
113
114 /******************************************************************
115  * misc small helpers
116  */
117 static pending_req_t* alloc_req(void)
118 {
119         pending_req_t *req = NULL;
120         unsigned long flags;
121
122         spin_lock_irqsave(&pending_free_lock, flags);
123         if (!list_empty(&pending_free)) {
124                 req = list_entry(pending_free.next, pending_req_t, free_list);
125                 list_del(&req->free_list);
126         }
127         spin_unlock_irqrestore(&pending_free_lock, flags);
128         return req;
129 }
130
131 static void free_req(pending_req_t *req)
132 {
133         unsigned long flags;
134         int was_empty;
135
136         spin_lock_irqsave(&pending_free_lock, flags);
137         was_empty = list_empty(&pending_free);
138         list_add(&req->free_list, &pending_free);
139         spin_unlock_irqrestore(&pending_free_lock, flags);
140         if (was_empty)
141                 wake_up(&pending_free_wq);
142 }
143
144 static void unplug_queue(blkif_t *blkif)
145 {
146         if (blkif->plug == NULL)
147                 return;
148         if (blkif->plug->unplug_fn)
149                 blkif->plug->unplug_fn(blkif->plug);
150         blk_put_queue(blkif->plug);
151         blkif->plug = NULL;
152 }
153
154 static void plug_queue(blkif_t *blkif, struct bio *bio)
155 {
156         request_queue_t *q = bdev_get_queue(bio->bi_bdev);
157
158         if (q == blkif->plug)
159                 return;
160         unplug_queue(blkif);
161         blk_get_queue(q);
162         blkif->plug = q;
163 }
164
165 static void fast_flush_area(pending_req_t *req)
166 {
167         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
168         unsigned int i, invcount = 0;
169         grant_handle_t handle;
170         int ret;
171
172         for (i = 0; i < req->nr_pages; i++) {
173                 handle = pending_handle(req, i);
174                 if (handle == BLKBACK_INVALID_HANDLE)
175                         continue;
176                 gnttab_set_unmap_op(&unmap[i], vaddr(req, i), GNTMAP_host_map,
177                                     handle);
178                 pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
179                 invcount++;
180         }
181
182         ret = HYPERVISOR_grant_table_op(
183                 GNTTABOP_unmap_grant_ref, unmap, invcount);
184         BUG_ON(ret);
185 }
186
187 /******************************************************************
188  * SCHEDULER FUNCTIONS
189  */
190
191 static void print_stats(blkif_t *blkif)
192 {
193         printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
194                current->comm, blkif->st_oo_req,
195                blkif->st_rd_req, blkif->st_wr_req);
196         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
197         blkif->st_rd_req = 0;
198         blkif->st_wr_req = 0;
199         blkif->st_oo_req = 0;
200 }
201
202 int blkif_schedule(void *arg)
203 {
204         blkif_t *blkif = arg;
205
206         blkif_get(blkif);
207
208         if (debug_lvl)
209                 printk(KERN_DEBUG "%s: started\n", current->comm);
210
211         while (!kthread_should_stop()) {
212                 wait_event_interruptible(
213                         blkif->wq,
214                         blkif->waiting_reqs || kthread_should_stop());
215                 wait_event_interruptible(
216                         pending_free_wq,
217                         !list_empty(&pending_free) || kthread_should_stop());
218
219                 blkif->waiting_reqs = 0;
220                 smp_mb(); /* clear flag *before* checking for work */
221
222                 if (do_block_io_op(blkif))
223                         blkif->waiting_reqs = 1;
224                 unplug_queue(blkif);
225
226                 if (log_stats && time_after(jiffies, blkif->st_print))
227                         print_stats(blkif);
228         }
229
230         if (log_stats)
231                 print_stats(blkif);
232         if (debug_lvl)
233                 printk(KERN_DEBUG "%s: exiting\n", current->comm);
234
235         blkif->xenblkd = NULL;
236         blkif_put(blkif);
237
238         return 0;
239 }
240
241 /******************************************************************
242  * COMPLETION CALLBACK -- Called as bh->b_end_io()
243  */
244
245 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
246 {
247         /* An error fails the entire request. */
248         if (!uptodate) {
249                 DPRINTK("Buffer not up-to-date at end of operation\n");
250                 pending_req->status = BLKIF_RSP_ERROR;
251         }
252
253         if (atomic_dec_and_test(&pending_req->pendcnt)) {
254                 fast_flush_area(pending_req);
255                 make_response(pending_req->blkif, pending_req->id,
256                               pending_req->operation, pending_req->status);
257                 blkif_put(pending_req->blkif);
258                 free_req(pending_req);
259         }
260 }
261
262 static int end_block_io_op(struct bio *bio, unsigned int done, int error)
263 {
264         if (bio->bi_size != 0)
265                 return 1;
266         __end_block_io_op(bio->bi_private, !error);
267         bio_put(bio);
268         return error;
269 }
270
271
272 /******************************************************************************
273  * NOTIFICATION FROM GUEST OS.
274  */
275
276 static void blkif_notify_work(blkif_t *blkif)
277 {
278         blkif->waiting_reqs = 1;
279         wake_up(&blkif->wq);
280 }
281
282 irqreturn_t blkif_be_int(int irq, void *dev_id)
283 {
284         blkif_notify_work(dev_id);
285         return IRQ_HANDLED;
286 }
287
288
289
290 /******************************************************************
291  * DOWNWARD CALLS -- These interface with the block-device layer proper.
292  */
293
294 static int do_block_io_op(blkif_t *blkif)
295 {
296         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
297         blkif_request_t req;
298         pending_req_t *pending_req;
299         RING_IDX rc, rp;
300         int more_to_do = 0;
301
302         rc = blk_ring->req_cons;
303         rp = blk_ring->sring->req_prod;
304         rmb(); /* Ensure we see queued requests up to 'rp'. */
305
306         while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
307
308                 pending_req = alloc_req();
309                 if (NULL == pending_req) {
310                         blkif->st_oo_req++;
311                         more_to_do = 1;
312                         break;
313                 }
314
315                 memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
316                 blk_ring->req_cons = ++rc; /* before make_response() */
317
318                 switch (req.operation) {
319                 case BLKIF_OP_READ:
320                         blkif->st_rd_req++;
321                         dispatch_rw_block_io(blkif, &req, pending_req);
322                         break;
323                 case BLKIF_OP_WRITE:
324                         blkif->st_wr_req++;
325                         dispatch_rw_block_io(blkif, &req, pending_req);
326                         break;
327                 default:
328                         DPRINTK("error: unknown block io operation [%d]\n",
329                                 req.operation);
330                         make_response(blkif, req.id, req.operation,
331                                       BLKIF_RSP_ERROR);
332                         free_req(pending_req);
333                         break;
334                 }
335         }
336         return more_to_do;
337 }
338
339 static void dispatch_rw_block_io(blkif_t *blkif,
340                                  blkif_request_t *req,
341                                  pending_req_t *pending_req)
342 {
343         extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
344         int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
345         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
346         struct phys_req preq;
347         struct { 
348                 unsigned long buf; unsigned int nsec;
349         } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
350         unsigned int nseg;
351         struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
352         int ret, i, nbio = 0;
353
354         /* Check that number of segments is sane. */
355         nseg = req->nr_segments;
356         if (unlikely(nseg == 0) || 
357             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
358                 DPRINTK("Bad number of segments in request (%d)\n", nseg);
359                 goto fail_response;
360         }
361
362         preq.dev           = req->handle;
363         preq.sector_number = req->sector_number;
364         preq.nr_sects      = 0;
365
366         pending_req->blkif     = blkif;
367         pending_req->id        = req->id;
368         pending_req->operation = operation;
369         pending_req->status    = BLKIF_RSP_OKAY;
370         pending_req->nr_pages  = nseg;
371
372         for (i = 0; i < nseg; i++) {
373                 uint32_t flags;
374
375                 seg[i].nsec = req->seg[i].last_sect -
376                         req->seg[i].first_sect + 1;
377
378                 if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
379                     (req->seg[i].last_sect < req->seg[i].first_sect))
380                         goto fail_response;
381                 preq.nr_sects += seg[i].nsec;
382
383                 flags = GNTMAP_host_map;
384                 if ( operation == WRITE )
385                         flags |= GNTMAP_readonly;
386                 gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
387                                   req->seg[i].gref, blkif->domid);
388         }
389
390         ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
391         BUG_ON(ret);
392
393         for (i = 0; i < nseg; i++) {
394                 if (unlikely(map[i].status != 0)) {
395                         DPRINTK("invalid buffer -- could not remap it\n");
396                         map[i].handle = BLKBACK_INVALID_HANDLE;
397                         ret |= 1;
398                 }
399
400                 pending_handle(pending_req, i) = map[i].handle;
401
402                 if (ret)
403                         continue;
404
405                 set_phys_to_machine(__pa(vaddr(
406                         pending_req, i)) >> PAGE_SHIFT,
407                         FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
408                 seg[i].buf  = map[i].dev_bus_addr | 
409                         (req->seg[i].first_sect << 9);
410         }
411
412         if (ret)
413                 goto fail_flush;
414
415         if (vbd_translate(&preq, blkif, operation) != 0) {
416                 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
417                         operation == READ ? "read" : "write",
418                         preq.sector_number,
419                         preq.sector_number + preq.nr_sects, preq.dev);
420                 goto fail_flush;
421         }
422
423         for (i = 0; i < nseg; i++) {
424                 if (((int)preq.sector_number|(int)seg[i].nsec) &
425                     ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
426                         DPRINTK("Misaligned I/O request from domain %d",
427                                 blkif->domid);
428                         goto fail_put_bio;
429                 }
430
431                 while ((bio == NULL) ||
432                        (bio_add_page(bio,
433                                      virt_to_page(vaddr(pending_req, i)),
434                                      seg[i].nsec << 9,
435                                      seg[i].buf & ~PAGE_MASK) == 0)) {
436                         bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
437                         if (unlikely(bio == NULL))
438                                 goto fail_put_bio;
439
440                         bio->bi_bdev    = preq.bdev;
441                         bio->bi_private = pending_req;
442                         bio->bi_end_io  = end_block_io_op;
443                         bio->bi_sector  = preq.sector_number;
444                 }
445
446                 preq.sector_number += seg[i].nsec;
447         }
448
449         plug_queue(blkif, bio);
450         atomic_set(&pending_req->pendcnt, nbio);
451         blkif_get(blkif);
452
453         for (i = 0; i < nbio; i++)
454                 submit_bio(operation, biolist[i]);
455
456         return;
457
458  fail_put_bio:
459         for (i = 0; i < (nbio-1); i++)
460                 bio_put(biolist[i]);
461  fail_flush:
462         fast_flush_area(pending_req);
463  fail_response:
464         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
465         free_req(pending_req);
466
467
468
469
470 /******************************************************************
471  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
472  */
473
474
475 static void make_response(blkif_t *blkif, unsigned long id, 
476                           unsigned short op, int st)
477 {
478         blkif_response_t *resp;
479         unsigned long     flags;
480         blkif_back_ring_t *blk_ring = &blkif->blk_ring;
481         int more_to_do = 0;
482         int notify;
483
484         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
485
486         /* Place on the response ring for the relevant domain. */ 
487         resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
488         resp->id        = id;
489         resp->operation = op;
490         resp->status    = st;
491         blk_ring->rsp_prod_pvt++;
492         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
493
494         if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
495                 /*
496                  * Tail check for pending requests. Allows frontend to avoid
497                  * notifications if requests are already in flight (lower
498                  * overheads and promotes batching).
499                  */
500                 RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
501
502         } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
503                 more_to_do = 1;
504
505         }
506         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
507
508         if (more_to_do)
509                 blkif_notify_work(blkif);
510         if (notify)
511                 notify_remote_via_irq(blkif->irq);
512 }
513
514 static int __init blkif_init(void)
515 {
516         int i, mmap_pages;
517
518         if (!is_running_on_xen())
519                 return -ENODEV;
520
521         mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
522
523         pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
524                                         blkif_reqs, GFP_KERNEL);
525         pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
526                                         mmap_pages, GFP_KERNEL);
527         pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
528
529         if (!pending_reqs || !pending_grant_handles || !pending_pages)
530                 goto out_of_memory;
531
532         for (i = 0; i < mmap_pages; i++)
533                 pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
534
535         blkif_interface_init();
536
537         memset(pending_reqs, 0, sizeof(pending_reqs));
538         INIT_LIST_HEAD(&pending_free);
539
540         for (i = 0; i < blkif_reqs; i++)
541                 list_add_tail(&pending_reqs[i].free_list, &pending_free);
542
543         blkif_xenbus_init();
544
545         return 0;
546
547  out_of_memory:
548         kfree(pending_reqs);
549         kfree(pending_grant_handles);
550         free_empty_pages_and_pagevec(pending_pages, mmap_pages);
551         printk("%s: out of memory\n", __FUNCTION__);
552         return -ENOMEM;
553 }
554
555 module_init(blkif_init);
556
557 MODULE_LICENSE("Dual BSD/GPL");