Merge to Fedora kernel-2.6.18-1.2224_FC5 patched with stable patch-2.6.18.1-vs2.0...
[linux-2.6.git] / drivers / xen / balloon / balloon.c
1 /******************************************************************************
2  * balloon.c
3  *
4  * Xen balloon driver - enables returning/claiming memory to/from Xen.
5  *
6  * Copyright (c) 2003, B Dragovic
7  * Copyright (c) 2003-2004, M Williamson, K Fraser
8  * Copyright (c) 2005 Dan M. Smith, IBM Corporation
9  * 
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public License version 2
12  * as published by the Free Software Foundation; or, when distributed
13  * separately from the Linux kernel or incorporated into other
14  * software packages, subject to the following license:
15  * 
16  * Permission is hereby granted, free of charge, to any person obtaining a copy
17  * of this source file (the "Software"), to deal in the Software without
18  * restriction, including without limitation the rights to use, copy, modify,
19  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
20  * and to permit persons to whom the Software is furnished to do so, subject to
21  * the following conditions:
22  * 
23  * The above copyright notice and this permission notice shall be included in
24  * all copies or substantial portions of the Software.
25  * 
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
32  * IN THE SOFTWARE.
33  */
34
35 #include <linux/kernel.h>
36 #include <linux/module.h>
37 #include <linux/sched.h>
38 #include <linux/errno.h>
39 #include <linux/mm.h>
40 #include <linux/mman.h>
41 #include <linux/smp_lock.h>
42 #include <linux/pagemap.h>
43 #include <linux/bootmem.h>
44 #include <linux/highmem.h>
45 #include <linux/vmalloc.h>
46 #include <xen/xen_proc.h>
47 #include <asm/hypervisor.h>
48 #include <xen/balloon.h>
49 #include <xen/interface/memory.h>
50 #include <asm/pgalloc.h>
51 #include <asm/pgtable.h>
52 #include <asm/uaccess.h>
53 #include <asm/tlb.h>
54 #include <linux/list.h>
55
56 #include <xen/xenbus.h>
57
58 #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
59
60 #ifdef CONFIG_PROC_FS
61 static struct proc_dir_entry *balloon_pde;
62 #endif
63
64 static DECLARE_MUTEX(balloon_mutex);
65
66 /*
67  * Protects atomic reservation decrease/increase against concurrent increases.
68  * Also protects non-atomic updates of current_pages and driver_pages, and
69  * balloon lists.
70  */
71 DEFINE_SPINLOCK(balloon_lock);
72
73 /* We aim for 'current allocation' == 'target allocation'. */
74 static unsigned long current_pages;
75 static unsigned long target_pages;
76
77 /* We increase/decrease in batches which fit in a page */
78 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
79
80 /* VM /proc information for memory */
81 extern unsigned long totalram_pages;
82
83 /* We may hit the hard limit in Xen. If we do then we remember it. */
84 static unsigned long hard_limit;
85
86 /*
87  * Drivers may alter the memory reservation independently, but they must
88  * inform the balloon driver so that we can avoid hitting the hard limit.
89  */
90 static unsigned long driver_pages;
91
92 /* List of ballooned pages, threaded through the mem_map array. */
93 static LIST_HEAD(ballooned_pages);
94 static unsigned long balloon_low, balloon_high;
95
96 /* Main work function, always executed in process context. */
97 static void balloon_process(void *unused);
98 static DECLARE_WORK(balloon_worker, balloon_process, NULL);
99 static struct timer_list balloon_timer;
100
101 /* When ballooning out (allocating memory to return to Xen) we don't really 
102    want the kernel to try too hard since that can trigger the oom killer. */
103 #define GFP_BALLOON \
104         (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
105
106 #define PAGE_TO_LIST(p) (&(p)->lru)
107 #define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
108 #define UNLIST_PAGE(p)                          \
109         do {                                    \
110                 list_del(PAGE_TO_LIST(p));      \
111                 PAGE_TO_LIST(p)->next = NULL;   \
112                 PAGE_TO_LIST(p)->prev = NULL;   \
113         } while(0)
114
115 #define IPRINTK(fmt, args...) \
116         printk(KERN_INFO "xen_mem: " fmt, ##args)
117 #define WPRINTK(fmt, args...) \
118         printk(KERN_WARNING "xen_mem: " fmt, ##args)
119
120 /* balloon_append: add the given page to the balloon. */
121 static void balloon_append(struct page *page)
122 {
123         /* Lowmem is re-populated first, so highmem pages go at list tail. */
124         if (PageHighMem(page)) {
125                 list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
126                 balloon_high++;
127         } else {
128                 list_add(PAGE_TO_LIST(page), &ballooned_pages);
129                 balloon_low++;
130         }
131 }
132
133 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
134 static struct page *balloon_retrieve(void)
135 {
136         struct page *page;
137
138         if (list_empty(&ballooned_pages))
139                 return NULL;
140
141         page = LIST_TO_PAGE(ballooned_pages.next);
142         UNLIST_PAGE(page);
143
144         if (PageHighMem(page))
145                 balloon_high--;
146         else
147                 balloon_low--;
148
149         return page;
150 }
151
152 static struct page *balloon_first_page(void)
153 {
154         if (list_empty(&ballooned_pages))
155                 return NULL;
156         return LIST_TO_PAGE(ballooned_pages.next);
157 }
158
159 static struct page *balloon_next_page(struct page *page)
160 {
161         struct list_head *next = PAGE_TO_LIST(page)->next;
162         if (next == &ballooned_pages)
163                 return NULL;
164         return LIST_TO_PAGE(next);
165 }
166
167 static void balloon_alarm(unsigned long unused)
168 {
169         schedule_work(&balloon_worker);
170 }
171
172 static unsigned long current_target(void)
173 {
174         unsigned long target = min(target_pages, hard_limit);
175         if (target > (current_pages + balloon_low + balloon_high))
176                 target = current_pages + balloon_low + balloon_high;
177         return target;
178 }
179
180 static int increase_reservation(unsigned long nr_pages)
181 {
182         unsigned long  pfn, i, flags;
183         struct page   *page;
184         long           rc;
185         struct xen_memory_reservation reservation = {
186                 .address_bits = 0,
187                 .extent_order = 0,
188                 .domid        = DOMID_SELF
189         };
190
191         if (nr_pages > ARRAY_SIZE(frame_list))
192                 nr_pages = ARRAY_SIZE(frame_list);
193
194         balloon_lock(flags);
195
196         page = balloon_first_page();
197         for (i = 0; i < nr_pages; i++) {
198                 BUG_ON(page == NULL);
199                 frame_list[i] = page_to_pfn(page);;
200                 page = balloon_next_page(page);
201         }
202
203         set_xen_guest_handle(reservation.extent_start, frame_list);
204         reservation.nr_extents   = nr_pages;
205         rc = HYPERVISOR_memory_op(
206                 XENMEM_populate_physmap, &reservation);
207         if (rc < nr_pages) {
208                 if (rc > 0) {
209                         int ret;
210
211                         /* We hit the Xen hard limit: reprobe. */
212                         reservation.nr_extents = rc;
213                         ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
214                                         &reservation);
215                         BUG_ON(ret != rc);
216                 }
217                 if (rc >= 0)
218                         hard_limit = current_pages + rc - driver_pages;
219                 goto out;
220         }
221
222         for (i = 0; i < nr_pages; i++) {
223                 page = balloon_retrieve();
224                 BUG_ON(page == NULL);
225
226                 pfn = page_to_pfn(page);
227                 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
228                        phys_to_machine_mapping_valid(pfn));
229
230                 /* Update P->M and M->P tables. */
231                 set_phys_to_machine(pfn, frame_list[i]);
232                 xen_machphys_update(frame_list[i], pfn);
233
234                 /* Link back into the page tables if not highmem. */
235                 if (pfn < max_low_pfn) {
236                         int ret;
237                         ret = HYPERVISOR_update_va_mapping(
238                                 (unsigned long)__va(pfn << PAGE_SHIFT),
239                                 pfn_pte_ma(frame_list[i], PAGE_KERNEL),
240                                 0);
241                         BUG_ON(ret);
242                 }
243
244                 /* Relinquish the page back to the allocator. */
245                 ClearPageReserved(page);
246                 init_page_count(page);
247                 __free_page(page);
248         }
249
250         current_pages += nr_pages;
251         totalram_pages = current_pages;
252
253  out:
254         balloon_unlock(flags);
255
256         return 0;
257 }
258
259 static int decrease_reservation(unsigned long nr_pages)
260 {
261         unsigned long  pfn, i, flags;
262         struct page   *page;
263         void          *v;
264         int            need_sleep = 0;
265         int ret;
266         struct xen_memory_reservation reservation = {
267                 .address_bits = 0,
268                 .extent_order = 0,
269                 .domid        = DOMID_SELF
270         };
271
272         if (nr_pages > ARRAY_SIZE(frame_list))
273                 nr_pages = ARRAY_SIZE(frame_list);
274
275         for (i = 0; i < nr_pages; i++) {
276                 if ((page = alloc_page(GFP_BALLOON)) == NULL) {
277                         nr_pages = i;
278                         need_sleep = 1;
279                         break;
280                 }
281
282                 pfn = page_to_pfn(page);
283                 frame_list[i] = pfn_to_mfn(pfn);
284
285                 if (!PageHighMem(page)) {
286                         v = phys_to_virt(pfn << PAGE_SHIFT);
287                         scrub_pages(v, 1);
288                         ret = HYPERVISOR_update_va_mapping(
289                                 (unsigned long)v, __pte_ma(0), 0);
290                         BUG_ON(ret);
291                 }
292 #ifdef CONFIG_XEN_SCRUB_PAGES
293                 else {
294                         v = kmap(page);
295                         scrub_pages(v, 1);
296                         kunmap(page);
297                 }
298 #endif
299         }
300
301         /* Ensure that ballooned highmem pages don't have kmaps. */
302         kmap_flush_unused();
303         flush_tlb_all();
304
305         balloon_lock(flags);
306
307         /* No more mappings: invalidate P2M and add to balloon. */
308         for (i = 0; i < nr_pages; i++) {
309                 pfn = mfn_to_pfn(frame_list[i]);
310                 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
311                 balloon_append(pfn_to_page(pfn));
312         }
313
314         set_xen_guest_handle(reservation.extent_start, frame_list);
315         reservation.nr_extents   = nr_pages;
316         ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
317         BUG_ON(ret != nr_pages);
318
319         current_pages -= nr_pages;
320         totalram_pages = current_pages;
321
322         balloon_unlock(flags);
323
324         return need_sleep;
325 }
326
327 /*
328  * We avoid multiple worker processes conflicting via the balloon mutex.
329  * We may of course race updates of the target counts (which are protected
330  * by the balloon lock), or with changes to the Xen hard limit, but we will
331  * recover from these in time.
332  */
333 static void balloon_process(void *unused)
334 {
335         int need_sleep = 0;
336         long credit;
337
338         down(&balloon_mutex);
339
340         do {
341                 credit = current_target() - current_pages;
342                 if (credit > 0)
343                         need_sleep = (increase_reservation(credit) != 0);
344                 if (credit < 0)
345                         need_sleep = (decrease_reservation(-credit) != 0);
346
347 #ifndef CONFIG_PREEMPT
348                 if (need_resched())
349                         schedule();
350 #endif
351         } while ((credit != 0) && !need_sleep);
352
353         /* Schedule more work if there is some still to be done. */
354         if (current_target() != current_pages)
355                 mod_timer(&balloon_timer, jiffies + HZ);
356
357         up(&balloon_mutex);
358 }
359
360 /* Resets the Xen limit, sets new target, and kicks off processing. */
361 static void set_new_target(unsigned long target)
362 {
363         /* No need for lock. Not read-modify-write updates. */
364         hard_limit   = ~0UL;
365         target_pages = target;
366         schedule_work(&balloon_worker);
367 }
368
369 static struct xenbus_watch target_watch =
370 {
371         .node = "memory/target"
372 };
373
374 /* React to a change in the target key */
375 static void watch_target(struct xenbus_watch *watch,
376                          const char **vec, unsigned int len)
377 {
378         unsigned long long new_target;
379         int err;
380
381         err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
382         if (err != 1) {
383                 /* This is ok (for domain0 at least) - so just return */
384                 return;
385         }
386
387         /* The given memory/target value is in KiB, so it needs converting to
388          * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
389          */
390         set_new_target(new_target >> (PAGE_SHIFT - 10));
391 }
392
393 static int balloon_init_watcher(struct notifier_block *notifier,
394                                 unsigned long event,
395                                 void *data)
396 {
397         int err;
398
399         err = register_xenbus_watch(&target_watch);
400         if (err)
401                 printk(KERN_ERR "Failed to set balloon watcher\n");
402
403         return NOTIFY_DONE;
404 }
405
406 #ifdef CONFIG_PROC_FS
407 static int balloon_write(struct file *file, const char __user *buffer,
408                          unsigned long count, void *data)
409 {
410         char memstring[64], *endchar;
411         unsigned long long target_bytes;
412
413         if (!capable(CAP_SYS_ADMIN))
414                 return -EPERM;
415
416         if (count <= 1)
417                 return -EBADMSG; /* runt */
418         if (count > sizeof(memstring))
419                 return -EFBIG;   /* too long */
420
421         if (copy_from_user(memstring, buffer, count))
422                 return -EFAULT;
423         memstring[sizeof(memstring)-1] = '\0';
424
425         target_bytes = memparse(memstring, &endchar);
426         set_new_target(target_bytes >> PAGE_SHIFT);
427
428         return count;
429 }
430
431 static int balloon_read(char *page, char **start, off_t off,
432                         int count, int *eof, void *data)
433 {
434         int len;
435
436         len = sprintf(
437                 page,
438                 "Current allocation: %8lu kB\n"
439                 "Requested target:   %8lu kB\n"
440                 "Low-mem balloon:    %8lu kB\n"
441                 "High-mem balloon:   %8lu kB\n"
442                 "Driver pages:       %8lu kB\n"
443                 "Xen hard limit:     ",
444                 PAGES2KB(current_pages), PAGES2KB(target_pages), 
445                 PAGES2KB(balloon_low), PAGES2KB(balloon_high),
446                 PAGES2KB(driver_pages));
447
448         if (hard_limit != ~0UL)
449                 len += sprintf(page + len, "%8lu kB\n", PAGES2KB(hard_limit));
450         else
451                 len += sprintf(page + len, "     ??? kB\n");
452
453         *eof = 1;
454         return len;
455 }
456 #endif
457
458 static struct notifier_block xenstore_notifier;
459
460 static int __init balloon_init(void)
461 {
462         unsigned long pfn;
463         struct page *page;
464
465         if (!is_running_on_xen())
466                 return -ENODEV;
467
468         IPRINTK("Initialising balloon driver.\n");
469
470         current_pages = min(xen_start_info->nr_pages, max_pfn);
471         totalram_pages = current_pages;
472         target_pages  = current_pages;
473         balloon_low   = 0;
474         balloon_high  = 0;
475         driver_pages  = 0UL;
476         hard_limit    = ~0UL;
477
478         init_timer(&balloon_timer);
479         balloon_timer.data = 0;
480         balloon_timer.function = balloon_alarm;
481     
482 #ifdef CONFIG_PROC_FS
483         if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
484                 WPRINTK("Unable to create /proc/xen/balloon.\n");
485                 return -1;
486         }
487
488         balloon_pde->read_proc  = balloon_read;
489         balloon_pde->write_proc = balloon_write;
490 #endif
491     
492         /* Initialise the balloon with excess memory space. */
493         for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
494                 page = pfn_to_page(pfn);
495                 if (!PageReserved(page))
496                         balloon_append(page);
497         }
498
499         target_watch.callback = watch_target;
500         xenstore_notifier.notifier_call = balloon_init_watcher;
501
502         register_xenstore_notifier(&xenstore_notifier);
503     
504         return 0;
505 }
506
507 subsys_initcall(balloon_init);
508
509 void balloon_update_driver_allowance(long delta)
510 {
511         unsigned long flags;
512
513         balloon_lock(flags);
514         driver_pages += delta;
515         balloon_unlock(flags);
516 }
517
518 static int dealloc_pte_fn(
519         pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
520 {
521         unsigned long mfn = pte_mfn(*pte);
522         int ret;
523         struct xen_memory_reservation reservation = {
524                 .nr_extents   = 1,
525                 .extent_order = 0,
526                 .domid        = DOMID_SELF
527         };
528         set_xen_guest_handle(reservation.extent_start, &mfn);
529         set_pte_at(&init_mm, addr, pte, __pte_ma(0));
530         set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
531         ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
532         BUG_ON(ret != 1);
533         return 0;
534 }
535
536 struct page **alloc_empty_pages_and_pagevec(int nr_pages)
537 {
538         unsigned long vaddr, flags;
539         struct page *page, **pagevec;
540         int i, ret;
541
542         pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
543         if (pagevec == NULL)
544                 return NULL;
545
546         for (i = 0; i < nr_pages; i++) {
547                 page = pagevec[i] = alloc_page(GFP_KERNEL);
548                 if (page == NULL)
549                         goto err;
550
551                 vaddr = (unsigned long)page_address(page);
552
553                 scrub_pages(vaddr, 1);
554
555                 balloon_lock(flags);
556
557                 if (xen_feature(XENFEAT_auto_translated_physmap)) {
558                         unsigned long gmfn = page_to_pfn(page);
559                         struct xen_memory_reservation reservation = {
560                                 .nr_extents   = 1,
561                                 .extent_order = 0,
562                                 .domid        = DOMID_SELF
563                         };
564                         set_xen_guest_handle(reservation.extent_start, &gmfn);
565                         ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
566                                                    &reservation);
567                         if (ret == 1)
568                                 ret = 0; /* success */
569                 } else {
570                         ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE,
571                                                   dealloc_pte_fn, NULL);
572                 }
573
574                 if (ret != 0) {
575                         balloon_unlock(flags);
576                         __free_page(page);
577                         goto err;
578                 }
579
580                 totalram_pages = --current_pages;
581
582                 balloon_unlock(flags);
583         }
584
585  out:
586         schedule_work(&balloon_worker);
587         flush_tlb_all();
588         return pagevec;
589
590  err:
591         balloon_lock(flags);
592         while (--i >= 0)
593                 balloon_append(pagevec[i]);
594         balloon_unlock(flags);
595         kfree(pagevec);
596         pagevec = NULL;
597         goto out;
598 }
599
600 void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
601 {
602         unsigned long flags;
603         int i;
604
605         if (pagevec == NULL)
606                 return;
607
608         balloon_lock(flags);
609         for (i = 0; i < nr_pages; i++) {
610                 BUG_ON(page_count(pagevec[i]) != 1);
611                 balloon_append(pagevec[i]);
612         }
613         balloon_unlock(flags);
614
615         kfree(pagevec);
616
617         schedule_work(&balloon_worker);
618 }
619
620 void balloon_release_driver_page(struct page *page)
621 {
622         unsigned long flags;
623
624         balloon_lock(flags);
625         balloon_append(page);
626         driver_pages--;
627         balloon_unlock(flags);
628
629         schedule_work(&balloon_worker);
630 }
631
632 EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
633 EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
634 EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
635 EXPORT_SYMBOL_GPL(balloon_release_driver_page);
636
637 MODULE_LICENSE("Dual BSD/GPL");