1 /* ckrm_mem.c - Memory Resource Manager for CKRM
3 * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
5 * Provides a Memory Resource controller for CKRM
7 * Latest version, more details at http://ckrm.sf.net
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
16 /* Code Description: TBD
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <linux/slab.h>
23 #include <asm/errno.h>
24 #include <linux/list.h>
25 #include <linux/spinlock.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/swapops.h>
29 #include <linux/cache.h>
30 #include <linux/percpu.h>
31 #include <linux/pagevec.h>
33 #include <linux/ckrm_mem_inline.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
38 #define MEM_NAME "mem"
40 #define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
42 /* all 1-level memory_share_class are chained together */
43 static LIST_HEAD(ckrm_memclass_list);
44 LIST_HEAD(ckrm_shrink_list);
45 EXPORT_SYMBOL(ckrm_shrink_list);
46 spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
47 EXPORT_SYMBOL(ckrm_mem_lock);
48 unsigned int ckrm_tot_lru_pages; // total # of pages in the system
49 // currently doesn't handle memory add/remove
50 EXPORT_SYMBOL(ckrm_tot_lru_pages);
52 static ckrm_mem_res_t *ckrm_mem_root_class;
53 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
54 EXPORT_SYMBOL(ckrm_mem_real_count);
56 /* Initialize rescls values
57 * May be called on each rcfs unmount or as part of error recovery
58 * to make share values sane.
59 * Does not traverse hierarchy reinitializing children.
63 set_ckrm_tot_pages(void)
66 int tot_lru_pages = 0;
69 tot_lru_pages += zone->nr_active;
70 tot_lru_pages += zone->nr_inactive;
71 tot_lru_pages += zone->free_pages;
73 ckrm_tot_lru_pages = tot_lru_pages;
77 mem_res_initcls_one(void *my_res)
79 ckrm_mem_res_t *res = my_res;
81 memset(res, 0, sizeof(ckrm_mem_res_t));
83 res->shares.my_guarantee = CKRM_SHARE_DONTCARE;
84 res->shares.my_limit = CKRM_SHARE_DONTCARE;
85 res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
86 res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
87 res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
88 res->shares.cur_max_limit = 0;
90 res->pg_guar = CKRM_SHARE_DONTCARE;
91 res->pg_limit = CKRM_SHARE_DONTCARE;
92 res->pg_unused = CKRM_SHARE_DONTCARE;
96 mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
98 ckrm_mem_res_t *res, *parres;
100 if (mem_rcbs.resid == -1) {
104 parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
105 if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
106 // allows only upto CKRM_MEM_MAX_HIERARCHY
110 if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
111 printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
115 if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
116 printk(KERN_ERR "MEM_RC: creating child class without root class\n");
120 res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
123 mem_res_initcls_one(res);
125 res->parent = parent;
126 spin_lock(&ckrm_mem_lock);
127 list_add(&res->mcls_list, &ckrm_memclass_list);
128 spin_unlock(&ckrm_mem_lock);
129 if (parent == NULL) {
130 // I am part of the root class. So, set the max to
131 // number of pages available
132 res->pg_guar = ckrm_tot_lru_pages;
133 res->pg_unused = ckrm_tot_lru_pages;
134 res->pg_limit = ckrm_tot_lru_pages;
136 ckrm_mem_root_class = res;
138 res->hier = parres->hier + 1;
143 printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
148 * It is the caller's responsibility to make sure that the parent only
149 * has chilren that are to be accounted. i.e if a new child is added
150 * this function should be called after it has been added, and if a
151 * child is deleted this should be called after the child is removed.
154 child_maxlimit_changed_local(ckrm_mem_res_t *parres)
157 ckrm_mem_res_t *childres;
158 ckrm_core_class_t *child = NULL;
160 // run thru parent's children and get the new max_limit of the parent
161 ckrm_lock_hier(parres->core);
162 while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
163 childres = ckrm_get_res_class(child, mem_rcbs.resid,
165 if (maxlimit < childres->shares.my_limit) {
166 maxlimit = childres->shares.my_limit;
169 ckrm_unlock_hier(parres->core);
170 parres->shares.cur_max_limit = maxlimit;
174 mem_res_free(void *my_res)
176 ckrm_mem_res_t *res = my_res;
177 ckrm_mem_res_t *parres;
182 parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
184 // return child's limit/guarantee to parent node
186 child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
187 child_maxlimit_changed_local(parres);
189 res->shares.my_guarantee = 0;
190 res->shares.my_limit = 0;
191 spin_lock(&ckrm_mem_lock);
192 list_del(&res->mcls_list);
193 spin_unlock(&ckrm_mem_lock);
200 * Recalculate the guarantee and limit in # of pages... and propagate the
202 * Caller is responsible for protecting res and for the integrity of parres
205 recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
207 ckrm_core_class_t *child = NULL;
208 ckrm_mem_res_t *childres;
209 int resid = mem_rcbs.resid;
210 struct ckrm_shares *self = &res->shares;
213 struct ckrm_shares *par = &parres->shares;
215 // calculate pg_guar and pg_limit
217 if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
218 self->my_guarantee == CKRM_SHARE_DONTCARE) {
219 res->pg_guar = CKRM_SHARE_DONTCARE;
220 } else if (par->total_guarantee) {
221 u64 temp = (u64) self->my_guarantee * parres->pg_guar;
222 do_div(temp, par->total_guarantee);
223 res->pg_guar = (int) temp;
228 if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
229 self->my_limit == CKRM_SHARE_DONTCARE) {
230 res->pg_limit = CKRM_SHARE_DONTCARE;
231 } else if (par->max_limit) {
232 u64 temp = (u64) self->my_limit * parres->pg_limit;
233 do_div(temp, par->max_limit);
234 res->pg_limit = (int) temp;
240 // Calculate unused units
241 if (res->pg_guar == CKRM_SHARE_DONTCARE) {
242 res->pg_unused = CKRM_SHARE_DONTCARE;
243 } else if (self->total_guarantee) {
244 u64 temp = (u64) self->unused_guarantee * res->pg_guar;
245 do_div(temp, self->total_guarantee);
246 res->pg_unused = (int) temp;
251 // propagate to children
252 ckrm_lock_hier(res->core);
253 while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
254 childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
255 recalc_and_propagate(childres, res);
257 ckrm_unlock_hier(res->core);
262 mem_set_share_values(void *my_res, struct ckrm_shares *shares)
264 ckrm_mem_res_t *res = my_res;
265 ckrm_mem_res_t *parres;
271 parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
273 rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
275 if ((rc == 0) && (parres != NULL)) {
276 child_maxlimit_changed_local(parres);
277 recalc_and_propagate(parres, NULL);
283 mem_get_share_values(void *my_res, struct ckrm_shares *shares)
285 ckrm_mem_res_t *res = my_res;
289 *shares = res->shares;
294 mem_get_stats(void *my_res, struct seq_file *sfile)
296 ckrm_mem_res_t *res = my_res;
302 seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
303 "lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
304 res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
305 res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
309 seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
310 seq_printf(sfile, "Number of pages used(including pages lent to children):"
311 " %d\n", atomic_read(&res->pg_total));
312 seq_printf(sfile, "Number of pages guaranteed: %d\n",
314 seq_printf(sfile, "Maximum limit of pages: %d\n",
316 seq_printf(sfile, "Total number of pages available"
317 "(after serving guarantees to children): %d\n",
319 seq_printf(sfile, "Number of pages lent to children: %d\n",
321 seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
323 seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
329 mem_change_resclass(void *tsk, void *old, void *new)
331 struct mm_struct *mm;
332 struct task_struct *task = tsk, *t1;
333 struct ckrm_mem_res *prev_mmcls;
335 if (!task->mm || (new == old) || (old == (void *) -1))
338 mm = task->active_mm;
339 spin_lock(&mm->peertask_lock);
340 prev_mmcls = mm->memclass;
343 list_del_init(&task->mm_peers);
346 list_for_each_entry(t1, &mm->tasklist, mm_peers) {
353 list_del_init(&task->mm_peers);
354 list_add_tail(&task->mm_peers, &mm->tasklist);
358 ckrm_mem_evaluate_mm(mm);
359 spin_unlock(&mm->peertask_lock);
363 // config file is available only at the root level,
364 // so assuming my_res to be the system level class
366 mem_set_config(void *my_res, const char *cfgstr)
368 ckrm_mem_res_t *res = my_res;
370 printk(KERN_INFO "%s class of %s is called with config<%s>\n",
371 MEM_NAME, res->core->name, cfgstr);
376 mem_show_config(void *my_res, struct seq_file *sfile)
379 ckrm_mem_res_t *res = my_res;
380 int active = 0, inactive = 0, fr = 0;
385 for_each_zone(zone) {
386 active += zone->nr_active;
387 inactive += zone->nr_inactive;
388 fr += zone->free_pages;
390 seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
391 MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
398 mem_reset_stats(void *my_res)
400 ckrm_mem_res_t *res = my_res;
401 printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
405 struct ckrm_res_ctlr mem_rcbs = {
406 .res_name = MEM_NAME,
407 .res_hdepth = CKRM_MEM_MAX_HIERARCHY,
409 .res_alloc = mem_res_alloc,
410 .res_free = mem_res_free,
411 .set_share_values = mem_set_share_values,
412 .get_share_values = mem_get_share_values,
413 .get_stats = mem_get_stats,
414 .change_resclass = mem_change_resclass,
415 .show_config = mem_show_config,
416 .set_config = mem_set_config,
417 .reset_stats = mem_reset_stats,
420 EXPORT_SYMBOL(mem_rcbs);
423 init_ckrm_mem_res(void)
425 struct ckrm_classtype *clstype;
426 int resid = mem_rcbs.resid;
428 set_ckrm_tot_pages();
429 clstype = ckrm_find_classtype_by_name("taskclass");
430 if (clstype == NULL) {
431 printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
436 resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
438 mem_rcbs.classtype = clstype;
441 return ((resid < 0) ? resid : 0);
445 exit_ckrm_mem_res(void)
447 ckrm_unregister_res_ctlr(&mem_rcbs);
451 module_init(init_ckrm_mem_res)
452 module_exit(exit_ckrm_mem_res)
455 set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
457 ckrm_mem_res_t *childres;
458 ckrm_core_class_t *child = NULL;
460 parres->reclaim_flags |= flag;
461 ckrm_lock_hier(parres->core);
462 while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
463 childres = ckrm_get_res_class(child, mem_rcbs.resid,
465 set_flags_of_children(childres, flag);
467 ckrm_unlock_hier(parres->core);
471 // FIXME: more attention is needed to this function
473 set_usage_flags(ckrm_mem_res_t *res)
475 int tot_usage, cls_usage, range, guar;
477 if (res->pg_limit == CKRM_SHARE_DONTCARE) {
478 // No limit is set for the class. don't bother it
479 res->reclaim_flags = 0;
480 return res->reclaim_flags;
483 tot_usage = atomic_read(&res->pg_total);
484 cls_usage = tot_usage - res->pg_lent;
485 guar = (res->pg_guar > 0) ? res->pg_guar : 0;
486 range = res->pg_limit - guar;
488 if ((tot_usage > (guar + ((120 * range) / 100))) &&
489 (res->pg_lent > (guar + ((25 * range) / 100)))) {
490 set_flags_of_children(res, CLS_PARENT_OVER);
493 if (cls_usage > (guar + ((110 * range) / 100))) {
494 res->reclaim_flags |= CLS_OVER_110;
495 } else if (cls_usage > (guar + range)) {
496 res->reclaim_flags |= CLS_OVER_100;
497 } else if (cls_usage > (guar + ((3 * range) / 4))) {
498 res->reclaim_flags |= CLS_OVER_75;
499 } else if (cls_usage > guar) {
500 res->reclaim_flags |= CLS_OVER_GUAR;
502 res->reclaim_flags = 0;
504 return res->reclaim_flags;
508 * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
509 * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the
510 * macros CLS_* define how the pages are reclaimed.
511 * Keeping this logic thru these interface eliminate the necessity to
512 * change the reclaimation code in VM if we want to change the logic.
515 ckrm_setup_reclamation(void)
518 unsigned int ret = 0;
520 spin_lock(&ckrm_mem_lock);
521 set_ckrm_tot_pages();
522 ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
523 ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
524 ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
525 recalc_and_propagate(ckrm_mem_root_class, NULL);
526 list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
527 ret |= set_usage_flags(res);
529 spin_unlock(&ckrm_mem_lock);
534 ckrm_teardown_reclamation(void)
537 spin_lock(&ckrm_mem_lock);
538 list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
539 res->reclaim_flags = 0;
541 spin_unlock(&ckrm_mem_lock);
545 ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
549 if (*extract == 0 || *flags == 0) {
552 if (*flags & CLS_SHRINK) {
553 *extract = CLS_SHRINK;
560 for (j = i-1; j > 0; j--) {
561 mask = (mask<<1) | 1;
563 *extract = (CLS_FLAGS_ALL & ~mask);
569 ckrm_near_limit(ckrm_mem_res_t *cls)
572 unsigned long now = jiffies;
574 if (!cls || ((cls->flags & MEM_NEAR_LIMIT) == MEM_NEAR_LIMIT)) {
577 if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
578 cls->last_shrink = now;
579 cls->shrink_count = 0;
582 if (cls->shrink_count > 10) {
585 spin_lock(&ckrm_mem_lock);
586 list_add(&cls->shrink_list, &ckrm_shrink_list);
587 spin_unlock(&ckrm_mem_lock);
588 cls->flags |= MEM_NEAR_LIMIT;
589 for_each_zone(zone) {
591 break; // only once is enough
596 ckrm_mem_evaluate_page_anon(struct page* page)
598 ckrm_mem_res_t* pgcls = page_class(page);
599 ckrm_mem_res_t* maxshareclass = NULL;
600 struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
601 struct vm_area_struct *vma;
602 struct mm_struct* mm;
604 spin_lock(&anon_vma->lock);
605 BUG_ON(list_empty(&anon_vma->head));
606 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
608 if (!maxshareclass ||
609 ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
610 maxshareclass = mm->memclass;
613 spin_unlock(&anon_vma->lock);
615 if (maxshareclass && (pgcls != maxshareclass)) {
616 ckrm_change_page_class(page, maxshareclass);
623 ckrm_mem_evaluate_page_file(struct page* page)
625 ckrm_mem_res_t* pgcls = page_class(page);
626 ckrm_mem_res_t* maxshareclass = NULL;
627 struct address_space *mapping = page->mapping;
628 struct vm_area_struct *vma = NULL;
629 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
630 struct prio_tree_iter iter;
631 struct mm_struct* mm;
636 if (!spin_trylock(&mapping->i_mmap_lock))
639 while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
640 &iter, pgoff, pgoff)) != NULL) {
642 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
643 maxshareclass = mm->memclass;
645 spin_unlock(&mapping->i_mmap_lock);
647 if (maxshareclass && pgcls != maxshareclass) {
648 ckrm_change_page_class(page, maxshareclass);
655 ckrm_mem_evaluate_page(struct page* page)
661 changed = ckrm_mem_evaluate_page_anon(page);
663 changed = ckrm_mem_evaluate_page_file(page);
669 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
670 pmd_t* pmdir, unsigned long address, unsigned long end)
673 unsigned long pmd_end;
675 if (pmd_none(*pmdir))
677 BUG_ON(pmd_bad(*pmdir));
679 pte = pte_offset_map(pmdir,address);
680 pmd_end = (address+PMD_SIZE)&PMD_MASK;
685 if (pte_present(*pte)) {
686 ckrm_mem_evaluate_page(pte_page(*pte));
688 address += PAGE_SIZE;
690 } while(address && (address<end));
695 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
696 pgd_t* pgdir, unsigned long address, unsigned long end)
699 unsigned long pgd_end;
701 if (pgd_none(*pgdir))
703 BUG_ON(pgd_bad(*pgdir));
705 pmd = pmd_offset(pgdir,address);
706 pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
708 if (pgd_end && (end>pgd_end))
712 class_migrate_pmd(mm,vma,pmd,address,end);
713 address = (address+PMD_SIZE)&PMD_MASK;
715 } while (address && (address<end));
720 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
723 unsigned long address, end;
725 address = vma->vm_start;
728 pgdir = pgd_offset(vma->vm_mm, address);
730 class_migrate_pgd(mm,vma,pgdir,address,end);
731 address = (address + PGDIR_SIZE) & PGDIR_MASK;
733 } while(address && (address<end));
737 /* this function is called with mm->peertask_lock hold */
739 ckrm_mem_evaluate_mm(struct mm_struct* mm)
741 struct task_struct *task;
742 struct ckrm_mem_res *maxshareclass = NULL;
743 struct vm_area_struct *vma;
745 if (list_empty(&mm->tasklist)) {
746 /* We leave the mm->memclass untouched since we believe that one
747 * mm with no task associated will be deleted soon or attach
748 * with another task later.
753 list_for_each_entry(task, &mm->tasklist, mm_peers) {
754 ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
757 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 )
761 if (mm->memclass != (void *)maxshareclass) {
762 mem_class_get(maxshareclass);
764 mem_class_put(mm->memclass);
765 mm->memclass = maxshareclass;
767 /* Go through all VMA to migrate pages */
768 down_read(&mm->mmap_sem);
771 class_migrate_vma(mm, vma);
774 up_read(&mm->mmap_sem);
780 ckrm_mem_evaluate_page_byadd(struct page* page, struct mm_struct* mm)
782 ckrm_mem_res_t *pgcls = page_class(page);
783 ckrm_mem_res_t *chgcls = mm->memclass ? mm->memclass : GET_MEM_CLASS(current);
785 if (!chgcls || pgcls == chgcls)
788 if (!page->mapcount) {
789 ckrm_change_page_class(page, chgcls);
792 if (ckrm_mem_share_compare(pgcls, chgcls) < 0) {
793 ckrm_change_page_class(page, chgcls);
800 ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
802 spin_lock(&mm->peertask_lock);
803 if (!list_empty(&task->mm_peers)) {
804 printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
805 list_del_init(&task->mm_peers);
807 list_add_tail(&task->mm_peers, &mm->tasklist);
808 if (mm->memclass != GET_MEM_CLASS(task))
809 ckrm_mem_evaluate_mm(mm);
810 spin_unlock(&mm->peertask_lock);
814 MODULE_LICENSE("GPL");