This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / ckrm / ckrm_mem.c
1 /* ckrm_mem.c - Memory Resource Manager for CKRM
2  *
3  * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
4  *
5  * Provides a Memory Resource controller for CKRM
6  *
7  * Latest version, more details at http://ckrm.sf.net
8  * 
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  */
15
16 /* Code Description: TBD
17  *
18  */
19
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <linux/slab.h>
23 #include <asm/errno.h>
24 #include <linux/list.h>
25 #include <linux/spinlock.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/swapops.h>
29 #include <linux/cache.h>
30 #include <linux/percpu.h>
31 #include <linux/pagevec.h>
32
33 #include <linux/ckrm_mem_inline.h>
34
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37
38 #define MEM_NAME "mem"
39
40 #define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
41
42 /* all 1-level memory_share_class are chained together */
43 static LIST_HEAD(ckrm_memclass_list);
44 LIST_HEAD(ckrm_shrink_list);
45 EXPORT_SYMBOL(ckrm_shrink_list);
46 spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
47 EXPORT_SYMBOL(ckrm_mem_lock);
48 unsigned int ckrm_tot_lru_pages; // total # of pages in the system
49                                                          // currently doesn't handle memory add/remove
50 EXPORT_SYMBOL(ckrm_tot_lru_pages);
51
52 static ckrm_mem_res_t *ckrm_mem_root_class;
53 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
54 EXPORT_SYMBOL(ckrm_mem_real_count);
55 static void ckrm_mem_evaluate_all_pages(void);
56
57 /* Initialize rescls values
58  * May be called on each rcfs unmount or as part of error recovery
59  * to make share values sane.
60  * Does not traverse hierarchy reinitializing children.
61  */
62
63 static void
64 set_ckrm_tot_pages(void)
65 {
66         struct zone *zone;
67         int tot_lru_pages = 0;
68
69         for_each_zone(zone) {
70                 tot_lru_pages += zone->nr_active;
71                 tot_lru_pages += zone->nr_inactive;
72                 tot_lru_pages += zone->free_pages;
73         }
74         ckrm_tot_lru_pages = tot_lru_pages;
75 }
76
77 static void
78 mem_res_initcls_one(void *my_res)
79 {
80         ckrm_mem_res_t *res = my_res;
81
82         memset(res, 0, sizeof(ckrm_mem_res_t));
83
84         res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
85         res->shares.my_limit         = CKRM_SHARE_DONTCARE;
86         res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
87         res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
88         res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
89         res->shares.cur_max_limit    = 0;
90
91         res->pg_guar = CKRM_SHARE_DONTCARE;
92         res->pg_limit = CKRM_SHARE_DONTCARE;
93         res->pg_unused = 0;
94 }
95
96 static void *
97 mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
98 {
99         ckrm_mem_res_t *res, *parres;
100
101         if (mem_rcbs.resid == -1) {
102                 return NULL;
103         }
104
105         parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
106         if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
107                 // allows only upto CKRM_MEM_MAX_HIERARCHY
108                 return NULL;
109         }
110
111         if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
112                 printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
113                 return NULL;
114         }
115                 
116         if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
117                 printk(KERN_ERR "MEM_RC: creating child class without root class\n");
118                 return NULL;
119         }
120                 
121         res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
122         
123         if (res) {
124                 mem_res_initcls_one(res);
125                 res->core = core;
126                 res->parent = parent;
127                 spin_lock(&ckrm_mem_lock);
128                 list_add(&res->mcls_list, &ckrm_memclass_list);
129                 spin_unlock(&ckrm_mem_lock);
130                 if (parent == NULL) {
131                         // I am part of the root class. So, set the max to 
132                         // number of pages available
133                         res->pg_guar = ckrm_tot_lru_pages;
134                         res->pg_unused = ckrm_tot_lru_pages;
135                         res->pg_limit = ckrm_tot_lru_pages;
136                         res->hier = 0;
137                         ckrm_mem_root_class = res;
138                 } else {
139                         res->hier = parres->hier + 1;
140                 }
141                 mem_class_get(res);
142         }
143         else
144                 printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
145         return res;
146 }
147
148 /*
149  * It is the caller's responsibility to make sure that the parent only
150  * has chilren that are to be accounted. i.e if a new child is added
151  * this function should be called after it has been added, and if a
152  * child is deleted this should be called after the child is removed.
153  */
154 static void
155 child_maxlimit_changed_local(ckrm_mem_res_t *parres)
156 {
157         int maxlimit = 0;
158         ckrm_mem_res_t *childres;
159         ckrm_core_class_t *child = NULL;
160
161         // run thru parent's children and get the new max_limit of the parent
162         ckrm_lock_hier(parres->core);
163         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
164                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
165                                 ckrm_mem_res_t);
166                 if (maxlimit < childres->shares.my_limit) {
167                         maxlimit = childres->shares.my_limit;
168                 }
169         }
170         ckrm_unlock_hier(parres->core);
171         parres->shares.cur_max_limit = maxlimit;
172 }
173
174 static void
175 mem_res_free(void *my_res)
176 {
177         ckrm_mem_res_t *res = my_res;
178         ckrm_mem_res_t *parres;
179
180         if (!res) 
181                 return;
182
183         res->shares.my_guarantee = 0;
184         res->shares.my_limit = 0;
185         res->pg_guar = 0;
186         res->pg_limit = 0;
187         res->pg_unused = 0;
188
189         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
190         // return child's limit/guarantee to parent node
191         if (parres) {
192                 child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
193                 child_maxlimit_changed_local(parres);
194         }
195         ckrm_mem_evaluate_all_pages();
196         res->core = NULL;
197
198         spin_lock(&ckrm_mem_lock);
199         list_del(&res->mcls_list);
200         spin_unlock(&ckrm_mem_lock);
201         mem_class_put(res);
202         return;
203 }
204
205 /*
206  * Recalculate the guarantee and limit in # of pages... and propagate the
207  * same to children.
208  * Caller is responsible for protecting res and for the integrity of parres
209  */
210 static void
211 recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
212 {
213         ckrm_core_class_t *child = NULL;
214         ckrm_mem_res_t *childres;
215         int resid = mem_rcbs.resid;
216         struct ckrm_shares *self = &res->shares;
217
218         if (parres) {
219                 struct ckrm_shares *par = &parres->shares;
220
221                 // calculate pg_guar and pg_limit
222                 //
223                 if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
224                                 self->my_guarantee == CKRM_SHARE_DONTCARE) {
225                         res->pg_guar = CKRM_SHARE_DONTCARE;
226                 } else if (par->total_guarantee) {
227                         u64 temp = (u64) self->my_guarantee * parres->pg_guar;
228                         do_div(temp, par->total_guarantee);
229                         res->pg_guar = (int) temp;
230                 } else {
231                         res->pg_guar = 0;
232                 }
233
234                 if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
235                                 self->my_limit == CKRM_SHARE_DONTCARE) {
236                         res->pg_limit = CKRM_SHARE_DONTCARE;
237                 } else if (par->max_limit) {
238                         u64 temp = (u64) self->my_limit * parres->pg_limit;
239                         do_div(temp, par->max_limit);
240                         res->pg_limit = (int) temp;
241                 } else {
242                         res->pg_limit = 0;
243                 }
244         }
245
246         // Calculate unused units
247         if (res->pg_guar == CKRM_SHARE_DONTCARE) {
248                 res->pg_unused = CKRM_SHARE_DONTCARE;
249         } else if (self->total_guarantee) {
250                 u64 temp = (u64) self->unused_guarantee * res->pg_guar;
251                 do_div(temp, self->total_guarantee);
252                 res->pg_unused = (int) temp;
253         } else {
254                 res->pg_unused = 0;
255         }
256
257         // propagate to children
258         ckrm_lock_hier(res->core);
259         while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
260                 childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
261                 recalc_and_propagate(childres, res);
262         }
263         ckrm_unlock_hier(res->core);
264         return;
265 }
266
267 static int
268 mem_set_share_values(void *my_res, struct ckrm_shares *shares)
269 {
270         ckrm_mem_res_t *res = my_res;
271         ckrm_mem_res_t *parres;
272         int rc = EINVAL;
273
274         if (!res) 
275                 return -EINVAL;
276
277         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
278
279         rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
280
281         if ((rc == 0) && (parres != NULL)) {
282                 child_maxlimit_changed_local(parres);
283                 recalc_and_propagate(parres, NULL);
284         }
285         return rc;
286 }
287
288 static int
289 mem_get_share_values(void *my_res, struct ckrm_shares *shares)
290 {
291         ckrm_mem_res_t *res = my_res;
292
293         if (!res) 
294                 return -EINVAL;
295         *shares = res->shares;
296         return 0;
297 }
298
299 static int  
300 mem_get_stats(void *my_res, struct seq_file *sfile)
301 {
302         ckrm_mem_res_t *res = my_res;
303
304         if (!res) 
305                 return -EINVAL;
306
307 #if 0
308         seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
309                         "lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
310                         res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
311                         res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
312 #endif
313
314
315         seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
316         seq_printf(sfile, "Number of pages used(including pages lent to children):"
317                         " %d\n", atomic_read(&res->pg_total));
318         seq_printf(sfile, "Number of pages guaranteed: %d\n",
319                         res->pg_guar);
320         seq_printf(sfile, "Maximum limit of pages: %d\n",
321                         res->pg_limit);
322         seq_printf(sfile, "Total number of pages available"
323                         "(after serving guarantees to children): %d\n",
324                         res->pg_unused);
325         seq_printf(sfile, "Number of pages lent to children: %d\n",
326                         res->pg_lent);
327         seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
328                         res->pg_borrowed);
329         seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
330
331         return 0;
332 }
333
334 static void
335 mem_change_resclass(void *tsk, void *old, void *new)
336 {
337         struct mm_struct *mm;
338         struct task_struct *task = tsk, *t1;
339         struct ckrm_mem_res *prev_mmcls;
340         
341         if (!task->mm || (new == old) || (old == (void *) -1))
342                 return;
343
344         mm = task->active_mm;
345         spin_lock(&mm->peertask_lock);
346         prev_mmcls = mm->memclass;
347                 
348         if (new == NULL) {
349                 list_del_init(&task->mm_peers);
350         } else {
351                 int found = 0;
352                 list_for_each_entry(t1, &mm->tasklist, mm_peers) {
353                         if (t1 == task) {
354                                 found++;
355                                 break;
356                         }
357                 }
358                 if (!found) {
359                         list_del_init(&task->mm_peers);
360                         list_add_tail(&task->mm_peers, &mm->tasklist);
361                 }
362         }
363
364         spin_unlock(&mm->peertask_lock);
365         ckrm_mem_evaluate_mm(mm);
366         /*
367         printk("chg_cls: task <%s:%d> mm %p oldmm %s newmm %s o %s n %s\n",
368                 task->comm, task->pid, mm, prev_mmcls ? prev_mmcls->core->name:
369                 "NULL", mm->memclass ? mm->memclass->core->name : "NULL",
370                 o ? o->core->name: "NULL", n ? n->core->name: "NULL");  
371         */
372         return;
373 }
374
375 // config file is available only at the root level,
376 // so assuming my_res to be the system level class
377 static int
378 mem_set_config(void *my_res, const char *cfgstr)
379 {
380         ckrm_mem_res_t *res = my_res;
381
382         printk(KERN_INFO "%s class of %s is called with config<%s>\n",
383                         MEM_NAME, res->core->name, cfgstr);
384         return 0;
385 }
386
387 static int 
388 mem_show_config(void *my_res, struct seq_file *sfile)
389 {
390         struct zone *zone;
391         ckrm_mem_res_t *res = my_res;
392         int active = 0, inactive = 0, fr = 0;
393
394         if (!res)
395                 return -EINVAL;
396
397         for_each_zone(zone) {
398                 active += zone->nr_active;
399                 inactive += zone->nr_inactive;
400                 fr += zone->free_pages;
401         }
402         seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
403                         MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
404
405
406         return 0;
407 }
408
409 static int
410 mem_reset_stats(void *my_res)
411 {
412         ckrm_mem_res_t *res = my_res;
413         printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
414         return 0;
415 }
416
417 struct ckrm_res_ctlr mem_rcbs = {
418         .res_name          = MEM_NAME,
419         .res_hdepth        = CKRM_MEM_MAX_HIERARCHY,
420         .resid             = -1,
421         .res_alloc         = mem_res_alloc,
422         .res_free          = mem_res_free,
423         .set_share_values  = mem_set_share_values,
424         .get_share_values  = mem_get_share_values,
425         .get_stats         = mem_get_stats,
426         .change_resclass   = mem_change_resclass,
427         .show_config       = mem_show_config,
428         .set_config        = mem_set_config,
429         .reset_stats       = mem_reset_stats,
430 };
431
432 EXPORT_SYMBOL(mem_rcbs);
433
434 int __init
435 init_ckrm_mem_res(void)
436 {
437         struct ckrm_classtype *clstype;
438         int resid = mem_rcbs.resid;
439
440         set_ckrm_tot_pages();
441         clstype = ckrm_find_classtype_by_name("taskclass");
442         if (clstype == NULL) {
443                 printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
444                 return -ENOENT;
445         }
446
447         if (resid == -1) {
448                 resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
449                 if (resid != -1) {
450                         mem_rcbs.classtype = clstype;
451                 }
452         }
453         return ((resid < 0) ? resid : 0);
454 }       
455
456 void __exit
457 exit_ckrm_mem_res(void)
458 {
459         ckrm_unregister_res_ctlr(&mem_rcbs);
460         mem_rcbs.resid = -1;
461 }
462
463 module_init(init_ckrm_mem_res)
464 module_exit(exit_ckrm_mem_res)
465
466 static void
467 set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
468 {
469         ckrm_mem_res_t *childres;
470         ckrm_core_class_t *child = NULL;
471
472         parres->reclaim_flags |= flag;
473         ckrm_lock_hier(parres->core);
474         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
475                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
476                                 ckrm_mem_res_t);
477                 set_flags_of_children(childres, flag);
478         }
479         ckrm_unlock_hier(parres->core);
480         return;
481 }
482
483 // FIXME: more attention is needed to this function
484 static unsigned int
485 set_usage_flags(ckrm_mem_res_t *res)
486 {
487         int tot_usage, cls_usage, range, guar;
488
489         if (res->pg_limit == CKRM_SHARE_DONTCARE) {
490                         // No limit is set for the class. don't bother it
491                         res->reclaim_flags = 0;
492                         return res->reclaim_flags;
493         }
494
495         tot_usage = atomic_read(&res->pg_total);
496         cls_usage = tot_usage - res->pg_lent;
497         guar = (res->pg_guar > 0) ? res->pg_guar : 0;
498         range = res->pg_limit - guar;
499
500         if ((tot_usage > (guar + ((110 * range) / 100))) &&
501                                 (res->pg_lent > (guar + ((25 * range) / 100)))) {
502                 set_flags_of_children(res, CLS_PARENT_OVER);
503         }
504
505         if (cls_usage > (guar + ((110 * range) / 100))) {
506                 res->reclaim_flags |= CLS_OVER_110;
507         } else if (cls_usage > (guar + range)) {
508                 res->reclaim_flags |= CLS_OVER_100;
509         } else if (cls_usage > (guar + ((3 * range) / 4))) {
510                 res->reclaim_flags |= CLS_OVER_75;
511         } else if (cls_usage > (guar + (range / 2))) {
512                 res->reclaim_flags |= CLS_OVER_50;
513         } else if (cls_usage > (guar + (range / 4))) {
514                 res->reclaim_flags |= CLS_OVER_25;
515         } else if (cls_usage > guar) {
516                 res->reclaim_flags |= CLS_OVER_GUAR;
517         } else {
518                 res->reclaim_flags = 0;
519         }
520         return res->reclaim_flags;
521 }
522
523 /*
524  * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
525  * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the 
526  * macros CLS_* define how the pages are reclaimed.
527  * Keeping this logic thru these interface eliminate the necessity to
528  * change the reclaimation code in VM if we want to change the logic.
529  */
530 unsigned int
531 ckrm_setup_reclamation(void)
532 {
533         ckrm_mem_res_t *res;
534         unsigned int ret = 0;
535
536         spin_lock(&ckrm_mem_lock);
537         set_ckrm_tot_pages();
538         ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
539         ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
540         ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
541         recalc_and_propagate(ckrm_mem_root_class, NULL);
542         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
543                 ret |= set_usage_flags(res);
544         }
545         spin_unlock(&ckrm_mem_lock);
546         return ret;
547 }
548
549 void
550 ckrm_teardown_reclamation(void)
551 {
552         ckrm_mem_res_t *res;
553         spin_lock(&ckrm_mem_lock);
554         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
555                 res->reclaim_flags = 0;
556         }
557         spin_unlock(&ckrm_mem_lock);
558 }
559
560 void
561 ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
562 {
563         int i, j, mask = 0;
564
565         if (*flags == 0) {
566                 *extract = 0;
567                 return;
568         }
569
570         if (*flags & CLS_SHRINK) {
571                 *extract = CLS_SHRINK;
572                 *flags = 0;
573                 return;
574         }
575
576         i = fls(*flags);
577         for (j = i-1; j > 0; j--) {
578                 mask = (mask<<1) | 1;
579         }
580         *extract = (CLS_FLAGS_ALL & ~mask);
581         *flags &= ~*extract;
582         return;
583 }
584
585 void
586 ckrm_at_limit(ckrm_mem_res_t *cls)
587 {
588 #ifndef AT_LIMIT_SUPPORT
589 #warning "ckrm_at_limit disabled due to problems with memory hog tests"
590 #else
591         struct zone *zone;
592         unsigned long now = jiffies;
593
594         if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || 
595                         ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) {
596                 return;
597         }
598         if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
599                 cls->last_shrink = now;
600                 cls->shrink_count = 0;
601         }
602         cls->shrink_count++;
603         if (cls->shrink_count > 10) {
604                 return;
605         }
606         spin_lock(&ckrm_mem_lock);
607         list_add(&cls->shrink_list, &ckrm_shrink_list);
608         spin_unlock(&ckrm_mem_lock);
609         cls->flags |= MEM_AT_LIMIT;
610         for_each_zone(zone) {
611                 wakeup_kswapd(zone);
612                 break; // only once is enough
613         }
614 #endif // AT_LIMIT_SUPPORT
615 }
616
617 static int unmapped = 0, changed = 0, unchanged = 0, maxnull = 0,
618 anovma = 0, fnovma = 0;
619 static void
620 ckrm_mem_evaluate_page_anon(struct page* page)
621 {
622         ckrm_mem_res_t* pgcls = page_class(page);
623         ckrm_mem_res_t* maxshareclass = NULL;
624         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
625         struct vm_area_struct *vma;
626         struct mm_struct* mm;
627         int v = 0;
628
629         spin_lock(&anon_vma->lock);
630         BUG_ON(list_empty(&anon_vma->head));
631         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
632                 v++;
633                 mm = vma->vm_mm;
634                 if (!maxshareclass ||
635                                 ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
636                         maxshareclass = mm->memclass;
637                 }
638         }
639         spin_unlock(&anon_vma->lock);
640         if (!v)
641                 anovma++;
642
643         if (!maxshareclass)
644                 maxnull++;
645         if (maxshareclass && (pgcls != maxshareclass)) {
646                 ckrm_change_page_class(page, maxshareclass);
647                 changed++;
648         } else 
649                 unchanged++;
650         return;
651 }
652
653 static void
654 ckrm_mem_evaluate_page_file(struct page* page) 
655 {
656         ckrm_mem_res_t* pgcls = page_class(page);
657         ckrm_mem_res_t* maxshareclass = NULL;
658         struct address_space *mapping = page->mapping;
659         struct vm_area_struct *vma = NULL;
660         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
661         struct prio_tree_iter iter;
662         struct mm_struct* mm;
663         int v = 0;
664
665         if (!mapping)
666                 return;
667
668         if (!spin_trylock(&mapping->i_mmap_lock))
669                 return;
670
671         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
672                                         &iter, pgoff, pgoff)) != NULL) {
673                 v++;
674                 mm = vma->vm_mm;
675                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
676                         maxshareclass = mm->memclass;
677         }
678         spin_unlock(&mapping->i_mmap_lock);
679
680         if (!v)
681                 fnovma++;
682         if (!maxshareclass)
683                 maxnull++;
684
685         if (maxshareclass && pgcls != maxshareclass) {
686                 ckrm_change_page_class(page, maxshareclass);
687                 changed++;
688         } else 
689                 unchanged++;
690         return;
691 }
692
693 static void
694 ckrm_mem_evaluate_page(struct page* page) 
695 {
696         if (page->mapping) {
697                 if (PageAnon(page))
698                         ckrm_mem_evaluate_page_anon(page);
699                 else
700                         ckrm_mem_evaluate_page_file(page);
701         } else
702                 unmapped++;
703         return;
704 }
705
706 static void
707 ckrm_mem_evaluate_all_pages()
708 {
709         struct page *page;
710         struct zone *zone;
711         int active = 0, inactive = 0, cleared = 0;
712         int act_cnt, inact_cnt, idx;
713         ckrm_mem_res_t *res;
714
715         spin_lock(&ckrm_mem_lock);
716         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
717                 res->tmp_cnt = 0;
718         }
719         spin_unlock(&ckrm_mem_lock);
720
721         for_each_zone(zone) {
722                 spin_lock_irq(&zone->lru_lock);
723                 list_for_each_entry(page, &zone->inactive_list, lru) {
724                         ckrm_mem_evaluate_page(page);
725                         active++;
726                         page_class(page)->tmp_cnt++;
727                         if (!test_bit(PG_ckrm_account, &page->flags))
728                                 cleared++;
729                 }
730                 list_for_each_entry(page, &zone->active_list, lru) {
731                         ckrm_mem_evaluate_page(page);
732                         inactive++;
733                         page_class(page)->tmp_cnt++;
734                         if (!test_bit(PG_ckrm_account, &page->flags))
735                                 cleared++;
736                 }
737                 spin_unlock_irq(&zone->lru_lock);
738         }
739         printk("all_pages: active %d inactive %d cleared %d\n", 
740                         active, inactive, cleared);
741         spin_lock(&ckrm_mem_lock);
742         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
743                 act_cnt = 0; inact_cnt = 0; idx = 0;
744                 for_each_zone(zone) {
745                         act_cnt += res->nr_active[idx];
746                         inact_cnt += res->nr_inactive[idx];
747                         idx++;
748                 }
749                 printk("all_pages: %s: tmp_cnt %d; act_cnt %d inact_cnt %d\n",
750                         res->core->name, res->tmp_cnt, act_cnt, inact_cnt);
751         }
752         spin_unlock(&ckrm_mem_lock);
753
754         // check all mm's in the system to see which memclass they are attached
755         // to.
756         return;
757 }
758
759 static /*inline*/ int
760 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
761                 pmd_t* pmdir, unsigned long address, unsigned long end)
762 {
763         pte_t *pte, *orig_pte;
764         unsigned long pmd_end;
765         
766         if (pmd_none(*pmdir))
767                 return 0;
768         BUG_ON(pmd_bad(*pmdir));
769         
770         orig_pte = pte = pte_offset_map(pmdir,address);
771         pmd_end = (address+PMD_SIZE)&PMD_MASK;
772         if (end>pmd_end)
773                 end = pmd_end;
774         
775         do {
776                 if (pte_present(*pte)) {
777                         BUG_ON(mm->memclass == NULL);
778                         ckrm_change_page_class(pte_page(*pte), mm->memclass);
779                         // ckrm_mem_evaluate_page(pte_page(*pte));
780                 }
781                 address += PAGE_SIZE;
782                 pte++;
783         } while(address && (address<end));
784         pte_unmap(orig_pte);
785         return 0;
786 }
787
788 static /*inline*/ int
789 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
790                 pgd_t* pgdir, unsigned long address, unsigned long end)
791 {
792         pmd_t* pmd;
793         unsigned long pgd_end;
794         
795         if (pgd_none(*pgdir))
796                 return 0;
797         BUG_ON(pgd_bad(*pgdir));
798         
799         pmd = pmd_offset(pgdir,address);
800         pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
801         
802         if (pgd_end && (end>pgd_end))
803                 end = pgd_end;
804         
805         do {
806                 class_migrate_pmd(mm,vma,pmd,address,end);
807                 address =  (address+PMD_SIZE)&PMD_MASK;
808                 pmd++;
809         } while (address && (address<end));
810         return 0;
811 }
812
813 static /*inline*/ int
814 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
815 {
816         pgd_t* pgdir;
817         unsigned long address, end;
818         
819         address = vma->vm_start;
820         end = vma->vm_end;
821         
822         pgdir = pgd_offset(vma->vm_mm, address);
823         do {
824                 class_migrate_pgd(mm,vma,pgdir,address,end);
825                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
826                 pgdir++;
827         } while(address && (address<end));
828         return 0;
829 }
830
831 /* this function is called with mm->peertask_lock hold */
832 void
833 ckrm_mem_evaluate_mm(struct mm_struct* mm)
834 {
835         struct task_struct *task;
836         struct ckrm_mem_res *maxshareclass = NULL;
837         struct vm_area_struct *vma;
838         
839         if (list_empty(&mm->tasklist)) {
840                 /* We leave the mm->memclass untouched since we believe that one
841                  * mm with no task associated will be deleted soon or attach
842                  * with another task later.
843                  */
844                 return; 
845         }
846
847         list_for_each_entry(task, &mm->tasklist, mm_peers) {
848                 ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
849                 if (!cls)
850                         continue;
851                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 ) 
852                         maxshareclass = cls;
853         }
854
855         if (maxshareclass && (mm->memclass != (void *)maxshareclass)) {
856                 if (mm->memclass)
857                         mem_class_put(mm->memclass);
858                 mm->memclass = maxshareclass;
859                 mem_class_get(maxshareclass);
860                 
861                 /* Go through all VMA to migrate pages */
862                 down_read(&mm->mmap_sem);
863                 vma = mm->mmap;
864                 while(vma) {
865                         class_migrate_vma(mm, vma);
866                         vma = vma->vm_next;
867                 }
868                 up_read(&mm->mmap_sem);
869         }
870         return;
871 }
872
873 void
874 ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
875 {
876         spin_lock(&mm->peertask_lock);
877         if (!list_empty(&task->mm_peers)) {
878                 printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
879                 list_del_init(&task->mm_peers);
880         }
881         list_add_tail(&task->mm_peers, &mm->tasklist);
882         spin_unlock(&mm->peertask_lock);
883         if (mm->memclass != GET_MEM_CLASS(task))
884                 ckrm_mem_evaluate_mm(mm);
885         return;
886 }
887
888 int
889 ckrm_memclass_valid(ckrm_mem_res_t *cls)
890 {
891         ckrm_mem_res_t *tmp;
892
893         spin_lock(&ckrm_mem_lock);
894         list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) {
895                 if (tmp == cls) {
896                         spin_unlock(&ckrm_mem_lock);
897                         return 1;
898                 }
899         }
900         spin_unlock(&ckrm_mem_lock);
901         return 0;
902 }
903
904 MODULE_LICENSE("GPL");