This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / ckrm / ckrm_mem.c
1 /* ckrm_mem.c - Memory Resource Manager for CKRM
2  *
3  * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
4  *
5  * Provides a Memory Resource controller for CKRM
6  *
7  * Latest version, more details at http://ckrm.sf.net
8  * 
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  */
15
16 /* Code Description: TBD
17  *
18  */
19
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <linux/slab.h>
23 #include <asm/errno.h>
24 #include <linux/list.h>
25 #include <linux/spinlock.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/swapops.h>
29 #include <linux/cache.h>
30 #include <linux/percpu.h>
31 #include <linux/pagevec.h>
32
33 #include <linux/ckrm_mem_inline.h>
34
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37
38 #define MEM_NAME "mem"
39
40 #define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
41
42 /* all 1-level memory_share_class are chained together */
43 static LIST_HEAD(ckrm_memclass_list);
44 LIST_HEAD(ckrm_shrink_list);
45 EXPORT_SYMBOL(ckrm_shrink_list);
46 spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
47 EXPORT_SYMBOL(ckrm_mem_lock);
48 unsigned int ckrm_tot_lru_pages; // total # of pages in the system
49                                                          // currently doesn't handle memory add/remove
50 EXPORT_SYMBOL(ckrm_tot_lru_pages);
51
52 static ckrm_mem_res_t *ckrm_mem_root_class;
53 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
54 EXPORT_SYMBOL(ckrm_mem_real_count);
55
56 /* Initialize rescls values
57  * May be called on each rcfs unmount or as part of error recovery
58  * to make share values sane.
59  * Does not traverse hierarchy reinitializing children.
60  */
61
62 static void
63 set_ckrm_tot_pages(void)
64 {
65         struct zone *zone;
66         int tot_lru_pages = 0;
67
68         for_each_zone(zone) {
69                 tot_lru_pages += zone->nr_active;
70                 tot_lru_pages += zone->nr_inactive;
71                 tot_lru_pages += zone->free_pages;
72         }
73         ckrm_tot_lru_pages = tot_lru_pages;
74 }
75
76 static void
77 mem_res_initcls_one(void *my_res)
78 {
79         ckrm_mem_res_t *res = my_res;
80
81         memset(res, 0, sizeof(ckrm_mem_res_t));
82
83         res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
84         res->shares.my_limit         = CKRM_SHARE_DONTCARE;
85         res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
86         res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
87         res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
88         res->shares.cur_max_limit    = 0;
89
90         res->pg_guar = CKRM_SHARE_DONTCARE;
91         res->pg_limit = CKRM_SHARE_DONTCARE;
92         res->pg_unused = CKRM_SHARE_DONTCARE;
93 }
94
95 static void *
96 mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
97 {
98         ckrm_mem_res_t *res, *parres;
99
100         if (mem_rcbs.resid == -1) {
101                 return NULL;
102         }
103
104         parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
105         if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
106                 // allows only upto CKRM_MEM_MAX_HIERARCHY
107                 return NULL;
108         }
109
110         if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
111                 printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
112                 return NULL;
113         }
114                 
115         if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
116                 printk(KERN_ERR "MEM_RC: creating child class without root class\n");
117                 return NULL;
118         }
119                 
120         res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
121         
122         if (res) {
123                 mem_res_initcls_one(res);
124                 res->core = core;
125                 res->parent = parent;
126                 spin_lock(&ckrm_mem_lock);
127                 list_add(&res->mcls_list, &ckrm_memclass_list);
128                 spin_unlock(&ckrm_mem_lock);
129                 if (parent == NULL) {
130                         // I am part of the root class. So, set the max to 
131                         // number of pages available
132                         res->pg_guar = ckrm_tot_lru_pages;
133                         res->pg_unused = ckrm_tot_lru_pages;
134                         res->pg_limit = ckrm_tot_lru_pages;
135                         res->hier = 0;
136                         ckrm_mem_root_class = res;
137                 } else {
138                         res->hier = parres->hier + 1;
139                 }
140                 mem_class_get(res);
141         }
142         else
143                 printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
144         return res;
145 }
146
147 /*
148  * It is the caller's responsibility to make sure that the parent only
149  * has chilren that are to be accounted. i.e if a new child is added
150  * this function should be called after it has been added, and if a
151  * child is deleted this should be called after the child is removed.
152  */
153 static void
154 child_maxlimit_changed_local(ckrm_mem_res_t *parres)
155 {
156         int maxlimit = 0;
157         ckrm_mem_res_t *childres;
158         ckrm_core_class_t *child = NULL;
159
160         // run thru parent's children and get the new max_limit of the parent
161         ckrm_lock_hier(parres->core);
162         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
163                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
164                                 ckrm_mem_res_t);
165                 if (maxlimit < childres->shares.my_limit) {
166                         maxlimit = childres->shares.my_limit;
167                 }
168         }
169         ckrm_unlock_hier(parres->core);
170         parres->shares.cur_max_limit = maxlimit;
171 }
172
173 static void
174 mem_res_free(void *my_res)
175 {
176         ckrm_mem_res_t *res = my_res;
177         ckrm_mem_res_t *parres;
178
179         if (!res) 
180                 return;
181
182         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
183
184         // return child's limit/guarantee to parent node
185         if (parres) {
186                 child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
187                 child_maxlimit_changed_local(parres);
188         }
189         res->shares.my_guarantee = 0;
190         res->shares.my_limit = 0;
191         spin_lock(&ckrm_mem_lock);
192         list_del(&res->mcls_list);
193         spin_unlock(&ckrm_mem_lock);
194         mem_class_put(res);
195
196         return;
197 }
198
199 /*
200  * Recalculate the guarantee and limit in # of pages... and propagate the
201  * same to children.
202  * Caller is responsible for protecting res and for the integrity of parres
203  */
204 static void
205 recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
206 {
207         ckrm_core_class_t *child = NULL;
208         ckrm_mem_res_t *childres;
209         int resid = mem_rcbs.resid;
210         struct ckrm_shares *self = &res->shares;
211
212         if (parres) {
213                 struct ckrm_shares *par = &parres->shares;
214
215                 // calculate pg_guar and pg_limit
216                 //
217                 if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
218                                 self->my_guarantee == CKRM_SHARE_DONTCARE) {
219                         res->pg_guar = CKRM_SHARE_DONTCARE;
220                 } else if (par->total_guarantee) {
221                         u64 temp = (u64) self->my_guarantee * parres->pg_guar;
222                         do_div(temp, par->total_guarantee);
223                         res->pg_guar = (int) temp;
224                 } else {
225                         res->pg_guar = 0;
226                 }
227
228                 if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
229                                 self->my_limit == CKRM_SHARE_DONTCARE) {
230                         res->pg_limit = CKRM_SHARE_DONTCARE;
231                 } else if (par->max_limit) {
232                         u64 temp = (u64) self->my_limit * parres->pg_limit;
233                         do_div(temp, par->max_limit);
234                         res->pg_limit = (int) temp;
235                 } else {
236                         res->pg_limit = 0;
237                 }
238         }
239
240         // Calculate unused units
241         if (res->pg_guar == CKRM_SHARE_DONTCARE) {
242                 res->pg_unused = CKRM_SHARE_DONTCARE;
243         } else if (self->total_guarantee) {
244                 u64 temp = (u64) self->unused_guarantee * res->pg_guar;
245                 do_div(temp, self->total_guarantee);
246                 res->pg_unused = (int) temp;
247         } else {
248                 res->pg_unused = 0;
249         }
250
251         // propagate to children
252         ckrm_lock_hier(res->core);
253         while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
254                 childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
255                 recalc_and_propagate(childres, res);
256         }
257         ckrm_unlock_hier(res->core);
258         return;
259 }
260
261 static int
262 mem_set_share_values(void *my_res, struct ckrm_shares *shares)
263 {
264         ckrm_mem_res_t *res = my_res;
265         ckrm_mem_res_t *parres;
266         int rc = EINVAL;
267
268         if (!res) 
269                 return -EINVAL;
270
271         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
272
273         rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
274
275         if ((rc == 0) && (parres != NULL)) {
276                 child_maxlimit_changed_local(parres);
277                 recalc_and_propagate(parres, NULL);
278         }
279         return rc;
280 }
281
282 static int
283 mem_get_share_values(void *my_res, struct ckrm_shares *shares)
284 {
285         ckrm_mem_res_t *res = my_res;
286
287         if (!res) 
288                 return -EINVAL;
289         *shares = res->shares;
290         return 0;
291 }
292
293 static int  
294 mem_get_stats(void *my_res, struct seq_file *sfile)
295 {
296         ckrm_mem_res_t *res = my_res;
297
298         if (!res) 
299                 return -EINVAL;
300
301 #if 0
302         seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
303                         "lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
304                         res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
305                         res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
306 #endif
307
308
309         seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
310         seq_printf(sfile, "Number of pages used(including pages lent to children):"
311                         " %d\n", atomic_read(&res->pg_total));
312         seq_printf(sfile, "Number of pages guaranteed: %d\n",
313                         res->pg_guar);
314         seq_printf(sfile, "Maximum limit of pages: %d\n",
315                         res->pg_limit);
316         seq_printf(sfile, "Total number of pages available"
317                         "(after serving guarantees to children): %d\n",
318                         res->pg_unused);
319         seq_printf(sfile, "Number of pages lent to children: %d\n",
320                         res->pg_lent);
321         seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
322                         res->pg_borrowed);
323         seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
324
325         return 0;
326 }
327
328 static void
329 mem_change_resclass(void *tsk, void *old, void *new)
330 {
331         struct mm_struct *mm;
332         struct task_struct *task = tsk, *t1;
333         struct ckrm_mem_res *prev_mmcls;
334         
335         if (!task->mm || (new == old) || (old == (void *) -1))
336                 return;
337
338         mm = task->active_mm;
339         spin_lock(&mm->peertask_lock);
340         prev_mmcls = mm->memclass;
341                 
342         if (new == NULL) {
343                 list_del_init(&task->mm_peers);
344         } else {
345                 int found = 0;
346                 list_for_each_entry(t1, &mm->tasklist, mm_peers) {
347                         if (t1 == task) {
348                                 found++;
349                                 break;
350                         }
351                 }
352                 if (!found) {
353                         list_del_init(&task->mm_peers);
354                         list_add_tail(&task->mm_peers, &mm->tasklist);
355                 }
356         }
357
358         ckrm_mem_evaluate_mm(mm);
359         spin_unlock(&mm->peertask_lock);
360         return;
361 }
362
363 // config file is available only at the root level,
364 // so assuming my_res to be the system level class
365 static int
366 mem_set_config(void *my_res, const char *cfgstr)
367 {
368         ckrm_mem_res_t *res = my_res;
369
370         printk(KERN_INFO "%s class of %s is called with config<%s>\n",
371                         MEM_NAME, res->core->name, cfgstr);
372         return 0;
373 }
374
375 static int 
376 mem_show_config(void *my_res, struct seq_file *sfile)
377 {
378         struct zone *zone;
379         ckrm_mem_res_t *res = my_res;
380         int active = 0, inactive = 0, fr = 0;
381
382         if (!res)
383                 return -EINVAL;
384
385         for_each_zone(zone) {
386                 active += zone->nr_active;
387                 inactive += zone->nr_inactive;
388                 fr += zone->free_pages;
389         }
390         seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
391                         MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
392
393
394         return 0;
395 }
396
397 static int
398 mem_reset_stats(void *my_res)
399 {
400         ckrm_mem_res_t *res = my_res;
401         printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
402         return 0;
403 }
404
405 struct ckrm_res_ctlr mem_rcbs = {
406         .res_name          = MEM_NAME,
407         .res_hdepth        = CKRM_MEM_MAX_HIERARCHY,
408         .resid             = -1,
409         .res_alloc         = mem_res_alloc,
410         .res_free          = mem_res_free,
411         .set_share_values  = mem_set_share_values,
412         .get_share_values  = mem_get_share_values,
413         .get_stats         = mem_get_stats,
414         .change_resclass   = mem_change_resclass,
415         .show_config       = mem_show_config,
416         .set_config        = mem_set_config,
417         .reset_stats       = mem_reset_stats,
418 };
419
420 EXPORT_SYMBOL(mem_rcbs);
421
422 int __init
423 init_ckrm_mem_res(void)
424 {
425         struct ckrm_classtype *clstype;
426         int resid = mem_rcbs.resid;
427
428         set_ckrm_tot_pages();
429         clstype = ckrm_find_classtype_by_name("taskclass");
430         if (clstype == NULL) {
431                 printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
432                 return -ENOENT;
433         }
434
435         if (resid == -1) {
436                 resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
437                 if (resid != -1) {
438                         mem_rcbs.classtype = clstype;
439                 }
440         }
441         return ((resid < 0) ? resid : 0);
442 }       
443
444 void __exit
445 exit_ckrm_mem_res(void)
446 {
447         ckrm_unregister_res_ctlr(&mem_rcbs);
448         mem_rcbs.resid = -1;
449 }
450
451 module_init(init_ckrm_mem_res)
452 module_exit(exit_ckrm_mem_res)
453
454 static void
455 set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
456 {
457         ckrm_mem_res_t *childres;
458         ckrm_core_class_t *child = NULL;
459
460         parres->reclaim_flags |= flag;
461         ckrm_lock_hier(parres->core);
462         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
463                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
464                                 ckrm_mem_res_t);
465                 set_flags_of_children(childres, flag);
466         }
467         ckrm_unlock_hier(parres->core);
468         return;
469 }
470
471 // FIXME: more attention is needed to this function
472 static unsigned int
473 set_usage_flags(ckrm_mem_res_t *res)
474 {
475         int tot_usage, cls_usage, range, guar;
476
477         if (res->pg_limit == CKRM_SHARE_DONTCARE) {
478                         // No limit is set for the class. don't bother it
479                         res->reclaim_flags = 0;
480                         return res->reclaim_flags;
481         }
482
483         tot_usage = atomic_read(&res->pg_total);
484         cls_usage = tot_usage - res->pg_lent;
485         guar = (res->pg_guar > 0) ? res->pg_guar : 0;
486         range = res->pg_limit - guar;
487
488         if ((tot_usage > (guar + ((120 * range) / 100))) &&
489                                 (res->pg_lent > (guar + ((25 * range) / 100)))) {
490                 set_flags_of_children(res, CLS_PARENT_OVER);
491         }
492
493         if (cls_usage > (guar + ((110 * range) / 100))) {
494                 res->reclaim_flags |= CLS_OVER_110;
495         } else if (cls_usage > (guar + range)) {
496                 res->reclaim_flags |= CLS_OVER_100;
497         } else if (cls_usage > (guar + ((3 * range) / 4))) {
498                 res->reclaim_flags |= CLS_OVER_75;
499         } else if (cls_usage > guar) {
500                 res->reclaim_flags |= CLS_OVER_GUAR;
501         } else {
502                 res->reclaim_flags = 0;
503         }
504         return res->reclaim_flags;
505 }
506
507 /*
508  * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
509  * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the 
510  * macros CLS_* define how the pages are reclaimed.
511  * Keeping this logic thru these interface eliminate the necessity to
512  * change the reclaimation code in VM if we want to change the logic.
513  */
514 unsigned int
515 ckrm_setup_reclamation(void)
516 {
517         ckrm_mem_res_t *res;
518         unsigned int ret = 0;
519
520         spin_lock(&ckrm_mem_lock);
521         set_ckrm_tot_pages();
522         ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
523         ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
524         ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
525         recalc_and_propagate(ckrm_mem_root_class, NULL);
526         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
527                 ret |= set_usage_flags(res);
528         }
529         spin_unlock(&ckrm_mem_lock);
530         return ret;
531 }
532
533 void
534 ckrm_teardown_reclamation(void)
535 {
536         ckrm_mem_res_t *res;
537         spin_lock(&ckrm_mem_lock);
538         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
539                 res->reclaim_flags = 0;
540         }
541         spin_unlock(&ckrm_mem_lock);
542 }
543
544 void
545 ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
546 {
547         int i, j, mask = 0;
548
549         if (*extract == 0 || *flags == 0) {
550                 return;
551         }
552         if (*flags & CLS_SHRINK) {
553                 *extract = CLS_SHRINK;
554                 *flags = 0;
555                 return;
556         }
557                         
558
559         i = fls(*flags);
560         for (j = i-1; j > 0; j--) {
561                 mask = (mask<<1) | 1;
562         }
563         *extract = (CLS_FLAGS_ALL & ~mask);
564         *flags &= ~*extract;
565         return;
566 }
567
568 void
569 ckrm_near_limit(ckrm_mem_res_t *cls)
570 {
571         struct zone *zone;
572         unsigned long now = jiffies;
573
574         if (!cls || ((cls->flags & MEM_NEAR_LIMIT) == MEM_NEAR_LIMIT)) {
575                 return;
576         }
577         if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
578                 cls->last_shrink = now;
579                 cls->shrink_count = 0;
580         }
581         cls->shrink_count++;
582         if (cls->shrink_count > 10) {
583                 return;
584         }
585         spin_lock(&ckrm_mem_lock);
586         list_add(&cls->shrink_list, &ckrm_shrink_list);
587         spin_unlock(&ckrm_mem_lock);
588         cls->flags |= MEM_NEAR_LIMIT;
589         for_each_zone(zone) {
590                 wakeup_kswapd(zone);
591                 break; // only once is enough
592         }
593 }
594
595 static int
596 ckrm_mem_evaluate_page_anon(struct page* page)
597 {
598         ckrm_mem_res_t* pgcls = page_class(page);
599         ckrm_mem_res_t* maxshareclass = NULL;
600         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
601         struct vm_area_struct *vma;
602         struct mm_struct* mm;
603
604         spin_lock(&anon_vma->lock);
605         BUG_ON(list_empty(&anon_vma->head));
606         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
607                 mm = vma->vm_mm;
608                 if (!maxshareclass ||
609                                 ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
610                         maxshareclass = mm->memclass;
611                 }
612         }
613         spin_unlock(&anon_vma->lock);
614
615         if (maxshareclass && (pgcls != maxshareclass)) {
616                 ckrm_change_page_class(page, maxshareclass);
617                 return 1;
618         }
619         return 0;
620 }
621
622 static int
623 ckrm_mem_evaluate_page_file(struct page* page) 
624 {
625         ckrm_mem_res_t* pgcls = page_class(page);
626         ckrm_mem_res_t* maxshareclass = NULL;
627         struct address_space *mapping = page->mapping;
628         struct vm_area_struct *vma = NULL;
629         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
630         struct prio_tree_iter iter;
631         struct mm_struct* mm;
632
633         if (!mapping)
634                 return 0;
635
636         if (!spin_trylock(&mapping->i_mmap_lock))
637                 return 0;
638
639         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
640                                         &iter, pgoff, pgoff)) != NULL) {
641                 mm = vma->vm_mm;
642                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
643                         maxshareclass = mm->memclass;
644         }
645         spin_unlock(&mapping->i_mmap_lock);
646
647         if (maxshareclass && pgcls != maxshareclass) {
648                 ckrm_change_page_class(page, maxshareclass);
649                 return 1;
650         }
651         return 0;
652 }
653
654 static int
655 ckrm_mem_evaluate_page(struct page* page) 
656 {
657         int changed = 0;
658
659         if (page->mapping) {
660                 if (PageAnon(page))
661                         changed = ckrm_mem_evaluate_page_anon(page);
662                 else
663                         changed = ckrm_mem_evaluate_page_file(page);
664         }
665         return changed;
666 }
667
668 static inline int
669 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
670                 pmd_t* pmdir, unsigned long address, unsigned long end)
671 {
672         pte_t* pte;
673         unsigned long pmd_end;
674         
675         if (pmd_none(*pmdir))
676                 return 0;
677         BUG_ON(pmd_bad(*pmdir));
678         
679         pte = pte_offset_map(pmdir,address);
680         pmd_end = (address+PMD_SIZE)&PMD_MASK;
681         if (end>pmd_end)
682                 end = pmd_end;
683         
684         do {
685                 if (pte_present(*pte)) {
686                         ckrm_mem_evaluate_page(pte_page(*pte));
687                 }
688                 address += PAGE_SIZE;
689                 pte++;
690         } while(address && (address<end));
691         return 0;
692 }
693
694 static inline int
695 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
696                 pgd_t* pgdir, unsigned long address, unsigned long end)
697 {
698         pmd_t* pmd;
699         unsigned long pgd_end;
700         
701         if (pgd_none(*pgdir))
702                 return 0;
703         BUG_ON(pgd_bad(*pgdir));
704         
705         pmd = pmd_offset(pgdir,address);
706         pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
707         
708         if (pgd_end && (end>pgd_end))
709                 end = pgd_end;
710         
711         do {
712                 class_migrate_pmd(mm,vma,pmd,address,end);
713                 address =  (address+PMD_SIZE)&PMD_MASK;
714                 pmd++;
715         } while (address && (address<end));
716         return 0;
717 }
718
719 static inline int
720 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
721 {
722         pgd_t* pgdir;
723         unsigned long address, end;
724         
725         address = vma->vm_start;
726         end = vma->vm_end;
727         
728         pgdir = pgd_offset(vma->vm_mm, address);
729         do {
730                 class_migrate_pgd(mm,vma,pgdir,address,end);
731                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
732                 pgdir++;
733         } while(address && (address<end));
734         return 0;
735 }
736
737 /* this function is called with mm->peertask_lock hold */
738 void
739 ckrm_mem_evaluate_mm(struct mm_struct* mm)
740 {
741         struct task_struct *task;
742         struct ckrm_mem_res *maxshareclass = NULL;
743         struct vm_area_struct *vma;
744         
745         if (list_empty(&mm->tasklist)) {
746                 /* We leave the mm->memclass untouched since we believe that one
747                  * mm with no task associated will be deleted soon or attach
748                  * with another task later.
749                  */
750                 return; 
751         }
752
753         list_for_each_entry(task, &mm->tasklist, mm_peers) {
754                 ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
755                 if (!cls)
756                         continue;
757                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 ) 
758                         maxshareclass = cls;
759         }
760
761         if (mm->memclass != (void *)maxshareclass) {
762                 mem_class_get(maxshareclass);
763                 if (mm->memclass)
764                         mem_class_put(mm->memclass);
765                 mm->memclass = maxshareclass;
766                 
767                 /* Go through all VMA to migrate pages */
768                 down_read(&mm->mmap_sem);
769                 vma = mm->mmap;
770                 while(vma) {
771                         class_migrate_vma(mm, vma);
772                         vma = vma->vm_next;
773                 }
774                 up_read(&mm->mmap_sem);
775         }
776         return;
777 }
778
779 void
780 ckrm_mem_evaluate_page_byadd(struct page* page, struct mm_struct* mm)
781 {
782         ckrm_mem_res_t *pgcls = page_class(page);
783         ckrm_mem_res_t *chgcls = mm->memclass ? mm->memclass : GET_MEM_CLASS(current);
784
785         if (!chgcls || pgcls == chgcls)
786                 return;
787
788         if (!page->mapcount) {
789                 ckrm_change_page_class(page, chgcls);
790                 return;
791         }
792         if (ckrm_mem_share_compare(pgcls, chgcls) < 0) {
793                 ckrm_change_page_class(page, chgcls);
794                 return;
795         }
796         return;
797 }
798
799 void
800 ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
801 {
802         spin_lock(&mm->peertask_lock);
803         if (!list_empty(&task->mm_peers)) {
804                 printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
805                 list_del_init(&task->mm_peers);
806         }
807         list_add_tail(&task->mm_peers, &mm->tasklist);
808         if (mm->memclass != GET_MEM_CLASS(task))
809                 ckrm_mem_evaluate_mm(mm);
810         spin_unlock(&mm->peertask_lock);
811         return;
812 }
813
814 MODULE_LICENSE("GPL");