disabled code for CKRM memory class shrinking
[linux-2.6.git] / kernel / ckrm / ckrm_mem.c
1 /* ckrm_mem.c - Memory Resource Manager for CKRM
2  *
3  * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
4  *
5  * Provides a Memory Resource controller for CKRM
6  *
7  * Latest version, more details at http://ckrm.sf.net
8  * 
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  */
15
16 /* Code Description: TBD
17  *
18  */
19
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <linux/slab.h>
23 #include <asm/errno.h>
24 #include <linux/list.h>
25 #include <linux/spinlock.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/swapops.h>
29 #include <linux/cache.h>
30 #include <linux/percpu.h>
31 #include <linux/pagevec.h>
32
33 #include <linux/ckrm_mem_inline.h>
34
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37
38 #define MEM_NAME "mem"
39
40 #define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
41
42 /* all 1-level memory_share_class are chained together */
43 static LIST_HEAD(ckrm_memclass_list);
44 LIST_HEAD(ckrm_shrink_list);
45 EXPORT_SYMBOL(ckrm_shrink_list);
46 spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
47 EXPORT_SYMBOL(ckrm_mem_lock);
48 unsigned int ckrm_tot_lru_pages; // total # of pages in the system
49                                                          // currently doesn't handle memory add/remove
50 EXPORT_SYMBOL(ckrm_tot_lru_pages);
51
52 static ckrm_mem_res_t *ckrm_mem_root_class;
53 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
54 EXPORT_SYMBOL(ckrm_mem_real_count);
55 static void ckrm_mem_evaluate_all_pages(void);
56
57 /* Initialize rescls values
58  * May be called on each rcfs unmount or as part of error recovery
59  * to make share values sane.
60  * Does not traverse hierarchy reinitializing children.
61  */
62
63 static void
64 set_ckrm_tot_pages(void)
65 {
66         struct zone *zone;
67         int tot_lru_pages = 0;
68
69         for_each_zone(zone) {
70                 tot_lru_pages += zone->nr_active;
71                 tot_lru_pages += zone->nr_inactive;
72                 tot_lru_pages += zone->free_pages;
73         }
74         ckrm_tot_lru_pages = tot_lru_pages;
75 }
76
77 static void
78 mem_res_initcls_one(void *my_res)
79 {
80         ckrm_mem_res_t *res = my_res;
81
82         memset(res, 0, sizeof(ckrm_mem_res_t));
83
84         res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
85         res->shares.my_limit         = CKRM_SHARE_DONTCARE;
86         res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
87         res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
88         res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
89         res->shares.cur_max_limit    = 0;
90
91         res->pg_guar = CKRM_SHARE_DONTCARE;
92         res->pg_limit = CKRM_SHARE_DONTCARE;
93         res->pg_unused = 0;
94 }
95
96 static void *
97 mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
98 {
99         ckrm_mem_res_t *res, *parres;
100
101         if (mem_rcbs.resid == -1) {
102                 return NULL;
103         }
104
105         parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
106         if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
107                 // allows only upto CKRM_MEM_MAX_HIERARCHY
108                 return NULL;
109         }
110
111         if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
112                 printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
113                 return NULL;
114         }
115                 
116         if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
117                 printk(KERN_ERR "MEM_RC: creating child class without root class\n");
118                 return NULL;
119         }
120                 
121         res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
122         
123         if (res) {
124                 mem_res_initcls_one(res);
125                 res->core = core;
126                 res->parent = parent;
127                 spin_lock(&ckrm_mem_lock);
128                 list_add(&res->mcls_list, &ckrm_memclass_list);
129                 spin_unlock(&ckrm_mem_lock);
130                 if (parent == NULL) {
131                         // I am part of the root class. So, set the max to 
132                         // number of pages available
133                         res->pg_guar = ckrm_tot_lru_pages;
134                         res->pg_unused = ckrm_tot_lru_pages;
135                         res->pg_limit = ckrm_tot_lru_pages;
136                         res->hier = 0;
137                         ckrm_mem_root_class = res;
138                 } else {
139                         res->hier = parres->hier + 1;
140                 }
141                 mem_class_get(res);
142         }
143         else
144                 printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
145         return res;
146 }
147
148 /*
149  * It is the caller's responsibility to make sure that the parent only
150  * has chilren that are to be accounted. i.e if a new child is added
151  * this function should be called after it has been added, and if a
152  * child is deleted this should be called after the child is removed.
153  */
154 static void
155 child_maxlimit_changed_local(ckrm_mem_res_t *parres)
156 {
157         int maxlimit = 0;
158         ckrm_mem_res_t *childres;
159         ckrm_core_class_t *child = NULL;
160
161         // run thru parent's children and get the new max_limit of the parent
162         ckrm_lock_hier(parres->core);
163         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
164                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
165                                 ckrm_mem_res_t);
166                 if (maxlimit < childres->shares.my_limit) {
167                         maxlimit = childres->shares.my_limit;
168                 }
169         }
170         ckrm_unlock_hier(parres->core);
171         parres->shares.cur_max_limit = maxlimit;
172 }
173
174 static void
175 mem_res_free(void *my_res)
176 {
177         ckrm_mem_res_t *res = my_res;
178         ckrm_mem_res_t *parres;
179
180         if (!res) 
181                 return;
182
183         res->shares.my_guarantee = 0;
184         res->shares.my_limit = 0;
185         res->pg_guar = 0;
186         res->pg_limit = 0;
187         res->pg_unused = 0;
188
189         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
190         // return child's limit/guarantee to parent node
191         if (parres) {
192                 child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
193                 child_maxlimit_changed_local(parres);
194         }
195         spin_lock(&ckrm_mem_lock);
196         list_del(&res->mcls_list);
197         spin_unlock(&ckrm_mem_lock);
198         mem_class_put(res);
199         ckrm_mem_evaluate_all_pages();
200         return;
201 }
202
203 /*
204  * Recalculate the guarantee and limit in # of pages... and propagate the
205  * same to children.
206  * Caller is responsible for protecting res and for the integrity of parres
207  */
208 static void
209 recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
210 {
211         ckrm_core_class_t *child = NULL;
212         ckrm_mem_res_t *childres;
213         int resid = mem_rcbs.resid;
214         struct ckrm_shares *self = &res->shares;
215
216         if (parres) {
217                 struct ckrm_shares *par = &parres->shares;
218
219                 // calculate pg_guar and pg_limit
220                 //
221                 if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
222                                 self->my_guarantee == CKRM_SHARE_DONTCARE) {
223                         res->pg_guar = CKRM_SHARE_DONTCARE;
224                 } else if (par->total_guarantee) {
225                         u64 temp = (u64) self->my_guarantee * parres->pg_guar;
226                         do_div(temp, par->total_guarantee);
227                         res->pg_guar = (int) temp;
228                 } else {
229                         res->pg_guar = 0;
230                 }
231
232                 if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
233                                 self->my_limit == CKRM_SHARE_DONTCARE) {
234                         res->pg_limit = CKRM_SHARE_DONTCARE;
235                 } else if (par->max_limit) {
236                         u64 temp = (u64) self->my_limit * parres->pg_limit;
237                         do_div(temp, par->max_limit);
238                         res->pg_limit = (int) temp;
239                 } else {
240                         res->pg_limit = 0;
241                 }
242         }
243
244         // Calculate unused units
245         if (res->pg_guar == CKRM_SHARE_DONTCARE) {
246                 res->pg_unused = CKRM_SHARE_DONTCARE;
247         } else if (self->total_guarantee) {
248                 u64 temp = (u64) self->unused_guarantee * res->pg_guar;
249                 do_div(temp, self->total_guarantee);
250                 res->pg_unused = (int) temp;
251         } else {
252                 res->pg_unused = 0;
253         }
254
255         // propagate to children
256         ckrm_lock_hier(res->core);
257         while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
258                 childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
259                 recalc_and_propagate(childres, res);
260         }
261         ckrm_unlock_hier(res->core);
262         return;
263 }
264
265 static int
266 mem_set_share_values(void *my_res, struct ckrm_shares *shares)
267 {
268         ckrm_mem_res_t *res = my_res;
269         ckrm_mem_res_t *parres;
270         int rc = EINVAL;
271
272         if (!res) 
273                 return -EINVAL;
274
275         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
276
277         rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
278
279         if ((rc == 0) && (parres != NULL)) {
280                 child_maxlimit_changed_local(parres);
281                 recalc_and_propagate(parres, NULL);
282         }
283         return rc;
284 }
285
286 static int
287 mem_get_share_values(void *my_res, struct ckrm_shares *shares)
288 {
289         ckrm_mem_res_t *res = my_res;
290
291         if (!res) 
292                 return -EINVAL;
293         *shares = res->shares;
294         return 0;
295 }
296
297 static int  
298 mem_get_stats(void *my_res, struct seq_file *sfile)
299 {
300         ckrm_mem_res_t *res = my_res;
301
302         if (!res) 
303                 return -EINVAL;
304
305 #if 0
306         seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
307                         "lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
308                         res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
309                         res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
310 #endif
311
312
313         seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
314         seq_printf(sfile, "Number of pages used(including pages lent to children):"
315                         " %d\n", atomic_read(&res->pg_total));
316         seq_printf(sfile, "Number of pages guaranteed: %d\n",
317                         res->pg_guar);
318         seq_printf(sfile, "Maximum limit of pages: %d\n",
319                         res->pg_limit);
320         seq_printf(sfile, "Total number of pages available"
321                         "(after serving guarantees to children): %d\n",
322                         res->pg_unused);
323         seq_printf(sfile, "Number of pages lent to children: %d\n",
324                         res->pg_lent);
325         seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
326                         res->pg_borrowed);
327         seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
328
329         return 0;
330 }
331
332 static void
333 mem_change_resclass(void *tsk, void *old, void *new)
334 {
335         struct mm_struct *mm;
336         struct task_struct *task = tsk, *t1;
337         struct ckrm_mem_res *prev_mmcls;
338         
339         if (!task->mm || (new == old) || (old == (void *) -1))
340                 return;
341
342         mm = task->active_mm;
343         spin_lock(&mm->peertask_lock);
344         prev_mmcls = mm->memclass;
345                 
346         if (new == NULL) {
347                 list_del_init(&task->mm_peers);
348         } else {
349                 int found = 0;
350                 list_for_each_entry(t1, &mm->tasklist, mm_peers) {
351                         if (t1 == task) {
352                                 found++;
353                                 break;
354                         }
355                 }
356                 if (!found) {
357                         list_del_init(&task->mm_peers);
358                         list_add_tail(&task->mm_peers, &mm->tasklist);
359                 }
360         }
361
362         spin_unlock(&mm->peertask_lock);
363         ckrm_mem_evaluate_mm(mm);
364         /*
365         printk("chg_cls: task <%s:%d> mm %p oldmm %s newmm %s o %s n %s\n",
366                 task->comm, task->pid, mm, prev_mmcls ? prev_mmcls->core->name:
367                 "NULL", mm->memclass ? mm->memclass->core->name : "NULL",
368                 o ? o->core->name: "NULL", n ? n->core->name: "NULL");  
369         */
370         return;
371 }
372
373 // config file is available only at the root level,
374 // so assuming my_res to be the system level class
375 static int
376 mem_set_config(void *my_res, const char *cfgstr)
377 {
378         ckrm_mem_res_t *res = my_res;
379
380         printk(KERN_INFO "%s class of %s is called with config<%s>\n",
381                         MEM_NAME, res->core->name, cfgstr);
382         return 0;
383 }
384
385 static int 
386 mem_show_config(void *my_res, struct seq_file *sfile)
387 {
388         struct zone *zone;
389         ckrm_mem_res_t *res = my_res;
390         int active = 0, inactive = 0, fr = 0;
391
392         if (!res)
393                 return -EINVAL;
394
395         for_each_zone(zone) {
396                 active += zone->nr_active;
397                 inactive += zone->nr_inactive;
398                 fr += zone->free_pages;
399         }
400         seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
401                         MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
402
403
404         return 0;
405 }
406
407 static int
408 mem_reset_stats(void *my_res)
409 {
410         ckrm_mem_res_t *res = my_res;
411         printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
412         return 0;
413 }
414
415 struct ckrm_res_ctlr mem_rcbs = {
416         .res_name          = MEM_NAME,
417         .res_hdepth        = CKRM_MEM_MAX_HIERARCHY,
418         .resid             = -1,
419         .res_alloc         = mem_res_alloc,
420         .res_free          = mem_res_free,
421         .set_share_values  = mem_set_share_values,
422         .get_share_values  = mem_get_share_values,
423         .get_stats         = mem_get_stats,
424         .change_resclass   = mem_change_resclass,
425         .show_config       = mem_show_config,
426         .set_config        = mem_set_config,
427         .reset_stats       = mem_reset_stats,
428 };
429
430 EXPORT_SYMBOL(mem_rcbs);
431
432 int __init
433 init_ckrm_mem_res(void)
434 {
435         struct ckrm_classtype *clstype;
436         int resid = mem_rcbs.resid;
437
438         set_ckrm_tot_pages();
439         clstype = ckrm_find_classtype_by_name("taskclass");
440         if (clstype == NULL) {
441                 printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
442                 return -ENOENT;
443         }
444
445         if (resid == -1) {
446                 resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
447                 if (resid != -1) {
448                         mem_rcbs.classtype = clstype;
449                 }
450         }
451         return ((resid < 0) ? resid : 0);
452 }       
453
454 void __exit
455 exit_ckrm_mem_res(void)
456 {
457         ckrm_unregister_res_ctlr(&mem_rcbs);
458         mem_rcbs.resid = -1;
459 }
460
461 module_init(init_ckrm_mem_res)
462 module_exit(exit_ckrm_mem_res)
463
464 static void
465 set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
466 {
467         ckrm_mem_res_t *childres;
468         ckrm_core_class_t *child = NULL;
469
470         parres->reclaim_flags |= flag;
471         ckrm_lock_hier(parres->core);
472         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
473                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
474                                 ckrm_mem_res_t);
475                 set_flags_of_children(childres, flag);
476         }
477         ckrm_unlock_hier(parres->core);
478         return;
479 }
480
481 // FIXME: more attention is needed to this function
482 static unsigned int
483 set_usage_flags(ckrm_mem_res_t *res)
484 {
485         int tot_usage, cls_usage, range, guar;
486
487         if (res->pg_limit == CKRM_SHARE_DONTCARE) {
488                         // No limit is set for the class. don't bother it
489                         res->reclaim_flags = 0;
490                         return res->reclaim_flags;
491         }
492
493         tot_usage = atomic_read(&res->pg_total);
494         cls_usage = tot_usage - res->pg_lent;
495         guar = (res->pg_guar > 0) ? res->pg_guar : 0;
496         range = res->pg_limit - guar;
497
498         if ((tot_usage > (guar + ((110 * range) / 100))) &&
499                                 (res->pg_lent > (guar + ((25 * range) / 100)))) {
500                 set_flags_of_children(res, CLS_PARENT_OVER);
501         }
502
503         if (cls_usage > (guar + ((110 * range) / 100))) {
504                 res->reclaim_flags |= CLS_OVER_110;
505         } else if (cls_usage > (guar + range)) {
506                 res->reclaim_flags |= CLS_OVER_100;
507         } else if (cls_usage > (guar + ((3 * range) / 4))) {
508                 res->reclaim_flags |= CLS_OVER_75;
509         } else if (cls_usage > (guar + (range / 2))) {
510                 res->reclaim_flags |= CLS_OVER_50;
511         } else if (cls_usage > (guar + (range / 4))) {
512                 res->reclaim_flags |= CLS_OVER_25;
513         } else if (cls_usage > guar) {
514                 res->reclaim_flags |= CLS_OVER_GUAR;
515         } else {
516                 res->reclaim_flags = 0;
517         }
518         return res->reclaim_flags;
519 }
520
521 /*
522  * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
523  * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the 
524  * macros CLS_* define how the pages are reclaimed.
525  * Keeping this logic thru these interface eliminate the necessity to
526  * change the reclaimation code in VM if we want to change the logic.
527  */
528 unsigned int
529 ckrm_setup_reclamation(void)
530 {
531         ckrm_mem_res_t *res;
532         unsigned int ret = 0;
533
534         spin_lock(&ckrm_mem_lock);
535         set_ckrm_tot_pages();
536         ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
537         ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
538         ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
539         recalc_and_propagate(ckrm_mem_root_class, NULL);
540         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
541                 ret |= set_usage_flags(res);
542         }
543         spin_unlock(&ckrm_mem_lock);
544         return ret;
545 }
546
547 void
548 ckrm_teardown_reclamation(void)
549 {
550         ckrm_mem_res_t *res;
551         spin_lock(&ckrm_mem_lock);
552         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
553                 res->reclaim_flags = 0;
554         }
555         spin_unlock(&ckrm_mem_lock);
556 }
557
558 void
559 ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
560 {
561         int i, j, mask = 0;
562
563         if (*flags == 0) {
564                 *extract = 0;
565                 return;
566         }
567
568         if (*flags & CLS_SHRINK) {
569                 *extract = CLS_SHRINK;
570                 *flags = 0;
571                 return;
572         }
573
574         i = fls(*flags);
575         for (j = i-1; j > 0; j--) {
576                 mask = (mask<<1) | 1;
577         }
578         *extract = (CLS_FLAGS_ALL & ~mask);
579         *flags &= ~*extract;
580         return;
581 }
582
583 void
584 ckrm_at_limit(ckrm_mem_res_t *cls)
585 {
586 #ifndef AT_LIMIT_SUPPORT
587 #warning "ckrm_at_limit disabled due to problems with memory hog tests"
588 #else
589         struct zone *zone;
590         unsigned long now = jiffies;
591
592         if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || 
593                         ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) {
594                 return;
595         }
596         if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
597                 cls->last_shrink = now;
598                 cls->shrink_count = 0;
599         }
600         cls->shrink_count++;
601         if (cls->shrink_count > 10) {
602                 return;
603         }
604         spin_lock(&ckrm_mem_lock);
605         list_add(&cls->shrink_list, &ckrm_shrink_list);
606         spin_unlock(&ckrm_mem_lock);
607         cls->flags |= MEM_AT_LIMIT;
608         for_each_zone(zone) {
609                 wakeup_kswapd(zone);
610                 break; // only once is enough
611         }
612 #endif // AT_LIMIT_SUPPORT
613 }
614
615 static int unmapped = 0, changed = 0, unchanged = 0, maxnull = 0,
616 anovma = 0, fnovma = 0;
617 static void
618 ckrm_mem_evaluate_page_anon(struct page* page)
619 {
620         ckrm_mem_res_t* pgcls = page_class(page);
621         ckrm_mem_res_t* maxshareclass = NULL;
622         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
623         struct vm_area_struct *vma;
624         struct mm_struct* mm;
625         int v = 0;
626
627         spin_lock(&anon_vma->lock);
628         BUG_ON(list_empty(&anon_vma->head));
629         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
630                 v++;
631                 mm = vma->vm_mm;
632                 if (!maxshareclass ||
633                                 ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
634                         maxshareclass = mm->memclass;
635                 }
636         }
637         spin_unlock(&anon_vma->lock);
638         if (!v)
639                 anovma++;
640
641         if (!maxshareclass)
642                 maxnull++;
643         if (maxshareclass && (pgcls != maxshareclass)) {
644                 ckrm_change_page_class(page, maxshareclass);
645                 changed++;
646         } else 
647                 unchanged++;
648         return;
649 }
650
651 static void
652 ckrm_mem_evaluate_page_file(struct page* page) 
653 {
654         ckrm_mem_res_t* pgcls = page_class(page);
655         ckrm_mem_res_t* maxshareclass = NULL;
656         struct address_space *mapping = page->mapping;
657         struct vm_area_struct *vma = NULL;
658         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
659         struct prio_tree_iter iter;
660         struct mm_struct* mm;
661         int v = 0;
662
663         if (!mapping)
664                 return;
665
666         if (!spin_trylock(&mapping->i_mmap_lock))
667                 return;
668
669         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
670                                         &iter, pgoff, pgoff)) != NULL) {
671                 v++;
672                 mm = vma->vm_mm;
673                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
674                         maxshareclass = mm->memclass;
675         }
676         spin_unlock(&mapping->i_mmap_lock);
677
678         if (!v)
679                 fnovma++;
680         if (!maxshareclass)
681                 maxnull++;
682
683         if (maxshareclass && pgcls != maxshareclass) {
684                 ckrm_change_page_class(page, maxshareclass);
685                 changed++;
686         } else 
687                 unchanged++;
688         return;
689 }
690
691 static void
692 ckrm_mem_evaluate_page(struct page* page) 
693 {
694         if (page->mapping) {
695                 if (PageAnon(page))
696                         ckrm_mem_evaluate_page_anon(page);
697                 else
698                         ckrm_mem_evaluate_page_file(page);
699         } else
700                 unmapped++;
701         return;
702 }
703
704 static void
705 ckrm_mem_evaluate_all_pages()
706 {
707         struct page *page;
708         struct zone *zone;
709         int active = 0, inactive = 0, cleared = 0;
710         int act_cnt, inact_cnt, idx;
711         ckrm_mem_res_t *res;
712
713         spin_lock(&ckrm_mem_lock);
714         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
715                 res->tmp_cnt = 0;
716         }
717         spin_unlock(&ckrm_mem_lock);
718
719         for_each_zone(zone) {
720                 spin_lock_irq(&zone->lru_lock);
721                 list_for_each_entry(page, &zone->inactive_list, lru) {
722                         ckrm_mem_evaluate_page(page);
723                         active++;
724                         page_class(page)->tmp_cnt++;
725                         if (!test_bit(PG_ckrm_account, &page->flags))
726                                 cleared++;
727                 }
728                 list_for_each_entry(page, &zone->active_list, lru) {
729                         ckrm_mem_evaluate_page(page);
730                         inactive++;
731                         page_class(page)->tmp_cnt++;
732                         if (!test_bit(PG_ckrm_account, &page->flags))
733                                 cleared++;
734                 }
735                 spin_unlock_irq(&zone->lru_lock);
736         }
737         printk("all_pages: active %d inactive %d cleared %d\n", 
738                         active, inactive, cleared);
739         spin_lock(&ckrm_mem_lock);
740         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
741                 act_cnt = 0; inact_cnt = 0; idx = 0;
742                 for_each_zone(zone) {
743                         act_cnt += res->nr_active[idx];
744                         inact_cnt += res->nr_inactive[idx];
745                         idx++;
746                 }
747                 printk("all_pages: %s: tmp_cnt %d; act_cnt %d inact_cnt %d\n",
748                         res->core->name, res->tmp_cnt, act_cnt, inact_cnt);
749         }
750         spin_unlock(&ckrm_mem_lock);
751
752         // check all mm's in the system to see which memclass they are attached
753         // to.
754         return;
755 }
756
757 static /*inline*/ int
758 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
759                 pmd_t* pmdir, unsigned long address, unsigned long end)
760 {
761         pte_t *pte, *orig_pte;
762         unsigned long pmd_end;
763         
764         if (pmd_none(*pmdir))
765                 return 0;
766         BUG_ON(pmd_bad(*pmdir));
767         
768         orig_pte = pte = pte_offset_map(pmdir,address);
769         pmd_end = (address+PMD_SIZE)&PMD_MASK;
770         if (end>pmd_end)
771                 end = pmd_end;
772         
773         do {
774                 if (pte_present(*pte)) {
775                         BUG_ON(mm->memclass == NULL);
776                         ckrm_change_page_class(pte_page(*pte), mm->memclass);
777                         // ckrm_mem_evaluate_page(pte_page(*pte));
778                 }
779                 address += PAGE_SIZE;
780                 pte++;
781         } while(address && (address<end));
782         pte_unmap(orig_pte);
783         return 0;
784 }
785
786 static /*inline*/ int
787 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
788                 pgd_t* pgdir, unsigned long address, unsigned long end)
789 {
790         pmd_t* pmd;
791         unsigned long pgd_end;
792         
793         if (pgd_none(*pgdir))
794                 return 0;
795         BUG_ON(pgd_bad(*pgdir));
796         
797         pmd = pmd_offset(pgdir,address);
798         pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
799         
800         if (pgd_end && (end>pgd_end))
801                 end = pgd_end;
802         
803         do {
804                 class_migrate_pmd(mm,vma,pmd,address,end);
805                 address =  (address+PMD_SIZE)&PMD_MASK;
806                 pmd++;
807         } while (address && (address<end));
808         return 0;
809 }
810
811 static /*inline*/ int
812 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
813 {
814         pgd_t* pgdir;
815         unsigned long address, end;
816         
817         address = vma->vm_start;
818         end = vma->vm_end;
819         
820         pgdir = pgd_offset(vma->vm_mm, address);
821         do {
822                 class_migrate_pgd(mm,vma,pgdir,address,end);
823                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
824                 pgdir++;
825         } while(address && (address<end));
826         return 0;
827 }
828
829 /* this function is called with mm->peertask_lock hold */
830 void
831 ckrm_mem_evaluate_mm(struct mm_struct* mm)
832 {
833         struct task_struct *task;
834         struct ckrm_mem_res *maxshareclass = NULL;
835         struct vm_area_struct *vma;
836         
837         if (list_empty(&mm->tasklist)) {
838                 /* We leave the mm->memclass untouched since we believe that one
839                  * mm with no task associated will be deleted soon or attach
840                  * with another task later.
841                  */
842                 return; 
843         }
844
845         list_for_each_entry(task, &mm->tasklist, mm_peers) {
846                 ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
847                 if (!cls)
848                         continue;
849                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 ) 
850                         maxshareclass = cls;
851         }
852
853         if (maxshareclass && (mm->memclass != (void *)maxshareclass)) {
854                 if (mm->memclass)
855                         mem_class_put(mm->memclass);
856                 mm->memclass = maxshareclass;
857                 mem_class_get(maxshareclass);
858                 
859                 /* Go through all VMA to migrate pages */
860                 down_read(&mm->mmap_sem);
861                 vma = mm->mmap;
862                 while(vma) {
863                         class_migrate_vma(mm, vma);
864                         vma = vma->vm_next;
865                 }
866                 up_read(&mm->mmap_sem);
867         }
868         return;
869 }
870
871 void
872 ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
873 {
874         spin_lock(&mm->peertask_lock);
875         if (!list_empty(&task->mm_peers)) {
876                 printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
877                 list_del_init(&task->mm_peers);
878         }
879         list_add_tail(&task->mm_peers, &mm->tasklist);
880         spin_unlock(&mm->peertask_lock);
881         if (mm->memclass != GET_MEM_CLASS(task))
882                 ckrm_mem_evaluate_mm(mm);
883         return;
884 }
885
886 int
887 ckrm_memclass_valid(ckrm_mem_res_t *cls)
888 {
889         ckrm_mem_res_t *tmp;
890
891         spin_lock(&ckrm_mem_lock);
892         list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) {
893                 if (tmp == cls) {
894                         spin_unlock(&ckrm_mem_lock);
895                         return 1;
896                 }
897         }
898         spin_unlock(&ckrm_mem_lock);
899         return 0;
900 }
901
902 MODULE_LICENSE("GPL");