This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / ckrm / ckrm_mem.c
1 /* ckrm_mem.c - Memory Resource Manager for CKRM
2  *
3  * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
4  *
5  * Provides a Memory Resource controller for CKRM
6  *
7  * Latest version, more details at http://ckrm.sf.net
8  * 
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  */
15
16 /* Code Description: TBD
17  *
18  */
19
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <linux/slab.h>
23 #include <asm/errno.h>
24 #include <linux/list.h>
25 #include <linux/spinlock.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/swapops.h>
29 #include <linux/cache.h>
30 #include <linux/percpu.h>
31 #include <linux/pagevec.h>
32
33 #include <linux/ckrm_mem_inline.h>
34
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37
38 #define MEM_NAME "mem"
39
40 #define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
41
42 /* all 1-level memory_share_class are chained together */
43 static LIST_HEAD(ckrm_memclass_list);
44 LIST_HEAD(ckrm_shrink_list);
45 EXPORT_SYMBOL(ckrm_shrink_list);
46 spinlock_t ckrm_mem_lock = SPIN_LOCK_UNLOCKED; // protects both lists above
47 EXPORT_SYMBOL(ckrm_mem_lock);
48 unsigned int ckrm_tot_lru_pages; // total # of pages in the system
49                                                          // currently doesn't handle memory add/remove
50 EXPORT_SYMBOL(ckrm_tot_lru_pages);
51
52 static ckrm_mem_res_t *ckrm_mem_root_class;
53 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
54 EXPORT_SYMBOL(ckrm_mem_real_count);
55 static void ckrm_mem_evaluate_all_pages(void);
56
57 /* Initialize rescls values
58  * May be called on each rcfs unmount or as part of error recovery
59  * to make share values sane.
60  * Does not traverse hierarchy reinitializing children.
61  */
62
63 static void
64 set_ckrm_tot_pages(void)
65 {
66         struct zone *zone;
67         int tot_lru_pages = 0;
68
69         for_each_zone(zone) {
70                 tot_lru_pages += zone->nr_active;
71                 tot_lru_pages += zone->nr_inactive;
72                 tot_lru_pages += zone->free_pages;
73         }
74         ckrm_tot_lru_pages = tot_lru_pages;
75 }
76
77 static void
78 mem_res_initcls_one(void *my_res)
79 {
80         ckrm_mem_res_t *res = my_res;
81
82         memset(res, 0, sizeof(ckrm_mem_res_t));
83
84         res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
85         res->shares.my_limit         = CKRM_SHARE_DONTCARE;
86         res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
87         res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
88         res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
89         res->shares.cur_max_limit    = 0;
90
91         res->pg_guar = CKRM_SHARE_DONTCARE;
92         res->pg_limit = CKRM_SHARE_DONTCARE;
93         res->pg_unused = 0;
94 }
95
96 static void *
97 mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
98 {
99         ckrm_mem_res_t *res, *parres;
100
101         if (mem_rcbs.resid == -1) {
102                 return NULL;
103         }
104
105         parres = ckrm_get_res_class(parent, mem_rcbs.resid, ckrm_mem_res_t);
106         if (parres && (parres->hier == CKRM_MEM_MAX_HIERARCHY)) {
107                 // allows only upto CKRM_MEM_MAX_HIERARCHY
108                 return NULL;
109         }
110
111         if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
112                 printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
113                 return NULL;
114         }
115                 
116         if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
117                 printk(KERN_ERR "MEM_RC: creating child class without root class\n");
118                 return NULL;
119         }
120                 
121         res = kmalloc(sizeof(ckrm_mem_res_t), GFP_ATOMIC);
122         
123         if (res) {
124                 mem_res_initcls_one(res);
125                 res->core = core;
126                 res->parent = parent;
127                 spin_lock(&ckrm_mem_lock);
128                 list_add(&res->mcls_list, &ckrm_memclass_list);
129                 spin_unlock(&ckrm_mem_lock);
130                 if (parent == NULL) {
131                         // I am part of the root class. So, set the max to 
132                         // number of pages available
133                         res->pg_guar = ckrm_tot_lru_pages;
134                         res->pg_unused = ckrm_tot_lru_pages;
135                         res->pg_limit = ckrm_tot_lru_pages;
136                         res->hier = 0;
137                         ckrm_mem_root_class = res;
138                 } else {
139                         res->hier = parres->hier + 1;
140                 }
141                 mem_class_get(res);
142         }
143         else
144                 printk(KERN_ERR "mem_res_alloc: failed GFP_ATOMIC alloc\n");
145         return res;
146 }
147
148 /*
149  * It is the caller's responsibility to make sure that the parent only
150  * has chilren that are to be accounted. i.e if a new child is added
151  * this function should be called after it has been added, and if a
152  * child is deleted this should be called after the child is removed.
153  */
154 static void
155 child_maxlimit_changed_local(ckrm_mem_res_t *parres)
156 {
157         int maxlimit = 0;
158         ckrm_mem_res_t *childres;
159         ckrm_core_class_t *child = NULL;
160
161         // run thru parent's children and get the new max_limit of the parent
162         ckrm_lock_hier(parres->core);
163         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
164                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
165                                 ckrm_mem_res_t);
166                 if (maxlimit < childres->shares.my_limit) {
167                         maxlimit = childres->shares.my_limit;
168                 }
169         }
170         ckrm_unlock_hier(parres->core);
171         parres->shares.cur_max_limit = maxlimit;
172 }
173
174 static void
175 mem_res_free(void *my_res)
176 {
177         ckrm_mem_res_t *res = my_res;
178         ckrm_mem_res_t *parres;
179
180         if (!res) 
181                 return;
182
183         res->shares.my_guarantee = 0;
184         res->shares.my_limit = 0;
185         res->pg_guar = 0;
186         res->pg_limit = 0;
187         res->pg_unused = 0;
188
189         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
190         // return child's limit/guarantee to parent node
191         if (parres) {
192                 child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0);
193                 child_maxlimit_changed_local(parres);
194         }
195         spin_lock(&ckrm_mem_lock);
196         list_del(&res->mcls_list);
197         spin_unlock(&ckrm_mem_lock);
198         mem_class_put(res);
199         ckrm_mem_evaluate_all_pages();
200         return;
201 }
202
203 /*
204  * Recalculate the guarantee and limit in # of pages... and propagate the
205  * same to children.
206  * Caller is responsible for protecting res and for the integrity of parres
207  */
208 static void
209 recalc_and_propagate(ckrm_mem_res_t * res, ckrm_mem_res_t * parres)
210 {
211         ckrm_core_class_t *child = NULL;
212         ckrm_mem_res_t *childres;
213         int resid = mem_rcbs.resid;
214         struct ckrm_shares *self = &res->shares;
215
216         if (parres) {
217                 struct ckrm_shares *par = &parres->shares;
218
219                 // calculate pg_guar and pg_limit
220                 //
221                 if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
222                                 self->my_guarantee == CKRM_SHARE_DONTCARE) {
223                         res->pg_guar = CKRM_SHARE_DONTCARE;
224                 } else if (par->total_guarantee) {
225                         u64 temp = (u64) self->my_guarantee * parres->pg_guar;
226                         do_div(temp, par->total_guarantee);
227                         res->pg_guar = (int) temp;
228                 } else {
229                         res->pg_guar = 0;
230                 }
231
232                 if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
233                                 self->my_limit == CKRM_SHARE_DONTCARE) {
234                         res->pg_limit = CKRM_SHARE_DONTCARE;
235                 } else if (par->max_limit) {
236                         u64 temp = (u64) self->my_limit * parres->pg_limit;
237                         do_div(temp, par->max_limit);
238                         res->pg_limit = (int) temp;
239                 } else {
240                         res->pg_limit = 0;
241                 }
242         }
243
244         // Calculate unused units
245         if (res->pg_guar == CKRM_SHARE_DONTCARE) {
246                 res->pg_unused = CKRM_SHARE_DONTCARE;
247         } else if (self->total_guarantee) {
248                 u64 temp = (u64) self->unused_guarantee * res->pg_guar;
249                 do_div(temp, self->total_guarantee);
250                 res->pg_unused = (int) temp;
251         } else {
252                 res->pg_unused = 0;
253         }
254
255         // propagate to children
256         ckrm_lock_hier(res->core);
257         while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
258                 childres = ckrm_get_res_class(child, resid, ckrm_mem_res_t);
259                 recalc_and_propagate(childres, res);
260         }
261         ckrm_unlock_hier(res->core);
262         return;
263 }
264
265 static int
266 mem_set_share_values(void *my_res, struct ckrm_shares *shares)
267 {
268         ckrm_mem_res_t *res = my_res;
269         ckrm_mem_res_t *parres;
270         int rc = EINVAL;
271
272         if (!res) 
273                 return -EINVAL;
274
275         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t);
276
277         rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
278
279         if ((rc == 0) && (parres != NULL)) {
280                 child_maxlimit_changed_local(parres);
281                 recalc_and_propagate(parres, NULL);
282         }
283         return rc;
284 }
285
286 static int
287 mem_get_share_values(void *my_res, struct ckrm_shares *shares)
288 {
289         ckrm_mem_res_t *res = my_res;
290
291         if (!res) 
292                 return -EINVAL;
293         *shares = res->shares;
294         return 0;
295 }
296
297 static int  
298 mem_get_stats(void *my_res, struct seq_file *sfile)
299 {
300         ckrm_mem_res_t *res = my_res;
301
302         if (!res) 
303                 return -EINVAL;
304
305 #if 0
306         seq_printf(sfile, "tot %6d;gua %6d;lmt %6d;unu %6d;"
307                         "lnt %6d;bor %6d;rlt %6d\n", atomic_read(&res->pg_total),
308                         res->pg_guar, res->pg_limit, res->pg_unused, res->pg_lent,
309                         res->pg_borrowed, atomic_read(&ckrm_mem_real_count));
310 #endif
311
312
313         seq_printf(sfile, "----------- Memory Resource stats start -----------\n");
314         seq_printf(sfile, "Number of pages used(including pages lent to children):"
315                         " %d\n", atomic_read(&res->pg_total));
316         seq_printf(sfile, "Number of pages guaranteed: %d\n",
317                         res->pg_guar);
318         seq_printf(sfile, "Maximum limit of pages: %d\n",
319                         res->pg_limit);
320         seq_printf(sfile, "Total number of pages available"
321                         "(after serving guarantees to children): %d\n",
322                         res->pg_unused);
323         seq_printf(sfile, "Number of pages lent to children: %d\n",
324                         res->pg_lent);
325         seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
326                         res->pg_borrowed);
327         seq_printf(sfile, "----------- Memory Resource stats end -----------\n");
328
329         return 0;
330 }
331
332 static void
333 mem_change_resclass(void *tsk, void *old, void *new)
334 {
335         struct mm_struct *mm;
336         struct task_struct *task = tsk, *t1;
337         struct ckrm_mem_res *prev_mmcls;
338         
339         if (!task->mm || (new == old) || (old == (void *) -1))
340                 return;
341
342         mm = task->active_mm;
343         spin_lock(&mm->peertask_lock);
344         prev_mmcls = mm->memclass;
345                 
346         if (new == NULL) {
347                 list_del_init(&task->mm_peers);
348         } else {
349                 int found = 0;
350                 list_for_each_entry(t1, &mm->tasklist, mm_peers) {
351                         if (t1 == task) {
352                                 found++;
353                                 break;
354                         }
355                 }
356                 if (!found) {
357                         list_del_init(&task->mm_peers);
358                         list_add_tail(&task->mm_peers, &mm->tasklist);
359                 }
360         }
361
362         spin_unlock(&mm->peertask_lock);
363         ckrm_mem_evaluate_mm(mm);
364         /*
365         printk("chg_cls: task <%s:%d> mm %p oldmm %s newmm %s o %s n %s\n",
366                 task->comm, task->pid, mm, prev_mmcls ? prev_mmcls->core->name:
367                 "NULL", mm->memclass ? mm->memclass->core->name : "NULL",
368                 o ? o->core->name: "NULL", n ? n->core->name: "NULL");  
369         */
370         return;
371 }
372
373 // config file is available only at the root level,
374 // so assuming my_res to be the system level class
375 static int
376 mem_set_config(void *my_res, const char *cfgstr)
377 {
378         ckrm_mem_res_t *res = my_res;
379
380         printk(KERN_INFO "%s class of %s is called with config<%s>\n",
381                         MEM_NAME, res->core->name, cfgstr);
382         return 0;
383 }
384
385 static int 
386 mem_show_config(void *my_res, struct seq_file *sfile)
387 {
388         struct zone *zone;
389         ckrm_mem_res_t *res = my_res;
390         int active = 0, inactive = 0, fr = 0;
391
392         if (!res)
393                 return -EINVAL;
394
395         for_each_zone(zone) {
396                 active += zone->nr_active;
397                 inactive += zone->nr_inactive;
398                 fr += zone->free_pages;
399         }
400         seq_printf(sfile, "res=%s;tot_pages=%d,active=%d,inactive=%d,free=%d\n",
401                         MEM_NAME, ckrm_tot_lru_pages,active,inactive,fr);
402
403
404         return 0;
405 }
406
407 static int
408 mem_reset_stats(void *my_res)
409 {
410         ckrm_mem_res_t *res = my_res;
411         printk(KERN_INFO " memclass of %s called for reset\n", res->core->name);
412         return 0;
413 }
414
415 struct ckrm_res_ctlr mem_rcbs = {
416         .res_name          = MEM_NAME,
417         .res_hdepth        = CKRM_MEM_MAX_HIERARCHY,
418         .resid             = -1,
419         .res_alloc         = mem_res_alloc,
420         .res_free          = mem_res_free,
421         .set_share_values  = mem_set_share_values,
422         .get_share_values  = mem_get_share_values,
423         .get_stats         = mem_get_stats,
424         .change_resclass   = mem_change_resclass,
425         .show_config       = mem_show_config,
426         .set_config        = mem_set_config,
427         .reset_stats       = mem_reset_stats,
428 };
429
430 EXPORT_SYMBOL(mem_rcbs);
431
432 int __init
433 init_ckrm_mem_res(void)
434 {
435         struct ckrm_classtype *clstype;
436         int resid = mem_rcbs.resid;
437
438         set_ckrm_tot_pages();
439         clstype = ckrm_find_classtype_by_name("taskclass");
440         if (clstype == NULL) {
441                 printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
442                 return -ENOENT;
443         }
444
445         if (resid == -1) {
446                 resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
447                 if (resid != -1) {
448                         mem_rcbs.classtype = clstype;
449                 }
450         }
451         return ((resid < 0) ? resid : 0);
452 }       
453
454 void __exit
455 exit_ckrm_mem_res(void)
456 {
457         ckrm_unregister_res_ctlr(&mem_rcbs);
458         mem_rcbs.resid = -1;
459 }
460
461 module_init(init_ckrm_mem_res)
462 module_exit(exit_ckrm_mem_res)
463
464 static void
465 set_flags_of_children(ckrm_mem_res_t *parres, unsigned int flag)
466 {
467         ckrm_mem_res_t *childres;
468         ckrm_core_class_t *child = NULL;
469
470         parres->reclaim_flags |= flag;
471         ckrm_lock_hier(parres->core);
472         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
473                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
474                                 ckrm_mem_res_t);
475                 set_flags_of_children(childres, flag);
476         }
477         ckrm_unlock_hier(parres->core);
478         return;
479 }
480
481 // FIXME: more attention is needed to this function
482 static unsigned int
483 set_usage_flags(ckrm_mem_res_t *res)
484 {
485         int tot_usage, cls_usage, range, guar;
486
487         if (res->pg_limit == CKRM_SHARE_DONTCARE) {
488                         // No limit is set for the class. don't bother it
489                         res->reclaim_flags = 0;
490                         return res->reclaim_flags;
491         }
492
493         tot_usage = atomic_read(&res->pg_total);
494         cls_usage = tot_usage - res->pg_lent;
495         guar = (res->pg_guar > 0) ? res->pg_guar : 0;
496         range = res->pg_limit - guar;
497
498         if ((tot_usage > (guar + ((110 * range) / 100))) &&
499                                 (res->pg_lent > (guar + ((25 * range) / 100)))) {
500                 set_flags_of_children(res, CLS_PARENT_OVER);
501         }
502
503         if (cls_usage > (guar + ((110 * range) / 100))) {
504                 res->reclaim_flags |= CLS_OVER_110;
505         } else if (cls_usage > (guar + range)) {
506                 res->reclaim_flags |= CLS_OVER_100;
507         } else if (cls_usage > (guar + ((3 * range) / 4))) {
508                 res->reclaim_flags |= CLS_OVER_75;
509         } else if (cls_usage > (guar + (range / 2))) {
510                 res->reclaim_flags |= CLS_OVER_50;
511         } else if (cls_usage > (guar + (range / 4))) {
512                 res->reclaim_flags |= CLS_OVER_25;
513         } else if (cls_usage > guar) {
514                 res->reclaim_flags |= CLS_OVER_GUAR;
515         } else {
516                 res->reclaim_flags = 0;
517         }
518         return res->reclaim_flags;
519 }
520
521 /*
522  * The functions ckrm_setup_reclamation(), ckrm_teardown_reclamation(),
523  * ckrm_get_reclaim_bits() and the macro ckrm_kick_page() along with the 
524  * macros CLS_* define how the pages are reclaimed.
525  * Keeping this logic thru these interface eliminate the necessity to
526  * change the reclaimation code in VM if we want to change the logic.
527  */
528 unsigned int
529 ckrm_setup_reclamation(void)
530 {
531         ckrm_mem_res_t *res;
532         unsigned int ret = 0;
533
534         spin_lock(&ckrm_mem_lock);
535         set_ckrm_tot_pages();
536         ckrm_mem_root_class->pg_guar = ckrm_tot_lru_pages;
537         ckrm_mem_root_class->pg_unused = ckrm_tot_lru_pages;
538         ckrm_mem_root_class->pg_limit = ckrm_tot_lru_pages;
539         recalc_and_propagate(ckrm_mem_root_class, NULL);
540         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
541                 ret |= set_usage_flags(res);
542         }
543         spin_unlock(&ckrm_mem_lock);
544         return ret;
545 }
546
547 void
548 ckrm_teardown_reclamation(void)
549 {
550         ckrm_mem_res_t *res;
551         spin_lock(&ckrm_mem_lock);
552         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
553                 res->reclaim_flags = 0;
554         }
555         spin_unlock(&ckrm_mem_lock);
556 }
557
558 void
559 ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract)
560 {
561         int i, j, mask = 0;
562
563         if (*extract == 0 || *flags == 0) {
564                 return;
565         }
566         if (*flags & CLS_SHRINK) {
567                 *extract = CLS_SHRINK;
568                 *flags = 0;
569                 return;
570         }
571                         
572
573         i = fls(*flags);
574         for (j = i-1; j > 0; j--) {
575                 mask = (mask<<1) | 1;
576         }
577         *extract = (CLS_FLAGS_ALL & ~mask);
578         *flags &= ~*extract;
579         return;
580 }
581
582 void
583 ckrm_at_limit(ckrm_mem_res_t *cls)
584 {
585         struct zone *zone;
586         unsigned long now = jiffies;
587
588         if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || 
589                         ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) {
590                 return;
591         }
592         if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ?
593                 cls->last_shrink = now;
594                 cls->shrink_count = 0;
595         }
596         cls->shrink_count++;
597         if (cls->shrink_count > 10) {
598                 return;
599         }
600         spin_lock(&ckrm_mem_lock);
601         list_add(&cls->shrink_list, &ckrm_shrink_list);
602         spin_unlock(&ckrm_mem_lock);
603         cls->flags |= MEM_AT_LIMIT;
604         for_each_zone(zone) {
605                 wakeup_kswapd(zone);
606                 break; // only once is enough
607         }
608 }
609
610 static int unmapped = 0, changed = 0, unchanged = 0, maxnull = 0,
611 anovma = 0, fnovma = 0;
612 static void
613 ckrm_mem_evaluate_page_anon(struct page* page)
614 {
615         ckrm_mem_res_t* pgcls = page_class(page);
616         ckrm_mem_res_t* maxshareclass = NULL;
617         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
618         struct vm_area_struct *vma;
619         struct mm_struct* mm;
620         int v = 0;
621
622         spin_lock(&anon_vma->lock);
623         BUG_ON(list_empty(&anon_vma->head));
624         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
625                 v++;
626                 mm = vma->vm_mm;
627                 if (!maxshareclass ||
628                                 ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) {
629                         maxshareclass = mm->memclass;
630                 }
631         }
632         spin_unlock(&anon_vma->lock);
633         if (!v)
634                 anovma++;
635
636         if (!maxshareclass)
637                 maxnull++;
638         if (maxshareclass && (pgcls != maxshareclass)) {
639                 ckrm_change_page_class(page, maxshareclass);
640                 changed++;
641         } else 
642                 unchanged++;
643         return;
644 }
645
646 static void
647 ckrm_mem_evaluate_page_file(struct page* page) 
648 {
649         ckrm_mem_res_t* pgcls = page_class(page);
650         ckrm_mem_res_t* maxshareclass = NULL;
651         struct address_space *mapping = page->mapping;
652         struct vm_area_struct *vma = NULL;
653         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
654         struct prio_tree_iter iter;
655         struct mm_struct* mm;
656         int v = 0;
657
658         if (!mapping)
659                 return;
660
661         if (!spin_trylock(&mapping->i_mmap_lock))
662                 return;
663
664         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
665                                         &iter, pgoff, pgoff)) != NULL) {
666                 v++;
667                 mm = vma->vm_mm;
668                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0)
669                         maxshareclass = mm->memclass;
670         }
671         spin_unlock(&mapping->i_mmap_lock);
672
673         if (!v)
674                 fnovma++;
675         if (!maxshareclass)
676                 maxnull++;
677
678         if (maxshareclass && pgcls != maxshareclass) {
679                 ckrm_change_page_class(page, maxshareclass);
680                 changed++;
681         } else 
682                 unchanged++;
683         return;
684 }
685
686 static void
687 ckrm_mem_evaluate_page(struct page* page) 
688 {
689         if (page->mapping) {
690                 if (PageAnon(page))
691                         ckrm_mem_evaluate_page_anon(page);
692                 else
693                         ckrm_mem_evaluate_page_file(page);
694         } else
695                 unmapped++;
696         return;
697 }
698
699 static void
700 ckrm_mem_evaluate_all_pages()
701 {
702         struct page *page;
703         struct zone *zone;
704         int active = 0, inactive = 0, cleared = 0;
705         int act_cnt, inact_cnt, idx;
706         ckrm_mem_res_t *res;
707
708         spin_lock(&ckrm_mem_lock);
709         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
710                 res->tmp_cnt = 0;
711         }
712         spin_unlock(&ckrm_mem_lock);
713
714         for_each_zone(zone) {
715                 spin_lock_irq(&zone->lru_lock);
716                 list_for_each_entry(page, &zone->inactive_list, lru) {
717                         ckrm_mem_evaluate_page(page);
718                         active++;
719                         page_class(page)->tmp_cnt++;
720                         if (!test_bit(PG_ckrm_account, &page->flags))
721                                 cleared++;
722                 }
723                 list_for_each_entry(page, &zone->active_list, lru) {
724                         ckrm_mem_evaluate_page(page);
725                         inactive++;
726                         page_class(page)->tmp_cnt++;
727                         if (!test_bit(PG_ckrm_account, &page->flags))
728                                 cleared++;
729                 }
730                 spin_unlock_irq(&zone->lru_lock);
731         }
732         printk("all_pages: active %d inactive %d cleared %d\n", 
733                         active, inactive, cleared);
734         spin_lock(&ckrm_mem_lock);
735         list_for_each_entry(res, &ckrm_memclass_list, mcls_list) {
736                 act_cnt = 0; inact_cnt = 0; idx = 0;
737                 for_each_zone(zone) {
738                         act_cnt += res->nr_active[idx];
739                         inact_cnt += res->nr_inactive[idx];
740                         idx++;
741                 }
742                 printk("all_pages: %s: tmp_cnt %d; act_cnt %d inact_cnt %d\n",
743                         res->core->name, res->tmp_cnt, act_cnt, inact_cnt);
744         }
745         spin_unlock(&ckrm_mem_lock);
746
747         // check all mm's in the system to see which memclass they are attached
748         // to.
749         return;
750 }
751
752 static /*inline*/ int
753 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
754                 pmd_t* pmdir, unsigned long address, unsigned long end)
755 {
756         pte_t *pte, *orig_pte;
757         unsigned long pmd_end;
758         
759         if (pmd_none(*pmdir))
760                 return 0;
761         BUG_ON(pmd_bad(*pmdir));
762         
763         orig_pte = pte = pte_offset_map(pmdir,address);
764         pmd_end = (address+PMD_SIZE)&PMD_MASK;
765         if (end>pmd_end)
766                 end = pmd_end;
767         
768         do {
769                 if (pte_present(*pte)) {
770                         BUG_ON(mm->memclass == NULL);
771                         ckrm_change_page_class(pte_page(*pte), mm->memclass);
772                         // ckrm_mem_evaluate_page(pte_page(*pte));
773                 }
774                 address += PAGE_SIZE;
775                 pte++;
776         } while(address && (address<end));
777         pte_unmap(orig_pte);
778         return 0;
779 }
780
781 static /*inline*/ int
782 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
783                 pgd_t* pgdir, unsigned long address, unsigned long end)
784 {
785         pmd_t* pmd;
786         unsigned long pgd_end;
787         
788         if (pgd_none(*pgdir))
789                 return 0;
790         BUG_ON(pgd_bad(*pgdir));
791         
792         pmd = pmd_offset(pgdir,address);
793         pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
794         
795         if (pgd_end && (end>pgd_end))
796                 end = pgd_end;
797         
798         do {
799                 class_migrate_pmd(mm,vma,pmd,address,end);
800                 address =  (address+PMD_SIZE)&PMD_MASK;
801                 pmd++;
802         } while (address && (address<end));
803         return 0;
804 }
805
806 static /*inline*/ int
807 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
808 {
809         pgd_t* pgdir;
810         unsigned long address, end;
811         
812         address = vma->vm_start;
813         end = vma->vm_end;
814         
815         pgdir = pgd_offset(vma->vm_mm, address);
816         do {
817                 class_migrate_pgd(mm,vma,pgdir,address,end);
818                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
819                 pgdir++;
820         } while(address && (address<end));
821         return 0;
822 }
823
824 /* this function is called with mm->peertask_lock hold */
825 void
826 ckrm_mem_evaluate_mm(struct mm_struct* mm)
827 {
828         struct task_struct *task;
829         struct ckrm_mem_res *maxshareclass = NULL;
830         struct vm_area_struct *vma;
831         
832         if (list_empty(&mm->tasklist)) {
833                 /* We leave the mm->memclass untouched since we believe that one
834                  * mm with no task associated will be deleted soon or attach
835                  * with another task later.
836                  */
837                 return; 
838         }
839
840         list_for_each_entry(task, &mm->tasklist, mm_peers) {
841                 ckrm_mem_res_t* cls = GET_MEM_CLASS(task);
842                 if (!cls)
843                         continue;
844                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,cls)<0 ) 
845                         maxshareclass = cls;
846         }
847
848         if (maxshareclass && (mm->memclass != (void *)maxshareclass)) {
849                 if (mm->memclass)
850                         mem_class_put(mm->memclass);
851                 mm->memclass = maxshareclass;
852                 mem_class_get(maxshareclass);
853                 
854                 /* Go through all VMA to migrate pages */
855                 down_read(&mm->mmap_sem);
856                 vma = mm->mmap;
857                 while(vma) {
858                         class_migrate_vma(mm, vma);
859                         vma = vma->vm_next;
860                 }
861                 up_read(&mm->mmap_sem);
862         }
863         return;
864 }
865
866 void
867 ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
868 {
869         spin_lock(&mm->peertask_lock);
870         if (!list_empty(&task->mm_peers)) {
871                 printk(KERN_ERR "CKRM_MEM: Task list should be empty, but is not!!\n");
872                 list_del_init(&task->mm_peers);
873         }
874         list_add_tail(&task->mm_peers, &mm->tasklist);
875         spin_unlock(&mm->peertask_lock);
876         if (mm->memclass != GET_MEM_CLASS(task))
877                 ckrm_mem_evaluate_mm(mm);
878         return;
879 }
880
881 int
882 ckrm_memclass_valid(ckrm_mem_res_t *cls)
883 {
884         ckrm_mem_res_t *tmp;
885
886         spin_lock(&ckrm_mem_lock);
887         list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) {
888                 if (tmp == cls) {
889                         spin_unlock(&ckrm_mem_lock);
890                         return 1;
891                 }
892         }
893         spin_unlock(&ckrm_mem_lock);
894         return 0;
895 }
896
897 MODULE_LICENSE("GPL");