736b579c7460e73db5b0fd27eb1acb891e351b36
[linux-2.6.git] / kernel / ckrm / ckrm_mem.c
1 /* ckrm_mem.c - Memory Resource Manager for CKRM
2  *
3  * Copyright (C) Chandra Seetharaman, IBM Corp. 2004
4  *
5  * Provides a Memory Resource controller for CKRM
6  *
7  * Latest version, more details at http://ckrm.sf.net
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  */
15
16 #include <linux/module.h>
17 #include <linux/init.h>
18 #include <linux/slab.h>
19 #include <linux/list.h>
20 #include <linux/spinlock.h>
21 #include <linux/pagemap.h>
22 #include <linux/swap.h>
23 #include <linux/swapops.h>
24 #include <linux/cache.h>
25 #include <linux/percpu.h>
26 #include <linux/pagevec.h>
27 #include <linux/parser.h>
28 #include <linux/ckrm_mem_inline.h>
29
30 #include <asm/uaccess.h>
31 #include <asm/pgtable.h>
32 #include <asm/errno.h>
33
34 #define MEM_NAME "mem"
35
36 #define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2
37
38 /* all 1-level memory_share_class are chained together */
39 LIST_HEAD(ckrm_memclass_list);
40 LIST_HEAD(ckrm_shrink_list);
41 spinlock_t ckrm_mem_lock; // protects both lists above
42 unsigned int ckrm_tot_lru_pages; // total # of pages in the system
43                                  // currently doesn't handle memory add/remove
44 struct ckrm_mem_res *ckrm_mem_root_class;
45 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
46 static void ckrm_mem_evaluate_all_pages(struct ckrm_mem_res *);
47 int ckrm_nr_mem_classes = 0;
48
49 EXPORT_SYMBOL_GPL(ckrm_memclass_list);
50 EXPORT_SYMBOL_GPL(ckrm_shrink_list);
51 EXPORT_SYMBOL_GPL(ckrm_mem_lock);
52 EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages);
53 EXPORT_SYMBOL_GPL(ckrm_mem_root_class);
54 EXPORT_SYMBOL_GPL(ckrm_mem_real_count);
55 EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes);
56
57 /* Initialize rescls values
58  * May be called on each rcfs unmount or as part of error recovery
59  * to make share values sane.
60  * Does not traverse hierarchy reinitializing children.
61  */
62
63 void
64 memclass_release(struct kref *kref)
65 {
66         struct ckrm_mem_res *cls = container_of(kref, struct ckrm_mem_res, nr_users);
67         BUG_ON(ckrm_memclass_valid(cls));
68         kfree(cls);
69 }
70 EXPORT_SYMBOL_GPL(memclass_release);
71
72 static void
73 set_ckrm_tot_pages(void)
74 {
75         struct zone *zone;
76         int tot_lru_pages = 0;
77
78         for_each_zone(zone) {
79                 tot_lru_pages += zone->nr_active;
80                 tot_lru_pages += zone->nr_inactive;
81                 tot_lru_pages += zone->free_pages;
82         }
83         ckrm_tot_lru_pages = tot_lru_pages;
84 }
85
86 static void
87 mem_res_initcls_one(struct ckrm_mem_res *res)
88 {
89         int zindex = 0;
90         struct zone *zone;
91
92         memset(res, 0, sizeof(struct ckrm_mem_res));
93
94         res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
95         res->shares.my_limit         = CKRM_SHARE_DONTCARE;
96         res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
97         res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
98         res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
99         res->shares.cur_max_limit    = 0;
100
101         res->pg_guar = CKRM_SHARE_DONTCARE;
102         res->pg_limit = CKRM_SHARE_DONTCARE;
103
104         INIT_LIST_HEAD(&res->shrink_list);
105         INIT_LIST_HEAD(&res->mcls_list);
106
107         for_each_zone(zone) {
108                 INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list);
109                 INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list);
110                 INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list);
111                 res->ckrm_zone[zindex].nr_active = 0;
112                 res->ckrm_zone[zindex].nr_inactive = 0;
113                 res->ckrm_zone[zindex].zone = zone;
114                 res->ckrm_zone[zindex].memcls = res;
115                 zindex++;
116         }
117
118         res->pg_unused = 0;
119         res->nr_dontcare = 1; // for default class
120         kref_init(&res->nr_users);
121 }
122
123 static void
124 set_impl_guar_children(struct ckrm_mem_res *parres)
125 {
126         ckrm_core_class_t *child = NULL;
127         struct ckrm_mem_res *cres;
128         int nr_dontcare = 1; // for defaultclass
129         int guar, impl_guar;
130         int resid = mem_rcbs.resid;
131
132         ckrm_lock_hier(parres->core);
133         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
134                 cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
135                 // treat NULL cres as don't care as that child is just being
136                 // created.
137                 // FIXME: need a better way to handle this case.
138                 if (!cres || cres->pg_guar == CKRM_SHARE_DONTCARE) {
139                         nr_dontcare++;
140                 }
141         }
142
143         parres->nr_dontcare = nr_dontcare;
144         guar = (parres->pg_guar == CKRM_SHARE_DONTCARE) ?
145                         parres->impl_guar : parres->pg_unused;
146         impl_guar = guar / parres->nr_dontcare;
147
148         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
149                 cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
150                 if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) {
151                         cres->impl_guar = impl_guar;
152                         set_impl_guar_children(cres);
153                 }
154         }
155         ckrm_unlock_hier(parres->core);
156
157 }
158
159 void
160 check_memclass(struct ckrm_mem_res *res, char *str)
161 {
162         int i, act = 0, inact = 0;
163         struct zone *zone;
164         struct ckrm_zone *ckrm_zone;
165         struct list_head *pos;
166         struct page *page;
167
168 #if 0
169         printk("Check<%s> %s: total=%d\n",
170                 str, res->core->name, atomic_read(&res->pg_total));
171 #endif
172         for (i = 0; i < MAX_NR_ZONES; i++) {
173                 act = 0; inact = 0;
174                 ckrm_zone = &res->ckrm_zone[i];
175                 zone = ckrm_zone->zone;
176                 spin_lock_irq(&zone->lru_lock);
177                 pos = ckrm_zone->inactive_list.next;
178                 while (pos != &ckrm_zone->inactive_list) {
179                         page = list_entry(pos, struct page, lru);
180                         pos = pos->next;
181                         inact++;
182                 }
183                 pos = ckrm_zone->active_list.next;
184                 while (pos != &ckrm_zone->active_list) {
185                         page = list_entry(pos, struct page, lru);
186                         pos = pos->next;
187                         act++;
188                 }
189                 spin_unlock_irq(&zone->lru_lock);
190 #if 0
191                 printk("Check<%s>(zone=%d): act %ld, inae %ld lact %d lina %d\n",
192                         str, i, ckrm_zone->nr_active, ckrm_zone->nr_inactive,
193                         act, inact);
194 #endif
195         }
196 }
197 EXPORT_SYMBOL_GPL(check_memclass);
198
199 static void *
200 mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
201 {
202         struct ckrm_mem_res *res, *pres;
203
204         if (mem_rcbs.resid == -1) {
205                 return NULL;
206         }
207
208         pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res);
209         if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) {
210                 printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n",
211                                                 CKRM_MEM_MAX_HIERARCHY);
212                 return NULL;
213         }
214
215         if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) {
216                 printk(KERN_ERR "MEM_RC: Only one root class is allowed\n");
217                 return NULL;
218         }
219
220         if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) {
221                 printk(KERN_ERR "MEM_RC: child class with no root class!!");
222                 return NULL;
223         }
224
225         res = kmalloc(sizeof(struct ckrm_mem_res), GFP_ATOMIC);
226
227         if (res) {
228                 mem_res_initcls_one(res);
229                 res->core = core;
230                 res->parent = parent;
231                 spin_lock_irq(&ckrm_mem_lock);
232                 list_add(&res->mcls_list, &ckrm_memclass_list);
233                 spin_unlock_irq(&ckrm_mem_lock);
234                 if (parent == NULL) {
235                         // I am part of the root class. So, set the max to
236                         // number of pages available
237                         res->pg_guar = ckrm_tot_lru_pages;
238                         res->pg_unused = ckrm_tot_lru_pages;
239                         res->pg_limit = ckrm_tot_lru_pages;
240                         res->hier = 0;
241                         ckrm_mem_root_class = res;
242                 } else {
243                         int guar;
244                         res->hier = pres->hier + 1;
245                         set_impl_guar_children(pres);
246                         guar = (pres->pg_guar == CKRM_SHARE_DONTCARE) ?
247                                 pres->impl_guar : pres->pg_unused;
248                         res->impl_guar = guar / pres->nr_dontcare;
249                 }
250                 ckrm_nr_mem_classes++;
251         }
252         else
253                 printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n");
254         return res;
255 }
256
257 /*
258  * It is the caller's responsibility to make sure that the parent only
259  * has chilren that are to be accounted. i.e if a new child is added
260  * this function should be called after it has been added, and if a
261  * child is deleted this should be called after the child is removed.
262  */
263 static void
264 child_maxlimit_changed_local(struct ckrm_mem_res *parres)
265 {
266         int maxlimit = 0;
267         struct ckrm_mem_res *childres;
268         ckrm_core_class_t *child = NULL;
269
270         // run thru parent's children and get the new max_limit of the parent
271         ckrm_lock_hier(parres->core);
272         while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
273                 childres = ckrm_get_res_class(child, mem_rcbs.resid,
274                                 struct ckrm_mem_res);
275                 if (maxlimit < childres->shares.my_limit) {
276                         maxlimit = childres->shares.my_limit;
277                 }
278         }
279         ckrm_unlock_hier(parres->core);
280         parres->shares.cur_max_limit = maxlimit;
281 }
282
283 /*
284  * Recalculate the guarantee and limit in # of pages... and propagate the
285  * same to children.
286  * Caller is responsible for protecting res and for the integrity of parres
287  */
288 static void
289 recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres)
290 {
291         ckrm_core_class_t *child = NULL;
292         struct ckrm_mem_res *cres;
293         int resid = mem_rcbs.resid;
294         struct ckrm_shares *self = &res->shares;
295
296         if (parres) {
297                 struct ckrm_shares *par = &parres->shares;
298
299                 // calculate pg_guar and pg_limit
300                 //
301                 if (parres->pg_guar == CKRM_SHARE_DONTCARE ||
302                                 self->my_guarantee == CKRM_SHARE_DONTCARE) {
303                         res->pg_guar = CKRM_SHARE_DONTCARE;
304                 } else if (par->total_guarantee) {
305                         u64 temp = (u64) self->my_guarantee * parres->pg_guar;
306                         do_div(temp, par->total_guarantee);
307                         res->pg_guar = (int) temp;
308                         res->impl_guar = CKRM_SHARE_DONTCARE;
309                 } else {
310                         res->pg_guar = 0;
311                         res->impl_guar = CKRM_SHARE_DONTCARE;
312                 }
313
314                 if (parres->pg_limit == CKRM_SHARE_DONTCARE ||
315                                 self->my_limit == CKRM_SHARE_DONTCARE) {
316                         res->pg_limit = CKRM_SHARE_DONTCARE;
317                 } else if (par->max_limit) {
318                         u64 temp = (u64) self->my_limit * parres->pg_limit;
319                         do_div(temp, par->max_limit);
320                         res->pg_limit = (int) temp;
321                 } else {
322                         res->pg_limit = 0;
323                 }
324         }
325
326         // Calculate unused units
327         if (res->pg_guar == CKRM_SHARE_DONTCARE) {
328                 res->pg_unused = CKRM_SHARE_DONTCARE;
329         } else if (self->total_guarantee) {
330                 u64 temp = (u64) self->unused_guarantee * res->pg_guar;
331                 do_div(temp, self->total_guarantee);
332                 res->pg_unused = (int) temp;
333         } else {
334                 res->pg_unused = 0;
335         }
336
337         // propagate to children
338         ckrm_lock_hier(res->core);
339         while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
340                 cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
341                 recalc_and_propagate(cres, res);
342         }
343         ckrm_unlock_hier(res->core);
344         return;
345 }
346
347 static void
348 mem_res_free(void *my_res)
349 {
350         struct ckrm_mem_res *res = my_res;
351         struct ckrm_mem_res *pres;
352
353         if (!res)
354                 return;
355
356         ckrm_mem_evaluate_all_pages(res);
357
358         pres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
359                         struct ckrm_mem_res);
360
361         if (pres) {
362                 child_guarantee_changed(&pres->shares,
363                                 res->shares.my_guarantee, 0);
364                 child_maxlimit_changed_local(pres);
365                 recalc_and_propagate(pres, NULL);
366                 set_impl_guar_children(pres);
367         }
368
369         res->shares.my_guarantee = 0;
370         res->shares.my_limit = 0;
371         res->pg_guar = 0;
372         res->pg_limit = 0;
373         res->pg_unused = 0;
374
375         spin_lock_irq(&ckrm_mem_lock);
376         list_del_init(&res->mcls_list);
377         spin_unlock_irq(&ckrm_mem_lock);
378
379         res->core = NULL;
380         res->parent = NULL;
381         kref_put(&res->nr_users, memclass_release);
382         ckrm_nr_mem_classes--;
383         return;
384 }
385
386 static int
387 mem_set_share_values(void *my_res, struct ckrm_shares *shares)
388 {
389         struct ckrm_mem_res *res = my_res;
390         struct ckrm_mem_res *parres;
391         int rc;
392
393         if (!res)
394                 return -EINVAL;
395
396         parres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
397                         struct ckrm_mem_res);
398
399         rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
400
401         if ((rc == 0) && (parres != NULL)) {
402                 child_maxlimit_changed_local(parres);
403                 recalc_and_propagate(parres, NULL);
404                 set_impl_guar_children(parres);
405         }
406
407         return rc;
408 }
409
410 static int
411 mem_get_share_values(void *my_res, struct ckrm_shares *shares)
412 {
413         struct ckrm_mem_res *res = my_res;
414
415         if (!res)
416                 return -EINVAL;
417         *shares = res->shares;
418         return 0;
419 }
420
421 static int
422 mem_get_stats(void *my_res, struct seq_file *sfile)
423 {
424         struct ckrm_mem_res *res = my_res;
425         struct zone *zone;
426         int active = 0, inactive = 0, fr = 0;
427
428         if (!res)
429                 return -EINVAL;
430
431         seq_printf(sfile, "--------- Memory Resource stats start ---------\n");
432         if (res == ckrm_mem_root_class) {
433                 int i = 0;
434                 for_each_zone(zone) {
435                         active += zone->nr_active;
436                         inactive += zone->nr_inactive;
437                         fr += zone->free_pages;
438                         i++;
439                 }
440                 seq_printf(sfile,"System: tot_pages=%d,active=%d,inactive=%d"
441                                 ",free=%d\n", ckrm_tot_lru_pages,
442                                 active, inactive, fr);
443         }
444         seq_printf(sfile, "Number of pages used(including pages lent to"
445                         " children): %d\n", atomic_read(&res->pg_total));
446         seq_printf(sfile, "Number of pages guaranteed: %d\n",
447                         res->pg_guar);
448         seq_printf(sfile, "Maximum limit of pages: %d\n",
449                         res->pg_limit);
450         seq_printf(sfile, "Total number of pages available"
451                         "(after serving guarantees to children): %d\n",
452                         res->pg_unused);
453         seq_printf(sfile, "Number of pages lent to children: %d\n",
454                         res->pg_lent);
455         seq_printf(sfile, "Number of pages borrowed from the parent: %d\n",
456                         res->pg_borrowed);
457         seq_printf(sfile, "---------- Memory Resource stats end ----------\n");
458
459         return 0;
460 }
461
462 static void
463 mem_change_resclass(void *tsk, void *old, void *new)
464 {
465         struct mm_struct *mm;
466         struct task_struct *task = tsk, *t1;
467         struct ckrm_mem_res *prev_mmcls;
468
469         if (!task->mm || (new == old) || (old == (void *) -1))
470                 return;
471
472         mm = task->active_mm;
473         spin_lock(&mm->peertask_lock);
474         prev_mmcls = mm->memclass;
475
476         if (new == NULL) {
477                 list_del_init(&task->mm_peers);
478         } else {
479                 int found = 0;
480                 list_for_each_entry(t1, &mm->tasklist, mm_peers) {
481                         if (t1 == task) {
482                                 found++;
483                                 break;
484                         }
485                 }
486                 if (!found) {
487                         list_del_init(&task->mm_peers);
488                         list_add_tail(&task->mm_peers, &mm->tasklist);
489                 }
490         }
491
492         spin_unlock(&mm->peertask_lock);
493         ckrm_mem_evaluate_mm(mm, (struct ckrm_mem_res *) new);
494         return;
495 }
496
497 #define MEM_FAIL_OVER "fail_over"
498 #define MEM_SHRINK_AT "shrink_at"
499 #define MEM_SHRINK_TO "shrink_to"
500 #define MEM_SHRINK_COUNT "num_shrinks"
501 #define MEM_SHRINK_INTERVAL "shrink_interval"
502
503 int ckrm_mem_fail_over = 110;
504 int ckrm_mem_shrink_at = 90;
505 static int ckrm_mem_shrink_to = 80;
506 static int ckrm_mem_shrink_count = 10;
507 static int ckrm_mem_shrink_interval = 10;
508
509 EXPORT_SYMBOL_GPL(ckrm_mem_fail_over);
510 EXPORT_SYMBOL_GPL(ckrm_mem_shrink_at);
511
512 static int
513 mem_show_config(void *my_res, struct seq_file *sfile)
514 {
515         struct ckrm_mem_res *res = my_res;
516
517         if (!res)
518                 return -EINVAL;
519
520         seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d,%s=%d,%s=%d\n",
521                 MEM_NAME,
522                 MEM_FAIL_OVER, ckrm_mem_fail_over,
523                 MEM_SHRINK_AT, ckrm_mem_shrink_at,
524                 MEM_SHRINK_TO, ckrm_mem_shrink_to,
525                 MEM_SHRINK_COUNT, ckrm_mem_shrink_count,
526                 MEM_SHRINK_INTERVAL, ckrm_mem_shrink_interval);
527
528         return 0;
529 }
530
531 // config file is available only at the root level,
532 // so assuming my_res to be the system level class
533 enum memclass_token {
534         mem_fail_over,
535         mem_shrink_at,
536         mem_shrink_to,
537         mem_shrink_count,
538         mem_shrink_interval,
539         mem_err
540 };
541
542 static match_table_t mem_tokens = {
543         {mem_fail_over, MEM_FAIL_OVER "=%d"},
544         {mem_shrink_at, MEM_SHRINK_AT "=%d"},
545         {mem_shrink_to, MEM_SHRINK_TO "=%d"},
546         {mem_shrink_count, MEM_SHRINK_COUNT "=%d"},
547         {mem_shrink_interval, MEM_SHRINK_INTERVAL "=%d"},
548         {mem_err, NULL},
549 };
550
551 static int
552 mem_set_config(void *my_res, const char *cfgstr)
553 {
554         char *p;
555         struct ckrm_mem_res *res = my_res;
556         int err = 0, val;
557
558         if (!res)
559                 return -EINVAL;
560
561         while ((p = strsep((char**)&cfgstr, ",")) != NULL) {
562                 substring_t args[MAX_OPT_ARGS];
563                 int token;
564                 if (!*p)
565                         continue;
566
567                 token = match_token(p, mem_tokens, args);
568                 switch (token) {
569                 case mem_fail_over:
570                         if (match_int(args, &val) || (val <= 0)) {
571                                 err = -EINVAL;
572                         } else {
573                                 ckrm_mem_fail_over = val;
574                         }
575                         break;
576                 case mem_shrink_at:
577                         if (match_int(args, &val) || (val <= 0)) {
578                                 err = -EINVAL;
579                         } else {
580                                 ckrm_mem_shrink_at = val;
581                         }
582                         break;
583                 case mem_shrink_to:
584                         if (match_int(args, &val) || (val < 0) || (val > 100)) {
585                                 err = -EINVAL;
586                         } else {
587                                 ckrm_mem_shrink_to = val;
588                         }
589                         break;
590                 case mem_shrink_count:
591                         if (match_int(args, &val) || (val <= 0)) {
592                                 err = -EINVAL;
593                         } else {
594                                 ckrm_mem_shrink_count = val;
595                         }
596                         break;
597                 case mem_shrink_interval:
598                         if (match_int(args, &val) || (val <= 0)) {
599                                 err = -EINVAL;
600                         } else {
601                                 ckrm_mem_shrink_interval = val;
602                         }
603                         break;
604                 default:
605                         err = -EINVAL;
606                 }
607         }
608         return err;
609 }
610
611 static int
612 mem_reset_stats(void *my_res)
613 {
614         struct ckrm_mem_res *res = my_res;
615         printk(KERN_INFO "MEM_RC: reset stats called for class %s\n",
616                                 res->core->name);
617         return 0;
618 }
619
620 struct ckrm_res_ctlr mem_rcbs = {
621         .res_name          = MEM_NAME,
622         .res_hdepth        = CKRM_MEM_MAX_HIERARCHY,
623         .resid             = -1,
624         .res_alloc         = mem_res_alloc,
625         .res_free          = mem_res_free,
626         .set_share_values  = mem_set_share_values,
627         .get_share_values  = mem_get_share_values,
628         .get_stats         = mem_get_stats,
629         .change_resclass   = mem_change_resclass,
630         .show_config       = mem_show_config,
631         .set_config        = mem_set_config,
632         .reset_stats       = mem_reset_stats,
633 };
634
635 EXPORT_SYMBOL_GPL(mem_rcbs);
636
637 int __init
638 init_ckrm_mem_res(void)
639 {
640         struct ckrm_classtype *clstype;
641         int resid = mem_rcbs.resid;
642
643         set_ckrm_tot_pages();
644         spin_lock_init(&ckrm_mem_lock);
645         clstype = ckrm_find_classtype_by_name("taskclass");
646         if (clstype == NULL) {
647                 printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
648                 return -ENOENT;
649         }
650
651         if (resid == -1) {
652                 resid = ckrm_register_res_ctlr(clstype, &mem_rcbs);
653                 if (resid != -1) {
654                         mem_rcbs.classtype = clstype;
655                 }
656         }
657         return ((resid < 0) ? resid : 0);
658 }
659
660 void __exit
661 exit_ckrm_mem_res(void)
662 {
663         ckrm_unregister_res_ctlr(&mem_rcbs);
664         mem_rcbs.resid = -1;
665 }
666
667 module_init(init_ckrm_mem_res)
668 module_exit(exit_ckrm_mem_res)
669
670 int
671 ckrm_mem_get_shrink_to(void)
672 {
673         return ckrm_mem_shrink_to;
674 }
675
676 void
677 ckrm_at_limit(struct ckrm_mem_res *cls)
678 {
679         struct zone *zone;
680         unsigned long now = jiffies;
681
682         if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) ||
683                         ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) {
684                 return;
685         }
686         if ((cls->last_shrink > now) /* jiffies wrapped around */ ||
687                    (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) {
688                 cls->last_shrink = now;
689                 cls->shrink_count = 0;
690         }
691         cls->shrink_count++;
692         if (cls->shrink_count > ckrm_mem_shrink_count) {
693                 return;
694         }
695         spin_lock_irq(&ckrm_mem_lock);
696         list_add(&cls->shrink_list, &ckrm_shrink_list);
697         spin_unlock_irq(&ckrm_mem_lock);
698         cls->flags |= MEM_AT_LIMIT;
699         for_each_zone(zone) {
700                 wakeup_kswapd(zone);
701                 break; // only once is enough
702         }
703 }
704
705 static int
706 ckrm_mem_evaluate_page_anon(struct page* page)
707 {
708         struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
709         struct ckrm_mem_res* maxshareclass = NULL;
710         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
711         struct vm_area_struct *vma;
712         struct mm_struct* mm;
713         int ret = 0;
714
715         spin_lock(&anon_vma->lock);
716         BUG_ON(list_empty(&anon_vma->head));
717         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
718                 mm = vma->vm_mm;
719                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
720                                 mm->memclass) < 0) {
721                         maxshareclass = mm->memclass;
722                 }
723         }
724         spin_unlock(&anon_vma->lock);
725
726         if (!maxshareclass) {
727                 maxshareclass = ckrm_mem_root_class;
728         }
729         if (pgcls != maxshareclass) {
730                 ckrm_change_page_class(page, maxshareclass);
731                 ret = 1;
732         }
733         return ret;
734 }
735
736 static int
737 ckrm_mem_evaluate_page_file(struct page* page)
738 {
739         struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
740         struct ckrm_mem_res* maxshareclass = NULL;
741         struct address_space *mapping = page->mapping;
742         struct vm_area_struct *vma = NULL;
743         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
744         struct prio_tree_iter iter;
745         struct mm_struct* mm;
746         int ret = 0;
747
748         if (!mapping)
749                 return 0;
750
751         if (!spin_trylock(&mapping->i_mmap_lock))
752                 return 0;
753
754         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,
755                                         pgoff, pgoff) {
756                 mm = vma->vm_mm;
757                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
758                                 mm->memclass)<0)
759                         maxshareclass = mm->memclass;
760         }
761         spin_unlock(&mapping->i_mmap_lock);
762
763         if (!maxshareclass) {
764                 maxshareclass = ckrm_mem_root_class;
765         }
766         if (pgcls != maxshareclass) {
767                 ckrm_change_page_class(page, maxshareclass);
768                 ret = 1;
769         }
770         return ret;
771 }
772
773 static int
774 ckrm_mem_evaluate_page(struct page* page)
775 {
776         int ret = 0;
777         BUG_ON(page->ckrm_zone == NULL);
778         if (page->mapping) {
779                 if (PageAnon(page))
780                         ret = ckrm_mem_evaluate_page_anon(page);
781                 else
782                         ret = ckrm_mem_evaluate_page_file(page);
783         }
784         return ret;
785 }
786
787 static void
788 ckrm_mem_evaluate_all_pages(struct ckrm_mem_res* res)
789 {
790         struct page *page;
791         struct ckrm_zone *ckrm_zone;
792         struct zone *zone;
793         struct list_head *pos, *next;
794         int i;
795
796         check_memclass(res, "bef_eval_all_pgs");
797         for (i = 0; i < MAX_NR_ZONES; i++) {
798                 ckrm_zone = &res->ckrm_zone[i];
799                 zone = ckrm_zone->zone;
800                 spin_lock_irq(&zone->lru_lock);
801                 pos = ckrm_zone->inactive_list.next;
802                 while (pos != &ckrm_zone->inactive_list) {
803                         next = pos->next;
804                         page = list_entry(pos, struct page, lru);
805                         if (!ckrm_mem_evaluate_page(page))
806                                 ckrm_change_page_class(page,
807                                                 ckrm_mem_root_class);
808                         pos = next;
809                 }
810                 pos = ckrm_zone->active_list.next;
811                 while (pos != &ckrm_zone->active_list) {
812                         next = pos->next;
813                         page = list_entry(pos, struct page, lru);
814                         if (!ckrm_mem_evaluate_page(page))
815                                 ckrm_change_page_class(page,
816                                                 ckrm_mem_root_class);
817                         pos = next;
818                 }
819                 spin_unlock_irq(&zone->lru_lock);
820         }
821         check_memclass(res, "aft_eval_all_pgs");
822         return;
823 }
824
825 static inline int
826 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
827                 pmd_t* pmdir, unsigned long address, unsigned long end)
828 {
829         pte_t *pte;
830         unsigned long pmd_end;
831
832         if (pmd_none(*pmdir))
833                 return 0;
834         BUG_ON(pmd_bad(*pmdir));
835
836         pmd_end = (address+PMD_SIZE)&PMD_MASK;
837         if (end>pmd_end)
838                 end = pmd_end;
839
840         do {
841                 pte = pte_offset_map(pmdir,address);
842                 if (pte_present(*pte)) {
843                         struct page *page = pte_page(*pte);
844                         BUG_ON(mm->memclass == NULL);
845                         if (page->mapping && page->ckrm_zone) {
846                                 struct zone *zone = page->ckrm_zone->zone;
847                                 spin_lock_irq(&zone->lru_lock);
848                                 ckrm_change_page_class(page, mm->memclass);
849                                 spin_unlock_irq(&zone->lru_lock);
850                         }
851                 }
852                 address += PAGE_SIZE;
853                 pte_unmap(pte);
854                 pte++;
855         } while(address && (address<end));
856         return 0;
857 }
858
859 static inline int
860 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
861                 pgd_t* pgdir, unsigned long address, unsigned long end)
862 {
863         pmd_t* pmd;
864         unsigned long pgd_end;
865
866         if (pgd_none(*pgdir))
867                 return 0;
868         BUG_ON(pgd_bad(*pgdir));
869
870         pmd = pmd_offset(pgdir,address);
871         pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK;
872
873         if (pgd_end && (end>pgd_end))
874                 end = pgd_end;
875
876         do {
877                 class_migrate_pmd(mm,vma,pmd,address,end);
878                 address = (address+PMD_SIZE)&PMD_MASK;
879                 pmd++;
880         } while (address && (address<end));
881         return 0;
882 }
883
884 static inline int
885 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
886 {
887         pgd_t* pgdir;
888         unsigned long address, end;
889
890         address = vma->vm_start;
891         end = vma->vm_end;
892
893         pgdir = pgd_offset(vma->vm_mm, address);
894         do {
895                 class_migrate_pgd(mm,vma,pgdir,address,end);
896                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
897                 pgdir++;
898         } while(address && (address<end));
899         return 0;
900 }
901
902 /* this function is called with mm->peertask_lock hold */
903 void
904 ckrm_mem_evaluate_mm(struct mm_struct* mm, struct ckrm_mem_res *def)
905 {
906         struct task_struct *task;
907         struct ckrm_mem_res *maxshareclass = def;
908         struct vm_area_struct *vma;
909
910         if (list_empty(&mm->tasklist)) {
911                 /* We leave the mm->memclass untouched since we believe that one
912                  * mm with no task associated will be deleted soon or attach
913                  * with another task later.
914                  */
915                 return;
916         }
917
918         list_for_each_entry(task, &mm->tasklist, mm_peers) {
919                 struct ckrm_mem_res* cls = ckrm_get_mem_class(task);
920                 if (!cls)
921                         continue;
922                 if (!maxshareclass ||
923                                 ckrm_mem_share_compare(maxshareclass,cls)<0 )
924                         maxshareclass = cls;
925         }
926
927         if (maxshareclass && (mm->memclass != maxshareclass)) {
928                 if (mm->memclass) {
929                         kref_put(&mm->memclass->nr_users, memclass_release);
930                 }
931                 mm->memclass = maxshareclass;
932                 kref_get(&maxshareclass->nr_users);
933
934                 /* Go through all VMA to migrate pages */
935                 down_read(&mm->mmap_sem);
936                 vma = mm->mmap;
937                 while(vma) {
938                         class_migrate_vma(mm, vma);
939                         vma = vma->vm_next;
940                 }
941                 up_read(&mm->mmap_sem);
942         }
943         return;
944 }
945
946 void
947 ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task)
948 {
949         spin_lock(&mm->peertask_lock);
950         if (!list_empty(&task->mm_peers)) {
951                 printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n");
952                 list_del_init(&task->mm_peers);
953         }
954         list_add_tail(&task->mm_peers, &mm->tasklist);
955         spin_unlock(&mm->peertask_lock);
956         if (mm->memclass != ckrm_get_mem_class(task))
957                 ckrm_mem_evaluate_mm(mm, NULL);
958         return;
959 }
960
961 int
962 ckrm_memclass_valid(struct ckrm_mem_res *cls)
963 {
964         struct ckrm_mem_res *tmp;
965         unsigned long flags;
966
967         if (!cls || list_empty(&cls->mcls_list)) {
968                 return 0;
969         }
970         spin_lock_irqsave(&ckrm_mem_lock, flags);
971         list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) {
972                 if (tmp == cls) {
973                         spin_unlock(&ckrm_mem_lock);
974                         return 1;
975                 }
976         }
977         spin_unlock_irqrestore(&ckrm_mem_lock, flags);
978         return 0;
979 }
980
981 MODULE_LICENSE("GPL");